From 3f1086894ebf8c71f503c38b74d646dbe24c368d Mon Sep 17 00:00:00 2001
From: Avijit Ghosh <avijitg22@gmail.com>
Date: Sun, 15 Feb 2026 19:42:36 -0500
Subject: [PATCH] Delete data directory

Since we have now moved to GH
---
 .../c8ab4e94-d8e8-417f-be18-fececf3c815c.json |  515 ---
 .../402c8833-1827-46fc-a497-46b40a6794ff.json |  515 ---
 .../acd2082a-ce0c-418f-9383-f3c9f11735a2.json |  515 ---
 .../c65ed336-b283-46c2-8284-c4695cad588d.json |  515 ---
 .../5ebb009d-b548-4f2b-b075-feb76ca295d2.json |  515 ---
 .../c7df2916-bde4-4987-9139-fcfd18a14ac1.json |  515 ---
 .../56ec8ab0-d76d-4c03-953b-a2a4a43af5f4.json |  515 ---
 .../ad3211a9-4390-4247-b64d-600191a88a75.json |  512 ---
 .../1a34326a-f75e-434c-a027-9f8cf7fe8fb9.json |  515 ---
 .../129c8b21-f97e-4284-9574-33d5932332f7.json |  515 ---
 .../3644fd67-0f46-4de3-b542-edf219d0e0cd.json |  515 ---
 .../c0692e14-6484-4d02-8dac-55ce4373fb15.json |  515 ---
 .../ab4940d1-118c-479a-bd37-1ea2da6f02a3.json |  515 ---
 .../85552093-435f-4d85-897d-4e74c3655533.json |  515 ---
 .../4ddc0062-6577-4ab9-85f1-791fd2822776.json |  515 ---
 .../50fc4840-933b-43ec-847e-1834b30f9f14.json |  515 ---
 .../6cdc5384-2be5-47e0-a9b2-9cd6719c1760.json |  515 ---
 .../a668c931-34e4-4702-a84c-97d8c6f59ef4.json |  515 ---
 .../3a7e2aa6-4e57-446f-a127-4a7e022fe3e1.json |  515 ---
 .../938a35f1-195d-49c8-9a16-90fab96692bd.json |  515 ---
 .../ce756801-f75e-4250-9721-1d627a37f055.json |  515 ---
 .../b83b41d4-6c95-4c7d-a290-65d89bf776c2.json |  515 ---
 .../31c3fe1b-be4b-42ef-8ec0-9da323b2ebb6.json |  515 ---
 .../a8e0fc0e-b3a4-4a0b-938f-aa11f1c64358.json |  515 ---
 .../8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json |  352 --
 .../7d2d1dba-1b31-47b2-8308-f2668cf36c99.json |  352 --
 .../3a056f7b-1bdf-4543-9e67-1101ace67179.json |  352 --
 .../275cf2e5-5ccd-40be-be55-938c82ef6688.json |  352 --
 .../43e7be99-4872-4eb1-b30b-75c44b298ab4.json |  345 --
 .../cfc99298-4570-48cf-9187-aa0d167cc0ba.json |  345 --
 .../a2162367-d16d-4274-aa89-43435cea5c0b.json |  345 --
 .../51ef4580-da13-415a-a37f-45e2036ed4c2.json |  345 --
 .../3fa605db-fcff-4f05-9398-6af77c9dcada.json |  345 --
 .../9d58ac39-fef7-47c8-920a-8be2069f5662.json |  345 --
 .../dd9b10af-ad39-45ef-8f91-097340d376c7.json |  345 --
 .../30a6de14-c57c-483e-92e9-26fc4c7f4772.json |  345 --
 .../bed1a799-77a6-40a1-9f37-d54fe9d4d055.json |  345 --
 .../6c226cad-23f1-4c09-8038-eb7b776cdee4.json |  345 --
 .../98887061-09d6-44ba-9cff-0267045a26ef.json |  345 --
 .../6693f0e2-3514-413d-be61-d10f7372b3dc.json |  345 --
 .../ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json |  345 --
 .../0d9a856d-01bf-4a82-9872-33d561cf4a57.json |  345 --
 .../3ff2ab7d-2c0f-4313-8223-8f514fde595a.json |  345 --
 .../2a46e8da-1996-428c-b567-cd0287b29d9f.json |  345 --
 .../30a92593-398e-4c2f-8be7-455be166aeaf.json |  345 --
 .../e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json |  345 --
 .../dfc2717d-ead8-4287-885e-5e0fc09c35e3.json |  345 --
 .../e97292eb-7031-4a3a-a415-44c137898e3f.json |  345 --
 .../4263a6be-9640-40a1-8881-768624949d47.json |  345 --
 .../a808cecf-8925-428f-99ea-b6c2f8bce96e.json |  345 --
 .../55e44a3b-1fac-4ad5-b25e-85702f33883d.json |  345 --
 .../5b5b339b-7631-4b77-ac51-df49d3e946eb.json |  345 --
 .../eaec6d66-6da7-4592-baca-2539240acc5d.json |  345 --
 .../2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json |  345 --
 .../eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json |  352 --
 .../75b5943a-67be-4b2f-85da-a52533edc76f.json |  345 --
 .../8bec35b7-271a-457d-b665-9f69baa248aa.json |  345 --
 .../c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json |  345 --
 .../c308b0a5-4c44-4369-9b23-8664959aa927.json |  345 --
 .../1a1edfb2-f0f1-4930-82c0-99293ec76645.json |  345 --
 .../9aa5af51-8c55-4896-b634-162a9d82b58e.json |  345 --
 .../21461a52-2f25-48c9-be19-f9233317d817.json |  345 --
 .../bdea0967-fcc7-493c-a18d-70727842deb9.json |  345 --
 .../f7404ea3-62c7-47fc-9106-44c208470381.json |  345 --
 .../2817820c-4b28-4235-a8fd-ad02d0f504bc.json |  345 --
 .../f3da71fc-fc88-4dda-b423-168d11eab317.json |  345 --
 .../2f7c0db9-b5de-4674-a130-5315520dea68.json |  345 --
 .../4dcb8022-fe54-42f7-b43f-9866de173731.json |  345 --
 .../c436f3d1-84ee-49df-9287-0305925f7cf4.json |  345 --
 .../90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json |  345 --
 .../07c823ba-9e17-47e4-858b-a1f2a514a276.json |  345 --
 .../eb1bb443-71ad-4b79-8308-2b66c5e8c631.json |  345 --
 .../e14d42a9-9639-4c35-8a0c-e395e754c46c.json |  345 --
 .../3754df44-ddce-4a66-9074-f65f5677ae27.json |  345 --
 .../a540b282-e9d6-403e-96df-a1d27ad14d3a.json |  345 --
 .../758851b3-9ac9-43d8-8b6a-3d9688752d80.json |  345 --
 .../1d9ac688-ca0d-405b-a262-e95673e79250.json |  345 --
 .../c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json |  345 --
 .../35a31e19-2ef5-4caa-a848-422af42adab8.json |  345 --
 .../7de0bda2-ce56-444a-b293-a310a5b2d7ab.json |  345 --
 .../dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json |  345 --
 .../9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json |  345 --
 .../07763926-3a19-43f9-a23f-095f6cb78799.json |  345 --
 .../56e024b3-c963-4172-9f52-7605276b3854.json |  345 --
 .../6f660e47-1d86-473d-9864-208111dcea31.json |  345 --
 .../91ef1f96-a708-4c53-ac9d-208ef3420668.json |  345 --
 .../c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json |  345 --
 .../505c6245-88d1-4557-9e34-63a4e8086210.json |  345 --
 .../9a473236-f187-4926-ae8a-e8b84fe2a060.json |  345 --
 .../1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json |  345 --
 .../aeabfb59-74db-445c-9693-7a088ac5073c.json |  345 --
 .../eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json |  345 --
 .../12fdea65-94eb-4c85-876c-65f0528bde12.json | 1613 ---------
 .../d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json | 1613 ---------
 .../1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json | 1613 ---------
 .../deddbc80-70ac-43e7-b052-753d127f8390.json | 1613 ---------
 .../e4780862-bf3c-4856-b1e7-02616afe931a.json | 1613 ---------
 .../cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json | 1613 ---------
 .../13a22d40-f274-4384-adcc-1539da821c6a.json | 1613 ---------
 .../a01f642e-730b-461d-8afe-9c077ab3f149.json | 1613 ---------
 .../813802a3-483e-443d-9e49-7cd581b5ea6d.json | 1613 ---------
 .../90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json | 1613 ---------
 .../d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json | 1613 ---------
 .../3dc29785-a884-4496-a6f4-a8bf19892e50.json | 1613 ---------
 .../ff8dc291-bbaf-4149-854e-e1780b0c86d5.json | 1613 ---------
 .../b8932181-b669-4b0e-8879-1dfbf9afea12.json | 1613 ---------
 .../c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json | 1613 ---------
 .../579fb908-3c36-4ff8-a262-fd5388806b83.json | 1613 ---------
 .../68ff9f10-0357-4ea8-b758-de6c7f51d669.json | 1613 ---------
 .../b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json | 1613 ---------
 .../8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json | 1613 ---------
 .../8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json | 1613 ---------
 .../6bbe052f-46f7-4541-80a3-dbb86433db7a.json | 1613 ---------
 .../9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json | 1613 ---------
 .../742a59e8-c813-42ef-938a-4897e25dcdad.json | 1613 ---------
 .../5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json | 1613 ---------
 .../509360bc-86f5-49dc-899c-2899d8b6bc6c.json | 1613 ---------
 .../8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json | 1613 ---------
 .../8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json | 1613 ---------
 .../7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json | 1613 ---------
 .../d65d8f48-8b8e-4ec6-af68-f61af5408adf.json | 1613 ---------
 .../dff69882-cb8b-4323-b587-60f295085459.json | 1613 ---------
 .../90220411-5e4d-4b74-a74c-ca2ad030d50e.json | 1613 ---------
 .../8c2465b2-deca-476c-bb41-836685ceab35.json | 1613 ---------
 .../4b0f6a03-1054-4047-82d1-53992f0378ee.json | 1613 ---------
 .../78bc128a-6e53-4086-9498-2b3428e1d884.json | 1613 ---------
 .../2be7887e-6c91-437c-bbfc-8b68de3330da.json | 1613 ---------
 .../f135ce21-655f-4ebf-9cc6-d83ada0f177b.json | 1613 ---------
 .../48912a61-af54-4208-b36d-2f3a283e5c5d.json | 1613 ---------
 .../cc85315f-4472-4b22-9f0a-e4609676ce13.json | 1613 ---------
 .../ab773619-db5e-449b-8d6b-da743cb038bb.json | 1613 ---------
 .../5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json | 1613 ---------
 .../32cc2aa3-be26-41bd-8124-a8b1073c84c4.json | 1613 ---------
 .../42a86a4a-7e76-4c7d-af48-e765a38df589.json | 1613 ---------
 .../f9746ed1-887f-4850-ac2d-700de18acbaf.json | 1613 ---------
 .../899521d0-e5eb-4e1b-af5a-78b3bd32e232.json | 1613 ---------
 .../1fb2c6db-2495-4609-a96b-57815c579953.json | 1613 ---------
 .../a5b6cc8b-676d-4c19-8093-0b893937e3d4.json | 1613 ---------
 .../0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json | 1613 ---------
 .../bc207557-fb49-4a87-8401-22c3ce853e7c.json | 1613 ---------
 .../895266ee-71a5-4ca5-b3f9-62df6383ff95.json | 1613 ---------
 .../8828e9e8-5716-41b4-a2d1-233bb056dc32.json | 1613 ---------
 .../f267ba72-b239-4126-99c5-675f79b1ae95.json | 1613 ---------
 .../f386e763-8078-454b-bd14-32b106663d53.json | 1613 ---------
 .../a4739cda-028b-48e0-b3b5-ca9b583d03f5.json | 1613 ---------
 .../837e20ff-fed1-4431-b643-63b904055c66.json | 1613 ---------
 .../e411f017-22c6-4d49-9bf9-5d99c1091791.json | 1613 ---------
 .../7bd2b266-5a65-4c63-bf18-5e4114564bfc.json | 1613 ---------
 .../49a1423e-d5f4-4665-b81e-d491f492a316.json | 1613 ---------
 .../8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json | 1613 ---------
 .../ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json | 1613 ---------
 .../a2b4ed40-b04f-481f-986b-25a2c26bbb79.json | 1613 ---------
 .../e88f9163-5334-43ed-9b56-154bf543f898.json | 1613 ---------
 .../6d436bd5-9d49-4895-8c07-7814b2eef12c.json | 1613 ---------
 .../681d0d6d-de06-4b8e-a7e2-964d98e2806e.json | 1613 ---------
 .../e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json | 1613 ---------
 .../cb80bd5f-204a-4dd8-96ec-40c7df93975f.json | 1613 ---------
 .../f84f84a8-7191-42ac-8951-5d7141a0f700.json | 1613 ---------
 .../9ba74767-b675-460a-bb68-e82adb6acd2f.json | 1613 ---------
 .../e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json |  267 --
 .../60724488-914d-4efe-98d6-f3ff26fe8fbc.json |  267 --
 .../2aaae404-b510-41e0-9a4a-b2d053731454.json |  267 --
 .../053badb4-b50a-434a-909c-c4d939c00b4e.json |  267 --
 .../7b4a4c6d-e302-4010-a099-5b01c874ffe8.json |  641 ----
 .../db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json |  641 ----
 .../f6808908-79d9-4de5-8434-94e4bdb854f2.json |  643 ----
 .../1a039ef6-5957-4246-82b2-bc607b6554e7.json |  641 ----
 .../fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json |  641 ----
 .../0e2790d3-40f1-4124-ba41-b65bd9de1852.json |  641 ----
 .../d55129d3-4eae-4009-a897-fa1624cea6a2.json |  641 ----
 .../6332f0b3-7fab-41ed-a8da-46b142051377.json |  641 ----
 .../0cb33741-ca10-40f5-90d3-28e300901ad3.json |  643 ----
 .../80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json |  643 ----
 .../de41775f-f60e-481e-a8ef-3df9a9b65a5a.json |  642 ----
 .../bc29d5c6-b5c8-473b-b69c-054026829089.json |  641 ----
 .../ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json |  644 ----
 .../4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json |  644 ----
 .../9ef56d5a-de00-4d89-930c-a4c74211dd78.json |  644 ----
 .../5598d3ed-5b37-4aec-b186-0b16c394633b.json |  641 ----
 .../a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json |  641 ----
 .../54bac699-aa82-4133-8c10-c6510c2a7f95.json |  644 ----
 .../79b23601-3148-4256-88ce-67e439a87c5b.json |  641 ----
 .../e92648e4-75c6-4944-9ec1-880823fefc87.json |  641 ----
 .../449feffd-d2e3-4a08-ad69-b8ad522532ae.json |  641 ----
 .../d297b253-0f4f-4caf-864b-9f457ab589da.json |  641 ----
 .../d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json |  641 ----
 .../cb409208-034d-42fd-acce-ab5cc4227383.json |  641 ----
 .../b2572ef8-446a-45b4-b557-45736418753b.json |  641 ----
 .../70d85516-b710-4b27-b664-03a6a822773b.json |  641 ----
 .../a8208df4-eb37-47d2-8845-f821e80e9858.json |  643 ----
 .../22cde248-40ab-43b0-a408-6d8b84692f22.json |  643 ----
 .../b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json |  641 ----
 .../ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json |  641 ----
 .../8721a15b-9102-4b1a-bde8-e5371f00f1b5.json |  641 ----
 .../23b3a30c-8aa3-4684-be54-adae003720fc.json |  643 ----
 .../7022c444-d6b8-4374-be0c-14835e5fd281.json |  643 ----
 .../bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json |  643 ----
 .../bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json |  641 ----
 .../527418d0-2591-43c9-b639-17328292b110.json |  643 ----
 .../8ddc465f-4f2d-4213-81c4-70b584d48047.json |  641 ----
 .../eca63d17-7fc2-4722-8bb3-0be99a257100.json |  644 ----
 .../e40a10b3-e682-4715-b2ee-4efcae050a58.json |  641 ----
 .../56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json |  643 ----
 .../f47ca10d-cd45-485e-b9cf-0c6592d63656.json |  641 ----
 .../7f0e318e-31bf-4044-bffb-357c1238d4fd.json |  641 ----
 .../818d6d72-0b5c-4fcf-b808-1d186223301e.json |  641 ----
 .../f09b853b-dbbc-4252-a0f0-a2c45c29f670.json |  641 ----
 .../f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json |  641 ----
 .../83c6a723-87a0-43d4-968e-86d186578e9e.json |  641 ----
 .../daaf221b-1759-4619-91fb-938e81975787.json |  641 ----
 .../6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json |  641 ----
 .../1043b815-b247-4444-bf8c-0b92b793c57f.json |  643 ----
 .../28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json |  643 ----
 .../73dedd31-7d40-4ee6-994d-00eb7d656597.json |  643 ----
 .../18da1dfa-5366-477b-a9cf-af29c5a99b68.json |  643 ----
 .../80057cc1-45ab-4976-878e-be963eaa83b1.json |  643 ----
 .../d896249f-bbd9-4657-a5db-5968544cb5fa.json |  643 ----
 .../9f73f3e5-b573-45d4-8c98-82f5c496f786.json |  641 ----
 .../a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json |  641 ----
 .../4ff688da-61a0-43ce-9c2d-e1c197887683.json |  641 ----
 .../181003ea-7587-4c93-8b89-c5c76958313d.json |  641 ----
 .../66688228-e59a-4caa-b3fb-c5df1efc9db4.json |  643 ----
 .../2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json |  641 ----
 .../077fe37f-b3a4-483a-93a5-034c6445fe98.json |  641 ----
 .../4fbb173c-b900-4e11-87bd-1ac6a489d014.json |  643 ----
 .../e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json |  641 ----
 .../0925f9b7-08f8-485f-84bc-a153a54aa417.json |  641 ----
 .../08082277-8305-4007-97cd-88202fc0115c.json |  641 ----
 .../fe554cbd-2480-40bd-b2f5-464cad700c14.json |  641 ----
 .../9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json |  643 ----
 .../d9654997-1d3e-41c3-9f16-05a36dde9b02.json |  641 ----
 .../73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json |  641 ----
 .../4d01d929-b5e2-42dc-89ee-20560f560db5.json |  641 ----
 .../76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json |  643 ----
 .../69ea0ef0-c136-4cff-9607-6ae12e0692c3.json |  643 ----
 .../bbe708f3-fb78-49e9-876d-cae57f1231cc.json |  643 ----
 .../ab7b7951-0792-4538-8a7a-6baee8602cbb.json |  643 ----
 .../fc94c95d-9678-4f23-b82f-190a08ece307.json |  641 ----
 .../3f92e2fc-9831-4c2c-b94e-af33d457fa82.json |  641 ----
 .../3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json |  643 ----
 .../6b2891bd-2444-4286-8ccf-c91181856d29.json |  641 ----
 .../bd924bd3-e13c-48e0-b339-8c15c5072038.json |  641 ----
 .../b8a6f32a-9904-43bb-9add-89404093a9db.json |  641 ----
 .../c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json |  641 ----
 .../9c1fc50a-437d-458b-926c-33cabdcc4aeb.json |  643 ----
 .../5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json |  644 ----
 .../10e1abfa-83de-4960-8d4c-c5099894cb80.json |  644 ----
 .../40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json |  643 ----
 .../2abf3bb8-a78f-4a59-807e-52da4e6426fd.json |  641 ----
 .../ae28615a-b7fa-4782-89e1-4b8e4804dc62.json |  641 ----
 .../52bb6ab9-e80b-4bf0-a375-7706f16d311d.json |  643 ----
 .../fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json |  649 ----
 .../1158720a-9a0e-492e-a677-9b0936f4cde5.json |  641 ----
 .../254ded81-4051-420d-b402-2e7b80a23848.json |  641 ----
 .../ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json | 3021 -----------------
 .../7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json | 3021 -----------------
 .../5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json | 3021 -----------------
 .../0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json | 3021 -----------------
 .../92e0b1b9-c167-4e07-b770-2b78527eb4eb.json | 3021 -----------------
 .../3da06ad4-0770-45f5-a6a2-9ef9500cef05.json | 3021 -----------------
 .../c1c79360-60bd-4f5d-a746-e0411b94f69b.json | 3021 -----------------
 .../bb904716-048c-4b41-9f64-4d17c485afe3.json | 3021 -----------------
 .../063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json | 3021 -----------------
 .../c8949c55-8987-4ed3-b74b-8b13b4381806.json | 3021 -----------------
 .../ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json | 3021 -----------------
 .../bc9cedd7-5cb2-44b2-abda-470322570e14.json | 3021 -----------------
 .../305a7f25-6e22-4146-9678-6a687a701567.json | 3021 -----------------
 .../c6059976-85a1-40ce-b02f-67e182aa2f7d.json | 3021 -----------------
 .../6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json | 3021 -----------------
 .../f397ca7a-41c4-4926-b075-2523639f0a50.json | 3021 -----------------
 .../acdf4701-e1c2-4867-bd85-d34ae8fb0991.json | 3021 -----------------
 .../3cd855af-9679-4fd0-bc3f-34db697c7855.json | 3021 -----------------
 .../78fb6814-e32f-4b15-b958-9e001637ba07.json | 3021 -----------------
 .../f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json | 3021 -----------------
 .../cefc3b25-0779-4fb3-93a5-3c7a285304af.json | 3021 -----------------
 .../7e00e082-0e79-45e0-b0ff-5458cc2aff85.json | 3021 -----------------
 .../ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json | 3021 -----------------
 .../c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json | 3021 -----------------
 .../7ea5b404-d98f-4282-81d8-6ca5f6629429.json | 3021 -----------------
 .../7056c7e7-f68a-4764-aa48-a8368ae2e317.json | 3021 -----------------
 .../5e67014d-6ca1-4e65-a85a-84d91e147d4d.json | 3021 -----------------
 .../3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json | 3021 -----------------
 .../46d5e547-507e-4c98-98a9-bad1bfad7f7b.json | 3021 -----------------
 .../ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json | 3021 -----------------
 .../2b31b441-caa9-465c-a2d2-051c951c7be3.json | 3021 -----------------
 .../b7ea6c93-af70-4c0f-ba50-03a539416a8b.json | 3021 -----------------
 .../fe4cec30-e483-49a8-80ea-00b2c6231740.json | 3021 -----------------
 .../53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json | 3021 -----------------
 .../af88b02d-cb29-4d2c-bb33-5fddcf316a95.json | 3021 -----------------
 .../a0abcd19-58a1-478a-9786-d044a4181241.json | 3021 -----------------
 .../95eda13a-cd34-4170-b2db-f2ead47250f9.json | 3021 -----------------
 .../7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json | 3021 -----------------
 .../9da7439c-e96b-444f-b4fa-7ef638080740.json | 3021 -----------------
 .../294b22a0-1676-4d8c-8ad2-5cdc40267255.json | 3021 -----------------
 .../1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json | 3021 -----------------
 .../78f2484e-bc73-4026-929b-db345e92cf5a.json | 3021 -----------------
 .../8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json | 3021 -----------------
 .../41af381a-3637-4578-a582-59d9b1327d95.json | 3021 -----------------
 .../96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json | 3021 -----------------
 .../bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json | 3021 -----------------
 .../e036de72-b425-4aa5-9448-dc52560e60db.json | 3021 -----------------
 .../65423181-18f1-4296-98c2-171356106404.json | 3021 -----------------
 .../41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json | 3021 -----------------
 .../f78d6e0a-a397-4a41-a37e-696bda5a1987.json | 3021 -----------------
 .../d2bf70ce-341f-49d7-bd03-87b523826953.json | 3021 -----------------
 .../b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json | 3021 -----------------
 .../08590b6e-7050-413d-844b-1f3f1c5aa444.json | 3021 -----------------
 .../2d18fd88-73b5-4d4c-a1cc-e66a20316605.json | 3021 -----------------
 .../567918be-be6f-4e41-b613-727828fe8a44.json | 3021 -----------------
 .../c2be131b-808c-4947-b24f-69ef6af499d7.json | 3021 -----------------
 .../24955250-a2e9-475f-a866-30a835579e03.json | 3021 -----------------
 .../de6f7e19-b54a-4bd3-b624-29f66afbee15.json | 3021 -----------------
 .../e4c3032d-04e0-414b-a7e9-e30756d82000.json | 3021 -----------------
 .../e9a41d4b-56c7-47f0-b439-72ad1e463000.json | 3021 -----------------
 .../a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json | 3021 -----------------
 .../fd6aea24-dc18-41ce-bc19-23f461a39032.json | 3021 -----------------
 .../625d33ce-a320-4bfd-a962-451b8c22d392.json | 3021 -----------------
 .../e51be257-610e-4d38-b58a-a3b29fc06a83.json | 3021 -----------------
 .../9e0b9f48-f913-4bbe-a135-59e596c9e479.json | 3021 -----------------
 .../189e6cc5-1c8f-4712-8dda-c108f18f836d.json | 3021 -----------------
 .../4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json | 3021 -----------------
 .../ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json | 3021 -----------------
 .../fa6a6772-671b-402e-9480-d61e0fb4a61e.json | 3021 -----------------
 .../b5279e94-ae7f-4671-9315-874e162a24fd.json | 3021 -----------------
 .../de00e8da-9c83-40df-b642-b94719ce1ac2.json | 3021 -----------------
 .../119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json | 3021 -----------------
 .../80aabdf4-60b7-493b-98d8-1854f1c41c10.json | 3021 -----------------
 .../29958cee-32c9-4d51-8f14-72db4273459f.json | 3021 -----------------
 .../72537b16-feda-4e5e-a477-f415650db847.json | 3021 -----------------
 .../7df68af5-667a-4125-9c12-e71fb5af0a74.json | 3021 -----------------
 .../1845eb8b-4c94-4d22-8771-012f7230dc62.json | 3021 -----------------
 .../b2c8cfd1-f09a-4616-8038-c7e1930bce74.json | 3021 -----------------
 .../12976629-cefe-4329-b974-bb17f88d385d.json | 3021 -----------------
 .../0d7928c3-c769-474e-8249-7a5c70c4c559.json |  132 -
 .../f63536ed-752b-4538-9b92-2514a617a4bf.json |  132 -
 .../8ff13de2-ea43-4392-992f-ba70b6023e96.json |  132 -
 .../02bac8a7-bd09-4e73-979a-7dbaa7a8ed75.json |  132 -
 .../74e4406d-b2b6-4c3f-b059-f52cccf1fff4.json |  132 -
 .../ec8a6d6c-b8ea-48a3-9af6-d357e0057ec1.json |  132 -
 .../05307b41-d832-4533-99bd-c8608bf8e64c.json |  132 -
 .../c09bd9b0-6f85-4120-94a9-b628c68bccb7.json |  132 -
 .../9f971385-1146-4436-91a6-0e52d4db1f07.json |  132 -
 .../80ed14ca-b4cd-4ceb-8fdb-24705e47bd0e.json |  132 -
 .../db88e3f5-58a9-4783-9093-a6df96483342.json |  132 -
 .../8cd90f8a-d8dc-469b-95b9-260fcef804d2.json |  132 -
 .../b2c82703-2b5c-407d-b84f-a8f8261ac894.json |  132 -
 .../55462e67-5eca-4e9d-9095-51fcf12de5fa.json |  132 -
 .../25a119f0-5eaa-4fa9-8cd4-e0f437ada456.json |  132 -
 .../efc036b6-d8de-4393-87a1-d4f86fb44d91.json |  132 -
 .../a5144406-eb85-43b2-a49d-be6b06d6b04a.json |  132 -
 .../900184ad-656d-416b-956f-5f6e3a991d1b.json |  132 -
 .../7a58954a-5d7d-4640-99fd-773249640237.json |  132 -
 .../4ea3146c-b912-424a-b0a9-7c37348348c8.json |  132 -
 .../b0276278-6d86-49c0-a246-cd9110ac1deb.json |  132 -
 .../04216f67-1385-43bf-b7de-5bae7a60f379.json |  132 -
 .../fbf7b76b-7ced-4217-8e14-1d02184e271c.json |  132 -
 .../74ac8aba-6dfb-464c-81b5-d02a9192b9cc.json |  132 -
 .../295938e1-ade2-4d36-beca-3cbe506b5b90.json |  132 -
 .../f331782f-ea09-41bd-8c6a-e964c88d7e09.json |  132 -
 .../e4e3d79a-1de9-43be-a029-0be4f60e472b.json |  132 -
 .../6914ac28-b543-4f36-81f1-f7491c018e3b.json |  132 -
 .../b7378f41-46ab-41af-94cc-e7fb10738658.json |  132 -
 .../acedae59-6192-4ac4-a354-d520ecd6ba36.json |  132 -
 .../ff105961-761d-4261-8a44-20acf2e7f440.json |  132 -
 .../fa0901f6-514e-44ae-84dc-0b793f26169e.json |  132 -
 .../d2dff5df-343b-40f3-85de-14eb72dab050.json |  132 -
 .../8fa3010f-b7a1-4fc1-9156-ba70453add86.json |  132 -
 .../58034f99-3b01-46d6-aea9-90c75d073bb0.json |  132 -
 .../e6c08c9c-6d01-45c7-8a24-219b756b8632.json |  132 -
 .../cd97ad01-1d20-4cbd-a9bb-2acf3d9fdcc7.json |  132 -
 .../95f44ef8-e5ba-4bdc-97a7-2c5a678b07be.json |  132 -
 .../082f25f0-994c-438a-8086-b1e439aca466.json |  132 -
 .../31423cbd-08cd-4079-b1c5-ba412acf1b51.json |  132 -
 .../2669bd86-da65-4d87-8464-bfa8c741ce0b.json |  132 -
 .../ab2c19ff-5671-446f-b09e-731e2ae515ca.json |  132 -
 .../36250dc3-cb51-43be-8ab0-6788eb5bda7c.json |  132 -
 .../cd616d6a-151f-4aaa-93b5-9c4a758f95b5.json |  132 -
 .../9cb09cae-9b1b-43b1-afbf-f44b0a44053c.json |  132 -
 .../038c32da-add5-4299-ac17-df6ef3fdea58.json |  132 -
 .../25eb4bdf-beb4-4ad2-a5e9-3a2f31c46cb5.json |  132 -
 .../77655d60-872f-468a-acc6-d584ef5bf46a.json |  132 -
 .../4de378c8-ccf6-4f0b-8287-3d138a8645b9.json |  132 -
 .../8039cadf-6644-44e7-8452-90e9c8069e28.json |  132 -
 .../8914d89d-c873-4704-998e-dc807e96030b.json |  132 -
 .../c2e9fc29-db07-4b49-a98a-084158831ac4.json |  132 -
 .../58724539-6fc5-40d9-ba43-87410959894d.json |  132 -
 .../b13324cf-f6f5-4bf1-9cf3-c196120c4bcf.json |  132 -
 .../782b2df0-d1b3-414c-a4bd-59052a4441a9.json |  132 -
 .../b508e41e-0f1c-49ce-8b80-5e7ec82b8f15.json |  132 -
 .../2824e8d4-2749-4b18-a3a1-b987ed215ac6.json |  132 -
 .../53176984-ba93-4a64-b81e-21f6e0f65bcd.json |  132 -
 .../53252698-7d17-4f2a-9106-3b744ae7a985.json |  132 -
 .../6dd0f3a2-27ee-48f1-9d97-ef6954d298c8.json |  132 -
 .../35f11d5e-88c4-4a95-8d06-a40bee648b00.json |  132 -
 .../ba1193c0-42b8-487d-b9fd-ddbc1fd15359.json |  132 -
 .../95733620-e1e7-4442-b9c3-a699165df5e7.json |  132 -
 .../cacfce0d-f5f1-4101-8065-f5f02eaab1fb.json |  132 -
 .../72be5537-198a-43e9-9840-a803083158d3.json |  132 -
 .../2e9a3443-970d-4f37-a356-277a11c81754.json |  132 -
 .../1188402f-aa1c-4306-b031-c92ff0a5dd64.json |  132 -
 .../ee2f567a-6403-46d5-9a6b-bd029f81d660.json |  132 -
 .../d809fdff-f5ff-44f5-afc7-7e8af9ce2f93.json |  132 -
 .../87d66efc-173f-4c14-b76c-d8b7e00d575d.json |  132 -
 .../47f62378-c3cc-408f-a0d1-71eb3f522f57.json |  132 -
 .../dba8c12c-388d-4f8b-8ce8-83acfc4920c7.json |  132 -
 .../e4087285-1d1a-465e-ac88-91310e939710.json |  132 -
 .../09f189d9-74fd-47bb-b5fb-7994cba56ae2.json |  132 -
 .../5754c262-6ddf-4f54-9722-22ff20a8d76f.json |  132 -
 .../cc1bd811-ec88-4514-8b47-4140ded4f03d.json |  132 -
 .../3f08155d-8551-4472-86fe-7988cd6df78b.json |  132 -
 .../339e12fb-b4a4-4a4b-bb40-899b4ad833f9.json |  132 -
 .../4fd60e9c-5c90-492a-b24d-7ca6d1e91eae.json |  132 -
 .../7f8d935e-3782-4769-8bd0-ee8a0ce91cd6.json |  132 -
 .../6fa07e60-9f82-4abc-aa45-4dfc0bcf9b8d.json |  132 -
 .../99a0022b-3fe7-4612-9cbb-cf082c1f6b70.json |  132 -
 .../b1153714-d6fe-4ff9-ab8c-85b677d57f8f.json |  132 -
 .../c3d39b6c-02af-410d-8a5c-224495b04572.json |  132 -
 .../0426fcba-3db4-492d-b622-e34ab8d3fc8f.json |  132 -
 .../aa099cfe-ac9a-42dd-8357-f4d8115133ca.json |  132 -
 .../ccbc8a5e-9a97-452a-b023-cc996ffe31f1.json |  132 -
 .../b359a7a3-cf2c-4952-b308-333672dadcec.json |  132 -
 .../0864d5cf-d6fe-42bc-9059-9f2e5ff06b60.json |  132 -
 .../e6ef2559-8a63-43e3-a60b-0d2b7256ad3d.json |  132 -
 .../45d019ab-b23c-4fc3-baf5-d57576e9945c.json |  132 -
 .../e3cd7c32-e5a1-4cd6-a9dc-95364a8abe75.json |  132 -
 .../9be442e8-4b77-43e0-a981-887338e59b78.json |  132 -
 .../a07b6326-f393-490e-b696-d8b45f593d4b.json |  132 -
 .../b66ed91a-98d5-407c-9896-9c2e2a31e9da.json |  132 -
 .../9c70921d-956b-4727-9201-1addbd01bb8b.json |  132 -
 .../4ba6d51e-314a-4db4-9552-568a4093e01a.json |  132 -
 .../835f5056-56bf-4a6c-886f-fbe6f263ac07.json |  132 -
 .../c2a63afa-9d25-41dc-b25f-848f5a640501.json |  132 -
 .../f64f9d24-e448-4bb6-89c3-edb66499bac9.json |  132 -
 .../2de14bfb-844a-4711-815e-8f63487a78fd.json |  132 -
 .../f953e0e2-ddca-42a2-a0f6-752a137bc6b5.json |  132 -
 .../98187b98-0cc8-4756-9cb7-c53deb998f90.json |  132 -
 .../8c79c60d-ebf4-4409-be4f-928a54cedd1d.json |  132 -
 .../5d5cebeb-faf0-4fdf-8749-6307080e82f2.json |  132 -
 .../e926ce8f-45bb-4f3d-b579-ecadb3df6468.json |  132 -
 .../070609d6-5f41-4712-9ad7-e215b1a6bb81.json |  132 -
 .../8d2909c7-37f2-4198-a1e2-4bf2ebc1444d.json |  132 -
 .../53587959-25f9-43aa-a34b-f274d8bc93af.json |  132 -
 .../2a7f80ed-d404-4c81-b000-b65c83069121.json |  132 -
 .../f0983645-4adb-4ddb-bf2f-33480cb7f421.json |  132 -
 .../161dadfe-4983-4f56-8a7d-9b97f1c5a3c7.json |  132 -
 .../694a02f9-4729-4d0b-97ce-80adaef29be2.json |  132 -
 .../0521f51d-22c1-4821-8f04-23c533411668.json |  132 -
 .../8fdea71b-5e68-4a78-aefc-8a00650464c4.json |  132 -
 .../e2ba5674-9251-4a4e-9eb8-046c834da400.json |  132 -
 .../4caafdb2-3065-40d4-b5a7-9deb41e1d8a7.json |  132 -
 .../886e0b8b-b2dc-434f-a299-50f668006241.json |  132 -
 .../7a6a9443-f331-4dfa-acf9-6aa30049bade.json |  132 -
 .../6d523da4-ec4a-405b-a25d-afc7b1b5aefd.json |  132 -
 .../cfecfce3-090d-4c2e-826c-03c0c5337e98.json |  132 -
 .../5aa124dc-4abd-4c5f-b40a-a8d81af922eb.json |  132 -
 .../ec91b122-c8f5-4dfb-94fd-336ef78c3e14.json |  132 -
 .../114f246a-6049-40bf-ad86-9a822d13cf74.json |  132 -
 .../82d28a3a-44f2-463f-a1b8-7e9079ec47b7.json |  132 -
 .../ed3c1349-a154-4866-890f-2b115ffaf127.json |  132 -
 .../47942c55-5ddb-4fda-9c5b-34676ae2046a.json |  132 -
 .../d860210b-4c8a-4d15-ad3a-4e39905f91ed.json |  132 -
 .../d137f429-2b65-4ee9-9d66-3f619b270fad.json |  132 -
 .../1da10dfe-b0a3-4cb8-aaa3-e16d48f3aab4.json |  132 -
 .../6156a0d2-4c32-40b2-9624-ef0c7a6a95bb.json |  132 -
 .../676342d2-f37a-4b6a-967d-3ac750243470.json |  132 -
 .../950b7108-0192-4875-b4e9-c3e43ab71e08.json |  132 -
 .../85672df5-2f35-43be-8648-9937c66872dc.json |  132 -
 .../051c5642-3b23-4879-9d10-639d1b3127d7.json |  132 -
 .../2acf0d12-7e0c-46dc-a079-ebc48a8818d3.json |  132 -
 .../8ce42090-006e-4e08-8d3f-5b1eb0b8da0b.json |  132 -
 .../703df6c3-dae4-437f-9379-f8c264797adc.json |  132 -
 .../1e349ad3-d29b-4a4b-97e7-b82055e41b07.json |  132 -
 .../8f677a76-932c-4c35-9708-4b723226aa19.json |  132 -
 .../ebfe625f-ff1f-45f9-826c-9351ea4134e1.json |  132 -
 .../66e6a757-ac22-47f3-82ce-81af45e1d3cf.json |  132 -
 .../1cd840c7-d432-495c-a3df-af1fa6264259.json |  132 -
 .../066f520f-9a64-4564-abfc-6435732c3585.json |  132 -
 .../aced5181-040a-48c0-bc5f-78d0de3afae8.json |  132 -
 .../a4889a38-84d2-4ae1-b8a9-297b4400602d.json |  132 -
 .../d540505a-c67b-4b72-a53a-c03aa6f8d3e7.json |  132 -
 .../9859afee-02ca-4c48-acc8-acfd20c37e4e.json |  132 -
 .../e222d12b-c796-4890-a584-cd689bae7ea6.json |  132 -
 .../c16850f8-0b80-4455-8f38-8ec453cd1d41.json |  132 -
 .../0d400b0f-cc82-4c86-b600-93a31b133f9d.json |  132 -
 .../90f6f8f1-02fc-425a-8499-e9b43ae8ac59.json |  132 -
 .../6704d6bc-6d38-4c59-87a4-81d3eacde3b1.json |  132 -
 .../e8ad6ce4-7efc-499e-a2c9-9e0df898fbb9.json |  132 -
 .../5e9c1273-536d-4280-8fff-9931f46dc968.json |  132 -
 .../460ca160-ac34-4091-ba2d-986b53532b55.json |  132 -
 .../ef9d2fab-07a2-44e2-aae2-ede5a2ff31d9.json |  132 -
 .../a29a69d3-d64e-4463-aa52-0a9d6d012c98.json |  132 -
 .../4539c16e-1ac6-47f4-88eb-a09842497330.json |  132 -
 .../2ff33c55-1236-4c57-8809-2d3076e43cc7.json |  132 -
 .../281ba822-49a2-4746-bc04-8de046439508.json |  132 -
 .../0606d916-95ea-4318-af0c-3942329071c6.json |  132 -
 .../005159f0-da68-480d-972c-c160d145a682.json |  132 -
 .../2f6abb5d-52b3-44b0-b960-115793485fb1.json |  132 -
 .../6ffacad9-1a4d-472e-bbbf-0d64d068dd0d.json |  132 -
 .../26eadaf8-bfb8-4aad-a8a4-90699b6f0fcd.json |  132 -
 .../d4536913-5708-45e4-a024-45ae37fdae13.json |  132 -
 .../848860aa-7de3-4fae-afca-ac11224b96c5.json |  132 -
 .../0241a8e3-d6e5-4ba5-afb9-862bde2ba851.json |  132 -
 .../20b69120-d476-4e34-b3c6-8cef11d6ee78.json |  132 -
 .../696bbbfc-49dd-444e-a90b-76821845a726.json |  132 -
 .../e6d974d3-467e-4fe7-bd84-79fc7c72cde2.json |  132 -
 .../b26ba2b7-1365-4b1c-a1be-35d588e02d36.json |  132 -
 .../64bd755d-ba4b-4559-ad8e-f56c697b1ae6.json |  132 -
 .../c4e572cb-1d12-4baf-a4d8-a55422692207.json |  132 -
 .../c6123e10-b1f9-49dc-888b-083881e6ef09.json |  132 -
 .../e1647f10-fec5-463d-b8e5-6b2b880bd687.json |  132 -
 .../6d5fa235-8d69-456e-9f23-0f702760baf4.json |  132 -
 .../e8709a6a-a2b8-4b09-9342-d1aeae89de1f.json |  132 -
 .../603e95c9-7e7f-4892-93f7-92f92b256865.json |  132 -
 .../3e2fd38a-186e-49aa-915c-7eb3cde50562.json |  132 -
 .../16d55e66-9015-4d72-81e4-3f14c42b0368.json |  132 -
 .../696644b9-bd40-4047-bb85-0cb19510a96c.json |  132 -
 .../cbae8c39-0aec-4859-98bc-3b2d065833ad.json |  132 -
 .../15fb3cc7-1ba5-4ba5-ba02-8e8a9d2029d0.json |  132 -
 .../357f6051-b880-48bb-8e68-e4b0a7a0cbcc.json |  132 -
 .../a50a542b-668e-47b1-a37e-805a58eea3d1.json |  132 -
 .../00f7bd51-0b31-446d-be8c-1e0dc0d82e54.json |  132 -
 .../26782941-b918-44c5-a7f6-5f770e47c3d6.json |  132 -
 .../5547ddaf-8fbb-4259-8b88-e946fc3d2404.json |  132 -
 .../bee5ea59-b97a-4783-b763-b6bd432d4558.json |  132 -
 .../8150333f-8e79-4230-af8b-7ddb1d5eeb21.json |  132 -
 .../be8510a9-ecd4-4ac7-9930-3200cacb7b50.json |  132 -
 .../887e4574-f876-4e75-afb8-e543bcb30020.json |  132 -
 .../fd21d8bd-28cf-4b91-8075-c38a61f5f32a.json |  132 -
 .../c0f05e38-6592-478a-9c46-26567f24ff85.json |  132 -
 .../06cc2913-8e05-44bf-a128-9a7c4aeff536.json |  132 -
 .../86368d5b-0509-4b52-b988-58bcf7e1043e.json |  132 -
 .../77b89fe6-464b-4017-a77f-8750e2668a82.json |  132 -
 .../d2e47d86-23dd-4c95-a7fb-99518615d09f.json |  132 -
 .../0a09891e-ac97-4c3a-8364-7106a851f1a8.json |  132 -
 .../eb41fe62-ac46-4630-bb2d-6b907f271737.json |  132 -
 .../d540a6c8-e9ec-4413-b9d2-dee68533c377.json |  132 -
 .../5b1f413a-05c4-43be-bdbc-9de5728e8d0a.json |  132 -
 .../6701738c-27e4-4bbd-b614-fbc297c3164f.json |  132 -
 .../7f4563b4-0b25-49e7-ac1c-afaa28b0eda2.json |  132 -
 .../32b6e4af-69ba-49b7-9367-dfafe3e390e8.json |  132 -
 .../e16deaf7-da55-40ba-ac18-860fa3f14d34.json |  132 -
 .../8a7a5886-0618-4615-9cdf-46f5d19a29fe.json |  132 -
 .../66d18e5b-9ebc-4ab6-94fb-6d5c23c58672.json |  132 -
 .../a36aaaf6-2478-4b98-ad0c-2b06ddb8c308.json |  132 -
 .../4a6237a7-019c-4310-971e-84b08d1b5067.json |  132 -
 .../996e781e-5939-41ac-b347-95c99037c34a.json |  132 -
 .../e880fa0e-ae49-4398-91bd-eadf8695425f.json |  132 -
 .../da04ff51-fbeb-41a8-ae5e-8ddf5925b792.json |  132 -
 .../6d709396-1ae1-4e5c-a03c-13c1e9425202.json |  132 -
 .../5b616df9-e15a-4f84-98b4-c2cb532c1b95.json |  132 -
 .../0f6552d9-3cbe-447e-909b-068e5ceed4c9.json |  132 -
 .../2861aae0-d2ec-48f5-bd20-9e7bcaf8dabd.json |  132 -
 .../51a64f37-256c-4fe7-b28c-6117520f04ec.json |  132 -
 .../03ce9c1d-38e8-4a6c-b293-57428a9d7c0e.json |  132 -
 .../3b0f5dea-db9b-4657-9807-6b3e56d38823.json |  132 -
 .../2d19e9ff-e331-4171-ae90-47e44f3f8885.json |  132 -
 .../6bfb8b24-1abd-405b-b01d-7d7111705dbb.json |  132 -
 .../c83e6b6c-c8be-4d97-9c65-2d883f88f37f.json |  132 -
 .../72569796-1b11-48cc-ada7-e8c09522dd54.json |  132 -
 .../58403e30-bd2b-4f4c-ad41-daa890c77d40.json |  132 -
 .../eb8e1f1d-c6b3-407c-b172-d240553d2f89.json |  132 -
 .../356d75a0-6520-46c1-afa9-7dbb2596a5c1.json |  132 -
 .../78681e0c-5fe2-4920-af7b-99345cea3efe.json |  132 -
 .../ba0ee5b4-070a-461d-a3d2-cd4036387cc9.json |  132 -
 .../17d0d377-bca4-411c-be11-6c5cfce07798.json |  132 -
 .../d01a56a1-1eb9-4ccf-8c09-348b6ba5480b.json |  132 -
 .../389821ff-d8e2-4d1d-8fb2-57a689867ac5.json |  132 -
 .../7913f782-29b0-48bd-bc62-37da9a5ac7d9.json |  132 -
 .../b0930974-999e-4372-9d21-b9790e0bad4c.json |  132 -
 .../8265f577-f504-4a56-9cf0-42c34766559a.json |  132 -
 .../82044cd2-1a46-406e-bc68-397ce41b29ea.json |  132 -
 .../de09e323-8cf1-4aa9-9537-e8ad30a8c297.json |  132 -
 .../bfe543b4-ec38-488e-ae04-125cd358b61f.json |  132 -
 .../be36d8ae-b81c-4b4e-aa2f-5999c7582237.json |  132 -
 .../342b435f-89e9-48ad-ab0f-2c1f52f4571a.json |  132 -
 .../b0c8737d-d838-4da1-909b-b218e22119dc.json |  132 -
 .../4cd40f28-842f-44d5-9eb2-86238077fc55.json |  132 -
 .../0758051c-2d75-402e-af0e-769096cbb17c.json |  132 -
 .../c93f610b-fb97-4ad1-b8af-fc41c6d8da33.json |  132 -
 .../b8467118-d895-41fa-81c7-89892e1844d5.json |  132 -
 .../30d867bb-63c6-48d1-8d43-6c24f4cf44ba.json |  132 -
 .../89b92cda-c5b6-45ed-a534-361c9d34794a.json |  132 -
 .../48cdf76a-886d-41ec-8580-00ed4232b601.json |  132 -
 .../116272d4-d25d-49cb-80cb-ff26a0fb3cf4.json |  132 -
 .../bb103828-70fe-4767-9302-6750d839129e.json |  132 -
 .../7b58ab54-239b-4e49-93f1-c3940df61474.json |  132 -
 .../559067a2-816c-4091-893e-b1c7860171ec.json |  132 -
 .../ec502619-880b-4b7c-acfe-c43cf6514e3f.json |  132 -
 .../6941a5dd-2a70-4846-a5f6-b16ef2d56a03.json |  132 -
 .../636e2f93-3242-491c-9df5-003aa1dacecf.json |  132 -
 .../1f4efa23-816d-49be-8659-feb003f4b3ef.json |  132 -
 .../d05be1e4-bcac-4b4a-bbde-8b17a5a71243.json |  132 -
 .../9ab53055-86f5-4a88-976f-015dd9c9e832.json |  132 -
 .../ba34083a-9b13-46d9-8f36-aa3ddd586711.json |  132 -
 .../6a39d734-ad73-4c4a-9583-3563e336d4b3.json |  132 -
 .../2af71e88-4931-4359-b92a-c64fa33df802.json |  132 -
 .../bf9336a7-a7c4-420a-9dd0-68d8e0c815c4.json |  132 -
 .../2de872b2-10c7-44dd-91c3-f20205207da6.json |  132 -
 .../5cabed09-d8ea-46c2-bb78-012dac954d6b.json |  132 -
 .../8236db6a-ff8a-4237-af5a-03bb258f8e59.json |  132 -
 .../1a7b078e-bc1f-400f-a0cd-f7b535548f23.json |  132 -
 .../fdaf561c-567c-416d-a74a-ac3c07c5be5b.json |  132 -
 .../58900b3b-303b-49c8-b807-7b8d06601568.json |  132 -
 .../7ac5a45a-7b41-4f63-8556-8737638a00ea.json |  132 -
 .../3cb55475-30c8-43c8-8d7d-394450fdc117.json |  132 -
 .../f5e140ff-0c0e-4769-8116-63cf50255773.json |  132 -
 .../df85ec6e-1325-40ce-8087-d960a1d767dd.json |  132 -
 .../a7bd3fff-f01e-46ca-af85-5b4ac6ae7320.json |  132 -
 .../11842dd9-0572-41ef-aaa0-8d19f3420efc.json |  132 -
 .../01abccec-1cea-4060-89be-289987d0a2ce.json |  132 -
 .../dce8226c-57bd-4255-b813-8a70494f0a1a.json |  132 -
 .../7f80e69c-eec6-49ac-a088-6248ee25f736.json |  132 -
 .../e0267a2c-dfc5-456e-864d-b5b0ad1fa508.json |  132 -
 .../e6ad37be-28f4-43b4-9df1-b7b47d31232e.json |  132 -
 .../5514368a-1f7d-4cd0-b7f7-d116b753f975.json |  132 -
 .../c0e29cf8-897f-4e07-abb4-71c801d34301.json |  132 -
 .../68310379-65b2-482d-892b-f76547bce2b0.json |  132 -
 .../a034c4ec-d4cd-439b-8dbd-e67685ea7616.json |  132 -
 .../e4b761d3-bb84-4433-b9fb-4c92ecae6279.json |  132 -
 .../38d78d30-be6d-476c-a3aa-d9a40f570a56.json |  132 -
 .../36e60f6c-60f7-4b17-88fe-82810e195fc7.json |  132 -
 .../a6c647e8-ed24-4150-8563-dd9b20e21498.json |  132 -
 .../b5a366ac-d736-4447-a2f1-98d0b84ba3bd.json |  132 -
 .../5d098dc6-8124-4d26-86ec-d54e6e09c3a6.json |  132 -
 .../1137cbc4-d80b-4e21-bfeb-feab41dc80b2.json |  132 -
 .../097bbfbc-0ccd-4fd4-9e0c-9c192cba9e8b.json |  132 -
 .../db8c6169-bfc1-48bb-be53-fa93c673f051.json |  132 -
 .../41437fc9-6d48-4317-a8de-ab4e63b2cf46.json |  132 -
 .../e075f4fe-95e0-48f4-94c4-f6ebd3f4edaa.json |  132 -
 .../3349d66c-e12b-49c1-a406-e0e77b697458.json |  132 -
 .../7aa0ff6b-11a9-4554-a27f-e477a0ff77c7.json |  132 -
 .../ac749485-df6d-485e-8fa7-63bdfd744167.json |  132 -
 .../54363a4b-312b-4035-a1c3-b5321311cec4.json |  132 -
 .../aa9e2b9e-cd25-4492-9801-eba7d40b4365.json |  132 -
 .../c6b484b8-f6f3-4516-aff5-c2f6438c9047.json |  132 -
 .../c6c760c9-a345-4e25-b333-b403bf6db389.json |  132 -
 .../65b2aa58-2c04-48f2-9ea3-c8fd97cb9dde.json |  132 -
 .../92903344-0dde-4f5a-a7d2-749a1ffe9cd3.json |  132 -
 .../59ddd478-c1cd-4bd8-80c3-fdebe762414a.json |  132 -
 .../02f63fc6-9376-4fb5-b067-63493238cc27.json |  132 -
 .../dd7597fd-27f5-4e77-a44f-b01d0db82719.json |  132 -
 .../20cd0d60-eb0d-41bd-b37f-910a03dd7f82.json |  132 -
 .../c4e9d045-3769-4828-a2ca-7fa508873089.json |  132 -
 .../0a0501ec-4ecd-47c1-914b-d473f795cef2.json |  132 -
 .../beca755f-203f-4bc8-b5cf-f9a9e3f8bd8f.json |  132 -
 .../79e1e1c6-cbe0-43a9-a593-8e2119baaf77.json |  132 -
 .../def80b44-3d9a-46ba-bf5f-ffc81e50af2e.json |  132 -
 .../5e1aa809-ef20-445e-a05b-eccd585d5991.json |  132 -
 .../7c2be651-ca56-4285-afc7-1bfe1c8ce11e.json |  132 -
 .../cfe4ea72-ddb9-49b5-9599-99f215e112e5.json |  132 -
 .../81d63d8e-88dd-4b16-b9b8-d07604878f8f.json |  132 -
 .../81f8208b-f7e7-4685-bb84-321d9e097470.json |  132 -
 .../a0c9a434-9b8c-47c5-b511-9daac7901686.json |  132 -
 .../28b60eae-1b38-4404-8db1-3fb2997583f4.json |  132 -
 .../746862a2-a90c-4612-91d0-f989b9eed1a5.json |  132 -
 .../715ee057-9c9a-4e04-991c-7040b1eef65b.json |  132 -
 .../4dc1d103-3458-4b8c-9e63-b98effd69667.json |  132 -
 .../070ff2a5-9a5d-48cf-8517-1ad9b6642d59.json |  132 -
 .../8406a5b8-a87d-489b-b75b-00e9f675f09f.json |  132 -
 .../11e8f9b6-32ab-4b83-a601-e5644c0b2c39.json |  132 -
 .../6b542f5a-ea62-45ce-8e98-436a4d058877.json |  132 -
 .../9b280640-bfee-4730-acc3-386a54b2434c.json |  132 -
 .../eff5171b-6119-4013-8aa8-8a4f0215b045.json |  132 -
 .../471c5fed-f155-4521-9d9c-b5370ca91bec.json |  132 -
 .../690be099-3ace-484f-b01f-2fe6b324d12a.json |  132 -
 .../71fbd15f-5eec-40d9-84e8-07323f3ffac6.json |  132 -
 .../eb93dd3e-3d13-4234-bb66-f6177648aa2b.json |  132 -
 .../f7ec1ed7-cc30-4879-8ab1-4909011553d5.json |  132 -
 .../3e100704-dbd3-4d05-b325-5bb4bc90e51c.json |  132 -
 .../12f003ef-1098-4d3f-aed7-7343034157bc.json |  132 -
 .../9de2e564-3a30-4f1c-80da-6432a245a64f.json |  132 -
 .../dd5aaa3f-b24b-4a5b-852b-b80f4a6bf366.json |  132 -
 .../8d8b9fd2-43f6-4edc-8340-44d20824a7e7.json |  132 -
 .../7fe45c20-a2c0-4acf-9425-651a1ec3b0d0.json |  132 -
 .../baf93ef6-56f3-4809-93f6-32dcf4730388.json |  132 -
 .../f6df14bd-207c-4fea-b789-c9f9aef749b3.json |  132 -
 .../97766a7f-cf5b-46ae-b51e-5c5702ae000b.json |  132 -
 .../d5cd2a1b-3def-4b33-a8fe-4b02e090db27.json |  132 -
 .../275d4bf0-566c-4b50-86b9-38c7f45df143.json |  132 -
 .../aa504db9-81f3-424f-b7d9-683ebe31f5d8.json |  132 -
 .../2cc209b7-ef10-435d-a840-b904ab741491.json |  132 -
 .../9b9390ac-fd65-4a58-9834-5352aa340cdc.json |  132 -
 .../4efe5cd4-6b8a-4951-a63a-4c7dc390bbec.json |  132 -
 .../4bc5a0db-1c88-4c61-9343-1d340305ecc5.json |  132 -
 .../74527f51-dcec-4b82-8ba8-075c933404f5.json |  132 -
 .../ac31bc90-3854-4d38-925d-ef8dc7e75d24.json |  132 -
 .../88583cff-1adc-4b1b-8e68-07f0074d0ae2.json |  132 -
 .../fadbac9e-7224-41d1-abfa-7039cbcba9f6.json |  132 -
 .../1fb90540-0fa0-44ca-ad67-1e3503f6b729.json |  132 -
 .../047784e2-c1ee-40d9-a60d-e43504825801.json |  132 -
 .../ee60453d-2d51-46f7-8a18-c651d590f0e7.json |  132 -
 .../b0ac4b11-f7b4-4753-baae-310a92f08259.json |  132 -
 .../324db8b3-38c7-4a2c-82e8-7bebfa38e760.json |  132 -
 .../54dd9033-61b9-4f26-9cde-e04c7136524b.json |  132 -
 .../d0973d6c-373c-41cd-9e62-52470c044dac.json |  132 -
 .../da15da67-b316-4c2e-86a5-c1f88eece9cb.json |  132 -
 .../b0c34174-bfd0-4556-a3bf-92ec0ddf5ec4.json |  132 -
 .../bce7b15d-1670-46db-bdff-24fb38bc3fd9.json |  132 -
 .../15e5e02f-27b9-4063-b601-42c2b17180f9.json |  132 -
 .../51b0c546-0dde-4668-a8b8-3b9753a31aa0.json |  132 -
 .../45842b1c-cf68-44a7-928f-2da454cdd13f.json |  132 -
 .../c15cdefd-dbe3-432e-aab0-3c43540cd320.json |  132 -
 .../1f489afa-a01d-40f3-836a-9e386c502d1d.json |  132 -
 .../94bcc87e-eb06-4321-9b72-2f99168cf92a.json |  132 -
 .../c0bc9811-4d7c-412f-a12b-3e6eab2e5a6f.json |  132 -
 .../b5a8b278-69e9-41ba-89ee-8fd6b2d90a1c.json |  132 -
 .../a3ad7f0f-64bd-42a1-bc7d-d7d4cbbd80fd.json |  132 -
 .../f07c3a4a-2a8e-45c4-a726-be95726df2db.json |  132 -
 .../f36d56b8-cd77-4d69-a51d-39025bcfcdfd.json |  132 -
 .../65acabdc-ea5f-426c-820b-2b79f2b20b44.json |  132 -
 .../96b00cfa-1383-4b36-a043-17eb39678ffc.json |  132 -
 .../3b8a796e-6bde-4506-8335-bd3cc72482e1.json |  132 -
 .../a93e99e2-ca13-4cdc-9904-7ae5cc82c623.json |  132 -
 .../65d9e237-2757-459e-94e7-e382213e4eeb.json |  132 -
 .../c3f44524-4c75-4cd0-9f5d-79c8b08f6f77.json |  132 -
 .../2e7d3674-d0b0-4b87-8bd8-8202114b7665.json |  132 -
 .../30d21295-beb1-4179-8c6f-7bac79b29474.json |  132 -
 .../e2fc95de-b9d9-4043-b55c-aa2819d4f52f.json |  132 -
 .../7fbd7f97-baf9-4acd-ba0c-90ffbf0c47a5.json |  132 -
 .../336effcd-d8fc-4477-846f-70fc40bdc111.json |  132 -
 .../28f87820-d587-498e-b713-7c0af0cdc324.json |  132 -
 .../f1b671ab-ebb3-43ec-86fa-832982d04cc1.json |  132 -
 .../327cde83-d107-4455-bc03-7e03026c52e6.json |  132 -
 .../7497b8fb-9a7d-46dc-868e-1a2bbcdc7860.json |  132 -
 .../92c8afbe-7735-40c8-af0e-29da687c2070.json |  132 -
 .../bca052ac-6556-49d8-94e3-f4bda560a5d3.json |  132 -
 .../5f74fe6e-8575-4cea-959b-e6ba03c7e273.json |  132 -
 .../b0f696f5-ed70-4293-999d-a9121192c137.json |  132 -
 .../18751a6f-062c-4915-bbe0-ae222cf9ae0b.json |  132 -
 .../398ebe04-638f-4a11-b99d-6778ff3ff97b.json |  132 -
 .../b4f197f2-3456-4221-b222-10dfbbb50f56.json |  132 -
 .../0a2fa86a-f9b3-4a49-b215-4cd3ee9b4c22.json |  132 -
 .../1561ec50-1cb9-47ce-9db1-09efe9c3fc61.json |  132 -
 .../496525ff-394a-4b7b-9d93-f5b38d2a1ee3.json |  132 -
 .../37071760-d24c-43cc-9965-d8c7873c0ee8.json |  132 -
 .../91a71a49-5dd4-43b1-9e1c-fd9492236712.json |  132 -
 .../d1d48abb-6dcf-4905-958f-c3a3e75feac6.json |  132 -
 .../68282f29-f56f-420b-bd1e-9cc54783c1a5.json |  132 -
 .../cd1c84dc-6c6e-4789-add7-0e3ca783b0ea.json |  132 -
 .../22a9d3b8-ac45-4433-8926-5d28681af922.json |  132 -
 .../57c4b9eb-dffd-4623-a2d5-b2374d3c9109.json |  132 -
 .../24adbd8c-df3a-4b58-94e6-61a3dfa6828e.json |  132 -
 .../6ed62f64-c2be-4bca-b17d-bd0184a3d498.json |  132 -
 .../db9e4d03-03a8-4a10-8739-16bbcfbb06d4.json |  132 -
 .../7b0fc4fe-51c8-4f01-b07b-5bca05b40859.json |  132 -
 .../6f286418-d8e3-4c11-8941-cfe5a18b1037.json |  132 -
 .../b0a83b1f-3af2-45e8-9d88-d7302a529112.json |  132 -
 .../0462fce1-51b4-48d8-8278-a90048ffd637.json |  132 -
 .../e02f597c-c368-4223-ac90-c99d82c90634.json |  132 -
 .../32e63ffc-c64e-4562-ba99-14873f5bac2e.json |  132 -
 .../6af4faad-05c2-488b-9685-e11ae4e1cbf0.json |  132 -
 .../8aa7701b-7019-44a0-851f-cfc9108fdfbd.json |  132 -
 .../a2f95fad-5ab5-47d0-b9aa-33358c673caf.json |  132 -
 .../aef73a77-9df7-4d4f-89ef-50905d326198.json |  132 -
 .../e9ffdfb6-6f91-4bac-89d2-40b1eb43f3ee.json |  132 -
 .../8ff39438-907c-465f-ac7a-5a25cfd8d824.json |  132 -
 .../83d831c5-a74f-4699-9961-664a7a51b7b8.json |  132 -
 .../83fb88ec-f640-4c1e-b71c-53a123fc4c2e.json |  132 -
 .../3811cc34-45cb-4932-b862-39bf042331e0.json |  132 -
 .../5b2a16a1-7a2a-40b7-add6-b99378b6af00.json |  132 -
 .../1dc2a5bb-40b6-401e-8f1c-6110cb4c0f0d.json |  132 -
 .../742e0a1c-7496-4076-bdbf-ada0a8e528c2.json |  132 -
 .../f0664035-3256-444c-b848-ef603e0d46b5.json |  132 -
 .../9159aaa6-8663-491f-901a-74da4c343d20.json |  132 -
 .../5179b145-9fdb-4ab5-8cca-87966ecf6519.json |  132 -
 .../da872193-1d25-4e8e-bc22-9138a9d121ba.json |  132 -
 .../967fdd26-1f8a-40d6-8f7d-ca731c7ef2e3.json |  132 -
 .../dd615b4c-189e-4361-bcf4-879fd59b28a2.json |  132 -
 .../0aeee3e8-00ce-4f95-bbd9-307d93a194a4.json |  132 -
 .../8c583b51-4349-48af-98d9-8eaaf43d60b6.json |  132 -
 .../34aab556-5e97-4ea2-9ada-d17dc3624be2.json |  132 -
 .../fbd9d5e3-15f7-45ce-92fb-368b3bfcc526.json |  132 -
 .../b177e329-ce6b-4bc6-aeac-1c01306e6b1f.json |  132 -
 .../7f371c11-e8f0-4233-b359-aac39c0a1110.json |  132 -
 .../9f758d4e-d121-4688-8ece-8dc67a499811.json |  132 -
 .../903b8c71-d54d-4ce4-9845-71eb8ca8733a.json |  132 -
 .../9bdc17bf-7b81-49c8-81f5-c6dfa31b449b.json |  132 -
 .../28109e00-87c1-4809-a4fc-dddebba52621.json |  132 -
 .../6a21381b-426d-4a5d-ad6d-2aeb57ed14c5.json |  132 -
 .../03a8091c-473e-4fbe-af70-35f791a23a0f.json |  132 -
 .../ed75e9ed-841b-4783-a201-bc72651afd0a.json |  132 -
 .../38cd418c-9770-49d2-8b30-ac47e445cee3.json |  132 -
 .../d49b6a48-ae81-467d-87c5-b17f9ca306f8.json |  132 -
 .../39b7e250-9f71-4833-941e-85692a48b6e6.json |  132 -
 .../c0d102a2-ff8c-45ac-a825-31472b98b871.json |  132 -
 .../7c5674a8-6a1c-483e-be9c-b0a6d00d3ac4.json |  132 -
 .../d34b899e-b067-4c9c-9fa2-439f8b2d589d.json |  132 -
 .../8c7b2332-510b-42d3-bcbb-e177c35d27d5.json |  132 -
 .../685f107f-e431-4dba-a117-8d6f1dd2c296.json |  132 -
 .../e1570804-85b6-4518-a099-5f21ab27d12c.json |  132 -
 .../a779ebec-76ab-4a1e-aa4f-d1a6adfe2d5c.json |  132 -
 .../1ed7f6ed-d04d-4cfc-a36a-1ef0f72d4814.json |  132 -
 .../c901a9ee-069a-4e3e-ac52-3017d67d8800.json |  132 -
 .../08317b59-ff74-43c8-bea5-2a266c38816e.json |  132 -
 .../4106d4d3-344a-4c1f-b9ce-a3140d435013.json |  132 -
 .../2b308fad-8494-4056-8b84-82733cd2710a.json |  132 -
 .../93c867d0-4f10-440c-838c-91d1633fe584.json |  132 -
 .../1a4a69c5-4acc-4ad9-adb2-bd9cf0fa2875.json |  132 -
 .../151226ba-9744-45bc-b923-30df57f7aa3e.json |  132 -
 .../98363657-0793-4eb3-94de-28961afc92ea.json |  132 -
 .../a32b4ded-6bff-441e-afbd-736e6d8cce5c.json |  132 -
 .../326bcf4a-02e9-4218-8bf2-55a94a79435e.json |  132 -
 .../145facc2-ab11-4c68-b841-762e0ad9bd5a.json |  132 -
 .../d3e6aae6-9284-4309-8d8c-02c9e797a58b.json |  132 -
 .../6ee8537c-90e8-4455-83ca-c8c375a5ead7.json |  132 -
 .../6efbfb38-57e5-46c7-b765-f7d0356afb97.json |  132 -
 .../f4d418d9-1089-452d-9c7f-4cc4712e6ac7.json |  132 -
 .../1c9b325b-92b3-499a-a3ea-026269c63c88.json |  132 -
 .../c546ccde-cef3-4de2-a49f-24517d76dde5.json |  132 -
 .../e85d3ccf-f48d-4e5c-b893-771a107773d4.json |  132 -
 .../b8d22ade-874e-4ff3-9fcd-dbe14220d48b.json |  132 -
 .../97e8e7e2-74a4-42a5-a0b1-250e47d3c3e6.json |  132 -
 .../b2d56bb6-a726-4e47-8bc6-c016a51aac5c.json |  132 -
 .../3366f6d8-41bc-4c2c-a72c-bc0fd7dc8dd2.json |  132 -
 .../7ba52efb-3890-4691-8740-9f051f1f645e.json |  132 -
 .../7b192b49-057e-418a-b47d-44b0ec82a6b6.json |  132 -
 .../f2120d53-bef6-44d6-84a6-a6f8e3537188.json |  132 -
 .../f5408aa9-85c8-46e5-b225-0480b2e18e97.json |  132 -
 .../c1918f55-286c-4b29-ac53-2ee8f9d36d9e.json |  132 -
 .../52659d37-67f8-45b8-88e4-11917dc90488.json |  132 -
 .../556ae77c-effe-44ab-ac4a-1ad7cbd7c363.json |  132 -
 .../048fc971-3baf-4740-a132-2f9476d01b7a.json |  132 -
 .../abd28d25-01e0-474d-be35-08d816d281f5.json |  132 -
 .../17f49724-6553-4baa-b354-45ffd0f2c844.json |  132 -
 .../3e60d982-d7d5-432b-962e-b7734cc90534.json |  132 -
 .../79a0fdf3-b432-4598-be62-f9eb57fa5a43.json |  132 -
 .../662566e0-2af3-40d6-90de-9b361bcae355.json |  132 -
 .../d81c0035-a0b1-426c-9080-8ccbf745642b.json |  132 -
 .../100bc243-158c-4e5c-918b-1439bf26fee8.json |  132 -
 .../45e32080-1464-40e0-a232-310fdda967eb.json |  132 -
 .../e89b279f-d548-4aa8-b5e5-0bffdd98b840.json |  132 -
 .../777a53f9-891c-4f9e-99a8-bb1988f61f19.json |  132 -
 .../f15846b1-8eaa-411b-88f7-25064161af4e.json |  132 -
 .../e803fc85-fb98-4db8-aab0-a63100dcd5fc.json |  132 -
 .../50620749-5ecf-41eb-a131-611675560e07.json |  132 -
 .../2d40a551-6440-4d71-87e4-639d486c1c5e.json |  132 -
 .../22235942-2e3e-4ef4-b7a0-5800f507571a.json |  132 -
 .../ac06867d-3a34-42f6-9e2e-226cf86748f6.json |  132 -
 .../394f1fc8-dc2c-4ff9-9ad0-7b3a8a8ddeb3.json |  132 -
 .../03e52d4f-78d7-453c-9685-844dd1636904.json |  132 -
 .../3ce136d5-be81-4b8c-a7dc-4e1346935d35.json |  132 -
 .../fb35accf-0c5d-4f72-8d73-ba366a41a76d.json |  132 -
 .../75e5ca5d-cce1-4463-b398-553399ce6833.json |  132 -
 .../c426bae7-b98d-4343-b419-ac8206196a95.json |  132 -
 .../b17de9f2-6f94-49f6-b908-fa983e8f8f9b.json |  132 -
 .../58ba7ca1-8cca-4668-836b-824491d9cf01.json |  132 -
 .../23da100a-13b9-42a7-ba79-234be551d0e4.json |  132 -
 .../2d0c12b9-cff8-4366-a3ce-7772e4c098c9.json |  132 -
 .../4b87eea2-169c-411e-9d15-caf6b7826590.json |  132 -
 .../62a3cce2-4ff5-4dc9-beab-a06001fd82d9.json |  132 -
 .../0e5961e1-af27-4eee-8b9b-c82ee4ab61b1.json |  132 -
 .../b62352d4-e3b0-4b4d-8d68-e2d973d820c1.json |  132 -
 .../7fadc486-767e-45ef-979d-74ecb858cb99.json |  132 -
 .../d0628e6f-a6f3-42eb-b9fc-e880ae8c0688.json |  132 -
 .../0999a066-1151-4445-b130-00d8fe4a516e.json |  132 -
 .../1efc09d8-6a5c-4d48-b76e-2e04ef97b676.json |  132 -
 .../1a59412f-fe78-4ecf-8951-8f2996dd374f.json |  132 -
 .../b5403311-2069-488d-af98-27da14496c15.json |  132 -
 .../6c10c176-b2b6-4216-91c0-1444944612f7.json |  132 -
 .../80ebd92e-d9b6-46ce-b77e-973c3f3f6051.json |  132 -
 .../0418e36f-17ea-46a2-bfeb-91cc0ff719bf.json |  132 -
 .../4f5ba3fc-694a-45b1-ae9d-2c7d33e41519.json |  132 -
 .../8b0d1556-bbd5-49e3-b881-32224bc1aa9a.json |  132 -
 .../524e634f-280c-4f3a-9f1f-bdda19fad740.json |  132 -
 .../cb82e92b-f207-4fbd-9bfe-43184769cdbd.json |  132 -
 .../0b674103-4e55-41f4-accb-b7be73671801.json |  132 -
 .../fa0290e0-723f-4502-90b6-c77007fffc1f.json |  132 -
 .../c3827ecd-d02a-4464-a098-110f4fb54516.json |  132 -
 .../af9700fe-20c0-4b7c-9f3a-c4d78fab7911.json |  132 -
 .../959a4e4d-211c-4e45-94f1-f8f877e0b36f.json |  132 -
 .../96a8b3c0-d6bc-41fe-8967-0d798669aa8e.json |  132 -
 .../ed5d2ca8-d551-493d-8877-348204ef91cc.json |  132 -
 .../04e20a14-8346-4801-8515-189861c857cb.json |  132 -
 .../eec2da56-ba0a-418f-afe1-8a46882b9839.json |  132 -
 .../321cf68b-9220-4ada-89da-061341a20a9d.json |  132 -
 .../86fda025-2345-4a40-9094-223b96b21f13.json |  132 -
 .../3c734233-9868-4ba6-83c0-2b63f2ce8980.json |  132 -
 .../7f5eca48-0ab9-4ef2-85c2-a7f1fe713afe.json |  132 -
 .../f5e0e809-08b8-43dd-a44d-875f365610c3.json |  132 -
 .../8d267135-a7e6-4ec5-ae09-66478804bb66.json |  132 -
 .../4940ed0e-2c1e-4408-9806-49ceed30a69e.json |  132 -
 .../5f6f7b7c-ef6a-4468-aae5-d7dfc25c5659.json |  132 -
 .../5244ee3c-7d65-434a-acfe-cdb277ff5264.json |  132 -
 .../eba4644f-d455-4a23-a16f-8ecb038ffe7f.json |  132 -
 .../fb270319-7010-4946-b60c-409aebe41aaa.json |  132 -
 .../d57bd77a-11cc-497c-b0bb-31c1ffa63dc2.json |  132 -
 .../0220984e-fe8c-4e72-bc3e-92b949ffe769.json |  132 -
 .../16482634-ec03-463a-9deb-2230ee955800.json |  132 -
 .../4c1db32d-96fc-4a66-b083-530a3e75ad6d.json |  132 -
 .../c0c5c846-395a-47ac-9e8e-e598939f317d.json |  132 -
 .../6b3f6b59-a8eb-48c2-acbc-92e8f34b2dd6.json |  132 -
 .../d017e3bf-2abe-4b84-810e-e0eaf973adc3.json |  132 -
 .../62a3ecb8-f6d1-429c-807f-5545b2a5897f.json |  132 -
 .../748557ce-1a49-4b3a-9c38-9007dc04aafb.json |  132 -
 .../95d43d01-a75e-4af4-a2cc-b60f832071d3.json |  132 -
 .../4dc7c889-7839-4047-b48c-33be5b688e72.json |  132 -
 .../751851c8-9a7f-4135-a106-eab4efbd0734.json |  132 -
 .../2930e30c-9f2e-4248-ae3b-ed7ffbd12f8c.json |  132 -
 .../c1acc460-aeb8-4a99-8ca5-376ab60fb74a.json |  132 -
 .../33b8b64f-7da5-45aa-bf80-7145ef704229.json |  132 -
 .../2662d257-49e2-430d-b44f-b0b347c61271.json |  132 -
 .../870b639b-ee7a-4b13-872b-52657539c836.json |  132 -
 .../6ff20678-a335-4fa8-8126-9f96ce247f34.json |  132 -
 .../19c4ea89-896a-4577-a386-c2470eaf743f.json |  132 -
 .../22eb2479-16ff-4a56-b9e4-e8835da7ca0e.json |  132 -
 .../aca3f1fd-9c46-47f6-81c6-dc56a702c1de.json |  132 -
 .../071ca686-5950-4af4-80f2-969b1008e370.json |  132 -
 .../78977c34-33f8-4037-86e0-dfce1d01c3f8.json |  132 -
 .../480e4294-c8d9-4088-9b8c-7a239d57f683.json |  132 -
 .../be9b21e8-90ce-451a-bcaf-2ebc7c72bc34.json |  132 -
 .../b0054dd8-e62c-4d0c-9b18-090851c3a7e2.json |  132 -
 .../985e479b-658a-4548-9b5e-c9c04b8838c1.json |  132 -
 .../d0ef8af4-156d-456d-9e33-b2cdb3f8c04e.json |  132 -
 .../5050c787-2f95-4a17-a4b0-c094860627b5.json |  132 -
 .../bb5c8274-4324-47f2-94c5-d0c831ce0de7.json |  132 -
 .../8113a26a-5941-4f3d-872a-bdde5456ad97.json |  132 -
 .../5b60047b-2e85-4a47-a31f-4c07f4bd2c30.json |  132 -
 .../88d79858-3a35-43eb-8da6-95b80b5deef6.json |  132 -
 .../63266a49-01ea-40f1-83ef-778f391aff2b.json |  132 -
 .../f0da069a-833f-489a-a923-c79542a3a9a6.json |  132 -
 .../205b9da8-d561-41ec-946e-1d2f9a43e437.json |  132 -
 .../2ea4da56-4b95-4222-a4e2-f57c73e0ee4e.json |  132 -
 .../c086f693-cef1-4212-9c17-669b210f4caa.json |  132 -
 .../290995f2-9982-4f29-ac74-dc646905206c.json |  132 -
 .../c60e65e6-d771-4c53-80d0-c1e09aa39377.json |  132 -
 .../fcff202d-3b4f-4ba9-b3f6-1122d8abcac1.json |  132 -
 .../5f0fa37a-e829-402b-b2ab-c68ffa248b6e.json |  132 -
 .../a0b4a345-3530-4da2-8403-87259bbd1405.json |  132 -
 .../3548f0ea-f3ab-4a0e-9c77-5ae62014ed44.json |  132 -
 .../707270e3-334b-4eba-84c0-2795ae53d79a.json |  132 -
 .../c827bee3-a181-42bc-9387-ca132d59c8ba.json |  132 -
 .../d3e8949b-f6f8-459f-891b-f4900ff806cd.json |  132 -
 .../35d5f5e3-74eb-4eea-9f78-b7b8969830a2.json |  132 -
 .../4cf4479a-622a-4bc2-86f2-aa526216f24c.json |  132 -
 .../6ed27890-3e61-4c7d-8c94-a78c0b34ba32.json |  132 -
 .../87b5e360-7867-4edd-b45e-e7bb92a91b69.json |  132 -
 .../d93116b8-28ff-41ea-8273-56f7ae11cf18.json |  132 -
 .../ba8c2c17-64f6-4cdb-b3b9-8977ce1bdbe2.json |  132 -
 .../5e5602cc-b4de-4247-aa6d-940817fc849b.json |  132 -
 .../cc5f27f5-36d8-49bb-9c9d-7879598bfe71.json |  132 -
 .../aec03bd9-808a-4c3f-bbde-40bcac5775fb.json |  132 -
 .../b4ae6f0b-8a6b-4c60-8eb2-3e202877bcf5.json |  132 -
 .../c68deb4d-73a8-40ab-b4e5-1773b7ec4ed8.json |  132 -
 .../a93c5674-599b-429c-a322-3c6bc7248f45.json |  132 -
 .../5e6374a6-56bd-4bd9-b04b-30ec9cf234bc.json |  132 -
 .../c3d2fc86-a5c4-4e92-bcf9-26096ca32ad4.json |  132 -
 .../1b49cb06-3ee1-4945-aaed-12c868d9e45e.json |  132 -
 .../65853bb5-ff3e-4880-8c32-ce9aabcadd7b.json |  132 -
 .../7fecc176-debf-4bf7-b3f3-479d05678a1e.json |  132 -
 .../3c965626-a264-40db-93e1-cd7659d0662e.json |  132 -
 .../50fa6f0c-d689-4380-b619-253209b5badc.json |  132 -
 .../adb25c88-6113-4307-bbf0-d377f757bc18.json |  132 -
 .../b9ac5e03-c878-4e46-a89c-1906f3b91dce.json |  132 -
 .../d6a6badf-4472-44b5-af9e-4282e4406a8e.json |  132 -
 .../92e62d3a-3091-4538-b6da-ba705e11687a.json |  132 -
 .../04f5fdc6-f1cd-4b2d-947a-86fee67b3b62.json |  132 -
 .../5013ccfc-6bc5-4862-898c-1ca781f92572.json |  132 -
 .../38fff98c-72b1-453c-a2cf-cf077dd19d10.json |  132 -
 .../42911928-ef64-474b-828a-02ce3383773e.json |  132 -
 .../7989d7d3-c5e9-43c6-80a1-6de51533f9bf.json |  132 -
 .../5b9acd52-7eb6-4099-98be-ecd6cae07835.json |  132 -
 .../666bef5a-2d62-4743-bff1-07365716ab19.json |  132 -
 .../85de411c-2308-4824-bd6e-3327eeb6fe3e.json |  132 -
 .../df28c4c2-d6a4-4ab0-a1ac-faf00a93de99.json |  132 -
 .../6fb37ad0-b41b-4ad7-91a2-79bbb835d445.json |  132 -
 .../c41df02e-5aff-4de6-a1c4-d45b5585e29d.json |  132 -
 .../aa587b4a-9c19-4231-ba72-9b66446460f9.json |  132 -
 .../be14e75e-4fb1-41aa-b168-1ec23eb305e0.json |  132 -
 .../73be4a2b-28c9-4208-8107-3734fea25008.json |  132 -
 .../0bf2fa4e-3bcb-46ff-a068-f4c796123c6d.json |  132 -
 .../9f8fc05a-8658-4ed3-994a-965e6882d242.json |  132 -
 .../ced11f6e-490d-42e9-8f3e-00e22cfc2910.json |  132 -
 .../70ba788b-fe8c-4667-a859-0fb122de22b9.json |  132 -
 .../e93f2d5f-7ffc-44b8-b2dc-d07b73de44ab.json |  132 -
 .../15cacfe0-bdfb-4b87-a813-bfa70ff71984.json |  132 -
 .../cff00e2a-41e3-40d2-aab3-4bb3bd7d0d0e.json |  132 -
 .../e1eab0cf-2c6d-44b2-8aaf-a75347741529.json |  132 -
 .../ed221db8-cf81-4257-8785-db9381eec5b7.json |  132 -
 .../b314468b-401a-4318-b022-c966bf3366aa.json |  132 -
 .../a0dbb2eb-66c7-48a3-a85c-725b49141edf.json |  132 -
 .../812a36ec-4928-40a9-9aa8-ee39d7bb02f5.json |  132 -
 .../77af2424-0a23-49f3-97b0-316d04a33547.json |  132 -
 .../6f422676-2d7e-40ed-a5e3-4afc25564cfc.json |  132 -
 .../43923dd6-838a-4259-a938-7766dfd9c07e.json |  132 -
 .../dba94a49-02b0-4e92-bd6c-c6bfc9be3cfb.json |  132 -
 .../16a782dc-0795-4281-aad6-4f664a0940ab.json |  132 -
 .../5d24d4ad-9f37-4634-ba23-74fbc74fd298.json |  132 -
 .../043cd315-fcb7-4871-ae79-dee3fdefaef0.json |  132 -
 .../3c377d7e-14bc-4c82-9ada-7560552abbe4.json |  132 -
 .../43bb650b-8bb7-41b4-866a-cb2dad1499d6.json |  132 -
 .../bdf8f907-37ca-41ca-9a4e-f4dd446f895f.json |  132 -
 .../14a1872c-7afd-4cd4-ad87-853e4fc0847e.json |  132 -
 .../887e4ca9-ed48-4b33-b933-f8534a8d0377.json |  132 -
 .../c585488d-4043-482f-b1fa-4a61e96f7f0f.json |  132 -
 .../d64541f6-19ef-4f04-a991-93efec6fe24f.json |  132 -
 .../1c13e194-8bee-4456-a249-f71e7e34b0eb.json |  132 -
 .../1d3db737-20e7-4da1-a311-e60de0b41c93.json |  132 -
 .../7b73d50e-358b-4961-8b58-63765ce5a82a.json |  132 -
 .../81dfd69c-cf01-4114-8157-fd09af6f490c.json |  132 -
 .../f38240ab-35e4-431e-b4d5-b1b0e1d57c5f.json |  132 -
 .../01863b4f-9550-49c3-ad83-74c0bb535eb9.json |  132 -
 .../edd25437-38bc-443c-9da3-bc041270447e.json |  132 -
 .../31836d43-5022-488f-ba9e-379195809069.json |  132 -
 .../2a5a3ed6-7137-49e2-a141-497ceba88757.json |  132 -
 .../0b1c6aa6-b94e-4400-9b0d-c39aa1bcd808.json |  132 -
 .../69423132-adc9-4b97-b799-15f37de1d7e5.json |  132 -
 .../54d5bf0f-7c4c-40b1-bca6-5484ef8e2a04.json |  132 -
 .../cfe8f9c7-e9bf-4a17-afa0-d5b8f46d24e7.json |  132 -
 .../7fbc0323-1c78-46b6-a08a-6e5870c64e53.json |  132 -
 .../1c769f0d-b99d-4b82-a529-f5264f7b3349.json |  132 -
 .../a9365685-e299-48e2-931a-c63e123a9e00.json |  132 -
 .../bdf2d61a-daa1-4b1f-9245-43ff263540fb.json |  132 -
 .../f0b4eef9-dab2-48e2-87f8-ad83ec33ec23.json |  132 -
 .../29e10491-8c34-4b7a-a0bd-77f6ca0dc54c.json |  132 -
 .../c588d86a-80c4-46d1-93e0-b7fa8491f3b3.json |  132 -
 .../0b11eb9a-61c8-4af1-8335-24bef2597e5d.json |  132 -
 .../7d31e5fd-700a-42a8-bea8-8989e8c52603.json |  132 -
 .../f993880a-3c7c-4af9-a3ce-3c27207b9a3c.json |  132 -
 .../2fae7e4a-8c28-4be8-9391-ca79077e32c2.json |  132 -
 .../436e651e-6f04-44ff-ab3d-db8ed0d639bd.json |  132 -
 .../9fbccac2-c840-494e-a24d-a6f0c9a07b88.json |  132 -
 .../a4ee6a33-df51-4a4e-a13d-45488a094fd7.json |  132 -
 .../a3923f10-e64c-4556-9616-4fe7072eff60.json |  132 -
 .../ca15d972-9075-42df-884b-5d069f6ff425.json |  132 -
 .../905909a5-abef-46bf-9392-c97873e229df.json |  132 -
 .../95bd05cf-8f59-409d-a99e-d249bad6c561.json |  132 -
 .../76b12246-33f6-4992-a0ab-38704dcf6345.json |  132 -
 .../e4415806-0ec0-465a-b28f-9c8741436fb4.json |  132 -
 .../98e62ab5-d35a-42dd-904b-bed9c50f3745.json |  132 -
 .../8fb3596e-224e-492b-bdb6-a95a16656eb0.json |  132 -
 .../154203c4-d86e-4c36-806b-c45c5cc568ce.json |  132 -
 .../e42c01f7-2869-4103-bbfd-81aa5a15c140.json |  132 -
 .../323d2f94-5e04-4627-9f74-129217f53eea.json |  132 -
 .../6bcc284b-8973-47d5-b5b1-1abb7a3242ee.json |  132 -
 .../691cace3-5316-4f5b-8693-67efb24a0a06.json |  132 -
 .../d387b3dc-9e76-44a6-9a9f-132a4fd762b4.json |  132 -
 .../f6f515d3-f5e9-4362-be51-bb8fc05527e6.json |  132 -
 .../2e1e215f-b622-439f-a13f-531441e25ae3.json |  132 -
 .../d50d66a9-a0c4-4b82-922c-9d012f1b50a1.json |  132 -
 .../ea7292a8-3f07-47be-b8ae-7d352ed1ecb6.json |  132 -
 .../4eedd6d4-279f-4660-8d71-708a27bb53e0.json |  132 -
 .../9c0f67d1-f95d-4ca0-a234-2e09ac788f55.json |  132 -
 .../e5c0fbc9-f424-4b04-839a-8335adaf89cc.json |  132 -
 .../d91107fa-eb8d-4d01-90a2-fc9831f337b2.json |  132 -
 .../926999bf-1ba6-4321-82b2-fcced4336739.json |  132 -
 .../57d481bf-0db9-4208-afda-dcd20df13964.json |  132 -
 .../eb417e47-fe63-4dc5-b3e5-28782f3782da.json |  132 -
 .../b0f516dd-7185-4906-87a5-3c6f019894d0.json |  132 -
 .../1e562944-a205-4ef7-aff1-3776595d131c.json |  132 -
 .../6ccaf08d-1b0a-4ca9-941e-a71e2dce5cb4.json |  132 -
 .../2064938d-9f05-4740-a4d4-2a2da0eac21d.json |  132 -
 .../43240184-8245-43ff-a971-678523918fe0.json |  132 -
 .../b3b854b6-700c-4297-b335-6acc3c385f84.json |  132 -
 .../a9d79c6a-f99a-4b60-8e37-ee2cdfe75f30.json |  132 -
 .../88e1dd78-d3bc-401b-88e9-d963bac181db.json |  132 -
 .../a41bd607-f319-4063-a6e4-813f43e40568.json |  132 -
 .../8629aef1-c673-4b17-a9cc-b361a53bdaa7.json |  132 -
 .../532c927a-dc0c-4e65-8ab0-7b9ddd889d89.json |  132 -
 .../843f9927-9865-4066-9cc0-f0522d3b914f.json |  132 -
 .../eeecb2cb-e286-443f-84aa-d825702a4ad8.json |  132 -
 .../36ab4f5a-b2cf-4d01-8283-9eaf2c90928f.json |  132 -
 .../c4e810f1-ffb3-4ece-b445-64e339761530.json |  132 -
 .../025725b6-0034-48c0-a720-5fc210e5e24b.json |  132 -
 .../7bdd8928-c336-494e-9c87-de9ecc2749b8.json |  132 -
 .../ff7369dc-3ff2-424b-80b0-e06a141b54f3.json |  132 -
 .../a6dc7253-75fd-4897-be85-8ac89fc11f8e.json |  132 -
 .../296ceacc-542a-4000-bf9b-ae59b33a53ce.json |  132 -
 .../13870577-7579-48b4-9c92-202318ca6ecc.json |  132 -
 .../6ebd2806-2623-4773-93bd-1036ff01cb8c.json |  132 -
 .../99d6a44b-d556-4674-8ade-a5b30cf99255.json |  132 -
 .../605118a3-316a-46b5-9719-f596e361a2a8.json |  132 -
 .../271d2829-fbd4-438e-9f09-59539af68c8b.json |  132 -
 .../107bc549-75c1-4272-b567-f8ab9f6cd675.json |  132 -
 .../dfb451e9-c1c1-45a1-8082-155763366129.json |  132 -
 .../b2d80977-d079-42ec-b057-5aac530b9d70.json |  132 -
 .../16b33b80-3b4b-4edb-b89f-3d93dca8969c.json |  132 -
 .../63c94e0a-4572-4b8a-bfe0-7f88bb847d7f.json |  132 -
 .../538f2b43-328c-456d-8a40-ff2b37924453.json |  132 -
 .../fb7a68e6-716e-48c6-96c0-d227735f9a7c.json |  132 -
 .../3593d4b8-5602-4cca-935f-a76e342f060a.json |  132 -
 .../72d503fc-b221-498e-811a-a806769175d6.json |  132 -
 .../ad7d9698-d9e6-4f2d-9767-987835626c8c.json |  132 -
 .../98899942-fcf0-41de-8587-44d7429bea47.json |  132 -
 .../bb51eb59-88f6-49c2-814a-11b2c80313d0.json |  132 -
 .../d8563f36-e299-4186-a5dc-9dae51824e1f.json |  132 -
 .../43bc0528-7bc5-4eac-8848-c9995079450f.json |  132 -
 .../ce19893b-a7e1-4f8e-96f2-eb9cee2afeac.json |  132 -
 .../24629e14-d197-4a5b-adff-7840af652f22.json |  132 -
 .../9c3ea35c-2cf7-4c31-8b83-c69df3cd9448.json |  132 -
 .../46548403-6eb5-4f7a-874c-1327420f4cab.json |  132 -
 .../0bd9c061-b7ee-4bc2-9deb-ea7eea012c49.json |  132 -
 .../aa2fe858-111c-45e8-b0d4-0048d7fc7ef7.json |  132 -
 .../ad03cae6-b126-4157-a225-9576e4d651d0.json |  132 -
 .../0d57b65d-3dd4-4185-b8cf-531105e94b5e.json |  132 -
 .../f8882044-6e71-4788-b2ee-f51f85e67ecc.json |  132 -
 .../3c8f96c5-af91-4f41-a0b4-6e1b7d55d8ad.json |  132 -
 .../e26743b9-4caf-46f8-bd5a-7e4445c850b1.json |  132 -
 .../febd4016-3a30-4b26-93e5-f7b556781b9b.json |  132 -
 .../ae82125e-94ac-48ca-8240-807e4b7ef9a0.json |  132 -
 .../5321fa0b-b010-4e1d-9f20-a97b56f4f937.json |  132 -
 .../d25a4602-ea50-4a53-952c-112ba250123b.json |  132 -
 .../232e3fc4-5cd2-4515-9e15-acd7d56bc34d.json |  132 -
 .../975f54fe-a581-4ce1-b0c1-7becb7605f09.json |  132 -
 .../92ae4461-48bc-47fe-a3ad-ea4c3452d395.json |  132 -
 .../638e1cc0-9baf-4555-a278-4b21c46af86f.json |  132 -
 .../cef4161a-4e1c-4a92-bca8-b07f957a13b1.json |  132 -
 .../715b556b-2bc0-4864-b4b1-b7413a5d45bc.json |  132 -
 .../7552ad5c-5d1f-478b-a931-036083b2954e.json |  132 -
 .../7bb3ae9f-9bb3-4bf2-9d97-d7f4f30697ac.json |  132 -
 .../821d67e5-da8d-4383-8825-3bfa72a91fc9.json |  132 -
 .../c5bddcba-4a40-4fbb-93e8-aebd06a70a66.json |  132 -
 .../dc35237c-606d-4609-927a-566bea767312.json |  132 -
 .../3924d1af-e167-4186-a34b-d9b4b8c26d59.json |  132 -
 .../f733c4cc-90fc-4b31-bed3-c57dba6d4b6a.json |  132 -
 .../08f933a0-b096-4271-890e-0df7e20d1d20.json |  132 -
 .../8434e448-ed77-45f2-9c31-39128912f842.json |  132 -
 .../d801037b-1eb0-4058-9096-429e5237e015.json |  132 -
 .../e0c46f18-598e-402f-8955-68e71fab67cd.json |  132 -
 .../4b987cb5-cf7c-4866-8cf0-9926f78c2de9.json |  132 -
 .../ec658058-1075-4918-9dc9-fc79d0dcf897.json |  132 -
 .../b68baa86-3e1a-4888-98ba-2ecede79b4a7.json |  132 -
 .../0b11c8ab-2cfa-425d-9d81-d999f94401db.json |  132 -
 .../a3e48db8-3679-4f19-853d-82a73ef49400.json |  132 -
 .../7dbf35b2-80c1-4181-80f9-850ea51cead2.json |  132 -
 .../231f47db-1662-4313-9ff4-f32883f5615c.json |  132 -
 .../c79df898-14c6-4f00-9f65-0d01cd34ed61.json |  132 -
 .../2c52917f-c396-410d-bc78-c93c433797fc.json |  132 -
 .../0f1d2925-4e1c-495b-94be-f3515fbd53d7.json |  132 -
 .../5cbb1972-9895-4689-9f6f-7e0037829a78.json |  132 -
 .../6bc42e37-1f31-47cb-97e4-9d0b28b53691.json |  132 -
 .../a1573b95-59e6-4ae0-bc12-6ef6fee90b76.json |  132 -
 .../78c61b39-3c76-4af9-8d5e-fcd67d6c8779.json |  132 -
 .../e4c06400-da86-4448-b421-23476f50bdb3.json |  132 -
 .../48f4c2a7-e819-4789-92ea-e02c5e92d3e4.json |  132 -
 .../cd9cbbac-f1ca-4193-88cc-e5968cc1bb62.json |  132 -
 .../ab3685ab-1795-4a0e-8ee4-4f509616d1b8.json |  132 -
 .../9018f443-a63f-4e07-b10b-272f66d1eb0d.json |  132 -
 .../548d1536-b941-43a9-a60b-ae5448b70933.json |  132 -
 .../99853109-17d9-46fa-a502-e4c977c1fb8f.json |  132 -
 .../e171a0a0-f46d-404f-84e8-539155284e17.json |  132 -
 .../eadd93e5-5770-4d4a-a1b2-6e732a82ce34.json |  132 -
 .../151cb8c4-0a7d-4886-80ea-560902e1f932.json |  132 -
 .../1acb97c4-a9d2-4ec8-9486-77eb6857646c.json |  132 -
 .../1d803ac5-3ca6-4cb0-bcd1-779eaea1562d.json |  132 -
 .../81562e50-23c5-4ef1-b98c-b40625f3b8c6.json |  132 -
 .../95fa292a-ee64-4844-9646-ce3cc7f730d2.json |  132 -
 .../4d14c584-b5a1-41cd-9605-78088dfebd7f.json |  132 -
 .../1415d3d9-d7f8-48ef-8a2f-aa675c4c14db.json |  132 -
 .../4b0ab369-e72f-4229-b449-3a21ee9d2c95.json |  132 -
 .../478b6c1f-3329-4c9b-9d90-59b8b551c1af.json |  132 -
 .../212f8dd2-3c61-45bd-a3de-2326334feb73.json |  132 -
 .../9251282e-f72f-406e-a2cf-e7063516f624.json |  132 -
 .../91a3c739-7e16-4d21-8879-bb2fd4d4c6ad.json |  132 -
 .../aaa78d8f-6050-4b5d-bb67-da6c9d1ee065.json |  132 -
 .../1f0430fe-24ff-4ef6-8577-ee5bfa74f18b.json |  132 -
 .../f374772b-2685-41e2-a455-9002e48e3739.json |  132 -
 .../6db801f8-5253-47c0-b87e-6779bff42f6b.json |  132 -
 .../0d704671-c0b6-4296-85b5-eaf972d6be6a.json |  132 -
 .../7e31545f-0865-4843-914b-a71f8a84314f.json |  132 -
 .../431c7130-5a19-4a71-8a92-fea9726769ac.json |  132 -
 .../ca850c4a-14d0-4145-9977-0d33e6e3e362.json |  132 -
 .../7389caa3-6d8f-43e3-b3f2-d9320e56f621.json |  132 -
 .../1e822b0f-0d80-4613-983b-ebd2e6fbfcd6.json |  132 -
 .../1206f592-e6f7-4e7d-83cd-cbe82b37ec58.json |  132 -
 .../e4085c6a-bc16-4328-a724-4b9838b55faa.json |  132 -
 .../b929b955-1fbb-43d0-add1-4d58fdc4097c.json |  132 -
 .../df723a0f-9a32-42f3-9421-780159f7d821.json |  132 -
 .../c1046d2c-0b5b-4ab7-b173-8d5b5ecbc07d.json |  132 -
 .../60c02070-7554-4764-8a02-841ca75a0d5c.json |  132 -
 .../d243f226-149b-4824-837e-e80ab68bae9d.json |  132 -
 .../4f9361d0-2ad9-44da-a1d9-876d43451ae6.json |  132 -
 .../6c6e9ebc-f83d-48d5-b69f-be43d4167a0e.json |  132 -
 .../7cd2c0da-15b8-4ad6-8cad-feb68631c079.json |  132 -
 .../36b84cf2-d221-4e9a-b728-37dc2bf7e1d6.json |  132 -
 .../1fd0d1db-1d75-4b10-bae8-33023c2c7466.json |  132 -
 .../c6c02512-6c91-4818-a084-c48915fd83de.json |  132 -
 .../326affa2-9ea4-4fc9-b60f-d2abeb7493c3.json |  132 -
 .../b3a190d1-5b86-4439-a21e-1f118239db82.json |  132 -
 .../b37a7db5-b26f-4a82-b27c-6c3a2ba72fda.json |  132 -
 .../05a59445-b816-4982-9b1a-1c2394ffbaa9.json |  132 -
 .../ff952579-e92d-4af8-9497-f49fed5efba0.json |  132 -
 .../b541ede0-6de9-4557-8280-43567fd3dd96.json |  132 -
 .../8514f601-0bb2-4639-90cc-29e96088e7de.json |  132 -
 .../57e6d0cf-943a-4b83-a1f4-4f03b5066523.json |  132 -
 .../ec205127-21c0-4edf-bb3a-ec8ccac4fcdb.json |  132 -
 .../14b260e6-4300-43ec-b7af-587a2f5b03fb.json |  132 -
 .../53de1fc9-7097-4103-b731-588a7bf39f80.json |  132 -
 .../1a1031c5-3ec2-4d12-93eb-e0a3b0448ed4.json |  132 -
 .../51b62d59-f39c-49ca-af0a-73df6440e29d.json |  132 -
 .../622a0ae1-0eb5-49f0-bc44-d396c7233e27.json |  132 -
 .../71291a41-283e-42ca-b192-7b759e3c3712.json |  132 -
 .../7e504fef-b304-4c1a-856d-06e56a8869d7.json |  132 -
 .../f8258f5e-8826-4fe1-b9d3-61708e79d4ab.json |  132 -
 .../099ce031-1e11-4a07-bac1-03bef9b915d6.json |  132 -
 .../75ff25fd-e5f7-4380-b192-cbc8a8ee95aa.json |  132 -
 .../cbc43c7a-d8ac-4b03-a383-703f7fa51757.json |  132 -
 .../72d7f252-1bff-40ad-9ec8-1ac2a2e02a8e.json |  132 -
 .../5eb10878-11e6-43ad-9bb5-658a3495129c.json |  132 -
 .../23b29cd4-cfd0-49f1-8959-c3aa8be9722f.json |  132 -
 .../03db2532-f8e0-41e9-ac0c-ff2913f4b12a.json |  132 -
 .../273f0d50-aa4e-4469-8360-2ce0a2e1a850.json |  132 -
 .../79a48e79-d59b-4f86-a8f4-3af174a9ee0b.json |  132 -
 .../9da9a0e6-257a-41f6-b3a3-e3279a4924db.json |  132 -
 .../dfed058c-48b2-4e1e-9a29-624771e3e9dd.json |  132 -
 .../bcb53a8a-1670-400c-aab6-bd8ed2ebcdf4.json |  132 -
 .../8438a108-0d5d-48b6-b73a-981d13329daa.json |  132 -
 .../88616292-1e38-4481-af30-6b60e28fb097.json |  132 -
 .../44094907-0b09-4706-a117-116a7e10a6e5.json |  132 -
 .../d19e8078-87e9-4760-9b91-6b5f478820e1.json |  132 -
 .../896464f1-01bc-4370-8d90-3368323b2908.json |  132 -
 .../9889f0b9-9051-485c-bd44-32b1e56b865c.json |  132 -
 .../6563ce79-6df4-4c78-89e2-064f1250d898.json |  132 -
 .../b1778755-e6e6-47e2-925d-44d786c4ff62.json |  132 -
 .../3ae923b8-e9f4-472e-8d5e-54fa5f42ce01.json |  132 -
 .../40831e23-0a9e-4bdc-a365-9399b6b82ff9.json |  132 -
 .../4a60fa82-34dc-4b0c-9102-65adac5039e4.json |  132 -
 .../75ff2c43-dd19-48ae-9ba3-f99cdbadda1c.json |  132 -
 .../d7962833-660a-4b9b-9836-8a2f3251f38e.json |  132 -
 .../ad8ecabf-a868-496e-892b-582efb54fa6a.json |  132 -
 .../49f25d3d-80c9-4723-8fa9-1501d44d70aa.json |  132 -
 .../70ea520c-3e0c-4412-9dbe-40a00801335c.json |  132 -
 .../8e7f8bad-812b-4f6c-8dea-1cf44584c300.json |  132 -
 .../3b39a8f0-c5ba-4f74-9d27-bf5b389e038c.json |  132 -
 .../702a14d5-a7fd-4926-ab26-e4c3b7f5eda7.json |  132 -
 .../20e5d087-7b20-4a39-81da-7334354b61f0.json |  132 -
 .../4c5a769c-0472-402c-8e97-d24e5b302bac.json |  132 -
 .../96166735-ed03-4931-81c9-d3daed1913d9.json |  132 -
 .../06d9b1e3-d054-4fa5-bf1f-9d6149e5111c.json |  132 -
 .../776fd8d8-9846-4359-97d4-2340425d1315.json |  132 -
 .../197ae1c5-c9b1-4912-91a3-8ccacddc1be6.json |  132 -
 .../1fffd3d9-1c6b-4965-84e6-980bb0a13af3.json |  132 -
 .../57e8aaf0-f10b-4024-9f93-7b7f13f3ab10.json |  132 -
 .../304d5bee-df2d-40fc-b4a0-e3d99178f4bd.json |  132 -
 .../6126d30d-e2dd-4b8b-9cb3-acdc76084bbb.json |  132 -
 .../fc7284d9-a73f-4562-a781-5cb87247183f.json |  132 -
 .../26ab447c-a850-4197-983a-a0dca4532029.json |  132 -
 .../ee9e2131-aa99-49e1-9814-f0664614354b.json |  132 -
 .../23c472f7-f060-4a69-8f72-12490675825a.json |  132 -
 .../04172bef-c06b-4c08-b2af-9e1fe4d97664.json |  132 -
 .../3436355a-d2fe-411f-a764-4cb8284deb4c.json |  132 -
 .../265655c0-2ead-4dd7-8c7e-4bee69d51bce.json |  132 -
 .../645cae82-9e7b-4d1b-b944-e3783089c1c1.json |  132 -
 .../ab658117-7c6b-428f-8f60-bf88a1d8a5bc.json |  132 -
 .../03c4b5ce-3b22-4d9f-bf60-b626b52a114b.json |  132 -
 .../ce7e3a31-c65b-4521-b685-fcbd067c75d9.json |  132 -
 .../adb53e2c-5dee-4840-8eae-e0186c6e103f.json |  132 -
 .../ba89563d-f53a-4bf0-91e1-92ac950523d8.json |  132 -
 .../3fc0ad8d-4bb2-401a-9baf-b94b39b7e1aa.json |  132 -
 .../ed816bcb-bbe9-48ae-a6ac-3603779a985f.json |  132 -
 .../f347ed24-066a-4cba-8478-f03628cb2b5b.json |  132 -
 .../ffddfea0-d17e-44e7-8931-a9601e9cb26b.json |  132 -
 .../ec351fa1-78c2-48c6-83f0-7c2a9b2f0731.json |  132 -
 .../a0038c34-130b-49dc-a93f-94706a3dad50.json |  132 -
 .../cbd5ea42-1e5b-4984-bdcf-e60fbfb9d692.json |  132 -
 .../b902e2b2-a0b3-4467-b076-b98717c40d74.json |  132 -
 .../4c749665-59ff-49df-a193-0262f66e6003.json |  132 -
 .../c99899c6-95e1-4dea-ac12-f8df49728a3b.json |  132 -
 .../13deca9f-073e-444b-bf79-35e816f7c312.json |  132 -
 .../c8adc0a5-f4bf-4f88-984c-aba506eae6a9.json |  132 -
 .../b146daaf-ce1f-4520-bc19-21ce8679b220.json |  132 -
 .../45e1d037-1ed0-472c-a311-c651fde270fc.json |  132 -
 .../3f4ce54a-01f3-4c23-a4ba-22d47e0344dc.json |  132 -
 .../470d52be-9dbd-4714-b004-f65cc82d245f.json |  132 -
 .../c836fd05-1969-439c-91e1-fd0cab816f6c.json |  132 -
 .../14774c6b-eb03-4abc-92df-1e7a196ca8a4.json |  132 -
 .../5293ae0c-8022-44d4-b2f5-4f5390dff93e.json |  132 -
 .../9020f91f-a8f0-447d-af68-247aa81a25c6.json |  132 -
 .../0cd6837a-8c3f-4529-9ea0-8755e1725467.json |  132 -
 .../7cb17011-cf77-4e86-b67f-84e6ff4b8086.json |  132 -
 .../086831f9-c677-428b-a997-4da58733633c.json |  132 -
 .../d71893b8-b82c-490b-a700-b579d64e0610.json |  132 -
 .../9893689f-c27d-4148-a27f-cd07b07e98b7.json |  132 -
 .../90f2df23-a9ec-44be-ade5-89b59cb7368a.json |  132 -
 .../afd545da-390a-478a-b0f5-ea819f088f27.json |  132 -
 .../ce776f68-856f-4aee-b7e4-e55d15e8d714.json |  132 -
 .../9b015729-524c-44f3-9c2c-c42981d7a61e.json |  132 -
 .../56a54ffc-4692-496c-95df-8e4ad19d4d95.json |  132 -
 .../4b105969-2ce5-4c62-89ef-efd392c2ca89.json |  132 -
 .../31af79b1-48c1-4399-9d16-8582c92996ee.json |  132 -
 .../59a67f29-cb7d-497c-b7bb-1764a665ae33.json |  132 -
 .../fe57367c-74b7-483e-af54-4f404cbea75b.json |  132 -
 .../fda2277b-1513-416e-b586-ed05920a0bb4.json |  132 -
 .../b3dde216-f80a-4664-aadc-b5f5dd3e5895.json |  132 -
 .../07ed6241-fd1a-46eb-91fd-92a4a8f6bd15.json |  132 -
 .../ba76c356-cd6a-4636-8ab1-18bb9df69881.json |  132 -
 .../c6ae54a1-2821-48d1-b689-bbb85aaa70a6.json |  132 -
 .../6f296f0e-80ca-49b7-94e7-cb45b795c715.json |  132 -
 .../b5509e11-820a-4ad4-8c6a-0294762502a8.json |  132 -
 .../90d73665-8d83-4e74-ab7d-29b1d3b6181b.json |  132 -
 .../72387647-cbac-4b72-9c22-db7029a39457.json |  132 -
 .../6219ec01-4b6a-4acd-aee1-96c3e8e48643.json |  132 -
 .../5c323d7c-25cd-4718-8a1f-54d986cadaf2.json |  132 -
 .../adfab21a-941b-4efc-8b63-fdfb3074ba9b.json |  132 -
 .../350d00a4-7501-4130-a069-323530bc9729.json |  132 -
 .../ea809d28-178e-4a0b-ab5a-34739077c5ff.json |  132 -
 .../243d5ccd-58f3-4da5-8718-553f3f456490.json |  132 -
 .../a45537a7-76a6-4855-b83b-abe965f13460.json |  132 -
 .../9be911b6-b9f4-47b1-849d-62eb20c9e944.json |  132 -
 .../33d7d5f0-cbee-4a26-b5e8-48bdd12492cf.json |  132 -
 .../4355fbdd-ac72-4f26-8e07-b7e8d774d238.json |  132 -
 .../4bffc633-e20c-4874-b7db-d1b7dabb8070.json |  132 -
 .../2d5c844d-d950-4254-bac2-0a986659c541.json |  132 -
 .../f6e74b3c-9ee4-40c3-bf92-35d965503a04.json |  132 -
 .../8f1d2600-7347-48b8-9759-11570598459d.json |  132 -
 .../cd653bfd-2c06-4224-aeeb-bf591995a69e.json |  132 -
 .../cdf1fcc7-429d-44bd-b76c-d26ee743f6fe.json |  132 -
 .../4828bd36-5453-4383-8985-08d04a7ebecd.json |  132 -
 .../4c2baa59-c2f1-4779-9d21-1f69c0821968.json |  132 -
 .../555c1079-c4d0-4b9e-9d2d-769e7ba32429.json |  132 -
 .../58a4a1c6-0ee4-4524-9ca1-b40870f1d600.json |  132 -
 .../eea2a38a-4f1b-48d0-894c-09974894f264.json |  132 -
 .../3d8063ab-0ad5-43e4-83ff-90b46dee766f.json |  132 -
 .../da5e0284-7c44-42d4-a110-a23880de277f.json |  132 -
 .../bef017bb-47b1-48e4-93c4-3b222a16af7a.json |  132 -
 .../401c83b0-b7d2-4987-9e46-f127fdbb595f.json |  132 -
 .../c6fde59b-73ed-4179-a907-076be068b262.json |  132 -
 .../90997fea-6c67-493e-bd8e-5327cfb33ea4.json |  132 -
 .../08957d63-7462-44ff-9dd8-060a5801a31b.json |  132 -
 .../a434f569-e7d6-4464-afa8-6104be43fa06.json |  132 -
 .../e32ed251-e817-409f-b4c3-8f168f1ff822.json |  132 -
 .../1d9a65a3-d2bb-48a7-8a00-8e4a79c36db2.json |  132 -
 .../608398da-ae2a-4be2-aaf9-6ec8899aa63d.json |  132 -
 .../80e04641-be7d-4351-a4f6-1318981ef834.json |  132 -
 .../e74222c6-636c-4075-8d4d-30c73fa70fda.json |  132 -
 .../aed80361-9304-44a0-934a-52976d7f1bf3.json |  132 -
 .../709bd280-b03e-4908-808f-34566bc968f4.json |  132 -
 .../66c495b3-4b09-42ad-b742-4d753c3bde7a.json |  132 -
 .../e24f7be6-3051-4990-8b93-121aec5402eb.json |  132 -
 .../0321571b-4246-4490-bd6c-7b106eb8e15a.json |  132 -
 .../54dbf947-ab18-40dd-9cd7-a496289b2e72.json |  132 -
 .../d841e204-ed6a-439d-8408-d5cfb3b38dae.json |  132 -
 .../96b57891-83e3-4948-ad48-64a2a370e166.json |  132 -
 .../30301818-6dad-45f9-acfb-a68ccc7c0609.json |  132 -
 .../50743107-30de-4c5d-bf83-cc003af8a5db.json |  132 -
 .../625ee1b3-e0a1-4a86-83a4-6e66b380f864.json |  132 -
 .../89fda762-1989-4850-837c-f79ef538c58c.json |  132 -
 .../1de1f906-0e36-4f79-b159-16ef8ee33ab3.json |  132 -
 .../d8588222-9e4b-47c1-9f86-92f47c9c8e38.json |  132 -
 .../15e6e6e6-39fa-424f-ba12-5f209cd4b2cc.json |  132 -
 .../81225b85-1523-49c1-b770-897112d2e6ae.json |  132 -
 .../254deaf7-a253-4d41-a10d-1143f86b288c.json |  132 -
 .../ba0b66f5-724a-4a6b-ac20-a36d530a8b4b.json |  132 -
 .../eed0b3b4-e277-49ee-aed5-f3599b2d5653.json |  132 -
 .../96a21b6e-ed47-40fb-85cd-15924330e60d.json |  132 -
 .../f41f5471-6384-4510-85d2-41f236082583.json |  132 -
 .../2728eccc-525f-4350-901b-dbc352c78014.json |  132 -
 .../3e7ae935-46c3-427c-8713-41c659c1828a.json |  132 -
 .../66782676-c942-4aff-b754-b96cd96cf1f9.json |  132 -
 .../941a9e27-2ac4-4dab-a6d0-cb9319c79a27.json |  132 -
 .../caf93f75-530e-4f4d-9cc0-2cf9b0a7f2ff.json |  132 -
 .../d3ca0458-ee97-4a4c-a6a9-066880ffefb5.json |  132 -
 .../615bf89b-9357-46f4-82ed-f49b0021da01.json |  132 -
 .../06398630-23ad-4000-8ea2-fcca230568d7.json |  132 -
 .../bdfa30f8-da0f-418f-adaf-caafda4c81a5.json |  132 -
 .../bd5e550c-5355-4e01-bafc-2ca89899253a.json |  132 -
 .../f842ad5b-24f0-419b-9d65-5a6ff1f5e04b.json |  132 -
 .../3a09590f-28f3-4161-8a93-d42cec62aa90.json |  132 -
 .../0f6b76ca-c4b8-40b2-a3af-2ea1c3650933.json |  132 -
 .../f276ad54-4e3b-4718-ae1f-0479565e4565.json |  132 -
 .../dec20396-6555-4773-bf02-2cd1fcedda89.json |  132 -
 .../eebc33e1-0016-4adf-815a-72653a34c01b.json |  132 -
 .../803c3898-c1a6-4832-ac3a-a86139489810.json |  132 -
 .../bfaa3d3e-66fd-4477-85af-4b83f13ff05b.json |  132 -
 .../99debdd2-1dea-4eb6-be5c-c144656cfe20.json |  132 -
 .../ad67bb88-7f74-4eb4-b771-0b3b60be4416.json |  132 -
 .../af2f579d-1e8a-47d8-8e44-a599bee83e37.json |  132 -
 .../763c840e-ea73-453e-8e54-5f4fd6fda9cd.json |  132 -
 .../4fb40ac4-a637-4b9a-b69d-ba551c0f0938.json |  132 -
 .../ffc4ef41-4a28-4816-be54-8ffd8e153073.json |  132 -
 .../f75fe902-f1c7-4e6c-87d6-128688db8d94.json |  132 -
 .../dbd3098b-4532-441b-a81c-072c52579be6.json |  132 -
 .../438e4aa3-5e02-446e-bd3a-07ef724d24ff.json |  132 -
 .../027fdc55-61eb-416c-b6ad-4408912d151b.json |  132 -
 .../37a4895d-def5-494d-9b62-d8c97ba9350b.json |  132 -
 .../0d53c27e-962c-428f-b540-35ab027883a8.json |  132 -
 .../6f7b2d91-24d6-442c-93a5-9afc88e9a308.json |  132 -
 .../21793520-7d1a-4040-bb96-fa7fe98ae580.json |  132 -
 .../59d53c40-5b16-4a70-a693-5fb554cf7614.json |  132 -
 .../b28a569c-6bdf-4547-a2ce-c3e224764be3.json |  132 -
 .../2de129c8-2259-4367-a619-85d9e8f61e06.json |  132 -
 .../c242030f-fb2b-42dc-a5d1-687273b17282.json |  132 -
 .../3b3fdb16-b6e1-40c8-9ac0-02f1f2207eb7.json |  132 -
 .../ef6e8e0d-7ba4-45ea-aaf7-617f68f2e97c.json |  132 -
 .../f8c131a4-1fee-4694-8753-88853418ef4b.json |  132 -
 .../27dec9ff-fb18-43dd-949f-7c0587a5858f.json |  132 -
 .../060df34d-ab67-43e1-bd56-ebaceb77abd3.json |  132 -
 .../a6357673-3daa-4593-8593-2b65a7d5477e.json |  132 -
 .../121d4877-1955-48db-a23a-6b0ad0623b9e.json |  132 -
 .../1f1eab02-219e-4ad8-af50-e103541e1c9d.json |  132 -
 .../b4cccfb3-1c17-48a3-a211-a26c44de757f.json |  132 -
 .../05e97a86-681d-42a2-8a47-beade25d8fc9.json |  132 -
 .../6c0899b4-f066-45f6-827d-11c535ef0634.json |  132 -
 .../f9660557-b9f6-4ecc-b260-c245f0e62b5b.json |  132 -
 .../89168032-5840-4c2c-821e-b3d717ade46f.json |  132 -
 .../10d0aa63-67d9-4dba-9bdc-db7ab3b4547d.json |  132 -
 .../6f66ae5b-8cb6-4263-98a4-4a1eddfaca10.json |  132 -
 .../5e715199-7030-47b4-89c6-83ba0968c07c.json |  132 -
 .../3fca39e8-443d-47da-a858-83a68c18eec9.json |  132 -
 .../b7518bd2-d3af-49e6-823a-f8d507e8e60f.json |  132 -
 .../fa399f16-1652-430c-be19-afaf5ab96be1.json |  132 -
 .../cbe5032b-122c-4a0b-a099-50e998a4bc77.json |  132 -
 .../fd8c3209-dcc0-4d27-a3aa-d0f76ef86f8d.json |  132 -
 .../1a18d49c-ad7b-4823-abbc-7191e9d659cd.json |  132 -
 .../9e2c614e-1104-43a6-9e8f-b7851562e01a.json |  132 -
 .../7d4b83ab-9c9d-46e5-8cbf-b8afcf781230.json |  132 -
 .../a42b5d7e-be7f-4cde-aaf0-001e2cf05a44.json |  132 -
 .../21f6688c-be52-4352-9c95-d37c0a5f6c94.json |  132 -
 .../e92ba586-7bee-4a9b-b388-e35efde3d36f.json |  132 -
 .../45ed0bb3-efbf-4a32-9735-d814aa08790a.json |  132 -
 .../eff28375-89a7-4970-9342-428b07d0c6f4.json |  132 -
 .../23877e30-b8fb-45ea-a803-47df757ea909.json |  132 -
 .../8bc25d04-9cc5-4551-a9c5-ce185c7ad974.json |  132 -
 .../d2d4b5a5-109d-4d26-a166-3d97b341584e.json |  132 -
 .../ac404d92-7a06-4758-ab1d-fcf840c2b995.json |  132 -
 .../95ea7fbf-d3f2-4fc1-ba17-05549f6e4d25.json |  132 -
 .../c101e272-24d2-44db-9b0f-2ed4d17cec41.json |  132 -
 .../2cb789c7-dddf-42b2-8fdf-4cbd5132946c.json |  132 -
 .../a414aefd-ce24-49a9-b431-0c6014ebfbd8.json |  132 -
 .../91fcb6a3-d351-48c8-87e8-e2a06642e925.json |  132 -
 .../3cd90efa-ddf0-43c4-884c-84337ded14b2.json |  132 -
 .../c66c21e9-a332-40f9-ae87-bdd78a25d753.json |  132 -
 .../0b4def91-29df-45d9-8dd4-c4097ec47ba3.json |  132 -
 .../2cbf258c-369e-4b1c-863f-43cf97c3a7a4.json |  132 -
 .../8372889e-f9cd-4cf7-aec0-8e18d5c627e3.json |  132 -
 .../ce4cc270-57da-4d08-9130-62508b409cb2.json |  132 -
 .../4cfedb8f-0e47-4008-9bc5-fb15e4afa607.json |  132 -
 .../de3c949d-bab5-4430-bdd1-48e1b7860934.json |  132 -
 .../011e53cd-409f-479b-9c3d-bfce75a1277b.json |  132 -
 .../1ff40e45-5be4-4625-9f66-5599a829903d.json |  132 -
 .../fed97d94-2949-4383-8f25-fa79bd413508.json |  132 -
 .../f4820bc8-7dfd-4439-af95-21b6cc9367ac.json |  132 -
 .../36e576bb-de50-49ec-a91f-f134c11bbe38.json |  132 -
 .../0edd388b-7a1b-4334-9b72-52d84653ff67.json |  132 -
 .../b3199674-328e-41a0-9aa4-bf39aec735bc.json |  132 -
 .../52db4d79-7040-4525-934e-0f33e4acec63.json |  132 -
 .../ee34821e-9182-433f-a8b0-745711e23738.json |  132 -
 .../10ef0990-5356-432f-b24c-dd107188ec5f.json |  132 -
 .../47de680d-33b1-4441-92da-4b97a5fc513f.json |  132 -
 .../96ac0351-2ade-4d76-bcf9-bc0f633f8694.json |  132 -
 .../31aae266-c14b-451f-8bab-62ee7d5d382e.json |  132 -
 .../f6edb102-e867-46d1-afdc-3c45166bd510.json |  132 -
 .../8b7756cc-9af3-4f98-84ac-7fef4c1bdaa0.json |  132 -
 .../dcf33a22-5e57-4476-a2cb-ebd60407a920.json |  132 -
 .../15659480-be0b-41c8-a463-873be444b194.json |  132 -
 .../0444c1bf-a3d3-4d23-bc6c-0a98c4dc1e9d.json |  132 -
 .../93aa3a13-5069-410f-a1df-6944e0231e0e.json |  132 -
 .../427ea7d0-c1f1-4cfe-b6a7-555262a7a317.json |  132 -
 .../c6dbe372-7a3c-487c-87c0-fb324c39f8c9.json |  132 -
 .../cf8d99c8-8790-4bdf-bfc2-1a6d1fe35916.json |  132 -
 .../5b5d42d7-8012-46f1-826f-32d839806048.json |  132 -
 .../5e1bf2cb-55c4-4806-89af-cb9953c7c1b1.json |  132 -
 .../21ee4b33-9829-4cca-9603-c30fd4a1f7ff.json |  132 -
 .../c6c14a8b-0e9f-4b97-b9f3-27c7250fb8f2.json |  132 -
 .../6586fa94-9f43-4814-8c8a-8ed244ac94e7.json |  132 -
 .../df7d7db2-867e-47f0-9abf-d71b79e97630.json |  132 -
 .../e2502e7e-3a10-49f3-b5c6-b20496fed998.json |  132 -
 .../51cde18f-09b0-4b66-a962-811ee49e192f.json |  132 -
 .../4ea48b42-8026-4799-b35d-46757fd2753f.json |  132 -
 .../52e9b4ae-9119-4f26-87e4-6532d1148ecd.json |  132 -
 .../4bda68c0-cc09-4945-961b-48776b7b5fc8.json |  132 -
 .../18ea0ad0-a216-4906-a96c-c8b040398dbd.json |  132 -
 .../1e2321f6-93bd-4acf-9f5b-c82807a40233.json |  132 -
 .../13032961-52a1-43cf-b69d-1802c43e1bcc.json |  132 -
 .../9d444061-2c29-499a-8906-77ef58aba34d.json |  132 -
 .../1ffdf6b0-b3a3-432a-a0e4-69b4d447bb76.json |  132 -
 .../8ce733ea-e6e9-4f9b-ab28-f93202507265.json |  132 -
 .../0e88aa91-609c-4d2d-9296-25b06eeb0342.json |  132 -
 .../3e235ea0-3f04-4d99-9db2-7cafcbdbac6f.json |  132 -
 .../5e31a55c-f222-4192-b031-27bb40ba56fa.json |  132 -
 .../11fd4b70-4ea7-4bee-8caf-8921d4c89f24.json |  132 -
 .../8e721067-898d-45ca-b4f5-9f523c4ce3d3.json |  132 -
 .../be5d5480-ce4c-4ade-8c6a-c08cd2826909.json |  132 -
 .../54dec074-29f8-4863-be37-2c08f6f2c3cb.json |  132 -
 .../88a15025-556b-469d-be77-c773f2c61038.json |  132 -
 .../b4f4596b-17e5-40bf-ae60-0b17492ba9f8.json |  132 -
 .../97ce858e-a64f-4881-b6d0-0a2c0814336d.json |  132 -
 .../1becd83e-e9b8-49c1-a137-80c5a8dbdf0d.json |  132 -
 .../337bb321-9c6e-4751-9c9b-d8ba0120dd07.json |  132 -
 .../cfa95cc9-5bb1-4921-97c7-078f2f929a2f.json |  132 -
 .../6d5ba3c4-a0c2-40cd-9766-68d36d21c5b6.json |  132 -
 .../6cc4404a-f3e1-47b9-b56b-34e4269e1261.json |  132 -
 .../8d820e43-ff42-4247-9ad0-4ed8e70672b4.json |  132 -
 .../d858ce8e-6a4b-46b1-8d51-03ebc2d8aaec.json |  132 -
 .../9813dd88-ff70-4d9e-86c5-9b73444275c5.json |  132 -
 .../ac677432-e7d1-4439-9c05-426059c285ef.json |  132 -
 .../018f270f-3cfe-403c-a236-483038a0b04e.json |  132 -
 .../718a40ea-26b1-4cf4-9584-57be798640ae.json |  132 -
 .../207a28a9-ae24-4a31-be95-96296b2e466d.json |  132 -
 .../72efedb8-d456-41ed-b1ae-4887cb6c18f8.json |  132 -
 .../ac91fb37-5742-4a3d-b93a-86c63b90cad5.json |  132 -
 .../c71d025d-e954-4420-b397-e07c3644d1f4.json |  132 -
 .../968c3759-de5f-4255-ba95-cafc7a3c70a7.json |  132 -
 .../5e23b2f7-33f7-4e49-b73a-a02b8650ee0d.json |  132 -
 .../1b6c64f6-acf8-4cff-bcae-6e8b3725c6f1.json |  132 -
 .../7908f572-8886-4add-ae84-b4ec0ec17c26.json |  132 -
 .../9e04ec5c-2208-4569-9b63-4768ed4262b9.json |  132 -
 .../ee2c8beb-6566-4b19-91d0-8e48c12a3fdf.json |  132 -
 .../c7579616-0c21-443a-a149-0c51a0ae92ac.json |  132 -
 .../ef7a1429-db2f-433b-a606-339a9d868e7a.json |  132 -
 .../f531e13c-79ed-45da-a246-857fd2c884c1.json |  132 -
 .../0f525d93-663a-442c-9a51-1ad3a5054172.json |  132 -
 .../15af21e1-3193-47fa-a3fc-1f087216d4d9.json |  132 -
 .../67b270d9-3422-4770-9957-7bde65acca0a.json |  132 -
 .../e2d38bcc-9133-4051-82d0-4e4fd66e00f8.json |  132 -
 .../4ff256af-73c7-4a5a-96da-19546a786c59.json |  132 -
 .../225cbeef-1d0d-40fc-949d-4ba6696fb690.json |  132 -
 .../24fcd662-5abb-4bf8-b8df-1c21b048cd92.json |  132 -
 .../7badcb45-7826-4fd1-b964-c697fbda76cc.json |  132 -
 .../bfb532f1-3319-46ff-80ae-0ca783a18bb6.json |  132 -
 .../ea304515-b41f-4e96-a0ec-78c897ebf9a4.json |  132 -
 .../1fe79ea5-1922-4a5e-8857-1c832353b0a6.json |  132 -
 .../9098d70f-cbcd-4f6c-bcba-0b1da743396e.json |  132 -
 .../df4ed9e0-30bc-4a3f-b7a2-8955cbb38d31.json |  132 -
 .../f68957d5-20a1-438f-9931-6a787aaed467.json |  132 -
 .../416e0c04-9119-4230-ba71-b0f47e2d4997.json |  132 -
 .../d57780e2-154e-437d-ac2f-0007e1f9140e.json |  132 -
 .../027d464b-1375-4de7-aa57-e1473d16ba89.json |  132 -
 .../a81f20fa-57e8-498c-a162-6d8a9be09ee6.json |  132 -
 .../d72ddbff-8ff7-446f-a74a-10a46bce6e3e.json |  132 -
 .../f681d612-f574-4641-b34e-95b6de97f9e8.json |  132 -
 .../cae1adaf-e424-4dcd-943b-5bbb708aca57.json |  132 -
 .../969ac825-92f2-448c-899a-226e69dee377.json |  132 -
 .../e108ad28-c155-4162-852c-0f588a136bdc.json |  132 -
 .../93cfeba9-7d31-45b4-a6e2-99a5f318f5b3.json |  132 -
 .../c1b16b84-9392-48f3-b483-0a9786925506.json |  132 -
 .../b0c6e08d-b426-49d5-8a66-ee3d70131b62.json |  132 -
 .../6a6651a3-b34e-404d-ac25-42c151fb9ba3.json |  132 -
 .../da63b789-5571-4ed8-976e-146d385b18e2.json |  132 -
 .../87b900e7-3bab-4e60-b0ef-349667cb2656.json |  132 -
 .../c9fd4740-4990-4174-b782-9b63c34d6407.json |  132 -
 .../2582a049-e940-408b-b2d9-7a7bdf470e49.json |  132 -
 .../99310118-d2ec-4647-85db-fcc22aee9161.json |  132 -
 .../bedd12e4-da18-4ca6-ba51-6d13e1c80bae.json |  132 -
 .../6767e14a-bbfa-4a0d-8120-1f48a565474e.json |  132 -
 .../70260aac-1bbf-4913-9dcc-58633d055314.json |  132 -
 .../fba6e1a2-c197-4731-91ea-f6d059ba8b16.json |  132 -
 .../22e74d0c-70d6-43c5-be4d-62842d93fedf.json |  132 -
 .../f7c33065-1da1-4da4-81c7-f2c9307b6e9b.json |  132 -
 .../ecdb4661-426a-46be-aefc-7e04483cebc0.json |  132 -
 .../236976b3-af46-45ac-a8a5-f5897e3468a1.json |  132 -
 .../fd175296-a5f6-4914-80e9-b8b75bc659de.json |  132 -
 .../d910bbaa-d55c-4b00-9320-856a8a6713c0.json |  132 -
 .../99a5f123-5d2e-469b-884e-c9a64c6bc197.json |  132 -
 .../ed17a715-f0ae-461c-9618-ac952c450ec5.json |  132 -
 .../3dd2a474-9ea8-4e26-8986-5bcc67c78c39.json |  132 -
 .../b39e14a6-c05f-4e88-b2d4-63a199aa61a1.json |  132 -
 .../39893637-552a-48d8-9b83-433415eb26c3.json |  132 -
 .../f9549713-f487-4e26-bfeb-ec6d394b7014.json |  132 -
 .../02579c41-f117-4412-9c00-ee7db3e9ab97.json |  132 -
 .../bfa1d761-00aa-4438-a5de-972d934c63d5.json |  132 -
 .../20a84d88-05c2-4e02-8c84-2afa84cc659f.json |  132 -
 .../84eedce3-3a93-4630-b914-aa281fd2efda.json |  132 -
 .../b3b7b62f-ac82-4ef9-9634-afb81645ec19.json |  132 -
 .../283c5166-b9c5-4d20-9653-0cd0346d87c1.json |  132 -
 .../478b54cd-6410-41e5-8a53-4e46bcd9d7af.json |  132 -
 .../de2ae7a9-93eb-4149-b3ff-b5b7dfba29c4.json |  132 -
 .../ef5aa9db-804b-4a53-9c22-9c99f6c69eeb.json |  132 -
 .../553fd36d-08dd-46a3-ab04-77b9039e7921.json |  132 -
 .../e2bae853-cc0f-456a-a635-98d5f87ac47c.json |  132 -
 .../d6c5f196-c97b-4a0a-81b0-59143ec4b10e.json |  132 -
 .../5d92e02f-b590-4b6b-8c64-30690f79e916.json |  132 -
 .../e10f38df-b5d5-47c6-924f-563c6f8a6616.json |  132 -
 .../27257dc9-750c-4673-8865-986434bc5c0e.json |  132 -
 .../e599f3f8-e5eb-4bfe-a102-efc5a967434d.json |  132 -
 .../8e56f2dd-49d0-4eff-beea-53d01cd96f0e.json |  132 -
 .../f1a2b5d0-2c8a-4bbc-8bc5-0484485c2dad.json |  132 -
 .../2c12ee67-0c77-4cb2-9e88-1c731ed55c3f.json |  132 -
 .../567f8f54-225f-4d9b-be06-f24091adc1e6.json |  132 -
 .../ebb59730-9522-4c45-8f42-c0d941fd728c.json |  132 -
 .../2c44fa8c-ebd3-4ea6-8578-61da38965c09.json |  132 -
 .../3ef26b8c-6bfb-457b-a160-a65c3cc8b0c6.json |  132 -
 .../0ab721ba-fbda-44ca-a349-1d3abfaabe62.json |  132 -
 .../2fea1128-4f0c-40d8-be87-72c42c0648fb.json |  132 -
 .../db9dc9d2-4aa2-43d0-9f2e-15fbd05af62c.json |  132 -
 .../28399fd0-840c-49d3-8179-407ed83d3bfc.json |  132 -
 .../d7108c13-e14a-4366-9a39-204f853b1bee.json |  132 -
 .../56152d05-9273-4701-8c0a-723e2cab618d.json |  132 -
 .../55d2f23d-cb6c-42d2-8b57-837451d3c6df.json |  132 -
 .../7479ae87-e795-4e20-848a-291614176def.json |  132 -
 .../04ceb40e-bde8-487b-9d29-dc8f681af9be.json |  132 -
 .../e26b00b0-d9df-4ce2-a649-b19f8957b8ce.json |  132 -
 .../9954194c-69b5-4eb4-8b32-859845548cb0.json |  132 -
 .../2afbc279-242a-4276-85f0-facd29c2d89b.json |  132 -
 .../ba307ad4-3647-4785-9bf1-cd4dacf3c71f.json |  132 -
 .../d03c73ca-7364-4517-aea4-f0ac564c49df.json |  132 -
 .../1dd4b82a-ca80-4c9c-8800-f97ab2b9cbe7.json |  132 -
 .../f2363099-c39a-4874-bf77-ccc0fa087680.json |  132 -
 .../596eeee8-3600-4f8a-8888-978b610eb2ca.json |  132 -
 .../595ddba1-c450-4b69-85b7-0e3118c8c6c7.json |  132 -
 .../64890314-bba0-4fb2-8c21-38b413cff4c8.json |  132 -
 .../470b8b0d-fbaf-408c-a28e-57d1b294f8a8.json |  132 -
 .../00a1579e-8636-4eca-9a63-c0b067a5f3dc.json |  132 -
 .../a52cc4c9-6d60-4083-ac77-591e247d86c9.json |  132 -
 .../ac5c321a-d35a-4e0f-a1be-bcc0b7109f91.json |  132 -
 .../c4d11b01-ae5b-4198-b102-07160f100a41.json |  132 -
 .../19405ead-2263-4613-8053-43beeafb4bfc.json |  132 -
 .../6c698a60-a813-4be7-b55f-b684029b492d.json |  132 -
 .../b67c4a44-7787-45e2-b88c-5d7e8e496fa3.json |  132 -
 .../a20a529e-c52e-41b7-a8ee-909167048bfb.json |  132 -
 .../2735e6f4-839f-4ab1-8ede-3447891b1b26.json |  132 -
 .../e74e7e7f-8550-4cba-97cd-2626c82d6b29.json |  132 -
 .../14f4c00d-8915-413d-8e85-79f395127682.json |  132 -
 .../9119b586-d3b2-4ce0-a243-d584e2087184.json |  132 -
 .../629f3f1a-f8ee-4d1b-b604-7bbd35c6517b.json |  132 -
 .../a6ac828c-904b-413a-a5fa-a5ed06a28143.json |  132 -
 .../251a3ef9-c7ae-4d79-8a60-4bc021a3f001.json |  132 -
 .../962b48a3-23d7-4104-b34d-4e5c2af31d58.json |  132 -
 .../e4b0be31-6f9a-4a57-b433-e561da9bd827.json |  132 -
 .../9a31f208-b7d8-4baa-b96e-99926ecb35af.json |  132 -
 .../8d933df1-60cb-471d-bfc3-b11c93150203.json |  132 -
 .../35315c3a-ec06-433a-b3fa-ae7a4a59b7ea.json |  132 -
 .../3530db9a-0d61-4cf8-9fff-b15f6488c845.json |  132 -
 .../7d9901e0-eafe-4d49-a5bb-fab059708bcb.json |  132 -
 .../ee7f9025-bb2c-4902-b8e2-bfac2b63d2fd.json |  132 -
 .../6157f79e-2673-4ad6-99d7-e5cf5e4e1db2.json |  132 -
 .../0aa7572c-1aa6-4997-a2a2-3b557fbde639.json |  132 -
 .../6f5df760-2d3e-47b1-b55e-4031a5f11d41.json |  132 -
 .../ac676b03-c3ce-4ff1-83fc-5c8db82f1497.json |  132 -
 .../2229cdf8-3ecb-4f11-8824-9c3bfbf6f968.json |  132 -
 .../95ebc5b8-a541-4fca-9e7c-692720e73362.json |  132 -
 .../09a2508d-a171-493f-9ff2-e7f375815c91.json |  132 -
 .../12a4a921-5859-4fd6-9d64-677a7d8ef696.json |  132 -
 .../b79f12d0-cdfc-4c9d-a88b-40612dcbf64d.json |  132 -
 .../d162cf7c-3ef4-420f-aab4-789a98b1195a.json |  132 -
 .../7e49018e-5e2d-4cdb-be5b-2ac04ec84bf5.json |  132 -
 .../24677f2a-ea89-4289-bcb6-13699de9782f.json |  132 -
 .../3e09df3c-2224-4a29-8e55-18a485db2b25.json |  132 -
 .../cc0bd236-8fc4-43d3-a18f-4b2afb112946.json |  132 -
 .../5afd4c0f-b61d-452f-8c48-d298780d91d5.json |  132 -
 .../eac52141-4fd8-4e21-9c78-920ab8933e5a.json |  132 -
 .../8449837f-64ac-4293-b1f8-210e62779202.json |  132 -
 .../ab8a665c-8234-484f-a8a9-8ee79d73edff.json |  132 -
 .../a954242f-41a6-49d7-a71d-3bfe940cdb92.json |  132 -
 .../6d1c518f-3f42-49eb-9208-b30e27e7e87e.json |  132 -
 .../87931db7-42a4-48df-b5a5-8bd934061dbe.json |  132 -
 .../54088dbc-04cc-4b35-b4e1-e495b7cfd47f.json |  132 -
 .../7129efad-8ab2-4f7a-b6ed-055989b3e131.json |  132 -
 .../cfc6f85f-e4b6-4164-b7eb-4efb888e1ba5.json |  132 -
 .../0f053a45-cd79-4e51-9b4c-ae5c51006c17.json |  132 -
 .../d8002b35-1454-4635-a31e-b419c7000b53.json |  132 -
 .../4c08530e-d529-49a1-a3fe-2351c422981a.json |  132 -
 .../d16879dc-7ed7-49c4-aca6-4c9cd3b3a350.json |  132 -
 .../70656b13-e0a2-4ef4-af43-0d9995d57af6.json |  132 -
 .../6544f1ca-02a6-4e58-98f0-e19cc6082682.json |  132 -
 .../5cd3796f-fb31-49c1-a974-019c5c5b20ae.json |  132 -
 .../49eff9ad-90c9-43b1-a1f5-cf371ac4b39b.json |  132 -
 .../59720f7e-7e09-483f-8332-8dc7aa19ae78.json |  132 -
 .../a3a89e4a-0589-4776-a1da-227552482e94.json |  132 -
 .../b3c04d1f-80e3-4d86-9779-c5e4bbce6f35.json |  132 -
 .../448fda35-bfdc-42ae-90f9-d44383e0a454.json |  132 -
 .../0d97542e-82b6-4f27-9822-62b67e7690c2.json |  132 -
 .../2725bd69-839d-4427-8e05-0e289fff70de.json |  132 -
 .../adb71488-adb8-4848-bf1d-aecd04cb6718.json |  132 -
 .../c7736577-c4c3-4233-9308-a4bb9b2dbb89.json |  132 -
 .../76fe52f4-9fa5-4ccb-8c92-7bd9eb9886ee.json |  132 -
 .../1d92e45f-c5a5-4dd6-a61f-8e0f7246117a.json |  132 -
 .../5e1513f1-4375-4380-85fa-b96a419c013b.json |  132 -
 .../fadbf3b2-283a-4f8e-9acf-463d75924b97.json |  132 -
 .../c04ffe5b-c313-4249-83bb-bbe07ad6fc69.json |  132 -
 .../a9aa164e-386b-4987-9f49-2dde64ade45c.json |  132 -
 .../e4c1b3ef-e1db-4eca-b818-f3b1680cc5f0.json |  132 -
 .../1ab95edc-ea3c-4d3f-9f59-dc7f7468adb9.json |  132 -
 .../80a81bbc-6edf-48b9-afb7-e4e0a03753d8.json |  132 -
 .../afb24bf8-3c47-4278-9b84-19b05017745b.json |  132 -
 .../4f8cda4d-959b-41ab-a79d-d2b35968eb89.json |  132 -
 .../2818aa8c-5c73-4de9-bcbe-fd8f68e8bc6b.json |  132 -
 .../6a683ead-0f3e-449b-9ae1-8afc9f1ab33d.json |  132 -
 .../38cb02a8-862d-40e1-922a-e65f537df87e.json |  132 -
 .../f816e2a7-2629-4abe-9ed0-3d1299e95194.json |  132 -
 .../286fae5b-544a-4033-9092-d633fc80f47b.json |  132 -
 .../93477bf6-ea00-418b-8a2f-975a9554263e.json |  132 -
 .../3d7c6576-f99c-4bb3-94fa-4f713e2898f6.json |  132 -
 .../d1e9a242-941f-4461-b75b-7043c2c01ef7.json |  132 -
 .../e39661af-ad93-41d7-8892-1230064f1a1c.json |  132 -
 .../595b61b2-5220-48f6-91a0-3aa0d37c63d8.json |  132 -
 .../3173263e-2a42-4e8d-956e-8175ef464e76.json |  132 -
 .../f77f8291-1573-4fb6-a984-1cc099c09621.json |  132 -
 .../c4681e14-513c-4e5e-af8c-88ca11849176.json |  132 -
 .../0c220edd-2563-4fec-99a4-ef8c210ca5ce.json |  132 -
 .../bd7ef5a7-aa75-4eb4-8860-aec63f8bf9d1.json |  132 -
 .../85c20522-03c0-4dac-a1c8-2945e4bf0e0e.json |  132 -
 .../f180fddd-077f-43f9-b2d9-38c5f33be44d.json |  132 -
 .../ef384329-8406-4767-ac1a-3eba3131f726.json |  132 -
 .../2ddeae27-77d3-413c-a6e1-9de0f3980c4e.json |  132 -
 .../38b2dbbe-be86-4ef0-a39b-89841f662141.json |  132 -
 .../999a8091-22bd-4c08-bee1-772202e7edde.json |  132 -
 .../fda91d98-d259-430c-929b-78852cab64ec.json |  132 -
 .../535bfa4f-ab63-4832-9f17-7b245ff2b2af.json |  132 -
 .../681a6cc5-5519-4b13-8b50-93adcab4a3f7.json |  132 -
 .../141dd12c-6901-4a96-a051-f35647ddcc73.json |  132 -
 .../5b095779-aacc-41f3-9a3f-83f64a1c0d4c.json |  132 -
 .../7a88c95a-b253-4f36-8fde-1b0158bbf0b6.json |  132 -
 .../7938a00e-4e11-4223-a900-fa53df168ab7.json |  132 -
 .../8f966b4e-1baf-445f-9f10-4ba6b47aaf9b.json |  132 -
 .../a334d998-21a5-4108-96e3-9935507a9f8f.json |  132 -
 .../941e27c6-81da-4ce1-b1c8-544c1426cd11.json |  132 -
 .../e409a374-685b-482d-82e4-2436dca37309.json |  132 -
 .../84713625-97b6-4fad-982d-41b5c500d73a.json |  132 -
 .../b7edd9ab-a018-4b2f-9b01-b56cbe98abda.json |  132 -
 .../ec896115-21ef-4337-9fdd-32a04c574a05.json |  132 -
 .../d8e5f49b-7bf3-41d4-a91e-c566219609f6.json |  132 -
 .../ce1a92a3-6bec-410f-ab42-c567c5d23856.json |  132 -
 .../0a125470-b50f-4ca0-90dc-1f6b69c3ccd4.json |  132 -
 .../aeee0165-ac7e-4da6-8102-ba60f43587de.json |  132 -
 .../b47b8666-2556-45df-ba5b-9a5e94186784.json |  132 -
 .../0bde5d57-39be-4497-a2a8-d08d3c8d65f4.json |  132 -
 .../86599961-3ec2-4837-89a4-809f1dd7226c.json |  132 -
 .../dc3ca25e-41b2-4206-afaa-7d2d10fd27a7.json |  132 -
 .../cd77d407-3be3-4b84-8a73-34a15744de93.json |  132 -
 .../1cd20db5-0225-4724-b1f9-7c32eae456e1.json |  132 -
 .../dfc45dc3-51e6-454b-aee9-ea6b0714f0ca.json |  132 -
 .../3da2a408-672c-47b8-be32-61f56a15e9f3.json |  132 -
 .../94700c3c-f18d-4f96-a794-65bcf483fca9.json |  132 -
 .../6f3481d4-076f-45bd-8564-d485109c7a63.json |  132 -
 .../9f5ca3b2-747a-4fd0-b382-bf7ef503ba25.json |  132 -
 .../f1932041-263a-4841-9c8b-c6cc9fa50c21.json |  132 -
 .../691bef38-bc9e-4f8d-b774-9d7c62eec72b.json |  132 -
 .../5795f693-9ebc-47c6-9d2c-185dd0d32044.json |  132 -
 .../eb83f474-0d3d-488c-bc0f-93e5d1dfb2f3.json |  132 -
 .../f93b2053-11c4-4868-860f-90fbfe8288fc.json |  132 -
 .../8984fe95-9fd3-48ff-aa5f-18df63ecd6bb.json |  132 -
 .../a0f6f5de-578c-4290-85b5-c51aed985074.json |  132 -
 .../8ccc76ff-25c9-4706-b6a8-31b49f8be813.json |  132 -
 .../924f8b31-506d-4df2-8a7b-d0cd66d55f6d.json |  132 -
 .../8e7dfd9f-350d-406c-811d-453f1744dd53.json |  132 -
 .../b713d1d2-351f-43a1-b77d-27723e1d4267.json |  132 -
 .../322a9442-174f-4223-b839-6f8f9664d5e5.json |  132 -
 .../b12e71d1-c435-4172-a28f-38e26791dadb.json |  132 -
 .../ad33b0e8-39c8-4118-81bd-bc86b482f122.json |  132 -
 .../db8a7864-293b-45e9-995b-5301071c902d.json |  132 -
 .../31e3beea-28dc-4b47-a5e9-5fafc89226db.json |  132 -
 .../49315a95-394f-4508-8e6c-7c1d5547c257.json |  132 -
 .../375d3a94-97af-47ef-82af-afd7581663d4.json |  132 -
 .../77cfe896-4aa1-4bcd-a39a-f437c3f7e738.json |  132 -
 .../3d69ec7d-9999-4e16-8dc9-99fad35e156e.json |  132 -
 .../d2a7459b-8a12-4529-b978-c7237979f16b.json |  132 -
 .../e7a228ad-69de-471a-9f31-6bdc7221999c.json |  132 -
 .../9196ae39-adb0-4d53-8399-0ccd4d628065.json |  132 -
 .../ea318f99-a1ab-41ed-ae5d-39c62ac40e1b.json |  132 -
 .../05f69fd6-a77e-478d-ad86-3e83e615e892.json |  132 -
 .../5b8e9508-befb-4674-bd84-9c722a0864ce.json |  132 -
 .../8beb3730-23e8-4b89-933d-2d3f1a1d1365.json |  132 -
 .../07417712-1933-4920-8964-67ba74bf6d01.json |  132 -
 .../ae4cc05d-a65a-4f18-a99c-f133603686d1.json |  132 -
 .../54df4d3e-0ef0-4e30-aa46-b47a4589a34c.json |  132 -
 .../a717d466-9157-4991-8459-f39847d914a2.json |  132 -
 .../15a8789b-27de-49d1-b3e5-9b1fc9b5694e.json |  132 -
 .../921562fe-cc21-4ff3-93de-a62e1d4bf7e7.json |  132 -
 .../863969d9-e567-43cc-a0a9-7f80eaba374a.json |  132 -
 .../2987fa45-363e-4a07-8e9f-db01586a135b.json |  132 -
 .../3488de21-d9a6-49e8-ba8f-d9beee9bdabe.json |  132 -
 .../0cacf042-6b62-4b67-8821-97cd703788d0.json |  132 -
 .../9f0dfceb-1332-447a-bf6f-6c6c40686a6f.json |  132 -
 .../c1308f95-6d55-4ff6-b14e-1bd09b467d99.json |  132 -
 .../4ab16120-8d39-4dea-aa76-5c249506848d.json |  132 -
 .../f9647ea0-6464-4aa0-b1ea-a994a7bcca3c.json |  132 -
 .../c5ef47ab-2e73-43d6-b9ea-1ee7e50d9df8.json |  132 -
 .../9ef7a4a0-b751-45ff-ab1f-d50687a3f4c3.json |  132 -
 .../8b303795-557b-4fa1-bbc6-d36bd77ee739.json |  132 -
 .../7fec288e-0b0d-45c0-b0e6-17b905cd7ea3.json |  132 -
 .../5a09783b-82da-43ae-a607-2cfea550d931.json |  132 -
 .../6c2d191a-a2d1-459c-b2e2-5766bec62ce7.json |  132 -
 .../121cb5fc-2fa2-4718-b325-c40014802e40.json |  132 -
 .../8bbfa040-b16e-4116-ad3e-b3e4e58a7de6.json |  132 -
 .../c8891914-c9fb-4b4d-9592-826f04520e7b.json |  132 -
 .../e77ffcb3-c7d8-4700-b4ea-fe4e5ba94223.json |  132 -
 .../da237415-f34e-4cbb-9a94-3ff621f3df8d.json |  132 -
 .../479f3bfa-d614-46a9-88c7-9891852b0d8c.json |  132 -
 .../f5f0c7da-fb03-4023-81a7-801b0729a19d.json |  132 -
 .../40f51424-2922-498d-bbbc-d500667a8554.json |  132 -
 .../4f25d177-6bcf-4864-87a4-1beb21a7373d.json |  132 -
 .../b160ab1f-be6b-4dfa-8fa9-36fc65a64782.json |  132 -
 .../d497a7e3-11c2-4e0c-8788-091caabede56.json |  132 -
 .../4a55bcf2-e1c1-4fce-8f79-472dae869b26.json |  132 -
 .../5b00dd5e-0ad3-4ea0-aa0d-2327d610e6a6.json |  132 -
 .../1c80d383-1ccb-4f32-a63d-dd3954fe5f6b.json |  132 -
 .../75065074-7ef6-41ac-be7c-496cc458640a.json |  132 -
 .../49a0287b-48d7-44db-bf20-a084919d332f.json |  132 -
 .../7b2861ee-58f9-4ac9-99ee-2ec663e1b157.json |  132 -
 .../628542f9-fac6-42a7-8ec5-5cd93f977a7e.json |  132 -
 .../5b0924ae-cf52-4245-a687-91e4b1742c16.json |  132 -
 .../459c2b98-c3af-4334-a4bc-13334efe49b8.json |  132 -
 .../b2780aa3-d299-4180-8441-dd54e94255cb.json |  132 -
 .../f55d398d-0555-4e89-a37c-def04741a0dd.json |  132 -
 .../63caf8f8-9e55-4ef6-ae76-ee7184a50675.json |  132 -
 .../f82ccde3-bd3b-499c-8b8c-182822392cea.json |  132 -
 .../8a52fb4a-d6ae-4c8d-aed0-2137e0a83ea1.json |  132 -
 .../b7cbc2fb-2c52-4c13-9266-52103421f2ee.json |  132 -
 .../f4474361-e897-4dbb-a89e-5451a4724474.json |  132 -
 .../de257b5e-4629-4f8a-b08d-d2ca372593e2.json |  132 -
 .../a37aada3-104a-488a-898f-245ff257de46.json |  132 -
 .../d9d655d1-d94c-483a-a3a2-ca196e1391d1.json |  132 -
 .../77bf7126-0cb9-43ef-8d23-5f1395f91642.json |  132 -
 .../73f410be-3084-4994-8406-f8ac70880626.json |  132 -
 .../24caad7a-15fa-4820-91cc-0f544a34d173.json |  132 -
 .../e087b221-f813-4688-8d98-17980f98ac5b.json |  132 -
 .../f4d03bff-3b34-497f-a17f-0379bc562f11.json |  132 -
 .../2ca21612-ea90-41f3-b618-3ea81c09c3ae.json |  132 -
 .../d4dc2088-9911-4966-afe9-022df89dd522.json |  132 -
 .../ad03a075-8f24-46f6-ae04-5a04eb7061c1.json |  132 -
 .../2d1da226-e65c-48a0-aabb-46b1cf670a82.json |  132 -
 .../7fb3a035-2b83-4a58-818f-16fe6d9a8ab3.json |  132 -
 .../87018726-9f81-47b1-883e-609afea7fb37.json |  132 -
 .../292b9333-96c7-4fc7-bf35-78bbce9f10d3.json |  132 -
 .../b44224c3-ed2c-4120-9e2a-e6286358a4da.json |  132 -
 .../f7a2c9af-c55c-4307-bfef-1ca709525d82.json |  132 -
 .../d9655f35-edfd-4c53-b359-559870e8019e.json |  132 -
 .../afdd962d-652a-4395-92f7-c16dc874a779.json |  132 -
 .../2594e917-3ebd-428b-8f36-cb0da668695d.json |  132 -
 .../91a86644-ad96-4c66-8691-1c0b531b572c.json |  132 -
 .../331f56ce-5e45-46d8-9143-3f66be20b699.json |  132 -
 .../6138ebe0-8483-4cfb-8d95-b334bb09e831.json |  132 -
 .../4d16dd47-42d1-4ea6-8f1b-dc50648bceab.json |  132 -
 .../a6b0f2bf-08da-472f-b858-8be967a44cdc.json |  132 -
 .../57c7553d-f3e5-4a31-8c16-66aae570d8ec.json |  132 -
 .../58c31bdd-f86f-4fbb-8549-191bb9f46f02.json |  132 -
 .../dd25c1dd-0edf-44ca-b18c-633dbd47368f.json |  132 -
 .../2a030613-b5f7-4393-ac39-d2d072c913dc.json |  132 -
 .../f8c73290-c400-4f1f-a00a-516592497b0d.json |  132 -
 .../b31908fc-5e7e-45d6-835f-4e86a05b23fb.json |  132 -
 .../4320cb98-7f9f-4510-bb88-448ce231bae8.json |  132 -
 .../28b986d1-2e67-4462-9165-6cb8f260b6c6.json |  132 -
 .../fe1e21cb-7934-4022-a74a-777172310021.json |  132 -
 .../90871638-b828-484d-8822-95ffceb20909.json |  132 -
 .../04a98dfb-8e96-444c-8df4-ed7cf72a26ea.json |  132 -
 .../8c5c22af-f230-4d34-b80d-f42ef27e1675.json |  132 -
 .../f3466a90-541b-4a08-a9c6-d5a79b2299b0.json |  132 -
 .../ef9ee5ae-d92b-4143-af1b-d62a7c3c7fd4.json |  132 -
 .../859af708-ac37-4749-bc06-73d92338d1f5.json |  132 -
 .../e274380d-e0f7-47c3-afc3-e603e6cecf9e.json |  132 -
 .../19810be8-ea81-4db5-9854-1830b05a5732.json |  132 -
 .../1258c282-3672-4b42-9d4d-117568e17bf5.json |  132 -
 .../9b9f6e01-238e-4893-b398-4e1c83c44dfa.json |  132 -
 .../b267621b-dbba-4c4a-bb9f-fa85734d0f59.json |  132 -
 .../a7e4e787-8e95-48a0-9d50-53ba9f05cd1c.json |  132 -
 .../3d39dcab-55df-4ad3-bdc8-03ae684e4390.json |  132 -
 .../1b499881-9edb-4626-a919-977393d6bef1.json |  132 -
 .../84b8970c-6c29-4ee1-93b8-c97e4a7c4950.json |  132 -
 .../2e070663-2622-4a8e-bd39-7f0ef9df399e.json |  132 -
 .../047fa91e-2dc7-4881-8254-3dfbd4a2ff1b.json |  132 -
 .../6d73016e-078e-4ffe-b2ae-5b829d1456df.json |  132 -
 .../0b68b5bd-d22c-4194-9ddf-f22e9181f84d.json |  132 -
 .../03d51d90-fd15-42b7-ad5f-c7326cc642a7.json |  132 -
 .../d3e5c939-c53a-49d6-80cd-34420dbb176a.json |  132 -
 .../ab321358-26f9-4577-a5fb-1f5d4b8784b4.json |  132 -
 .../a43aae68-f12c-4a6d-b846-c498cf35f6cd.json |  132 -
 .../b84615c0-43c4-49ec-83fe-5d3f8e6026af.json |  132 -
 .../7e687d24-9e12-4ecf-b283-e222efb9473a.json |  132 -
 .../4aea143c-28fd-48bb-b911-37ac3fe58220.json |  132 -
 .../34a8daec-bfff-4cf4-9011-0542b30c1d10.json |  132 -
 .../3e919d7b-53db-41fb-ac93-224e2768b9c6.json |  132 -
 .../66becca1-d92b-409f-ab56-44d05cac66fd.json |  132 -
 .../6293b269-7c4c-44da-bd85-e51954c173a1.json |  132 -
 .../add3b058-e7bc-4b7b-bb98-0d7039979072.json |  132 -
 .../db0b6b3f-e5a9-4367-ab87-e58d5c6ccd81.json |  132 -
 .../54b055d0-80ae-4bba-b729-bd77b3ec7502.json |  132 -
 .../5c22d0b3-5082-4c6e-865c-71da03cf9378.json |  132 -
 .../f8e5ee9f-519d-4ed8-bd2a-88897075f401.json |  132 -
 .../b74c3215-7bd5-42d1-9193-f4c9c6a8bec2.json |  132 -
 .../27df1e06-463b-4519-87eb-a1666ad3f98c.json |  132 -
 .../9d975b05-7bee-462d-a33a-afa0d5af94d4.json |  132 -
 .../9ef9135a-473e-43a5-a460-fd3ec50226f9.json |  132 -
 .../c57cae01-328e-447b-8945-e3cd2c4b8a7b.json |  132 -
 .../494c86cf-7f37-49d8-8160-b81859552c87.json |  132 -
 .../6de5e76e-4297-4bcd-b06e-f63fa28da0e0.json |  132 -
 .../9b10cd14-82f3-4b36-a4be-5092127d68c3.json |  132 -
 .../bbd94181-0523-4543-80a7-056b041e03b7.json |  132 -
 .../e10d8573-e201-460e-a931-49a1b13ceeea.json |  132 -
 .../e2ca9477-2414-4b8a-8d22-68f9ced54ae5.json |  132 -
 .../831246b8-5433-48e6-ba11-8a4239373106.json |  132 -
 .../8277994c-8bf5-4ece-9f34-4fe9a4310bbf.json |  132 -
 .../5aabc7c5-eb3a-42e0-8b40-0a08004f6e1a.json |  132 -
 .../cbb73c83-ad94-4973-9bf5-a5e7ca4d1653.json |  132 -
 .../3ed06a16-d5fe-43d3-a369-f4ed29fb3a5d.json |  132 -
 .../fc817789-2f44-4d2b-b40e-2422fe33d104.json |  132 -
 .../5e1c8723-7c43-4d8f-8c7c-386c2eb6b9cf.json |  132 -
 .../b6740747-19ac-4a9c-892f-6556013ddc8b.json |  132 -
 .../3263ab46-09ae-4c24-9332-b6874d0d0330.json |  132 -
 .../a8706a7e-5693-4768-a955-a448549d2e77.json |  132 -
 .../3c932329-0440-4799-886f-10bc4a5aeb09.json |  132 -
 .../b1e42d9d-827d-4109-8d1b-182694033b21.json |  132 -
 .../0c6f0d92-3ee0-48d7-b3fc-70149911a51d.json |  132 -
 .../73b07681-8e10-414e-8922-650908f9cf6a.json |  132 -
 .../8b1549f8-0602-4538-842c-abe9dca7baff.json |  132 -
 .../ad395ad4-0f9f-4b49-83c9-b89fa6b6dd89.json |  132 -
 .../14c01681-fbef-49c4-b737-a7baaa02d393.json |  132 -
 .../3ad495c0-da8e-4776-8d05-bc7dce1fe120.json |  132 -
 .../0762ca9e-f0d4-408e-9992-e91a10e0e65f.json |  132 -
 .../ec6c1d05-cea7-445c-bed3-9eee1e1ff03d.json |  132 -
 .../1fc39812-77fb-4d0c-b9fb-706e94c40afe.json |  132 -
 .../fdc3c502-53ad-4bf7-85ce-51eaed72754b.json |  132 -
 .../3f74c1c7-f349-4193-95cf-b0033112fea0.json |  132 -
 .../36a803da-83ab-4c49-8855-9344aaa7a68b.json |  132 -
 .../df986996-249e-49f9-b074-91e8dcdf62e2.json |  132 -
 .../90f007e9-e323-4a82-b276-ac1b928030ca.json |  132 -
 .../2b627f93-5cc7-4a5e-b682-d129396362e5.json |  132 -
 .../2fde07ac-d218-4cc6-947e-8ceb87eedbee.json |  132 -
 .../2a141bfe-4632-4058-a232-1f2c5540c41f.json |  132 -
 .../fa2d74a5-e8f6-4a1c-9310-a9b16c2e59d1.json |  132 -
 .../c7c0ceff-9273-4cc3-8f8e-bd93181590ba.json |  132 -
 .../c439478a-1734-4038-aa8b-bb2d12ec022d.json |  132 -
 .../4a36f73a-9495-4ea2-863c-220b8ca6bf99.json |  132 -
 .../faa9d3b9-343a-4a9e-82c5-6bc81bc87b9c.json |  132 -
 .../a55bf380-d567-4228-b30c-57e9df31e844.json |  132 -
 .../dfd92311-4f3d-4355-8ccf-a59f29914b8f.json |  132 -
 .../d98e190e-5b5f-46eb-b701-e32d2dbef3a0.json |  132 -
 .../32edb764-2a42-4efe-ac86-9eda81942b84.json |  132 -
 .../36855ebd-2030-4d5d-9c42-ca049244e694.json |  132 -
 .../9651a0a1-4004-42f3-ad8f-2aebb38ec967.json |  132 -
 .../a59e55dc-e2b5-43be-8469-49eee0e98d55.json |  132 -
 .../a956e306-f184-4dbc-ac7a-3793ae735801.json |  132 -
 .../c05cc6ce-12fd-491d-b41b-57cc14b6d34a.json |  132 -
 .../415875b7-fe10-47e7-aca0-029c2f51c067.json |  132 -
 .../c505ee64-3d3b-48e2-9c8a-f59609a758e9.json |  132 -
 .../00003185-c291-40c5-bba1-f87eae0afc08.json |  132 -
 .../328f61d7-677b-4a06-b464-0da42153f9ae.json |  132 -
 .../9cb5b8fd-062c-4161-9301-640980d21b9f.json |  132 -
 .../09284b75-a2f9-40ea-8135-7aa61c626fa2.json |  132 -
 .../e2502331-6ac3-43bc-8218-259b44333283.json |  132 -
 .../8dde454d-aa48-4ee1-b5c6-f3353087d492.json |  132 -
 .../662c8ed2-2407-4606-ac1e-ec7ade185d2d.json |  132 -
 .../332aef8c-7c62-463e-ba3c-07ae0205d457.json |  132 -
 .../cfdfcf21-e445-430e-a295-946cb8c3fce9.json |  132 -
 .../a5606b92-aa2d-44e3-a92c-47d0b38fef9c.json |  132 -
 .../465d473c-ef28-4725-8cac-02f2a031b22c.json |  132 -
 .../2c636544-8676-4eee-8bcd-d623be0275be.json |  132 -
 .../8b332fac-1cfa-498b-853a-52ec5492ddc7.json |  132 -
 .../2bf1b38b-e90b-4fa8-b19e-47d93ff9ab4e.json |  132 -
 .../69bb0243-75b2-4858-ba6b-5e70cfb516a7.json |  132 -
 .../4bb7e325-8741-4c09-81f6-9efdb30ef5a5.json |  132 -
 .../87878b74-22ce-4554-914c-03e486d13de3.json |  132 -
 .../5030f8d4-f216-4f78-84f1-dd03b0324bb0.json |  132 -
 .../c5e244fd-e85e-4fbb-9703-b8e733fb91bf.json |  132 -
 .../38261a01-62df-42b2-9b1d-f924598e70ef.json |  132 -
 .../5736f0b5-3903-4774-a84a-c3db260d36e4.json |  132 -
 .../70134d58-972e-49c9-8cde-4ba2691d3dc3.json |  132 -
 .../d4bb1440-2064-4752-bcb3-c9cec234fd1b.json |  132 -
 .../d9e6059e-d20b-4465-b7ba-2ee3a72562b6.json |  132 -
 .../f8b02d65-c8a0-43eb-b48e-d1e1f7f363d6.json |  132 -
 .../7bf23db0-877c-4700-95c8-e35dee5e57b4.json |  132 -
 .../07f8351e-c7c6-463f-9e91-ee1d3bb2b35c.json |  132 -
 .../8535ffae-f39d-46ed-89bb-a1656885db91.json |  132 -
 .../5e832121-9a67-44d9-973d-fffdb1b37975.json |  132 -
 .../92d3f67d-a026-49e3-a440-68c10fb358ae.json |  132 -
 .../9d0baaef-bd31-4a96-bb2a-e92b62b748d2.json |  132 -
 .../489e8e84-5e30-46fa-a421-f52308f051e7.json |  132 -
 .../a208f807-c930-4e81-8ebd-dcbb4db76442.json |  132 -
 .../4956539d-a255-4c56-877f-257e463fa3e4.json |  132 -
 .../3451eb65-020c-4e34-9128-7410e6b293cd.json |  132 -
 .../b5cd0061-e4dd-4049-a51e-b16490e69120.json |  132 -
 .../c4686af6-0b7b-4df3-9152-14a3ef087b7f.json |  132 -
 .../155885ca-11e7-4cd2-b26c-53e001e2a6f9.json |  132 -
 .../d9ca5411-def6-43b3-a522-595131d8e5e6.json |  132 -
 .../e54553ab-0897-4cb5-9213-5bb72758d2b5.json |  132 -
 .../eed48cdc-18db-4c03-84bf-d2d50e3328b0.json |  132 -
 .../d7952aef-37e2-4c15-a1a4-598690773bbb.json |  132 -
 .../5e1e1376-bb22-4fc9-a1d6-3f2fe7d302b9.json |  132 -
 .../cfdae559-f3f1-4a78-b4cc-fbfb8bb37b16.json |  132 -
 .../a12208ce-e9e1-4476-8054-0d565efad92c.json |  132 -
 .../f46e1eeb-8b8b-4d47-9510-445109b5518b.json |  132 -
 .../7dc4970f-ce35-4ffa-9052-2ab40abb1e55.json |  132 -
 .../823e886a-1431-4078-81a3-4b941983461d.json |  132 -
 .../583609f0-de5b-43cd-a667-bb2c36679fd2.json |  132 -
 .../2d2cea8b-167e-4d63-b01c-537f372672f9.json |  132 -
 .../f584f596-3a17-404a-81a2-3033ad38cad6.json |  132 -
 .../ebb0930f-92be-4e1b-a2a6-779f69d2151c.json |  132 -
 .../b8926567-e208-442e-8ba8-c6dd4ecc5c4a.json |  132 -
 .../4bf6efe1-81fc-48f6-96ba-8df9ffbef2f2.json |  132 -
 .../05ffcb7a-2694-4276-bf45-73e1110bc494.json |  132 -
 .../dc3b944b-a57a-44ab-87ac-8e1882b7bcce.json |  132 -
 .../154f70b4-d77c-4d1b-b85c-bc81fe8162bd.json |  132 -
 .../998316d2-389a-4ce0-b0b0-0430c1361de7.json |  132 -
 .../ce803cde-6e23-433c-a4d2-38c5cb5ba14b.json |  132 -
 .../2519485b-47cd-497c-a349-9e69db0266f3.json |  132 -
 .../56d86e26-4ee6-4652-9b7b-a538238a24d4.json |  132 -
 .../416b89e4-5e8a-4131-9403-e8967a4127b8.json |  132 -
 .../347a90e8-d8b7-4266-8242-ceac865796a0.json |  132 -
 .../389f7ab8-b30e-4d0c-b9a4-625e74a1f73f.json |  132 -
 .../6ae33b7f-53a1-45c5-8b0b-d462188c3f9d.json |  132 -
 .../d96fb0b2-7cba-4cc4-a5f4-b8a451754857.json |  132 -
 .../f8d362f6-eafc-4d11-bc40-d169d69d3a95.json |  132 -
 .../4bacd3dd-44c2-42d8-98c0-3eeb920dc0f0.json |  132 -
 .../de073f45-0d14-4f8a-9d3b-d4fd961186b8.json |  132 -
 .../fd88d234-b3f9-4f48-896c-af58f1a69880.json |  132 -
 .../273745b1-3761-463e-b9ab-7860968064eb.json |  132 -
 .../101d84d3-e741-4eb2-bd8a-db6c12022fe2.json |  132 -
 .../9c82deca-1998-4506-b038-c5dd592324d8.json |  132 -
 .../da620a94-4c0d-4c50-9619-10e12001fb5d.json |  132 -
 .../51dade8f-34e7-4237-8691-22655249bf76.json |  132 -
 .../cdd59385-0a54-4ca1-b24d-9316a70f2875.json |  132 -
 .../514a3103-e8a1-49e8-b9da-a85963f5b3dd.json |  132 -
 .../daafaafa-1e00-4433-95f3-91c169598ebd.json |  132 -
 .../50e53ad5-8693-44c1-b5c7-45b91d7e0ae4.json |  132 -
 .../bda5d02f-7973-41a3-8f8e-4e33a12b74e0.json |  132 -
 .../99ff5ca5-4409-4d9c-9ec0-4cf392afeff2.json |  132 -
 .../362f5875-4dbc-4e68-90ce-789f692bb533.json |  132 -
 .../fdb5faf6-2cdd-42bb-b154-d6e93b2348bf.json |  132 -
 .../93f829b8-b8d9-4389-a210-2a38c3a30edb.json |  132 -
 .../6ec3554d-377b-4bf6-88ef-8a4c9e70f485.json |  132 -
 .../70d749cf-2e92-4847-86de-7964fc8eb990.json |  132 -
 .../623f2b04-6cd7-4ea0-8844-badb0ff6c9c6.json |  132 -
 .../e1aca741-2765-4e47-b6a1-49f3d9532432.json |  132 -
 .../4f42366e-e6aa-4974-9a40-5781e350616d.json |  132 -
 .../4ec2231d-c012-4ad3-830c-8ff86c977202.json |  132 -
 .../1d2e5513-bd0c-4795-8487-f5266c6e368f.json |  132 -
 .../104172b7-86f5-410a-a454-63e1cfbeb87f.json |  132 -
 .../d28e04ac-7d18-43fb-80b8-82c0662fec79.json |  132 -
 .../20bb3819-9d85-4d84-99ba-65e33965f0c5.json |  132 -
 .../3a4bdf58-0137-4d85-b567-59b3fed3dad5.json |  132 -
 .../04f843ba-947c-4732-979c-2aeae7d34e5a.json |  132 -
 .../173a31d3-7d12-4ab1-a963-005a81aee767.json |  132 -
 .../d0555736-b614-43ca-91d7-8264e3566872.json |  132 -
 .../4b7b13b7-4aee-4462-87e6-aa6c15068236.json |  132 -
 .../4b1f9ce5-bb12-42e3-b0e0-afaa784b0c4c.json |  132 -
 .../acbcd5a5-bcd8-4209-b35f-425feada7e8b.json |  132 -
 .../cb9a415f-1a02-46ad-a731-bf825ddd78ae.json |  132 -
 .../92cde6db-47f4-43c6-9ad5-643c35faa226.json |  132 -
 .../5e88a037-f9bd-4b39-944f-f0781bb7884f.json |  132 -
 .../d4b08f5d-5add-49f4-b8db-c1a12e0a5313.json |  132 -
 .../ac5adf39-f0a4-439b-9873-9141e0a554b1.json |  132 -
 .../62965c92-cdf4-4a3b-b035-990abaab615c.json |  132 -
 .../3866ece8-d70a-4061-9e86-0798ecd98bd6.json |  132 -
 .../ff484d0e-bb14-4a80-ae29-2351b03cf278.json |  132 -
 .../06ac1718-fe71-4e05-a47f-1200e067336c.json |  132 -
 .../4ddb1616-7889-45ef-96de-823fee338e1d.json |  132 -
 .../487dd91b-5bc4-4355-90d3-c82ecc789ab3.json |  132 -
 .../a74e86d9-8b94-4f60-8f0c-73cc4b04d905.json |  132 -
 .../9a9239ab-9e0e-449b-bd1b-6ec280fad505.json |  132 -
 .../2c710cd5-75a6-46b7-8356-212da7bf864d.json |  132 -
 .../377d5240-73b5-48d0-bbdc-0960ad1d9069.json |  132 -
 .../9f31a6da-c5bd-4143-b2f9-715c0e9f7b74.json |  132 -
 .../104a0157-c614-44cf-b6cc-9f15dab4b187.json |  132 -
 .../bb379093-c169-44bd-ac86-edb8ab8fc225.json |  132 -
 .../e29001c0-17c0-4deb-8ca2-ce9ad06d8cb3.json |  132 -
 .../43d87bf5-2620-4f8e-a8b6-f86fc157d987.json |  132 -
 .../735d9d75-d9d1-4553-b7cf-f8e7c2e65218.json |  132 -
 .../0c6dcc87-343c-4973-a589-3e3393829184.json |  132 -
 .../7c1d1657-e9ae-433f-be9d-523431bfc7ae.json |  132 -
 .../0b2d9a65-c028-4f4b-a280-dc0c35ac9516.json |  132 -
 .../e87e1d3f-1476-499d-a9f3-b6463b429262.json |  132 -
 .../246e8450-3c53-4bde-99bb-5663f751e88e.json |  132 -
 .../496b9e45-2f64-456e-b35e-12a94c5643b1.json |  132 -
 .../05890047-a95a-433e-b6b6-fb037592cdd1.json |  132 -
 .../4a30580c-1d25-49d4-984d-2d28ef3a5656.json |  132 -
 .../696d7966-d140-4f43-91df-54f02247b34f.json |  132 -
 .../fdf10ab8-e3f9-49e6-8fd0-ed116868c217.json |  132 -
 .../9ac16d1f-d894-414d-8a14-110e971d0ba6.json |  132 -
 .../2eb01e0e-8f7b-4956-9a2d-b32ecaa936f6.json |  132 -
 .../3b221b0e-6158-471f-bcd2-b09514f28bd7.json |  132 -
 .../c8af8428-aab6-4d19-b185-2b437c0334fa.json |  132 -
 .../c617d12b-c37f-47ef-9704-e19774c67aeb.json |  132 -
 .../577f31e2-1808-45e2-a528-5933019cfa85.json |  132 -
 .../7bd7f5c8-be9e-473e-be18-03ad22a195ee.json |  132 -
 .../5036a549-5583-4775-935a-1a12b6de3e7d.json |  132 -
 .../5c0ffff9-542c-424e-88e9-89584e686e12.json |  132 -
 .../5c6a045d-2c90-4938-9185-9c1a0f82903a.json |  132 -
 .../02480176-2058-4e71-a970-9698be8d235e.json |  132 -
 .../4be1e5b4-254c-4287-907d-cc845042de37.json |  132 -
 .../21b51852-5cad-414e-92d5-31878f025d67.json |  132 -
 .../9eb07d4a-1f01-4696-9137-d477ffca43be.json |  132 -
 .../4236485b-aa92-4bc4-a652-17ed3231ecf4.json |  132 -
 .../9c0d6b71-8c6a-4294-961c-972a002b847f.json |  132 -
 .../d1e906d5-8f0d-49c2-88c3-cf71774de600.json |  132 -
 .../798e4f83-6262-4d5b-a854-6ff114167209.json |  132 -
 .../dd2603d5-e99e-4778-95d0-159c788626cf.json |  132 -
 .../41c71990-e79d-447f-b082-63c96fd67a1f.json |  132 -
 .../b9e25948-2871-4b6c-933b-8a731e48e81b.json |  132 -
 .../7c70df74-2bc2-40e0-b0f4-77be1a7e044c.json |  132 -
 .../ea71bdd5-3aa1-4d26-9256-5aeb2f79fa8c.json |  132 -
 .../b0e9c0ca-cd56-42c8-96ed-477884bfd9f9.json |  132 -
 .../7395fcde-49dd-47f4-a8ea-463eda40f5e3.json |  132 -
 .../a130087f-566f-4405-b662-1102f1664c49.json |  132 -
 .../3be58cf3-4761-4459-9f3c-eabf812a3c19.json |  132 -
 .../dbdd71ad-db5b-4b4b-8856-68b55adbe127.json |  132 -
 .../da159a16-48a0-45e3-ad4d-bdc9e8b5288c.json |  132 -
 .../77d5f51e-5ad2-42a6-a32c-060cd844b949.json |  132 -
 .../724cc582-cc83-474b-9606-70dbc22f3581.json |  132 -
 .../8a1b2aae-d717-4b49-8ed2-a7ee2cee1940.json |  132 -
 .../0dfb062d-a6ec-42a6-a9f9-6f6424bbdf0c.json |  132 -
 .../ab2512fa-2335-4817-9a76-3259690bbc67.json |  132 -
 .../fe7f1442-b7db-42d5-bc83-b8afd1d0c802.json |  132 -
 .../0e14484a-69d7-423e-bf6c-33d0992f408c.json |  132 -
 .../881eaa2c-af5f-4e84-8807-d0835c10ebd2.json |  132 -
 .../ef8a7079-9d13-42b7-ab2d-b72df5ae5d95.json |  132 -
 .../db8d3fc4-58f4-4f07-8c27-c73a4a4719fb.json |  132 -
 .../0c44a429-e705-4794-b702-1a731e52df90.json |  132 -
 .../92b3d2c1-61f4-432a-82a7-43b4367f7ef0.json |  132 -
 .../5703e81d-055c-459b-8202-80ec382a8d5b.json |  132 -
 .../f6260b6e-52a2-4142-93ba-5393807fa0d4.json |  132 -
 .../83b84506-4826-48de-a6fe-2af6ae5d425a.json |  132 -
 .../7483e260-9853-4d3f-aa10-187796d96de9.json |  132 -
 .../f9925806-4252-44e8-b67e-917737572bd4.json |  132 -
 .../70470e6c-8d66-4249-b762-a5a2e3589a53.json |  132 -
 .../d3abfe3c-ebfe-4dfd-b0db-93c14d32c585.json |  132 -
 .../a35b06bc-d759-421a-94cf-f408a98e9273.json |  132 -
 .../bbac659c-7cf8-41d4-98d4-ded4c471bd98.json |  132 -
 .../0c73f3a0-0a92-4b1c-abfa-6eb77138dacd.json |  132 -
 .../a7ab6f16-717f-4567-8057-a4a18e1a1e77.json |  132 -
 .../2abe2c9d-032d-469e-852b-114eca5e84f8.json |  132 -
 .../2e8a83dc-c760-4f42-a361-e02cf3a65427.json |  132 -
 .../743dfe64-e7cd-493e-817d-8d5fcdc2ea24.json |  132 -
 .../4e37c90b-65a8-4b71-bfc2-d63541fb8962.json |  132 -
 .../2e34d74e-1b69-4daf-8bee-77e5357fd439.json |  132 -
 .../0646e2f7-d2e6-42d3-8f09-f8daee302709.json |  132 -
 .../c66b1ff8-9c04-4f9c-b83e-088f31f79590.json |  132 -
 .../1bd2affc-9970-4149-b52b-51549b1f0029.json |  132 -
 .../f0479d74-4684-4b41-a63b-16d7fe0e3290.json |  132 -
 .../95deb890-a15d-4c71-8151-ed45c3dfb87f.json |  132 -
 .../1c07fc4c-a773-4e03-bb14-7144e7815c01.json |  132 -
 .../e7e8388e-db3c-4881-b67c-5177c60562b9.json |  132 -
 .../c4923208-2a47-45f2-a74a-4483e4b99bee.json |  132 -
 .../b5f06a78-5b57-45a5-93be-4f3c1b36f208.json |  132 -
 .../835f19d3-515c-4bc4-ab96-5cb5bece45dc.json |  132 -
 .../7dd96382-6fc1-4a39-924b-d9034b5b0839.json |  132 -
 .../77a666a2-a9b2-43cc-8e64-67172f4ab6c8.json |  132 -
 .../e3eae267-46ab-4433-a8f3-2a2f8448299b.json |  132 -
 .../e31308c4-8eb2-4a72-8127-18049d58b814.json |  132 -
 .../c7098a7a-e865-4ecd-b511-abeb2c0872bd.json |  132 -
 .../b3a8c734-e63a-47f7-af2c-a3b6518802fa.json |  132 -
 .../35937965-2791-4f75-8954-5a2280381c91.json |  132 -
 .../4ab806fe-738d-4f5b-89e4-004134d2f7fe.json |  132 -
 .../a937e27e-b757-4de7-b679-01ac29d8bb22.json |  132 -
 .../1d906aab-33a6-4ffe-8a63-694482d83d09.json |  132 -
 .../9e101298-6482-4ae8-83e4-b948ba8fa550.json |  132 -
 .../3818710d-80a9-4e7d-90e3-f06afffb71ac.json |  132 -
 .../a18ec0c4-6f3f-4904-b69c-e40770df169e.json |  132 -
 .../529c2bd4-6b8e-4e3c-8737-c0b794444d13.json |  132 -
 .../9e994362-a1d1-48f7-9db1-dd9d532b9f35.json |  132 -
 .../cf35b7db-f675-4362-8916-36b0582b64f4.json |  132 -
 .../79ee7e34-36cd-4024-8978-86c1b059ae5f.json |  132 -
 .../9ec4fb99-ed4d-416e-9342-0c036aadd35d.json |  132 -
 .../8788e4fa-04c5-4f7c-bb4e-523287901f71.json |  132 -
 .../18097bf4-5149-40e9-9850-558c3f143ed8.json |  132 -
 .../b5942721-5c30-4c49-a6e1-fb5419539652.json |  132 -
 .../76d27de3-0309-4e4b-8d0d-0e402bde0a31.json |  132 -
 .../5c0553ff-4910-45a9-aa8d-3a76af098403.json |  132 -
 .../fd97d1d9-a1b5-429d-b73d-1ea92ae1d61c.json |  132 -
 .../f77aa103-5a09-409c-ad72-7992b6049f94.json |  132 -
 .../0afdaa1d-c1e7-4283-a2b3-f459c09df4a9.json |  132 -
 .../044ed79b-0c54-4a7a-94ba-a3f999adeb0d.json |  132 -
 .../ac6b884d-62ea-4ff5-8eee-cfce08869030.json |  132 -
 .../8ffa696e-adef-4808-ba0e-bb04921a433d.json |  132 -
 .../8a2cfa62-5f13-447e-8d0f-2503e4962ac5.json |  132 -
 .../4f24fc46-3686-41fa-bf25-a0e39b252cc9.json |  132 -
 .../b1375cb4-b0d5-4cb4-ad43-394ebd1a481f.json |  132 -
 .../4ce062da-acfc-4684-95c2-679cbe5a697b.json |  132 -
 .../3d785765-befa-4e53-8672-769f7bb87dcd.json |  132 -
 .../ab0d3a24-19db-4d00-892e-bcb7c0f2f30f.json |  132 -
 .../31f0b186-1805-42ff-86cf-d8455a66d538.json |  132 -
 .../ed6b3e7e-d294-420d-b9b9-460a52cd0239.json |  132 -
 .../91dec0c0-9854-4790-a0a5-e17d19636f17.json |  132 -
 .../599616fb-26c1-47e3-a98b-9ad922a95c08.json |  132 -
 .../aeee4365-c34d-46b9-8c98-29976010bb62.json |  132 -
 .../1ec68708-94c9-4561-bb99-7f211d7a9950.json |  132 -
 .../0b53e7b4-0e91-40a2-911b-cd0d415e9fad.json |  132 -
 .../91bcd646-fe3d-458b-a426-a6a8863d69a0.json |  132 -
 .../2e0458cc-e092-4770-bd80-00dff169d754.json |  132 -
 .../d56ef415-0edf-4fde-8277-ae44b4bb4ed2.json |  132 -
 .../a0a1beb8-ee9a-4e88-b939-6e0104ed76a7.json |  132 -
 .../f9b7c3ee-ea8b-42f0-a55a-6171d4e3d0ea.json |  132 -
 .../2c8c6c6a-ce95-4d11-a33a-d547859fee11.json |  132 -
 .../47858744-3378-4ed4-9101-8acbc3a53cda.json |  132 -
 .../2aaeaaa7-89ed-4666-b0a5-8c1320ec4ec5.json |  132 -
 .../23ae6a72-5a1f-4961-8662-feb4d8ad8a26.json |  132 -
 .../312ec315-6175-4f99-8741-97d97eb26b47.json |  132 -
 .../7869bbe3-fd17-4e6d-9546-94d3df5e83ef.json |  132 -
 .../68c9fb85-f90e-442f-aa96-458dabe30b39.json |  132 -
 .../6891d1dd-0e1a-42e8-9206-64a4c71854f9.json |  132 -
 .../c62eb6b3-2a3d-45bd-acdf-bad717e51766.json |  132 -
 .../55d4a6ae-44e5-4a1b-9509-299fbc6c3a36.json |  132 -
 .../227e3e19-29d6-414f-b538-9f6f89d47677.json |  132 -
 .../e922ac2c-e8d0-48f2-99fc-da70c925136c.json |  132 -
 .../59f93c1c-3712-4ee2-a3d2-999e5acc2ee5.json |  132 -
 .../a98dcf1e-6abb-402b-9e0c-da7c23b74bde.json |  132 -
 .../a889f561-0d8a-4345-9131-0a897ec215ac.json |  132 -
 .../6402facc-6258-43a4-a0fd-78e21765c504.json |  132 -
 .../29fbd2e0-e08a-48f4-905e-d2aa54886915.json |  132 -
 .../313e0379-d3ea-4f5a-8e06-4b0a94317487.json |  132 -
 .../f326fbd0-5f92-4324-a587-1f08cf7da208.json |  132 -
 .../d61310e9-5267-4a87-8e24-ae25172cd64e.json |  132 -
 .../60953e5e-523d-43c0-ad00-f746308030b1.json |  132 -
 .../5afd8861-d7cb-45cd-af1b-6db966cb56e0.json |  132 -
 .../c3972df1-4414-4c71-b473-fb9459cf085b.json |  132 -
 .../b89d54b7-2329-4608-b9f6-07017e63f1cd.json |  132 -
 .../50389350-af23-41ba-af46-5ffe338ff9d2.json |  132 -
 .../b8f8f045-2306-43ad-8fa0-6a8bdb494db6.json |  132 -
 .../7cd59011-75d7-4497-956c-322d5d609c5f.json |  132 -
 .../1313d865-9c5b-45d2-ad64-629c65f07f2c.json |  132 -
 .../0efc2583-bf21-4b60-96cc-716928768eb1.json |  132 -
 .../be0a2737-19a0-4401-998a-a03663467133.json |  132 -
 .../71720e07-2de0-4402-bdfd-102150c61765.json |  132 -
 .../38c84c69-5cdb-4f24-820d-4b39c5b118ff.json |  132 -
 .../de9d274d-f213-4037-9711-3e9d3dbbcc96.json |  132 -
 .../92381da4-b9d1-43c4-a5c9-59f375017e11.json |  132 -
 .../44ab6a50-027d-47df-a518-5aa944eb2a61.json |  132 -
 .../2a1947d7-74e0-43d0-931d-b2862348e90a.json |  132 -
 .../3677b71c-387d-4182-b15d-c3525bc7bc36.json |  132 -
 .../6b125a8e-5b53-48ca-8875-926249879f39.json |  132 -
 .../af851d4b-69d4-49a9-a160-a180146c3963.json |  132 -
 .../7aa6ce37-c0e4-48ce-b9db-f158ac47d366.json |  132 -
 .../1bce093e-27c0-41ad-aad6-b656f6773ed5.json |  132 -
 .../5c6cffab-ef72-4e12-808c-c26ee8ec6999.json |  132 -
 .../e288a874-f750-4a90-be07-616094c220cf.json |  132 -
 .../0607da8d-3f4e-468a-91a6-b975261a87c0.json |  132 -
 .../be2cc2fd-c8e7-4421-b8c8-d3b937272d0d.json |  132 -
 .../15ffe64e-72fd-4e65-8632-babf137a386d.json |  132 -
 .../ce1c0d4f-f5a3-49e7-ab77-65ff51bbd0ca.json |  132 -
 .../b5afab38-13ba-4abd-9d04-a433c41061c5.json |  132 -
 .../a862c2a5-f66b-4d09-ac57-6cbe565f9f35.json |  132 -
 .../d8254f6c-8110-44d3-800e-101fc731d779.json |  132 -
 .../ccbcd5a7-2b98-4d90-ace1-3ad5971a5f18.json |  132 -
 .../c208b19b-4ecf-4fad-b931-54f65d4b711b.json |  132 -
 .../debaf4a0-c734-47ea-bea0-2ddc65dc397d.json |  132 -
 .../0eeb5962-ccc0-407b-92e6-7cf17c00941f.json |  132 -
 .../4b60e863-482c-4f91-8cd1-6c993d3c5988.json |  132 -
 .../f5f0bc72-427d-4703-aab1-1bb1bea73895.json |  132 -
 .../aae7f543-7b5b-435f-a506-e3ab901a8c5a.json |  132 -
 .../6e6ff4c3-3cfd-4790-80c4-544d9cbe47e2.json |  132 -
 .../3ee76278-89d4-44fb-a449-717534b00161.json |  132 -
 .../fa2854d3-9e2f-4f79-ac8c-e1cb5a638745.json |  132 -
 .../9ddaa721-bf3a-416a-9be8-291188793cc9.json |  132 -
 .../d659077d-7261-4c69-862c-d61be21662a2.json |  132 -
 .../e87ba227-c55e-4666-949d-b45913f8336b.json |  132 -
 .../077f683a-af6f-4a71-b599-b9b269546b7c.json |  132 -
 .../54808b08-d10d-4a06-ab60-8d99039311b8.json |  132 -
 .../138e6fdb-7092-4ee6-be82-7bb86c1fc759.json |  132 -
 .../1b27423f-62cc-4189-a293-5af84ef1f2c8.json |  132 -
 .../f5468512-d2c7-4486-9d31-bef61225af52.json |  132 -
 .../0e0ec1a9-76aa-4d7e-9c0e-946d6b000a6a.json |  132 -
 .../07b87b98-0d61-4479-937f-7447565b4631.json |  132 -
 .../85b11b91-d686-49e9-8db0-971dd7cafb75.json |  132 -
 .../21bac032-a092-4afa-8d29-ebdefb3a0650.json |  132 -
 .../29e3a687-429f-4f33-ae5f-48db85127364.json |  132 -
 .../d98493a6-f237-4565-8508-9e4cc3188d2d.json |  132 -
 .../2def6fbd-7488-4e9f-a822-2405d4f7a315.json |  132 -
 .../819143d4-9538-48b9-b7af-128bc15c518a.json |  132 -
 .../c29d47af-a9de-4edb-acac-6763c0d44ca3.json |  132 -
 .../22bf3fb7-9235-4a57-b8fd-c85b12047b0e.json |  132 -
 .../2bea7014-460d-470b-918f-468b58d70fd6.json |  132 -
 .../3927a5dd-002b-441a-b769-ba68547cd5f3.json |  132 -
 .../476fc734-dedd-4192-aa59-eb2f9dabf16b.json |  132 -
 .../817e2fbe-0866-489f-b987-391228a68c53.json |  132 -
 .../f25f5eb1-ff22-4be3-a639-a9d25207078f.json |  132 -
 .../f71d1c31-184b-46be-a288-bdc92f0ebe09.json |  132 -
 .../0d9547b3-7bef-4815-9c44-7d714fe81bbb.json |  132 -
 .../22dbc5a2-0ff6-4566-9bfd-e5ce314be597.json |  132 -
 .../afedb249-f1a5-42d6-b6c0-54b2cc303f64.json |  132 -
 .../61b1bf5e-6aa4-4e90-af2c-dcf5fc9903f2.json |  132 -
 .../c0adc04c-1e02-4891-a5a1-1fab0ddf18ca.json |  132 -
 .../cc57e6f0-ab55-4ab9-983c-63d74632d016.json |  132 -
 .../0d3c5fdb-c4a5-4436-b9d4-f0f42cb4db96.json |  132 -
 .../a6ec2934-e9fd-481d-8f00-932603bc6e0a.json |  132 -
 .../e2553c93-60df-4126-9e64-ecd4a5003389.json |  132 -
 .../e7c2fb42-e82a-4dac-9cc3-a9f41ab54e0f.json |  132 -
 .../a807ee8c-509e-4b6d-a414-df24444d8a0a.json |  132 -
 .../2199024b-7944-4950-8335-32a536efad02.json |  132 -
 .../97919c86-6161-4548-95b9-d44263a29f8a.json |  132 -
 .../c40c1a46-2e30-4cf1-bcf3-a316a793fbcd.json |  132 -
 .../c1294268-b5f5-4d64-b91a-147f58a21a47.json |  132 -
 .../2b029e6d-a0b8-4b6c-b62d-144b8dc4f739.json |  132 -
 .../b926ca6c-60c9-4353-9671-0453b46d0222.json |  132 -
 .../44db30b4-2010-4f96-a39e-9ccc8568374f.json |  132 -
 .../2210d673-d417-46be-aeca-de48cd846e01.json |  132 -
 .../892d27cc-dfb3-40c7-ae0f-a7cd06784808.json |  132 -
 .../49b3f293-721d-4d44-9748-88d1ce275050.json |  132 -
 .../70fb41fe-46af-49e3-8270-5882e12f710f.json |  132 -
 .../13e2489f-9d96-4f68-8e22-c937604c2145.json |  132 -
 .../0c386ea0-4706-4a6f-994c-b6ee21dbce92.json |  132 -
 .../a8d5a193-6c87-4b5b-8ea3-b3ab78e73104.json |  132 -
 .../4018f4bd-492a-4814-9a7a-1f0c376f2d2e.json |  132 -
 .../568072cb-118d-41af-bfe8-fa14cb4c7348.json |  132 -
 .../a6d08766-8c36-41bf-8bbc-acdfdc3f8e23.json |  132 -
 .../2504fed5-c8a1-4ffc-8ce5-9559aa8c4325.json |  132 -
 .../359dde31-d9dc-4c22-b829-77df652dcc73.json |  132 -
 .../34a79823-b993-402a-89a7-538e126ee02a.json |  132 -
 .../f392c5c3-9bee-4111-9a22-6a1b706fd2ad.json |  132 -
 .../73bbdd22-4e5f-496b-b39f-290d8e0d2aa4.json |  132 -
 .../72a66eae-9c94-40e3-b3c9-211303e5cba8.json |  132 -
 .../ef7390b5-599b-4354-805b-9486e4ce34fa.json |  132 -
 .../57f964c3-0504-4b60-9539-ce0e369816ea.json |  132 -
 .../4e6c0336-5d94-4417-a194-92a4d6f38481.json |  132 -
 .../fe38dea8-92f4-4fb2-afdf-c5932d7c9e27.json |  132 -
 .../5ced7497-5a05-40d2-80cb-cae63ca62022.json |  132 -
 .../52a66aaa-193a-48ca-b693-4dcab811eaa3.json |  132 -
 .../e0e4bcef-cb73-436b-9353-b18ade293e8b.json |  132 -
 .../1ae45791-7e47-4083-bd72-4530fa26893c.json |  132 -
 .../b2731f04-a9bd-4e36-a545-85be5b66f5a7.json |  132 -
 .../ed6de552-d04b-4d51-8456-610e2cb41d85.json |  132 -
 .../3e08a589-d2b3-487b-900e-85725522a2e4.json |  132 -
 .../b2717503-d081-40ee-b1ed-fcadaf239049.json |  132 -
 .../9915eb01-5c45-42b6-82a3-ad782411642f.json |  132 -
 .../190eb7ca-46db-4e1d-8b71-9bb20af74ede.json |  132 -
 .../86b9077d-9ec3-411d-84c5-326ba97742c1.json |  132 -
 .../18bfa50c-20be-4027-8ee7-f6cd1411c882.json |  132 -
 .../eb1a099a-48c7-412b-b62f-143537c41f06.json |  132 -
 .../e530a4b7-c2f6-4bad-bab5-2895e950ed63.json |  132 -
 .../52ad7152-feea-46a6-b2d8-20e1a70514ce.json |  132 -
 .../a61162a6-ef3e-46f4-8aa2-241547fadea2.json |  132 -
 .../9f208aef-8544-47c8-bb1f-a3841aff208b.json |  132 -
 .../da237ab6-df39-460f-9efc-e1649e1ac202.json |  132 -
 .../c81b3193-9d01-4590-8b72-da97aa3c9dc4.json |  132 -
 .../1a9ffe50-69ae-48bc-b636-89431391eb37.json |  132 -
 .../b0c67359-1da0-4f55-aa1c-f54f88038bd7.json |  132 -
 .../c700798b-583a-41be-94dd-382669bb495f.json |  132 -
 .../3c0b9735-2ef1-4f27-b94a-f246eb57b73c.json |  132 -
 .../e8c9501b-c985-4b78-a902-a1a030c72e60.json |  132 -
 .../df978fce-3373-4073-8c44-d6a83df1d9d1.json |  132 -
 .../e46ee8d9-81af-4259-8fef-3d3113fb6168.json |  132 -
 .../aa6ab404-89ef-4336-b811-7c8064e26107.json |  132 -
 .../a14e6c79-4a78-4c02-a7ca-35e783f32be1.json |  132 -
 .../ba1fb85b-bbc0-46ac-95d7-e61b91f65c2b.json |  132 -
 .../f6312fc7-c7a8-45dc-a57c-91f56b4ca28a.json |  132 -
 .../335f5c32-f3f0-4a16-8c9d-8f07b2aae54a.json |  132 -
 .../b7c7a907-7ecc-4d5b-bc6f-8b8d82954b21.json |  132 -
 .../112f01a2-f0fb-4257-86bf-61c9a184eb92.json |  132 -
 .../2d9410d6-7162-4811-bf7d-9de2c2b48fd2.json |  132 -
 .../16ff8fa3-4676-473c-99ad-908ddb59d8ed.json |  132 -
 .../9b153ac9-f95b-419b-b7f9-beccd769ddad.json |  132 -
 .../8a5df3c2-eb71-4e12-b013-fb43685f2916.json |  132 -
 .../35fa3213-5c08-4b19-ae76-237fdd25444e.json |  132 -
 .../242ce55f-1471-435e-bcd7-d28b5fc87fc4.json |  132 -
 .../95f509f2-5e67-404a-968d-f7488d684e32.json |  132 -
 .../bcbcdfe9-0663-417c-9a29-60906e63db8f.json |  132 -
 .../d95a7493-2f99-4c10-8067-711c7388af7d.json |  132 -
 .../789848a0-6d8a-4583-93c3-a72df74d0071.json |  132 -
 .../14af87df-0fc5-46e1-9d0b-c25c8b6a7ce7.json |  132 -
 .../379f559f-9bfa-444f-b477-562c25b4c299.json |  132 -
 .../effb6a3d-c98f-4c3a-be77-902c61cda21b.json |  132 -
 .../6c1c1405-afa4-412d-ba1f-49dc1cac4509.json |  132 -
 .../6f4ed7c2-c775-4fd2-8600-4cea523f53e4.json |  132 -
 .../5fd5206b-186a-43b9-a4f4-07e75aa0293a.json |  132 -
 .../b707ecbf-0658-4226-803d-53456d16d54b.json |  132 -
 .../dca1ee57-5e86-4532-a2f3-ac6a619ca576.json |  132 -
 .../1233476a-7839-4a22-a7ca-1d0f237d8888.json |  132 -
 .../5c4bdeca-5ef8-4002-8f82-67d49b5ff722.json |  132 -
 .../18f5fd6c-2b79-4d48-b7e9-18845db16271.json |  132 -
 .../a9039374-fa5a-4b8b-800f-5f4651cf812d.json |  132 -
 .../3f9704b4-bf25-40da-b6dc-b927c3569f40.json |  132 -
 .../a8f858d8-a792-409f-b79d-948a19e2aa87.json |  132 -
 .../5c34a168-b8cf-436b-a3b7-a2d1feadffb9.json |  132 -
 .../77092cfe-9820-45e8-94c5-31d27f1daa7c.json |  132 -
 .../cab8fed8-de68-4fa5-b4fc-d9483fc56571.json |  132 -
 .../a8103350-b208-4856-8e7b-8ea8918ba0d1.json |  132 -
 .../e849c03c-c569-4059-8fc5-6a98cf391342.json |  132 -
 .../f1d8bffa-61fc-47d5-85cf-48cebcb31af5.json |  132 -
 .../97bdb352-2e9d-4cc5-8b70-55348ef3a217.json |  132 -
 .../78053a33-24c8-4e9f-8791-f127f21eec1c.json |  132 -
 .../03082966-87ba-4560-a784-5d8677003500.json |  132 -
 .../97f26b20-db66-4a30-ba2a-c18a31081271.json |  132 -
 .../85f9ccda-8c47-4fa1-9d47-e9da4730b077.json |  132 -
 .../2a57d6f4-643b-4b30-8d67-03032d454887.json |  132 -
 .../d333f360-c1c3-4916-8480-4a1fc490875a.json |  132 -
 .../37a41261-a7b0-44b2-916f-770cdfa0ad39.json |  132 -
 .../c46cd6cc-b56d-44c5-a03c-b49381ba3462.json |  132 -
 .../612b6226-c25d-42e0-bcd7-be7faa844530.json |  132 -
 .../2fc7a4d6-88e0-4f11-9110-dc53942870a4.json |  132 -
 .../34665752-58d8-48ee-81a6-f1a068c23026.json |  132 -
 .../cc0767b5-4aaa-4418-8f68-72a721323e9c.json |  132 -
 .../ea507a41-1654-4515-94cc-ce2e38800c61.json |  132 -
 .../c44e773f-4cca-4780-bdd4-f486e65c18e0.json |  132 -
 .../f8a46bda-d53b-484e-8832-7939f7d0762d.json |  132 -
 .../c3968a2d-4a9a-4f62-8bea-a3b4b6dcd378.json |  132 -
 .../da18242c-d6bb-4a0a-a2f9-2e42099f4e8a.json |  132 -
 .../ac078124-85d9-4715-bf7c-1428b1063732.json |  132 -
 .../9c1dcd75-8491-4890-ac6f-000868099a3e.json |  132 -
 .../7850fc57-49c7-4124-b7c6-e1e7bb2bc726.json |  132 -
 .../8f38374e-f373-4639-9278-24441ebd0325.json |  132 -
 .../c007938e-3427-4896-8493-1500abdfbd2b.json |  132 -
 .../df81dc0d-6c72-49e9-862b-02e9b6642cb6.json |  132 -
 .../46c96d8e-568c-48f8-a74b-9dd4b4195037.json |  132 -
 .../1f4f7181-8a81-49f4-9e81-925d5d69a37c.json |  132 -
 .../3ea343b6-93f6-4c61-a164-3db95d13cbdf.json |  132 -
 .../a9ea8bb5-05fc-4da3-8e00-f53ab8ea6af5.json |  132 -
 .../0ea74ce5-43c9-43eb-92bc-3d928062d9e0.json |  132 -
 .../6896faa7-7204-4091-8f4e-9cc0b53d673a.json |  132 -
 .../88064453-fd8c-4bd9-adf1-39f43972bec1.json |  132 -
 .../a18ade45-acba-4059-b969-445e529a82e2.json |  132 -
 .../6c0e4132-71e7-44af-95fc-83b0a6be2a82.json |  132 -
 .../5d9ab422-4f4f-460d-bd39-51266b43d7e5.json |  132 -
 .../cda03c45-0782-40cc-a17d-67d808657b83.json |  132 -
 .../50f5451b-41c4-4ba5-8bee-ee8a2deb7e79.json |  132 -
 .../cf758994-6e94-434d-bf68-74cca188b5e8.json |  132 -
 .../611f9549-0788-44e9-8125-18df06cd80d6.json |  132 -
 .../59cf23ba-027d-4bac-a0e1-526376396b4d.json |  132 -
 .../1f02bbd3-ddaf-4db6-b7f8-31bad8ffac66.json |  132 -
 .../1e737e28-d926-43e8-9e4c-e39fa91d7977.json |  132 -
 .../43ef8eee-5d8a-47e7-ac71-1a898421370a.json |  132 -
 .../d8d03c71-942f-4aff-8a5e-5c265c639b44.json |  132 -
 .../96262938-1146-4993-92a1-a2ddb2519f8a.json |  132 -
 .../292d7cfb-3e3c-47d8-8cca-33507f9ff081.json |  132 -
 .../3f29c10f-57ef-435b-85df-2cae30ae72fa.json |  132 -
 .../d7f022fe-86cb-4e4e-a672-62c2dc8cffd3.json |  132 -
 .../baa35c90-c494-4dff-af28-cb549e40bed8.json |  132 -
 .../2fdc3186-6791-4550-ac4f-a1a5a5a1d514.json |  132 -
 .../f687df8b-42b5-4d94-b741-1b516d9221b2.json |  132 -
 .../c3a8a952-6869-4eee-a59f-4ae33ac72986.json |  132 -
 .../a7a74117-71e4-49b2-bd65-add82c9165d8.json |  132 -
 .../04ee694c-0c89-4f25-b10f-315a24743ba2.json |  132 -
 .../47fd4acb-acc3-4f12-8af5-c425d3754c38.json |  132 -
 .../e19577f5-d1ba-45ad-8500-d18ae2b14440.json |  132 -
 .../e86443cd-453b-4ca0-8e7e-054764fe4bb9.json |  132 -
 .../24cd9977-f3fb-4619-aea1-59e1a36b2a5e.json |  132 -
 .../1401f0d9-6f4c-41d2-819f-eb9487c5c1e6.json |  132 -
 .../4b1f2aab-ef92-4231-9bdd-96918b26914c.json |  132 -
 .../4956e127-14a1-405e-a0e0-76fe94ea727b.json |  132 -
 .../90fb6e40-88f7-4ce2-ae99-308d87e69718.json |  132 -
 .../cdad0f08-1c60-4493-bed0-9733894b367a.json |  132 -
 .../8e83b4f7-736f-4e03-8256-2a1fc421b04f.json |  132 -
 .../f0d6639d-8485-4bcd-b069-046a747dfbfa.json |  132 -
 .../d1fe36ba-04f8-4110-8c39-81d393c4cbfc.json |  132 -
 .../5a8ab5fb-ec1e-490c-b643-e3b9d49f5d34.json |  132 -
 .../de944f89-d2d4-4b01-b4b5-e7cbd1d8d1ae.json |  132 -
 .../db96601a-2f7f-438f-915b-55fee0e0d1d1.json |  132 -
 .../27912f7d-7033-4b7c-b93a-af1673ce4a9b.json |  132 -
 .../da58a484-4a45-4a70-a651-031ada8023d5.json |  132 -
 .../e8bd221d-8a89-4e3c-8815-0bff27574053.json |  132 -
 .../ffc21c2a-59fb-4ad8-88a4-930879b6eba0.json |  132 -
 .../1e506afa-0d08-45d6-9242-b06104aa67e8.json |  132 -
 .../7d66bb93-cb2f-4be6-b133-1f0325be58e1.json |  132 -
 .../936f3c5f-7817-4118-96c8-e4061d4560fb.json |  132 -
 .../7d36ceed-2a1b-4b20-88ae-0a609cc161e9.json |  132 -
 .../77cace56-503f-4531-a4eb-0178a68cc283.json |  132 -
 .../9e49b710-2413-42f3-8943-bc9dbf68cb3c.json |  132 -
 .../9a5b3564-97df-4661-a171-37322386ac4d.json |  132 -
 .../0fc0450d-cdf1-44b5-a809-202d1dd6b5e3.json |  132 -
 .../7f06c78c-f95e-4e50-aa57-da0579adcdae.json |  132 -
 .../06e55e47-9995-4fa2-877a-c728e9f9f1a1.json |  132 -
 .../39af1e0a-d1e3-4372-bc18-d07f3dff09f0.json |  132 -
 .../f32d59d6-8ab9-4b7d-ad9d-f62ce6d559bd.json |  132 -
 .../7ddc3aef-c6c5-4d04-8473-3b3bba219d7f.json |  132 -
 .../ce80ac07-22d2-4883-ac6c-40b080e00b81.json |  132 -
 .../cbece170-f872-485f-a6c2-5db17ced73bc.json |  132 -
 .../c1fd751b-c6c3-4350-9618-f4b4840e1b69.json |  132 -
 .../bfd28b91-3a72-4417-b52b-804d2cbae12f.json |  132 -
 .../32c26cbc-3697-47a6-bd12-18187df9dda9.json |  132 -
 .../02280b9f-bc01-4e44-9d09-1e4ae8c0438b.json |  132 -
 .../a57d2d49-5ccf-48f5-8035-b1d480c80f40.json |  132 -
 .../6b5a3c69-f8dd-4952-96fc-b6e4dec1ed9d.json |  132 -
 .../fe0665dd-b976-4d90-b16b-6c2acfef15ff.json |  132 -
 .../8c6bdc44-fd29-45e7-b161-2c8e07ef2935.json |  132 -
 .../e7c70ff9-59ad-4d09-8af0-ef9cf16d1dfa.json |  132 -
 .../26c4c993-ae49-42a0-be0a-f157be9f7d58.json |  132 -
 .../19adf124-c120-4e97-80cf-49c40a66eb81.json |  132 -
 .../66bc5d38-8d25-4934-bce8-41ce4ea0e385.json |  132 -
 .../541eafe5-807e-44b0-b652-a0752210fc71.json |  132 -
 .../845a2484-9f17-4c0e-b06b-6250992298bc.json |  132 -
 .../e62b6b26-5f3c-42c9-9541-bb8b23caee66.json |  132 -
 .../ec773b66-24fd-4b6f-ac9c-ebcd355e4be7.json |  132 -
 .../a70b8356-94ce-4f0d-b44a-2215076eed5e.json |  132 -
 .../b182807d-587e-4702-bf30-dab11983b8db.json |  132 -
 .../c1f0944a-c44c-42e9-90ba-a847509cbd66.json |  132 -
 .../64bb8530-7071-402e-ba9b-1d15ecbe275c.json |  132 -
 .../4f1fc265-f8b7-47e6-a9e6-cfa61b89ad4a.json |  132 -
 .../1420df5c-690e-4b01-b99c-c21c793689ae.json |  132 -
 .../aa9d0b0e-cb3f-452e-bc85-f7cf172d2b8b.json |  132 -
 .../dfabd777-8620-40e3-b19c-a9227f57b638.json |  132 -
 .../08fe3877-ab04-426a-9e27-72ec4ff8ffc3.json |  132 -
 .../4b264bb0-bd7e-4b15-9591-50b5a521f100.json |  132 -
 .../a8cfe336-0c3e-401c-a1e9-d951e64918ec.json |  132 -
 .../5e66c653-41b1-46de-b677-ffd8426ba5ec.json |  132 -
 .../9f0f0914-1f7a-468e-8a2e-7ae122fd064d.json |  132 -
 .../cc64a143-4f1e-42ee-ade1-fafc4b316336.json |  132 -
 .../cf322e64-2682-4a9a-a48f-c4ec47b852f2.json |  132 -
 .../30b32261-b24a-49e3-ba57-172dc1d03ba0.json |  132 -
 .../0681c01d-23f3-4b8b-9516-a5cc41761fc4.json |  132 -
 .../7693ed8a-f76d-482b-92c1-f11810e522ca.json |  132 -
 .../f8dc0128-c606-490a-b965-59d5377dd778.json |  132 -
 .../844547f7-658f-41dd-ab4c-dc0569030e59.json |  132 -
 .../75c291b5-6d60-4bde-8621-f865196a6ecc.json |  132 -
 .../36d54b12-594f-47fe-9637-a9b740416c5c.json |  132 -
 .../57733383-9573-463d-a467-068d2685014c.json |  132 -
 .../eda1ac9a-98e1-496f-bdeb-1e256b52c14a.json |  132 -
 .../00b8bfda-c6b1-4e1f-b68c-bff7335e2dff.json |  132 -
 .../0a3b9ad6-b853-471d-a292-413b30273034.json |  132 -
 .../d61c3ace-e353-4c0b-9472-c9a1928809cc.json |  132 -
 .../2293a19a-b650-436d-9448-1b641e63d407.json |  132 -
 .../c15b977c-c781-4b17-ac9f-25c77602c875.json |  132 -
 .../42c191be-c0ae-4170-8b6f-565053ae7d9c.json |  132 -
 .../f5cb910d-6e5b-404a-a751-d5cb90668150.json |  132 -
 .../de806e4c-dbf8-48cc-a0d8-033a61dfc777.json |  132 -
 .../59150b73-b05a-451e-ba3f-696d04effe05.json |  132 -
 .../84926b81-360a-480c-b240-f154ec7fe0ba.json |  132 -
 .../8e6edb04-302b-4dfc-b38f-94b437c921a8.json |  132 -
 .../db92c564-1cf9-43db-9e25-1f450c7b1e7f.json |  132 -
 .../e3796243-cbba-4ec2-ad7c-89547ad24342.json |  132 -
 .../1479be90-df8f-4e1d-b9db-03e84000187a.json |  132 -
 .../d2e6c48c-1c18-45a6-ba1a-b335325c980c.json |  132 -
 .../f843e45a-f66b-4091-a964-75583c2d7fc5.json |  132 -
 .../cbc3cd41-e187-4c4f-b207-37bceab423a4.json |  132 -
 .../0f124566-5e94-4233-9a3f-5ff9cfdf160c.json |  132 -
 .../98fabba8-7d70-4a1f-b03c-37e1a9ac94e8.json |  132 -
 .../91522dad-529b-477c-8372-793f631e14b7.json |  132 -
 .../cec22734-493c-4d11-ba86-6c7ae2005124.json |  132 -
 .../704a6e19-0d86-42a5-b8f5-05a5856e9c29.json |  132 -
 .../bc54349d-59e0-4ae4-94f9-3f5ae98261f4.json |  132 -
 .../d20d533a-758b-477c-b4eb-073adaed640e.json |  132 -
 .../f7c9ad0d-3fea-4bec-8ac3-46f01a3449fb.json |  132 -
 .../9db1f823-e068-4a39-a5cc-b9c588099427.json |  132 -
 .../23818b45-bf5f-48a2-982f-1e2a0d35aac8.json |  132 -
 .../de6eda66-b8f5-4b23-89e1-44bbac600953.json |  132 -
 .../632974c2-57e2-41f9-8c00-671e07e7594b.json |  132 -
 .../e86dcf4f-6282-4aa6-b645-00f93a2e9077.json |  132 -
 .../b20be5c9-9720-4076-b587-728549dd19af.json |  132 -
 .../5e193803-39d1-4f12-8726-ebbe5f71563c.json |  132 -
 .../61131a6c-f412-42bf-814b-7d711a840d44.json |  132 -
 .../535e72b1-17e0-40e3-9d66-d31f8ec70413.json |  132 -
 .../ea15479e-24a8-4924-a754-a8567c511e61.json |  132 -
 .../5799f285-c61f-43a8-a6a6-053808cf4e8f.json |  132 -
 .../36feef44-3d3b-4102-8606-ee6420bddcff.json |  132 -
 .../fd55f19a-2c22-4f29-82e0-15b02f25b9a9.json |  132 -
 .../18e5decd-c95e-43d2-9ba2-007ba32e216f.json |  132 -
 .../85a4996e-8c44-4e4f-9478-19a8c5513617.json |  132 -
 .../db6d57c8-df0b-407e-b937-67c55b513a5f.json |  132 -
 .../89ac933d-0a7c-40e6-8fa7-35bb6205e44b.json |  132 -
 .../c79e690f-3e09-4fac-9412-937a3b7ef352.json |  132 -
 .../ce74b7e3-8505-4c79-a7de-12d1e6b47155.json |  132 -
 .../3c562d8a-2df9-4d3f-9699-bfaee4a1ce2b.json |  132 -
 .../152b0cbe-e27b-4438-8326-e67f4e70e600.json |  132 -
 .../c733c91f-79a9-49e5-9398-3a424ee1940a.json |  132 -
 .../32d7b6c6-de5c-4864-a446-97dccce378c5.json |  132 -
 .../7b22d02b-5bfd-4243-9ad9-c858d0af55a6.json |  132 -
 .../99650529-55d9-42b0-b812-761a30277e5e.json |  132 -
 .../81abbc2a-791b-4a39-bb46-97edfa14b9c0.json |  132 -
 .../c658e535-7098-40fc-bea0-f5734d8f4ca9.json |  132 -
 .../9e0656e9-9b82-4f6d-b00a-c09cf9cbc105.json |  132 -
 .../07c36058-e0e8-48ea-85f3-0a2cb2fe3443.json |  132 -
 .../c41d8925-b56b-458e-b1a9-27dbbcaee149.json |  132 -
 .../9136feb4-5c3e-48b3-bc70-c7816b8b189b.json |  132 -
 .../c395ef02-9a50-4696-aad2-bcb32ba05f67.json |  132 -
 .../93f47969-556a-4fd4-b7bb-4d1c861a8d71.json |  132 -
 .../349ae559-6c1f-4b2f-954c-e83cba1e603a.json |  132 -
 .../3e43c3f6-645b-4ab3-b684-b23eb67bc5d9.json |  132 -
 .../500c8cd4-fe4e-44f3-86b7-b0efd387ab92.json |  132 -
 .../340a3ebb-bc06-404f-84e7-aeccc016fd32.json |  132 -
 .../a6426f88-d7cc-4e6a-a2b5-76e59a52a6de.json |  132 -
 .../bdd05c8f-b895-4c91-9a9f-a608a4259cbd.json |  132 -
 .../0e1e45d4-2747-480d-9b1f-2b200e250271.json |  132 -
 .../00f3f9ca-ae7d-4e62-9e7e-6bd202dbed59.json |  132 -
 .../c9e57ab2-c2a4-4935-b976-4bf24647b777.json |  132 -
 .../c22436a2-ec60-4220-82b3-123618165eb2.json |  132 -
 .../1f990438-dd84-44d2-99f9-a10035ecd652.json |  132 -
 .../f4564f5e-3595-466e-8201-0e2a4c50ff0d.json |  132 -
 .../040def3a-702d-4868-b429-39697ca36207.json |  132 -
 .../9e24fd65-56ec-4160-b299-b34d702a3231.json |  132 -
 .../216bf9f8-9521-4311-a40b-8a847271265c.json |  132 -
 .../45f8c4fb-3591-44df-a4f0-57093b9bae23.json |  132 -
 .../d17275ef-8a32-4fcb-94f4-fb24299ba50e.json |  132 -
 .../61b79e7d-0f50-4cfe-825c-ed5b23d943f3.json |  132 -
 .../113c3507-b738-4b06-ada8-da93b19c6ae2.json |  132 -
 .../8835d5c1-8350-4d42-a753-82b94dffda3b.json |  132 -
 .../dc3bbda7-5007-44c7-b1ba-af0c82d100ee.json |  132 -
 .../0d24ee06-a6b4-4be7-b3ef-c4f53b4fc414.json |  132 -
 .../f2415b7a-2cd7-4a05-834b-7da992e1da1a.json |  132 -
 .../01af237f-40d8-4841-a90d-13dce6db8634.json |  132 -
 .../d69bb392-fd38-4f57-b567-24566896167b.json |  132 -
 .../63503943-1c1e-4dac-9c41-4933fbb44b70.json |  132 -
 .../80c5d343-41e6-45d7-8921-62586a3cd270.json |  132 -
 .../2c27d7f6-60fd-49f3-8666-784f2a16031b.json |  132 -
 .../cbcc1e64-8455-4382-8999-654d1757bbd6.json |  132 -
 .../1bea4f6b-7a41-4907-baca-430c7ea179e9.json |  132 -
 .../298ce89b-966c-4f4e-9da5-3803a395188f.json |  132 -
 .../ea27a4d6-8c32-4b36-873d-1046ae6240e5.json |  132 -
 .../73d5905d-7825-43ba-8051-7e1f5639b857.json |  132 -
 .../956b8589-a048-43be-9cfd-05658d3c57ca.json |  132 -
 .../36f597b4-8f53-4b40-9c0e-c9284743e456.json |  132 -
 .../7b67e526-7588-4c62-9293-55e77851c4c7.json |  132 -
 .../8bc96d6d-0cd7-49c4-8112-7d8fb1c45199.json |  132 -
 .../6751a200-0bd9-498e-a991-ebe22375633d.json |  132 -
 .../f41442e3-5aa7-4ca4-9e61-a5e13965a3e4.json |  132 -
 .../b105b62a-ce77-4387-b679-1adf2782b2f4.json |  132 -
 .../72180fd7-bf34-4758-b02f-7d11859700c7.json |  132 -
 .../ac5aaa9c-79ab-4082-b8c5-084fba3e122a.json |  132 -
 .../2d266d7f-8edd-40fd-adfc-597a7742167b.json |  132 -
 .../484ccbf2-87e2-423f-9de4-a4bd54291b54.json |  132 -
 .../4de79504-f9e8-4235-9aad-d38f0799e081.json |  132 -
 .../b4bde9d8-f50c-448c-ada4-5bc05f302c04.json |  132 -
 .../5da3240b-b5e3-4333-ba61-925343b56043.json |  132 -
 .../d6727b7d-cdf3-48d5-8e30-484e86ad60b6.json |  132 -
 .../15b86bbf-8d3b-474b-98f0-abb3972a7271.json |  132 -
 .../c0b339f6-4a46-46eb-b2d0-945176afe676.json |  132 -
 .../79367289-6245-4bf0-99e9-42bc3ff7649c.json |  132 -
 .../c3ec5505-1086-446a-9739-523810e93d13.json |  132 -
 .../c6c5e462-d373-4536-afc3-b740fb7e300f.json |  132 -
 .../b7537abe-8177-4206-999f-5bb7e95c72c8.json |  132 -
 .../eb2f6159-e37e-46db-9419-6a66cb7e539e.json |  132 -
 .../0b2d0a06-2907-4258-be33-1591e18ac6a2.json |  132 -
 .../0284d867-45c4-4fe4-883c-8e3ea169d66c.json |  132 -
 .../1a2da513-104e-4074-b3b7-601ab11bf6d8.json |  132 -
 .../189db16b-5e78-439f-9f79-6eec979c3a79.json |  132 -
 .../d751f1c5-5505-4c12-8d51-091538b49949.json |  132 -
 .../b6f9144f-57a0-4c18-9e52-ffccf2d8ca9c.json |  132 -
 .../67dc7fb2-1455-4f60-9dcb-59a8197741d7.json |  132 -
 .../7f4ab590-29fa-473a-b617-00135dd1d6ee.json |  132 -
 .../d67db62e-e21d-43c8-8b4c-bfa353e47636.json |  132 -
 .../85abff46-8ae5-4a75-9522-721793224363.json |  132 -
 .../1736bbd8-4457-4d55-8c0b-0ae6e001ee62.json |  132 -
 .../4777e427-8d17-4e06-8cbf-0883c95bbfd8.json |  132 -
 .../4df0b890-d4c5-408e-8994-88f7383e9235.json |  132 -
 .../76a5a59d-f5fd-4fb0-849e-7db7772b555a.json |  132 -
 .../6c8399d0-01ce-45cb-a20f-a49e4e760a1e.json |  132 -
 .../92c2c5ee-dfa2-4db3-8401-887d02cc21dd.json |  132 -
 .../b40ef568-f277-4d5c-87cd-53feaa71598b.json |  132 -
 .../893d5149-c535-41c7-8a1a-26bb6b33e407.json |  132 -
 .../0b649ed5-5af4-4910-b853-2408e3b58f1f.json |  132 -
 .../5c8edeba-5c65-4168-b67e-02143acbcafb.json |  132 -
 .../67e657ef-d602-4f58-b898-874a22f4a009.json |  132 -
 .../53d2bf07-689a-4e69-a534-b288313c8481.json |  132 -
 .../34d6a184-d4d5-4609-8305-c0e2ee1c585b.json |  132 -
 .../39b627ab-3e64-42f7-a88d-abe5764fcf4d.json |  132 -
 .../d8467b15-8a03-4cde-9fc5-5c08bdabb6c6.json |  132 -
 .../85bc5976-0d40-4416-bbf8-9b1dbf372343.json |  132 -
 .../8c7e8e64-672e-4c7e-a808-a49f1792d3a8.json |  132 -
 .../de8651eb-16d1-46ee-a1df-b8c72caaf205.json |  132 -
 .../6a744db8-814f-4e8e-b6e5-0d096267dfa5.json |  132 -
 .../028b7c37-770e-4356-a7c6-0cc74650d5fd.json |  132 -
 .../3b399c64-922a-48ba-9a25-862102749647.json |  132 -
 .../d5e46a11-3e81-457d-9d26-9fd17f96f076.json |  132 -
 .../b3abfbc1-911a-43b7-a338-efb25f746f9d.json |  132 -
 .../6b471ee0-9444-45ff-92cf-da624aa59bf6.json |  132 -
 .../b56bd924-0a63-4ca2-8f2f-97b581e47a36.json |  132 -
 .../bfe9098d-7207-4f8c-9a3f-549a29303b5f.json |  132 -
 .../7856172d-ec3e-4e71-befe-54952478e330.json |  132 -
 .../a68aada5-61bd-4a4c-a8e1-b9a2ace349df.json |  132 -
 .../9d19c44f-4912-4c95-ab3f-2dddb055d932.json |  132 -
 .../6cef3550-27d7-4073-b4bb-0f19a2c5f553.json |  132 -
 .../08ab8f6a-9aaf-4ab4-ada3-eb4a75f46995.json |  132 -
 .../622f9379-6a30-43ba-a7a8-fbd08c484fa5.json |  132 -
 .../24f728e6-de5e-44cc-8b6d-51e0065c1475.json |  132 -
 .../c3b2bf18-d355-40fc-a862-376c1b988305.json |  132 -
 .../79474be5-2587-4087-a2cc-1337e3b696dd.json |  132 -
 .../22ff2700-70c0-459e-96a2-0ce1710947bc.json |  132 -
 .../7d3a47a3-83d3-4f51-ab72-6a2fa5b5ef80.json |  132 -
 .../69dc0f8e-16d7-4907-9741-484eafa62b8c.json |  132 -
 .../e516abc1-9c3c-4921-a385-e2533d45fed3.json |  132 -
 .../8baa5832-cc07-4a31-a815-0e8151426ea6.json |  132 -
 .../509fbca4-f405-4c27-85a9-1eea59025070.json |  132 -
 .../6f45ed56-6bec-4439-9adb-e79fcd74667c.json |  132 -
 .../512ff924-c1d3-4d75-a468-2bcdcda25cf6.json |  132 -
 .../86b561ae-c4d3-4293-a884-bcab26df026d.json |  132 -
 .../516d1972-9731-4234-a4b3-b96423ebba5c.json |  132 -
 .../274f6e02-c81f-4f2e-9747-e5de5cee1933.json |  132 -
 .../61638b55-296b-40fd-a39f-cc2276d9f94a.json |  132 -
 .../11c1b6fe-4815-415b-a4a8-d14073df6ee1.json |  132 -
 .../88e2cb24-288e-4f37-8753-f0daa825051c.json |  132 -
 .../8a1a6c44-17fd-402e-a22e-e795a1f612e3.json |  132 -
 .../1121af0b-61fe-424a-bc66-3164bcb1d833.json |  132 -
 .../35300d67-7ee1-4874-b351-87f46267cec9.json |  132 -
 .../6180b7b3-4b21-42aa-a62d-084a91568b43.json |  132 -
 .../7414d344-0e67-424a-9e16-00de0487ce02.json |  132 -
 .../f5fcd407-080c-4cb7-a299-7a7f919c734d.json |  132 -
 .../efe03731-6021-4dcf-b7fe-24cbf2d60fac.json |  132 -
 .../6ffed624-cc22-4b62-a447-3c02b0e43ded.json |  132 -
 .../ed867fa8-be8a-49b0-8c94-38085808b58b.json |  132 -
 .../c8b9a56b-0933-4085-8d5f-a1d8294699db.json |  132 -
 .../9b178661-ed9a-427d-b93c-b905b8089ad8.json |  132 -
 .../69588e07-7559-49c2-9423-19fd143e42f7.json |  132 -
 .../317589da-d673-4f90-93e9-59983f2ef54b.json |  132 -
 .../efab322e-ea15-4fe7-9bfc-15246003e59c.json |  132 -
 .../b1eac68e-b292-414b-9594-c921f8e10818.json |  132 -
 .../b7d08c65-8219-4067-9504-99e438a86038.json |  132 -
 .../e9c5b479-0dce-4de3-84d6-90c7515337f1.json |  132 -
 .../3c766465-29db-4b3d-b42f-a3222b38a096.json |  132 -
 .../e6c85677-61ed-475b-85a5-48b91ec76bcf.json |  132 -
 .../7b68fa5e-dbbf-4542-8767-6874aabf8f40.json |  132 -
 .../c103b7f4-a432-42d6-86ef-cb369e0c16ff.json |  132 -
 .../643dda41-37d0-4c1e-b856-58b774612886.json |  132 -
 .../ba2f284b-d7c6-4748-a8dc-4f80caa30c6c.json |  132 -
 .../16e30aa0-736a-4ef8-8ba6-78285b84546f.json |  132 -
 .../73eb729d-adfd-4dee-9bde-04a31f5528f6.json |  132 -
 .../0daad2ae-92d0-4522-a067-20332f72c96f.json |  132 -
 .../a3e3849f-a289-4132-b4a8-f67d67ad46a1.json |  132 -
 .../59a9ed26-a67a-4e76-8858-520400c90766.json |  132 -
 .../6c5c61b4-8037-4b28-8616-1aefa7963eb8.json |  132 -
 .../e9f9b836-fbdf-4996-9b35-2c8145a7f01b.json |  132 -
 .../5b3dae43-5d5c-4d19-bd47-5c0f68ecbb81.json |  132 -
 .../d5b31b1f-ace0-457f-bf8a-9041398b8344.json |  132 -
 .../b34702cf-ffb8-4e75-9c9b-f5c52623d4c8.json |  132 -
 .../c701f1fd-166d-416b-8f78-edf17f2fecd4.json |  132 -
 .../4217b403-e924-4f67-9b0e-ad1d4ed293a1.json |  132 -
 .../03816e41-5fb8-4815-ab9c-4108ab19a3bc.json |  132 -
 .../a763b10e-350a-4342-ade3-b782437ca3e2.json |  132 -
 .../9e806fd2-edbf-40e2-a008-834cee537bb6.json |  132 -
 .../fbcf861c-62db-4079-bba6-becd4e231216.json |  132 -
 .../22b591c0-3386-4bd5-860c-20c0c6001986.json |  132 -
 .../dfb9a9c4-114e-4188-9940-4d6df7e4815f.json |  132 -
 .../38fd5f4d-0f3c-4dc2-b250-a9ee7090aac2.json |  132 -
 .../e53cbc94-fc9f-4d53-ae28-26bc8c2caef8.json |  132 -
 .../2165e69a-c50c-419a-932e-909f53b73b71.json |  132 -
 .../46430a07-15c8-4727-9102-2f471d4f1d3c.json |  132 -
 .../3c7f540a-c850-4e20-ad93-60e021d17133.json |  132 -
 .../c3ab4f38-6f7b-4589-ae4f-21ace05b8c44.json |  132 -
 .../2708c0d6-03e7-4a17-b6b9-e16f3ddcf5bb.json |  132 -
 .../6427a5ef-8508-430d-970d-054fc485e754.json |  132 -
 .../08984ad9-1e9b-4916-b214-af26dadfcc0b.json |  132 -
 .../1dbb5d03-fdfa-4059-9d50-d037ada6b1ac.json |  132 -
 .../6bf42faa-c3e9-4069-bf93-ffd626062f0f.json |  132 -
 .../9feccbdc-18eb-4077-b50b-986db0047fc8.json |  132 -
 .../a074c33f-782a-409c-987b-7dd62c65ccc7.json |  132 -
 .../2f2c0dea-dcd4-4e54-9f40-9fda4b91bd40.json |  132 -
 .../84481fee-3727-427b-912a-30e2744df28a.json |  132 -
 .../aaa801dc-1a47-4009-9ad4-7129a8d4e651.json |  132 -
 .../3ac92cbf-c85b-4e00-9ef9-4322f961591a.json |  132 -
 .../162b511b-4684-4595-9261-a33f3a4117f9.json |  132 -
 .../20d5d59a-028d-4e34-9414-d9edaf2e59b8.json |  132 -
 .../a21b53fb-783b-440b-9f3d-d8ada3bd18ea.json |  132 -
 .../0d2ab1e8-a2d7-45cf-b123-67bcab2d9dff.json |  132 -
 .../6b4a37c8-c7e6-4156-9d6d-8cba51b74d82.json |  132 -
 .../78582fec-2f69-4b37-8497-12ceb097b44b.json |  132 -
 .../949bf65e-c2ae-4701-82f0-39d0c62a0e87.json |  132 -
 .../8812151c-4301-4131-a414-d64d025e476e.json |  132 -
 .../2db1542f-a8da-4fb8-91a5-6dd1a942b55e.json |  132 -
 .../9feeffb2-3763-4e43-933e-89100b76f7fa.json |  132 -
 .../721102b5-ed5e-4631-8600-a6adfff0c784.json |  132 -
 .../18c185f7-5ca4-46ff-81c2-6c538f096409.json |  132 -
 .../7ab5911c-e229-43e5-a798-095287d0a597.json |  132 -
 .../f800c4e5-e918-45bb-8a12-3ca2a64c6b23.json |  132 -
 .../5fcf41bc-30dc-46a7-9cf2-4ce2c7a5850c.json |  132 -
 .../d4b20ef4-734e-40a7-818e-f77e170d7437.json |  132 -
 .../e0996c96-c9e5-4d39-8e6d-1455ef1f9544.json |  132 -
 .../3ad2b31e-ce2a-4cb4-9b85-79cdebd5d364.json |  132 -
 .../9aff874c-1953-4b97-9bff-9e6120b0bfa7.json |  132 -
 .../45ae7f45-8c36-46c6-989d-bc672cdf8eff.json |  132 -
 .../7d36e44e-a329-4b96-a891-365ad900f718.json |  132 -
 .../a8c26325-1eec-43a6-a8ad-3bcb2e378924.json |  132 -
 .../bde1a879-6852-42ce-9217-f427af85a46a.json |  132 -
 .../dd7a0377-f4d6-4390-b9f2-bf50b05ec0f7.json |  132 -
 .../12cbf241-d6d4-4d25-ad3d-13a42d7adc74.json |  132 -
 .../1f66fd7c-40ee-4249-8963-5c7bb93a3eaf.json |  132 -
 .../7076406b-7e0a-49c7-8150-2e6a243aa23b.json |  132 -
 .../96c3fd80-a601-4629-a1ab-bf7f366a909a.json |  132 -
 .../1302c9a5-d35c-400c-b9f3-d990243e5d59.json |  132 -
 .../c7f48bbf-6583-4ddd-ae4d-671c43218dae.json |  132 -
 .../5f07e092-2eb0-44c2-b2ce-5f1b31a9ea99.json |  132 -
 .../15701682-97ce-46cf-8010-a6bdeaf8c7aa.json |  132 -
 .../c6eecf0b-fa16-484a-8eeb-d196203b3c3e.json |  132 -
 .../4337b1c1-cc00-4a15-8148-e8d0739561b9.json |  132 -
 .../1151ee14-8fe9-4f97-808d-8103b353c2ec.json |  132 -
 .../a2c18179-aca3-422c-b9f5-8345109cea13.json |  132 -
 .../07495d34-1505-45a9-bb48-887af0da8a0c.json |  132 -
 .../567baf6d-99f9-46a5-8c40-c6899986f1ff.json |  132 -
 .../a337df3a-28ff-46c9-adae-4bc029937101.json |  132 -
 .../b201a849-44e9-4598-918b-ffa27c894ee9.json |  132 -
 .../dd87ebf3-3088-43b1-851c-a97d12a68ea8.json |  132 -
 .../1b3ef805-8b0c-44bf-b048-773a0dd94d0d.json |  132 -
 .../220cb478-58c0-4028-b51a-ec5fe1050746.json |  132 -
 .../17cb8ab1-e7ba-4daf-95d4-2cdbd2777434.json |  132 -
 .../2b55023b-b8bc-42a2-aca8-dcaf39890232.json |  132 -
 .../31736569-5992-4b1d-9d66-27a6c1620506.json |  132 -
 .../630b37b5-351c-403c-ac76-ccb68ffc5d53.json |  132 -
 .../69cdef01-30dc-4f75-97fa-9daeebcec72f.json |  132 -
 .../9aa1acb0-c791-4dea-aa1e-c912cea69466.json |  132 -
 .../0c1d66f3-8fd7-47f2-8538-a1aa8985aebf.json |  132 -
 .../2872dcd9-421b-4346-812c-b27bb32c6e86.json |  132 -
 .../2f3e2fc0-f1e0-43cb-8a8c-6aadcc538646.json |  132 -
 .../d0a76497-84b0-45b9-b748-04ffe9bc13a3.json |  132 -
 .../185b6560-6790-417f-aeba-f7405fee808a.json |  132 -
 .../30a8074e-df03-4866-9b8d-a5a7eece3c71.json |  132 -
 .../ac8874ae-d6d6-45d3-aabc-06a3852f68d0.json |  132 -
 .../bc98b048-18d4-438e-80c4-0cd851798da5.json |  132 -
 .../c88c011f-0a24-4e78-a104-035d25af2430.json |  132 -
 .../f9e3c31c-02c0-4f5e-ad4f-3be0801a0f41.json |  132 -
 .../5484405a-2ec8-4515-af75-76a5dd348d3d.json |  132 -
 .../7dc117b9-c2a2-44c1-8471-f3bc8a116e3e.json |  132 -
 .../e2d314dd-b5b3-49b5-8e64-1e3464f4b963.json |  132 -
 .../7ecb453b-1ba7-44ec-abfd-1f8be4c817fd.json |  132 -
 .../d0a70e95-fc72-41c6-ac42-09b8f379b566.json |  132 -
 .../e2ef8ea6-b464-445e-81df-ef0779c1d0d4.json |  132 -
 .../f3d7cca2-141c-4b84-abc4-396ad2d59e3c.json |  132 -
 .../e3f48d7a-c8a3-4e75-99d6-7f2946696b12.json |  132 -
 .../3feb9449-49a2-427f-a317-c21e6d1ca66c.json |  132 -
 .../6359e37e-0405-436b-903c-8f0e740dd6c7.json |  132 -
 .../f5daed76-f6e5-4a7d-84d7-80537a046b83.json |  132 -
 .../03af2b1d-989f-4afc-ab13-8793093b9c50.json |  132 -
 .../5db7ec54-7feb-4c11-b2e0-042226ba1f94.json |  132 -
 .../f1f5615d-8a78-43c9-b5c6-edc180252381.json |  132 -
 .../9c89bf8f-4b8a-4c01-8685-fafc687c673e.json |  132 -
 .../58b69c0f-826d-414f-915e-dd0b78d9298c.json |  132 -
 .../101ea548-2ffe-4f47-b3b5-5fbe9a3854b4.json |  132 -
 .../259c4798-ff03-4f58-8fb4-59150710212b.json |  132 -
 .../f731caa1-f777-494a-8490-da0c815f0708.json |  132 -
 .../d4d25d38-b21a-490e-9ca9-556504ec00ea.json |  132 -
 .../75bb85a3-40bb-4630-95a0-50e40b008412.json |  132 -
 .../bb44f3ef-eefa-48ef-a257-2eb345c89a00.json |  132 -
 .../2dcf1771-3dbe-43ad-974c-54e2e2860bcc.json |  132 -
 .../caa0c8df-5488-4bf9-a5b8-0fff831e6732.json |  132 -
 .../c6f8e581-e849-4e28-b3a6-1838ee522770.json |  132 -
 .../f0c361a1-a3ac-4415-ab5d-069bdf27e7a3.json |  132 -
 .../44129be7-f73d-4580-8375-e8ef324e73a8.json |  132 -
 .../2925ecde-a9a5-4369-b391-d23a8605d35c.json |  132 -
 .../8409e464-fd16-4b41-b533-2f6cae4fe894.json |  132 -
 .../86f6c6eb-8b08-4e6c-a1bc-0d941a00f10b.json |  132 -
 .../aa2e6df7-a0b0-42f7-8057-e2763fc34834.json |  132 -
 .../2bf9a06e-f3bf-4b55-804b-e553a722e0de.json |  132 -
 .../b380a675-39ea-4950-ad0a-d9771f09ddde.json |  132 -
 .../482358eb-7d3b-4de0-b5d9-451308f104e2.json |  132 -
 .../ef04a83d-7b89-43ec-ba33-30e1006422dc.json |  132 -
 .../7b64cf2e-c7c6-4b48-8e51-ea2aa0914145.json |  132 -
 .../52c8e3f4-1063-4d9c-80d9-fdd0a72fc98e.json |  132 -
 .../1f4a827d-31cd-42e6-871d-7c0cad010f58.json |  132 -
 .../56d6d99c-fba1-42e7-aad4-631370b44da3.json |  132 -
 .../006a0ac7-d6c3-42c1-b0cc-6a0bfe74f884.json |  132 -
 .../33a82686-6202-4a4d-ba34-bd4537105e5f.json |  132 -
 .../38d45554-44bd-4b40-b7c9-c0b7ba44b862.json |  132 -
 .../37d7e3ab-db9c-4ad7-81d1-933c030a6250.json |  132 -
 .../9cc49b3c-4e51-4f67-92ea-4ac8a3cbed43.json |  132 -
 .../b6bd8515-4c95-40ce-b2d5-af8873d261ab.json |  132 -
 .../d102e75d-3e20-482b-a243-bae3ec44e2bb.json |  132 -
 .../68920da1-af71-4ccd-88b9-554e3c72c4dc.json |  132 -
 .../c0eb144f-c726-4a80-bce9-384fb7a641a7.json |  132 -
 .../0b26f82d-36f6-4fd0-a0fd-05e4a1368a6e.json |  132 -
 .../8fe4360a-0924-4386-b4cd-89069f7ff55f.json |  132 -
 .../eeeb082b-7112-4a08-a87a-b2c9ae37efff.json |  132 -
 .../b8f933e9-867f-4934-9648-371d1e632116.json |  132 -
 .../8d225023-4b7e-48cd-ae67-6d00b541f17d.json |  132 -
 .../ee3b45e7-a5d6-4fa8-8abd-f6a77d5a6d5b.json |  132 -
 .../177ef040-da5c-4a65-adac-efdc555bd110.json |  132 -
 .../e9dc8337-eb35-4eb9-bca7-30ec1cd44092.json |  132 -
 .../f4549a39-0b28-4e06-998a-774f5f02cfba.json |  132 -
 .../a79af78a-adab-406f-995a-adb3893e1510.json |  132 -
 .../4e8e457a-85eb-4afb-a9fe-8f8ce6eaf4d7.json |  132 -
 .../eeb3a10a-d584-414a-90de-e018c47615c2.json |  132 -
 .../e83dadb0-5092-48b8-b408-e6bb1ac8a0ba.json |  132 -
 .../cebc7767-fbc9-45a2-808b-51e1a4f0f35c.json |  132 -
 .../b64b6416-b18b-47cc-a516-c613cd670b37.json |  132 -
 .../64e96d56-72a9-413f-8903-45821b98f71e.json |  132 -
 .../a3f44cfd-d1fc-4a3c-aa5b-a0f37fc4a192.json |  132 -
 .../79314f48-d92b-4992-b3c6-d31278c0867a.json |  132 -
 .../5a007612-c8e7-4f6b-baa9-a21af7e908c6.json |  132 -
 .../fdefdd3e-2d83-4430-bd95-e16a1935dff1.json |  132 -
 .../ffdd45bf-3409-4b92-909a-25a32ba27f82.json |  132 -
 .../a78ab8ac-2c2e-405a-95ee-0d1d27cf533b.json |  132 -
 .../d9d49bf7-f6f0-4c25-9182-d815454940e3.json |  132 -
 .../deb48e93-0378-482f-8a5d-7ec350497e0b.json |  132 -
 .../302a9a47-8603-42d9-85fb-64c60e7c6f44.json |  132 -
 .../28d52801-3998-421f-a37a-2b7b677d0eaa.json |  132 -
 .../32b4e23b-9430-45a8-bfa2-eea2e89792c4.json |  132 -
 .../0336e168-e313-44cb-a030-42e6d20e92df.json |  132 -
 .../11bd8b5b-2ea4-4ec5-8fe6-654aedb40fc9.json |  132 -
 .../6d97749c-3bfa-4c32-b581-a5e2b73303f3.json |  132 -
 .../ec58907d-b67c-467e-a3dd-b9f9c10138f0.json |  132 -
 .../a7f09a3d-025c-48fa-9358-863b9ae382b1.json |  132 -
 .../bf2be2d5-58de-4550-b733-a5910bded48d.json |  132 -
 .../52b32c1f-6189-4850-b3f4-de442eb2ccb5.json |  132 -
 .../87b44160-c3dd-452d-8c15-c4f758f8db7b.json |  132 -
 .../3e6814d3-54ea-493f-a9fc-85ae9eed1b05.json |  132 -
 .../35b7ff42-3825-4240-97bf-f8af7e8c23ff.json |  132 -
 .../c108173e-1582-4c99-9291-46986d7ba1cf.json |  132 -
 .../6feb08b0-1c67-4fe2-a001-0b3b84529687.json |  132 -
 .../d4ab3df2-109a-4eec-9742-dc3bb79d5a58.json |  132 -
 .../53ec995e-bcfd-4a72-bd9a-45d14da3f219.json |  132 -
 .../299a0397-89c7-4329-9599-9fc29a52db87.json |  132 -
 .../41adbc32-6cdf-49ba-980c-6eb6f722b40b.json |  132 -
 .../4236ece5-f2b2-44e7-9503-9731bff20155.json |  132 -
 .../b33d672c-4a96-4093-bc13-25c42303b918.json |  132 -
 .../2b4f42fc-8b25-481c-98f7-911c52fdd242.json |  132 -
 .../634b7a64-2bd3-48b8-b2f4-a93189801850.json |  132 -
 .../72a4bcc3-9dfc-4268-be4e-cda5837a3da2.json |  132 -
 .../78fa85f6-baff-4d95-ad3a-a0663f51b0a0.json |  132 -
 .../359231a5-6eb9-4f73-a6f1-d7fd7f35c7ed.json |  132 -
 .../79b81e37-f75e-4b18-b145-73c42625ced5.json |  132 -
 .../2d99af7a-f67c-4e74-9ba2-f1401dfdf9fb.json |  132 -
 .../315fa815-fab0-47c9-8185-00bc597c0176.json |  132 -
 .../0c1686db-b396-4ecf-86f1-e4e092491acd.json |  132 -
 .../57455fbc-b5a9-4a3b-9a30-7da0593fd778.json |  132 -
 .../a8f9d0e6-5a1a-4d09-ac78-47fd586384df.json |  132 -
 .../9d0d4eee-0b87-485c-843f-e32d08aa601b.json |  132 -
 .../e47c83ff-9a16-488b-8ccf-4a2fad2b14fc.json |  132 -
 .../8c7e25df-884d-4940-8185-4c1b82fac8c5.json |  132 -
 .../83611d50-01d0-4642-a104-daf77f1a0fe8.json |  132 -
 .../5cbdafba-6071-4da1-8b19-3de612e9ff18.json |  132 -
 .../1c934cba-c94a-4aad-9645-84658e0b5588.json |  132 -
 .../7aad3f6b-89d9-4c9e-9339-cf4111fc37c6.json |  132 -
 .../38d4a8ca-4273-4e6a-8a39-3b5ff20ec461.json |  132 -
 .../3d65fbc2-bf91-479c-a687-e9ef702794fb.json |  132 -
 .../650cdbbb-e066-4581-8d61-77aa6a4c402c.json |  132 -
 .../05d566c5-1810-483c-8ce0-84635b9457dc.json |  132 -
 .../37e3456a-92ff-4122-a697-ffbdc1c79555.json |  132 -
 .../70c908d4-f1bf-4553-9bf7-95eb593b4853.json |  132 -
 .../2ccc9c20-5414-4286-abcd-ad2b20f8652d.json |  132 -
 .../50f4560a-e172-42b9-b552-437aff158a38.json |  132 -
 .../c6a3abac-8a34-4725-915b-c27c3d0bc484.json |  132 -
 .../a8ed68ea-6463-4ff9-9dcd-034080272dec.json |  132 -
 .../5799ce8b-c00d-49f6-96dc-f7dd057a268c.json |  132 -
 .../0d261023-3e35-4160-98ca-241bbaee927e.json |  132 -
 .../f0454d3b-18b4-488a-94dd-fb24729996c7.json |  132 -
 .../6bafa7a7-3a2a-4141-9564-a762d1cdb1d0.json |  132 -
 .../37f20f86-40ba-4f63-b29d-efff6cb0e09b.json |  132 -
 .../bf0e7ce4-09e9-4879-993a-eb50b2a421d7.json |  132 -
 .../bcbc29f7-ea03-4dbe-a83e-d4940b2c6bea.json |  132 -
 .../cbea8d66-0370-4998-8e3a-06fef0a60f0c.json |  132 -
 .../ca48b670-b82e-46cc-beb9-2fd0f11d3585.json |  132 -
 .../d37f99f7-f9c3-48b6-84d3-7da5d77f5030.json |  132 -
 .../503c8a24-4ced-4dca-b9df-5733ce89c2ca.json |  132 -
 .../5c5283a0-819f-4112-bb90-5277423d9c00.json |  132 -
 .../b636bc82-1625-49b1-beec-cadaf4e1b1a9.json |  132 -
 .../00f481c1-0ef0-40bd-bd95-81dc9443a62c.json |  132 -
 .../7ea22fef-2d79-49ae-bf72-9153a4e239c5.json |  132 -
 .../64f441df-1781-4d01-b73b-2156413ad403.json |  132 -
 .../4e3676eb-8607-416e-986a-7098bc192820.json |  132 -
 .../2101369c-5042-48f3-a8f2-f9f56e7b6ae7.json |  132 -
 .../c4b86264-3725-4742-91f0-3e01f8d965a4.json |  132 -
 .../0308147c-dabb-46bb-8add-d332fcd5a800.json |  132 -
 .../a9977a0d-e199-488a-a26e-6269806fdb2b.json |  132 -
 .../56b89ec8-90c5-4e1e-a458-1bb8b5b92be8.json |  132 -
 .../4185c376-91c6-435d-ae3b-47cd85151049.json |  132 -
 .../26e45f5d-1e3d-425f-ba4d-b444dcda7f74.json |  132 -
 .../09be48ce-61f8-4ba9-b082-b9c475fa714d.json |  132 -
 .../27417bcb-fb2f-41d2-9dfa-9865a36f38d5.json |  132 -
 .../7b6fc3c2-a67d-450e-858c-fa87be122376.json |  132 -
 .../76b86418-5450-48c6-ae56-58a19016d055.json |  132 -
 .../e06594e4-899a-4285-b130-f7b605e5a6b9.json |  132 -
 .../9efdc773-a5c7-4709-88c8-96a67d84a742.json |  132 -
 .../1fcc2f96-afc9-403f-b82e-8e1804506582.json |  132 -
 .../bee1e134-9a43-441a-b977-522c510dd1ce.json |  132 -
 .../b70e1089-d136-4b2f-a253-f361bcf8cdcc.json |  132 -
 .../8b7e9c34-a982-4f4d-b5dc-66a12578601f.json |  132 -
 .../0ccc36d0-f546-46d1-91d3-15a40c7bf6c1.json |  132 -
 .../066abe97-2c6c-4f3b-9e5e-e144f130258a.json |  132 -
 .../a3af8f77-d915-4482-a2b6-c99744aada4b.json |  132 -
 .../82cc8b37-e242-441e-ac74-1662bcc0a0e2.json |  132 -
 .../1527c8bc-c1ec-45f4-9663-4cffbb808f94.json |  132 -
 .../337b8ce8-d697-47f6-94ac-7a420dd7d91b.json |  132 -
 .../3d6ed2bb-5be7-4838-abb7-49754f9c3bfe.json |  132 -
 .../0a6c7056-1bce-479e-84b0-f4eeea0bd3cc.json |  132 -
 .../3e236ad8-3828-407f-9076-743b465b8d15.json |  132 -
 .../9e90dcdf-ce2a-4a7c-8b89-6af8b7c2bcfe.json |  132 -
 .../940d88e9-085b-4065-b8c8-92ebe685deb0.json |  132 -
 .../7fdcd616-2c72-4c44-9646-9c32344bfa0b.json |  132 -
 .../9d358f55-810c-4ac1-adc7-83f95bd74c11.json |  132 -
 .../9ba3fe31-772a-4cf7-aa13-3680b6ad51ba.json |  132 -
 .../651a32b1-77fb-4acf-89bf-2d45b684944d.json |  132 -
 .../192c4037-753a-4790-80d0-33c4d277102d.json |  132 -
 .../679d66bf-244e-4080-9a42-0a0c6cfdc965.json |  132 -
 .../73b0ca8a-fb16-43eb-a9af-a01219cf6196.json |  132 -
 .../7f00ecbc-fcc8-43ae-867b-cb160e63a80c.json |  132 -
 .../a8238bd4-3982-4e45-92e4-bab77e528e29.json |  132 -
 .../f87f9f08-e989-4e99-a254-a3650e7ab1b6.json |  132 -
 .../f40496a9-fb14-4b2d-8070-84f55e6417f6.json |  132 -
 .../cc52f59d-5669-44b0-b1af-e6fd0836e284.json |  132 -
 .../67525a37-f658-40e8-89a1-de8bf6275a00.json |  132 -
 .../3cb34886-7a93-42b9-a8fa-fab5f4bd8624.json |  132 -
 .../0dd1f9fc-cf54-47ff-8ccd-148b45f3c921.json |  132 -
 .../7a05616e-7335-419a-914d-00fb287fe663.json |  132 -
 .../070a21b5-4cd3-41b7-9653-0d2d2e4f273d.json |  132 -
 .../5afc044a-3138-443f-89cf-74f1272cc632.json |  132 -
 .../a6c1d914-647c-46b7-b0e1-712b8d506780.json |  132 -
 .../43f35eac-0946-42f9-a128-eb8011c29588.json |  132 -
 .../04c22be7-2cf4-4774-b479-863199c7c3a4.json |  132 -
 .../fc3d436b-ec61-4458-a3c6-1df41057ea70.json |  132 -
 .../e3ed157f-f306-40fb-b3a1-d3434236759e.json |  132 -
 .../8793b3e3-f409-499a-81f8-c250c8092841.json |  132 -
 .../33572f63-15ba-4fbc-b1cf-56b978384d02.json |  132 -
 .../44c636ba-8303-4d75-bcb5-46e3c07a991a.json |  132 -
 .../0a002444-3e5a-4fc8-acc6-72210a4181a9.json |  132 -
 .../bbf936a5-3594-4d0a-b5af-7a01740d0c81.json |  132 -
 .../1164abea-4cc2-46a7-a44b-f024a2ce40b4.json |  132 -
 .../bfd88bec-fcc2-4580-a5c7-4792a0300a5b.json |  132 -
 .../7f49e582-a01f-481f-8345-1c384fc8b567.json |  132 -
 .../10937ed1-56e2-4aad-b717-5125bc8ac72a.json |  132 -
 .../f4622539-c0ac-4e9f-86d4-00e3c826d03b.json |  132 -
 .../6b13b2b1-68cd-4aae-8f2b-2400f40760d7.json |  132 -
 .../5b02726c-ba3f-482b-9f10-87b8d69ffeb4.json |  132 -
 .../21d6f2dd-7bd6-42a9-b14e-c25777497890.json |  132 -
 .../d0bc11cb-56ff-4c77-9446-e76e550e0919.json |  132 -
 .../ff78dc97-e9cf-4215-a607-3e80892af82c.json |  132 -
 .../0ff1c6ff-5404-4d61-b6c6-f6ef7ae9ca8b.json |  132 -
 .../48837141-2556-4658-87e0-bb88cfcd562a.json |  132 -
 .../f2d6da5d-3685-43de-8ceb-5b798f88e24c.json |  132 -
 .../9ec02ccd-329a-4d62-9f04-87de6fda5011.json |  132 -
 .../781d0332-e332-4ff7-8585-9c2d8395a147.json |  132 -
 .../d6dd460e-c352-4d31-8941-183c6eabd0a7.json |  132 -
 .../66bf6442-04ea-437b-88c4-e61afc6f7139.json |  132 -
 .../0d1911f5-a2e7-4511-a8d8-098cbf9207df.json |  132 -
 .../abc18648-ef96-4695-94d5-fa14be277431.json |  132 -
 .../ff1e7aaa-3f29-4192-a0e0-80fcd11ba055.json |  132 -
 .../cc8ef5bd-957f-4308-9539-00a696182056.json |  132 -
 .../abc7652f-b88e-40ba-847c-c99dce9f2719.json |  132 -
 .../56e36294-e616-45a1-8dc9-2c14cf3ee8d0.json |  132 -
 .../4b81caad-92ed-4bd5-98bd-58582854b5d8.json |  132 -
 .../2cef0040-6d4c-4c38-be40-5477911f3063.json |  132 -
 .../4aeef94f-823e-4be5-b4f1-37463e052748.json |  132 -
 .../3d367147-373f-4543-be19-55a6429558a2.json |  132 -
 .../cb93091a-6c46-438a-b111-cbf7e2fac420.json |  132 -
 .../ea6048f1-8be4-4ec8-a5d5-35ff1523d74a.json |  132 -
 .../f4dc1659-800f-49d2-a290-48e9d4b15581.json |  132 -
 .../d4d8a784-5bd5-4437-8e0d-75dcb967ae33.json |  132 -
 .../91017e73-f33a-49f5-ac87-f6e6a178d885.json |  132 -
 .../b7a75bca-6afe-448a-8e5c-53ebd577c964.json |  132 -
 .../8cdced5c-23bc-4426-a0c9-b9bf82913683.json |  132 -
 .../368784c8-6fc2-4340-8277-a6a9a9800a99.json |  132 -
 .../f7ddf26b-4b4c-404b-b9d3-6ceaf78d39aa.json |  132 -
 .../f423b0d1-3536-4865-9615-f89b9d15b14c.json |  132 -
 .../c7e8333d-1d79-4cfa-9833-fa42f9fcbb4b.json |  132 -
 .../b6149d15-3e0f-43d2-ae90-eca290a94edb.json |  132 -
 .../e21f5d83-6b71-488d-ad55-d23268fbd611.json |  132 -
 .../68e1a42e-4318-4b5a-a45b-2607b7c2fe05.json |  132 -
 .../12a03ffb-d66b-4d00-a43b-fd5be80e1b07.json |  132 -
 .../adbad8dc-7d13-44cc-a5c6-e8da1de27c37.json |  132 -
 .../7fb595e5-abbc-43ff-8135-c4bb4a2ea593.json |  132 -
 .../1bb09da7-1675-4e57-b46a-9791c888ce6f.json |  132 -
 .../3ed7dd5a-e431-480a-91a7-5ccd915057e4.json |  132 -
 .../9cab35b6-d6a7-475e-b715-e4493d07cd92.json |  132 -
 .../ef7149ae-8d50-4890-89ae-fb561a86d130.json |  132 -
 .../3fa14e1f-82a5-4c04-9c76-2a3f6d56aa81.json |  132 -
 .../4418c7d1-72da-4ed3-9d5c-9d8520f6641c.json |  132 -
 .../8fe13380-a045-4d63-96f8-ec977540478c.json |  132 -
 .../6da42427-c7de-4830-b368-ca7757ee1d51.json |  132 -
 .../5faf24b3-38af-4f3f-8377-bba70d75f8df.json |  132 -
 .../9a26214c-2601-49be-b1b1-03796b704059.json |  132 -
 .../fa71ed09-45d4-4a5b-bfb1-a61a359a8f0c.json |  132 -
 .../25c5b304-46d3-4df3-9ac3-75ffa972849a.json |  132 -
 .../88ed0272-39f8-4676-970a-525aee058991.json |  132 -
 .../d8eff5d0-061b-4b83-b96a-04f9ba47ea6c.json |  132 -
 .../dcb90e75-8709-4729-8c00-e756e6a9a49d.json |  132 -
 .../81dcf3ca-f5c2-40a1-8871-b0188d5e9ceb.json |  132 -
 .../0a0a4d32-c7a9-49c9-bba4-dae6b464a5b6.json |  132 -
 .../82a3a8ef-7e5f-48d0-a48e-41ea2c5b6452.json |  132 -
 .../e635e798-fa85-4430-bf1e-9d5ad7fe9f22.json |  132 -
 .../7ccaa29a-4f73-4794-83a2-b925d755d91e.json |  132 -
 .../ba8de8f6-c118-4bc3-ae8d-851e964684ed.json |  132 -
 .../4011975a-e2a0-466a-9b34-923e1b4f8733.json |  132 -
 .../8a172205-39c6-4dd1-86b2-11b234b37e3c.json |  132 -
 .../495b2e8e-e2d8-4158-bc6e-7568604d44e9.json |  132 -
 .../e6a97d0d-9dc3-43a5-a69f-8132e19f9c77.json |  132 -
 .../4aecfd45-f47b-4f02-a0ed-288cbef46a6f.json |  132 -
 .../a6f7bc45-c2b5-47d8-a062-60f20c3d7ea4.json |  132 -
 .../c85c79d6-28e0-4deb-ad84-901b725aeca8.json |  132 -
 .../73271472-d06f-405b-af9d-2da7c17e1eb0.json |  132 -
 .../4e40bb43-c33d-4324-aa02-5bb7f88a5d1f.json |  132 -
 .../9b36e4c0-0d13-4988-8145-b9254da2e76e.json |  132 -
 .../6a464798-0111-4c71-b156-72a5aba1da63.json |  132 -
 .../78252135-f15b-427d-86de-c32cd3dbcd0f.json |  132 -
 .../c3b7bd57-9bc3-4d83-aad9-7d6315748c0a.json |  132 -
 .../bce17582-e807-4b91-b0e7-0a890bf5eb24.json |  132 -
 .../f8371e81-f6d4-4441-bc6c-5d4a18da7d08.json |  132 -
 .../78407b2e-1f44-46f0-bc21-76bdc68f8d9c.json |  132 -
 .../bdb9e2d2-8d09-4994-a320-2f968bcb4898.json |  132 -
 .../c57d15c8-9581-4bb5-89e4-2fea1e3c584e.json |  132 -
 .../550d5665-7a8a-437e-b318-000690dd250f.json |  132 -
 .../a1922f33-32f5-4f99-8df6-e2080808d292.json |  132 -
 .../6ccc376b-24a4-42cc-8ea0-823ef14336db.json |  132 -
 .../6547b6f3-63dd-4516-b294-62c4246c3dc7.json |  132 -
 .../a58bf2d3-d209-41b8-a795-ba7a16e4a28f.json |  132 -
 .../b15ad3b5-7ef2-439e-9acd-a85eab520d31.json |  132 -
 .../64da2654-9fdb-4a08-ad16-cf8793a30ed8.json |  132 -
 .../37080215-ee30-4e59-a407-b14695ac2a38.json |  132 -
 .../b83a0ce7-bf13-4a98-81f3-04e5a44105f7.json |  132 -
 .../bb7bea21-5bc6-460d-98ff-b3ed02d5b215.json |  132 -
 .../da9ddecc-43cf-4055-a19e-795b1ee98826.json |  132 -
 .../a93ccb3f-f2d9-415d-8397-0c7fb765fada.json |  132 -
 .../d0f86765-bdb4-4367-986b-28303bbe1844.json |  132 -
 .../693bb191-ae83-49dc-9df1-2f68b1b5fe4a.json |  132 -
 .../7b2c0b72-6421-4f33-8593-a4bbfd0c6d6b.json |  132 -
 .../c4ee822f-fc8b-4523-95b6-7c3f12a334b3.json |  132 -
 .../1810033a-185b-4c91-91d3-43b8f6c61443.json |  132 -
 .../beb721ae-a35c-4f6b-a80f-aac4835d5f8d.json |  132 -
 .../cf20e77a-340f-4d8d-b593-9645bdfc5877.json |  132 -
 .../eec73e49-ac2b-42ed-a115-76e45007cd5d.json |  132 -
 .../aa06d058-87f9-4fde-ad53-139b29a71448.json |  132 -
 .../3f1d571a-fc42-411b-88ab-4700d5861367.json |  132 -
 .../74a56080-aeb2-4cc6-a825-bbe4d9a5900a.json |  132 -
 .../2eb433ba-5c93-4355-99dd-edcb65721603.json |  132 -
 .../826fc3ab-6ff8-44fa-a745-a0b80bcb2db4.json |  132 -
 .../6da54964-e3b5-4567-8ce4-7e0f279af84f.json |  132 -
 .../a7dde688-a0ae-4731-909f-0bef0c6eeba9.json |  132 -
 .../eb2a8a60-2240-4b08-9dc3-be0215aa7bfc.json |  132 -
 .../9b05919f-d7c1-4e04-9dd8-9ae70e0005e6.json |  132 -
 .../6cd98538-74b6-4ac6-a3ac-9a311cfe47f6.json |  132 -
 .../b0ca2dec-387f-4b27-9adb-772af1899832.json |  132 -
 .../53c4b397-b78e-4699-a01e-3535aa072225.json |  132 -
 .../f5b251f0-741c-4ad5-ab04-19c5202854ea.json |  132 -
 .../7b2ba13a-e01d-4442-9abe-d16df1a1668a.json |  132 -
 .../bf79f87c-3f14-49e8-acba-725e709d5f11.json |  132 -
 .../3fbac7d4-cbbb-4b77-9db4-fd7e122cc90e.json |  132 -
 .../6efd0dbd-b8c1-4c66-bdf7-19055c16ca22.json |  132 -
 .../1388b8d4-c711-480c-8a06-a8b7bd8aa79c.json |  132 -
 .../03393ffd-1923-4767-ba14-d0e3e6751842.json |  132 -
 .../b7d049dc-127d-4075-8067-22adac9a58c3.json |  132 -
 .../89d79024-f4b8-4165-bd88-47f2b0010800.json |  132 -
 .../d2c0fb0d-6c0c-464a-b09f-6382a57b6afb.json |  132 -
 .../a891b28a-2dcc-4b8e-ad20-1f23d663b44b.json |  132 -
 .../55e274bb-1e2c-4402-b7ae-09ff7b1f9738.json |  132 -
 .../fe7a6940-fc4c-4345-84be-609c8155be57.json |  132 -
 .../77eb2b0f-e3e3-474c-bb02-dabde2998ef0.json |  132 -
 .../94d744be-5d28-490a-ba9a-8440cb97dce9.json |  132 -
 .../2765061e-7506-4eb6-b63f-312f6290665a.json |  132 -
 .../167c937c-66c7-45a8-bbd9-97d98531bf7d.json |  132 -
 .../9587c35c-1def-46e7-8642-7acb0340be5e.json |  132 -
 .../1c9594fe-03d6-4ec1-9da5-99960da0dcd4.json |  132 -
 .../8ed2c4eb-bc72-4dde-a559-1afd1698d37d.json |  132 -
 .../a2f9536a-9266-4aee-be90-d04f4dcbe53c.json |  132 -
 .../7f116aaa-3880-4e53-948a-4b06e0d26cff.json |  132 -
 .../7cbe4516-2be2-421b-95f4-c9500ad64ca5.json |  132 -
 .../07df565a-bc30-4a9d-b472-7a85f35938be.json |  132 -
 .../7545f7db-10bb-4d97-9b3f-4346f4f26bad.json |  132 -
 .../47384f10-ac6a-4629-92db-86f01a441f7f.json |  132 -
 .../3c9f022f-3e2b-48d6-acb9-07f066cfceb6.json |  132 -
 .../1d851cfb-8624-4516-8204-85569c60dc67.json |  132 -
 .../a7990990-7498-4b74-a0aa-9c266910698e.json |  132 -
 .../0b41d37e-0728-4575-9662-c150e2e29bd0.json |  132 -
 .../c565a7e9-bd1b-41a5-bff3-3a349553f4e8.json |  132 -
 .../680a4507-755e-4014-877b-6032f0220270.json |  132 -
 .../5ace8dc6-e348-4267-bb4a-f71a335d074e.json |  132 -
 .../07549821-db51-4b77-980a-056131b5dd29.json |  132 -
 .../ff12a0a1-a913-441b-955c-bcbd50056acf.json |  132 -
 .../947cfc2b-b73c-40eb-9e57-be5278776711.json |  132 -
 .../53639078-c50a-4147-bab0-16993f1790b6.json |  132 -
 .../b2cf96e0-382e-4200-a4a4-d66e8a188878.json |  132 -
 .../d4ed3eb6-f569-4d4b-8da5-50eaaf824128.json |  132 -
 .../210f7063-e0d9-424d-94f4-3645e4e1b401.json |  132 -
 .../4ecd26d8-8416-4dba-8d53-96f4013cfef0.json |  132 -
 .../15712b7d-e69f-4a4f-b13c-4e79ce859399.json |  132 -
 .../9148c375-7c08-4c1c-82ed-5f935b2a4f04.json |  132 -
 .../fb93274b-b7d8-483a-a95d-96340535febc.json |  132 -
 .../0818b755-ec49-457c-8635-73f01816f30b.json |  132 -
 .../77962326-0160-49bd-9ef1-59b403b2bfce.json |  132 -
 .../272abbe5-8b61-442f-9860-d7411e7fec99.json |  132 -
 .../14d617a8-18c6-40a7-a4ba-19cf5fc5f4e3.json |  132 -
 .../ef7b5e6d-b5b7-4c7b-9781-6f90eb1ff5dd.json |  132 -
 .../1970e257-7c93-4342-9ff4-a96af21acc67.json |  132 -
 .../15d71696-4b21-41ff-a4c6-0aea92fb844a.json |  132 -
 .../ccb85394-5252-48d4-8980-8b3a6c67ab1a.json |  132 -
 .../ea9837ff-f4c7-4bb0-b2af-7ae26371baf0.json |  132 -
 .../fe9012a7-d07f-48d4-b460-eca256078d8b.json |  132 -
 .../8e8d2071-8e7d-4dad-8536-4698b2d00316.json |  132 -
 .../dbcb41be-9ed6-4244-ada8-77f363c3487e.json |  132 -
 .../e48e2d7e-6c14-4bb1-bd12-74d93a145ca3.json |  132 -
 .../30c2d908-3eaf-408a-a2b5-301e0cd9e052.json |  132 -
 .../f7624d04-66d1-4c05-8c01-d015ecf8412c.json |  132 -
 .../511e4aad-1e5a-4515-9433-46989fc3945b.json |  132 -
 .../863e71ec-03a4-47ed-8bc9-b064d5571162.json |  132 -
 .../6a6dfcb4-192b-44ff-a34f-76b31bbf5ad3.json |  132 -
 .../e0dbec0b-a154-448a-be23-ef9b764469ea.json |  132 -
 .../ecd91300-b0cf-48ce-9e5c-253a7991f90e.json |  132 -
 .../e3df71f1-63e1-40f1-918d-07cb3ec939cf.json |  132 -
 .../52066a23-9847-490e-90e3-57eee3c63276.json |  132 -
 .../91f15ba3-a062-4b01-8a61-6e51fdf5f8d4.json |  132 -
 .../323630ee-fbe0-49a7-aa11-816fde38ba2d.json |  132 -
 .../e5c8f97d-1873-4c9d-8bed-50dc592543db.json |  132 -
 .../7ee2803c-b8f8-4156-8472-bab4baab8863.json |  132 -
 .../78573f63-3073-4be4-93a7-0ea00b1383fd.json |  132 -
 .../42da7295-d78d-49a4-9279-8406063240c4.json |  132 -
 .../b61c5735-53ca-4dda-a223-79921eee7f3e.json |  132 -
 .../310124ef-e33f-49de-83eb-e665a5143aaa.json |  132 -
 .../c9b056df-8bbe-4959-ab44-85813157c95c.json |  132 -
 .../7a60385f-48dd-4926-8b66-3d42a1631db3.json |  132 -
 .../da365c7b-74d0-4a9f-a8fd-cf4049ec4de6.json |  132 -
 .../e2930715-b616-49a4-83bc-53e92fc3580f.json |  132 -
 .../543f45e0-a158-4fdb-bbb1-8deb38f4515b.json |  132 -
 .../b96a20e0-d044-4a66-8909-437aeaef569c.json |  132 -
 .../408742ff-4b21-46dc-b4d6-4c78d652d228.json |  132 -
 .../496a9fbe-376c-4546-bd90-b42f583924ce.json |  132 -
 .../f32c07b4-21a8-4cd2-91f8-f0f26d0b1b38.json |  132 -
 .../cc36cc37-0f41-42aa-8051-54cc135820ef.json |  132 -
 .../20d3dac4-9f8c-431c-b20f-364dd860e37f.json |  132 -
 .../89022ea8-2a5b-4eba-8d7a-320ba13d30a4.json |  132 -
 .../97bfd152-79c6-4c96-8d3e-588275339e41.json |  132 -
 .../93061947-2bcf-482e-ab22-38ef8ee33bcf.json |  132 -
 .../8f65748b-1251-49f8-bfed-d1e4a937d5ba.json |  132 -
 .../4f278881-69d3-42b5-b72c-ff8627a6ef44.json |  132 -
 .../d88e85c5-73df-46cc-9234-f0556592ad5a.json |  132 -
 .../44d2a20d-e867-4fa5-af3d-087f9c1b4067.json |  132 -
 .../e83b3e7e-dc34-4b06-bcfe-95b3ba28aab4.json |  132 -
 .../44f2948c-4564-44cc-98d8-4f82a30e1f09.json |  132 -
 .../846cf1ff-62c3-44e7-b6dd-0135ec77451a.json |  132 -
 .../d2054469-b38b-4b1d-bd40-7324319f8eca.json |  132 -
 .../ce60608d-5b52-49d4-bbce-4b20e8272cef.json |  132 -
 .../f177bb70-fb7c-4b57-965d-acbcb4936bfa.json |  132 -
 .../a5b2ab3d-1f12-4a5a-a110-2514185568b6.json |  132 -
 .../63b887a1-a0b9-46db-a563-b9bd67a0805a.json |  132 -
 .../92d122f7-f29d-49e3-99da-bf20edf377a2.json |  132 -
 .../a0b71344-f3a8-4ad0-87c5-6393148488b1.json |  132 -
 .../821ff784-c48a-4623-9fb5-b77b7114b625.json |  132 -
 .../ed251513-4807-4e31-bc8e-3ab0217ae4f3.json |  132 -
 .../e7fa3baa-07b4-4f10-aa9c-8424d8fea303.json |  132 -
 .../11dfd131-00bf-4561-a913-f1c0cb15bf9c.json |  132 -
 .../3ba34f38-2340-407f-a7b5-82749f8a0ee6.json |  132 -
 .../91b9649b-bdf6-4b15-a038-47edc2e79ef6.json |  132 -
 .../24670e63-32e1-4c5d-82fe-0d0c45a4e165.json |  132 -
 .../198d1441-1d13-468a-a998-c8cf9f1e7a57.json |  132 -
 .../e9eb1499-835c-4a70-b531-4be5a9718c34.json |  132 -
 .../b1fd95ad-767d-4c13-a936-00b08c74ca3d.json |  132 -
 .../f87bd357-535e-4450-b01d-b41e1b7571e0.json |  132 -
 .../300fd27e-4dce-441f-91da-f38bd14ffe5e.json |  132 -
 .../1fd9a2e5-856f-4303-8ac1-611311f3e7b5.json |  132 -
 .../4c34d5c6-af1b-4519-8d08-67bd837e9b97.json |  132 -
 .../ddc27df7-1c4c-4563-92b2-5a39380423a8.json |  132 -
 .../3e606ef8-9caa-43d4-81d6-8eae9936ab4c.json |  132 -
 .../b9053559-3b90-4de0-981a-dbb49db38eb5.json |  132 -
 .../cea89bc6-b1a1-4b67-a136-45e097563a5b.json |  132 -
 .../5eb16113-7d0d-47a0-91d8-ec7dab35efdd.json |  132 -
 .../45aa6545-d20a-4dfb-a8a6-01f2fd34c9f5.json |  132 -
 .../c94079d1-d8b1-4198-8129-8c5a11c310ca.json |  132 -
 .../cb45306a-096c-4ed5-a028-6d720b26afe9.json |  132 -
 .../f301908e-474b-4ba2-a873-610ca1b6c2bd.json |  132 -
 .../06f5865d-a62a-48da-b33f-486fe29e3685.json |  132 -
 .../4f952c51-91dc-446e-bda1-43ed66e1ca3e.json |  132 -
 .../dcba3a6f-8f4f-49f6-af74-541de16be435.json |  132 -
 .../b5d39bcb-dab4-4880-9cb1-68dbd20a3ce5.json |  132 -
 .../1e597e9b-4e75-4981-842b-dad6f1c15ed7.json |  132 -
 .../18752dc4-76d1-40dc-9f43-62b8087b7a88.json |  132 -
 .../fa30c36e-20f1-41ee-a59d-0044f2b76dfb.json |  132 -
 .../5391ae8f-41b0-41cb-9365-b5cb7649c8b7.json |  132 -
 .../a95ab4cf-456f-4b3d-9bab-2b755649758d.json |  132 -
 .../9840baa9-2ddf-4dd9-b3b0-3ec3075089bc.json |  132 -
 .../26ff113c-95ca-4716-83f7-4792b46be246.json |  132 -
 .../285e1d08-15a0-4d8b-a844-e4cad923ea9b.json |  132 -
 .../0462269d-94a3-4991-9af5-e55592f344e5.json |  132 -
 .../c47c4cd6-90b6-42df-a3b9-4fc8f1b3c980.json |  132 -
 .../0fecafe4-f8f0-4f97-ab2d-589a3856e1af.json |  132 -
 .../4b5529b9-0800-4cd6-b720-a905ab5e6c9a.json |  132 -
 .../84783e4d-5eed-474d-9463-a01a0890850e.json |  132 -
 .../d9fe39c5-24a5-4240-bfc9-59860fcb3911.json |  132 -
 .../2ddf850e-36dc-41b2-92da-e2b45d1544c6.json |  132 -
 .../b10a9284-fa5e-4a4e-8240-edc98cea6d9c.json |  132 -
 .../2c51bd1d-ebe8-4de9-9749-5f42f7ba3d5a.json |  132 -
 .../425e6f1e-50dd-444f-b0da-5a0c47d5bf06.json |  132 -
 .../7e1fcf4e-9f64-4112-934c-4808f07d32b2.json |  132 -
 .../d3666566-09dc-4d53-9996-2301c6fb2721.json |  132 -
 .../36e5efb9-e3f0-4903-a9f1-3d51453bfdc4.json |  132 -
 .../a6dba337-81d2-40c6-89c2-aee6de82282e.json |  132 -
 .../e44b8d9a-f270-45c8-b126-6a8911c35436.json |  132 -
 .../44d5e1ac-45d5-42aa-b9fa-f18112cf6676.json |  132 -
 .../4246401d-9049-4c83-83d4-e2d9efa4dded.json |  132 -
 .../26c4785a-0caf-4b01-be5d-1e421bfeb698.json |  132 -
 .../cc9b9a25-18f9-4cc3-a756-3975a3a3be7d.json |  132 -
 .../b4edb7f5-a675-4627-af96-7ed0909da1e5.json |  132 -
 .../461b6f40-6f19-48b1-857e-f0fb37f929f9.json |  132 -
 .../e924270d-a655-4093-91b2-f73b7f12eefd.json |  132 -
 .../af8905e0-e969-45bd-8e09-e7316fff0914.json |  132 -
 .../e92a6d31-2277-4093-8fae-b3dfaa2d47dd.json |  132 -
 .../47472cd9-36d3-4074-83d4-af53b9c23758.json |  132 -
 .../b922f4e1-1fd9-4a32-94ce-4784430cef51.json |  132 -
 .../5bb2e77f-7709-4eb8-bd08-3c8da4a56310.json |  132 -
 .../35937213-bb16-4935-9d92-9fa8fd61aac3.json |  132 -
 .../04122d1b-929d-439c-bb8d-f08508f7a00e.json |  132 -
 .../03beb242-2628-4ea0-a2f3-c3ec43d379de.json |  132 -
 .../46d55b7b-1972-4cb0-97ca-e04d306282a7.json |  132 -
 .../32730d82-cfac-481f-9a22-9cbe40646218.json |  132 -
 .../a290a75f-753b-489d-87a2-ce0637c09f41.json |  132 -
 .../54032eb0-c4cd-4c76-be2e-f0c81bd26365.json |  132 -
 .../73b59506-cc1d-413c-a28b-d25e0e6bf413.json |  132 -
 .../bea2dcd6-4772-4aac-bcbc-4802cfb33495.json |  132 -
 .../66275215-28e6-42bc-bc22-5d152682ce53.json |  132 -
 .../9015365c-400b-4fa3-85f2-a1033b030cf7.json |  132 -
 .../55d52914-0904-4e6e-8b37-c22b06f5f2bf.json |  132 -
 .../3677260a-2fd5-41bf-9010-f1b31cedacbc.json |  132 -
 .../fc54f87a-2e4a-4f3f-b407-e268c4487d16.json |  132 -
 .../8d893736-1707-4c0b-860d-16c62ec26d78.json |  132 -
 .../d3d2728f-74bf-4196-a909-43797d8b628a.json |  132 -
 .../ed241e67-8718-48be-a6e8-19e295a2b5cd.json |  132 -
 .../05aafad3-e07a-453b-a70b-f18fbd4eb218.json |  132 -
 .../f79ac32e-ab83-40c3-9c18-35623f5ae1d4.json |  132 -
 .../cec76b15-1069-4d37-b8bc-74dde28101f6.json |  132 -
 .../e4ac0d0c-65ea-4b43-bb4b-7371c6cd5d61.json |  132 -
 .../f8d629bf-df0b-4c6a-8c18-17dda002b089.json |  132 -
 .../6739d8e3-f4bd-4fd5-98f3-887f5ed3f9c0.json |  132 -
 .../a51722f4-29f4-47a5-acba-4c8b5355551b.json |  132 -
 .../06d0a21f-f6e4-4ca9-a679-8c4502aaaad1.json |  132 -
 .../04a4dcc9-3784-4aea-9faf-9db49c2e4c43.json |  132 -
 .../e4668365-d3dd-4996-9bb1-5b4e6f510264.json |  132 -
 .../4d743678-e14d-4866-b1bf-0d660787847b.json |  132 -
 .../720b1476-876c-47d1-bf46-d037389b4b2f.json |  132 -
 .../4e4f3b2d-5b17-486a-a2ab-c2e89194c765.json |  132 -
 .../b738668e-3ac1-4a36-ad71-ad7d2a5256ae.json |  132 -
 .../623f1b73-1505-4527-b41c-dcb2b711226d.json |  132 -
 .../53f03454-9587-4208-bc01-21de62f59195.json |  132 -
 .../fb38d8b4-6320-4b8d-bf3d-e3d22bb0ed83.json |  132 -
 .../b127a923-3bf2-4cad-9225-d738efe800e3.json |  132 -
 .../a94ae52a-7936-4750-83f5-4740f23adf15.json |  132 -
 .../95e689c6-cd19-4114-b3b5-1672ab849214.json |  132 -
 .../890a8414-bccf-4a66-8013-6c270d017965.json |  132 -
 .../0f8ce410-cf3b-4f78-81b9-a0a1fe91b963.json |  132 -
 .../121096cf-356b-4069-a0a3-8cf6aad52b81.json |  132 -
 .../fb0bcadf-32a0-4320-909f-2c38ba7d9372.json |  132 -
 .../ab941c52-cf33-4b8e-87af-4a73930cf72a.json |  132 -
 .../08c242fd-0258-4817-970a-668584ed9385.json |  132 -
 .../2171af9a-be5e-4daf-8e67-a5239ccec7bd.json |  132 -
 .../706f75a1-2f6b-47dd-809e-a830e739b574.json |  132 -
 .../a9cd0399-4670-4f5c-8c64-c82dac97cd8c.json |  132 -
 .../67cfd12d-0551-406d-bd1d-8ced75c69478.json |  132 -
 .../0a31d2f0-196b-4508-861a-1ba7bd28ea23.json |  132 -
 .../57576999-2749-441a-91d6-5a976e83a658.json |  132 -
 .../e44792e6-0329-4784-832b-3043478e70a4.json |  132 -
 .../8b3789d6-51be-472a-95d3-2ae7c34ad140.json |  132 -
 .../3f4765f2-551b-485f-9020-0cf17a36a887.json |  132 -
 .../6375a845-5d86-4dcf-bfd2-e836daa4ca11.json |  132 -
 .../65a74446-6964-4f5f-8ea6-aeb1b09595ae.json |  132 -
 .../dcba5998-3b84-4753-a4fa-2558ffe3e69b.json |  132 -
 .../0af6b3c0-6638-4bd8-bdd9-349e2b9ca71c.json |  132 -
 .../4e332594-d0b9-4913-9950-208abe4faab7.json |  132 -
 .../5ad2ad73-47ed-465d-b4c0-b358e6b6435f.json |  132 -
 .../c9f716ef-0aa6-445f-8fc9-b102f3a0ea2a.json |  132 -
 .../a2e32a77-867c-4921-ada4-c7b169efbebe.json |  132 -
 .../f76f759f-d05d-4eb6-a2b9-3b1dfbe840f0.json |  132 -
 .../ece0bd6b-4eec-485c-942b-e23f3295c2f8.json |  132 -
 .../ada110bb-0988-4c19-9798-74577dde5ce9.json |  132 -
 .../ed4f994d-d196-40bd-8f8f-f6a7f07c3c90.json |  132 -
 .../57395f9a-0534-453e-80fc-96e9dc5cd9c3.json |  132 -
 .../f8f70702-9ab4-4e1a-a11d-090627d58f02.json |  132 -
 .../3cab8bda-bdf6-4345-b89e-18d34a8f6361.json |  132 -
 .../0955fc17-8878-401a-9ec3-149528ee51e1.json |  132 -
 .../c63bf49a-e7d4-4853-8684-9cc03eaa7840.json |  132 -
 .../65e6a3b6-4291-4591-bc0b-576930061c68.json |  132 -
 .../1ddf9e02-4066-440e-a777-fcd3f96bc4b3.json |  132 -
 .../f9f96bb2-edbc-4112-97aa-a7420dea32a1.json |  132 -
 .../3a24b30f-7698-4ecb-ac26-3537a0b38616.json |  132 -
 .../d4030df6-2be6-4f46-9c9b-ce3037b9a004.json |  132 -
 .../ec234403-f43d-46a0-84a4-ab47673226b3.json |  132 -
 .../805379f4-784f-4602-92e8-180df4da9fc3.json |  132 -
 .../9f3920aa-9400-46f1-bcfa-969f69b3335c.json |  132 -
 .../26cbf444-ab93-409a-b85d-e2bd267eae5e.json |  132 -
 .../7c2b17a8-1de2-4441-a281-fe3fd043f831.json |  132 -
 .../94c5756c-cbde-46e2-90d2-207678373061.json |  132 -
 .../e0048124-89bf-4327-88a8-00aa51ee29af.json |  132 -
 .../9d776307-43af-43bb-ab64-52fb7f331cfe.json |  132 -
 .../d8d41981-a7c8-48e9-a63c-86520a0f23d5.json |  132 -
 .../1355985c-fbcb-4eac-8435-417d6034f2f0.json |  132 -
 .../44486b02-7bdd-4f59-8d4e-5c8deeb1fd60.json |  132 -
 .../45ae3dc3-6dc0-4d10-99cb-a7f330110906.json |  132 -
 .../6b54763a-6329-47fb-bf50-296604251b47.json |  132 -
 .../96a26bf3-b4b2-465f-8ce6-a2ef943c001a.json |  132 -
 .../655b047f-c3a8-4c9c-b864-81d318b2f506.json |  132 -
 .../f62fed77-e166-422d-b5ce-c50b7bccbf4c.json |  132 -
 .../7ffdabf3-0a8e-4316-b6bd-85b10a81db53.json |  132 -
 .../2c93c987-b32d-4a02-8df4-949cc45b8eb2.json |  132 -
 .../02e7c1d6-9db1-4de8-b13e-afd752b3669a.json |  132 -
 .../580a3045-338a-47b2-8ed7-54c993d5aa90.json |  132 -
 .../e71d3be5-ea9d-4426-aa58-5806b7541aa6.json |  132 -
 .../1174683a-9488-4c6b-be6b-e5a96328a96f.json |  132 -
 .../3789b37f-daf0-4c21-82b8-309cbf00312e.json |  132 -
 .../8586cdc1-dd4e-4112-a59c-f6bc2766701b.json |  132 -
 .../946a7b16-dfa6-42ad-97c1-955bf8a40dae.json |  132 -
 .../d9a6cc31-57c4-4480-a019-25a34b31fcc8.json |  132 -
 .../279bd5fa-0ab1-411b-871b-bd9ff23853f6.json |  132 -
 .../c26fae10-e65a-49ac-a2da-2dbf024fd10d.json |  132 -
 .../6d37b2b4-630e-4471-b7a8-50f8a58902fe.json |  132 -
 .../de687865-4297-4130-bcfe-0c5116c9b0d1.json |  132 -
 .../ee1acad1-5dc4-4d8b-8aca-544af5dc2392.json |  132 -
 .../52e3f1b1-5a1c-4cca-a36f-9f60284e1883.json |  132 -
 .../2d54c67e-fad5-4a61-b3ae-0393f16dc1ba.json |  132 -
 .../5120e433-f5c7-45fa-be56-566101556271.json |  132 -
 .../7f4b4668-c3a0-4575-957d-ba321d55f420.json |  132 -
 .../9245b74d-4b9d-4158-a402-0c3742097eba.json |  132 -
 .../29a5fcd3-9c22-424c-ab17-70cfe187aea1.json |  132 -
 .../af71bfa0-1077-4c96-a4c1-0aa28dc789bf.json |  132 -
 .../258ebe6d-191d-4804-b5e1-5cd6ce93ba88.json |  132 -
 .../4765f197-82ed-44b3-9a7c-7cbabc6ecd8e.json |  132 -
 .../a5d66f97-1f4b-43da-a83a-4a262e297fd9.json |  132 -
 .../5d29cf73-65d6-4965-a504-4caf07108cc8.json |  132 -
 .../15ec04ae-30d3-4ffb-9b0c-54ba63410e3d.json |  132 -
 .../2ed96c70-390b-44de-aa08-9883a2f33ff3.json |  132 -
 .../67c95889-8a67-40fd-99e2-62e767c16416.json |  132 -
 .../a518f39d-e073-493d-9a4f-9af53fc71abf.json |  132 -
 .../24f0d9bc-d743-4f46-b5a6-e855e39a1daf.json |  132 -
 .../3d27f6d9-05a0-44bd-a225-6e6a0bf4a35b.json |  132 -
 .../ad28e7b8-69e6-4fb9-bec4-62c67fae6d58.json |  132 -
 .../0da639d4-181c-4ee1-808c-3de8003c2471.json |  132 -
 .../480bd62c-bc67-4379-bce0-b28a5d6bdf4f.json |  132 -
 .../dd94c18e-b2c3-4135-aa2d-5eb0248315d0.json |  132 -
 .../a2ae2953-e341-49be-8469-32bd41d780d7.json |  132 -
 .../23bdd694-f250-46dd-9b8b-526fda47bc9e.json |  132 -
 .../d600a69d-1952-4e30-abe8-1769ab63ac29.json |  132 -
 .../afc031d4-852e-4ead-9098-6ce30112b459.json |  132 -
 .../cb33e29f-e5e1-4bf5-9e20-86d9c3486d2d.json |  132 -
 .../a4b93124-1151-4f69-8a5e-6b916e8cf11f.json |  132 -
 .../efe11d8f-65e6-4ba6-8148-fdd43c9346be.json |  132 -
 .../923da7be-2ec8-46b2-8187-fe08eb86d5a0.json |  132 -
 .../1652b9fe-640a-48f9-b7a5-20ae28fb5985.json |  132 -
 .../572463ed-f6b9-460d-9c38-0e0ee5327511.json |  132 -
 .../5f6bbbfd-16a8-4ea8-b9d9-b436a882700a.json |  132 -
 .../32322361-f18d-480d-9475-cd11a45bc4bc.json |  132 -
 .../f62d1aee-2d9e-466e-85e2-002fae5d2504.json |  132 -
 .../af389bf1-da63-49a9-9e49-32613d8d05b8.json |  132 -
 .../ea13ae62-d050-4cc4-9cbe-99eedfc206e2.json |  132 -
 .../1e697620-36a7-459c-b88c-405febb57c3a.json |  132 -
 .../532723e8-a9b7-4f72-a015-c2bd9363b5d8.json |  132 -
 .../be096a57-7d81-4999-919a-ed8a243012b2.json |  132 -
 .../cadeb016-e158-4a49-921c-efe0e4eb0cb2.json |  132 -
 .../c606d7b9-3ea3-49d4-9ecc-9610ed4b4eac.json |  132 -
 .../04a5eed3-7eea-4d9f-acc6-5a96ec987e2b.json |  132 -
 .../a1c60d74-dabe-423d-9e40-3dd8112d7d8e.json |  132 -
 .../29c7bc9b-6833-497b-a553-2941026efea5.json |  132 -
 .../09a60955-978e-4136-bdde-d5459e37ad2c.json |  132 -
 .../501744a2-070a-4378-9232-f7ccd9b2a67e.json |  132 -
 .../369efdc6-6529-477c-b5f0-d229c8102491.json |  132 -
 .../906645f3-2041-4380-8118-ac26b92297ba.json |  132 -
 .../57fe8deb-02dc-43a8-8a92-14bdaf61dd67.json |  132 -
 .../95f2fa22-3da9-4876-ace3-50763f2b2453.json |  132 -
 .../b2f9e38f-c2a1-4e5f-a7ce-4e33a05b503b.json |  132 -
 .../b3173a2a-8309-498d-961b-0167d5d5dea6.json |  132 -
 .../0d59dd75-c999-4a7e-919a-fd084202fc9c.json |  132 -
 .../639e91d9-ebbf-4ba2-bce3-6953e7c91e32.json |  132 -
 .../56a5fb9b-a4b7-4290-9ec9-6864b3efaa82.json |  132 -
 .../d03fb481-be0b-4dfb-bb4d-54067e058e99.json |  132 -
 .../d8fc3475-83e9-4790-a472-72b442087562.json |  132 -
 .../57efd335-4873-4e01-bfc3-0d704b3d482a.json |  132 -
 .../25fdcc8a-0e7d-4148-8508-2631ea6deb05.json |  132 -
 .../f5f63d06-7e51-4b91-8814-ecbda604fe6b.json |  132 -
 .../5326c33b-6b8a-472a-9058-a9e9fe83b599.json |  132 -
 .../28674053-e1b6-4f0a-a90e-5dd5082ec164.json |  132 -
 .../fd27bfa7-11b3-46d3-915c-373ddf5a9865.json |  132 -
 .../91f190ba-39c8-47af-8351-73d1f382dd99.json |  132 -
 .../b637b55c-dd05-4060-bf33-e63e9de7fac9.json |  132 -
 .../bcacef79-d7c0-46e7-9194-43541c2f01fc.json |  132 -
 .../77a358c7-59fa-4b22-a190-dfca86c5166b.json |  132 -
 .../ad4c8922-7079-4383-8f42-d3de6326a1e1.json |  132 -
 .../7f89eded-e5fc-4b3b-9afd-dcd71b7b44d5.json |  132 -
 .../07cb94ab-0aea-4ce2-89b0-4378cb892c7e.json |  132 -
 .../5fb04756-c7bb-4772-b209-0d9a300bbf7d.json |  132 -
 .../0c02d1b6-2d31-4c54-b881-588cbfb0c686.json |  132 -
 .../a32e4d22-8096-4537-a68a-98ff9171ac8c.json |  132 -
 .../4e45b666-fa7e-4a38-8b6b-65846876c8d9.json |  132 -
 .../d9cb1d13-2af5-4385-aa78-5c053e00e6c6.json |  132 -
 .../6afaec07-ebb8-4f3f-af48-c679f38f4917.json |  132 -
 .../bf8370c9-baed-4034-ac38-c6f796baca15.json |  132 -
 .../d397c078-6fe3-44a8-859c-a0f7c551dc3a.json |  132 -
 .../ed61cd6a-bbf0-45f2-9536-a7a262d5d6fb.json |  132 -
 .../6be795f4-0784-44bf-8926-e3060ec37dcf.json |  132 -
 .../d4d808f5-3b79-43b5-8076-d3f785083789.json |  132 -
 .../370f5923-91d7-40d2-bd06-bf2b657b8ef2.json |  132 -
 .../5334e5e4-d243-4c20-912c-d0ded74d6ea5.json |  132 -
 .../7306f2cd-4fd2-4dd4-b06b-8c9aa558388b.json |  132 -
 .../68cc19eb-423b-4d6d-a3bf-eac6f666bc4b.json |  132 -
 .../59aa26a8-93b3-43fc-8c38-ef67cd8efd80.json |  132 -
 .../220cd306-0613-4c8f-9848-4af812a1d37f.json |  132 -
 .../39a6a40c-3fa0-41ba-9d13-da9381263d4a.json |  132 -
 .../4d037b71-5d03-41a1-bf23-c0aea0cdcbbb.json |  132 -
 .../16baf620-7dcc-49f3-a787-b431e11ad4f6.json |  132 -
 .../4745add2-7bcb-4c05-8b12-6bd30856890b.json |  132 -
 .../f68b122d-4dec-4d5c-ac22-198da3d3e96b.json |  132 -
 .../2e20f780-ceab-4d1d-a1ab-35f4f0ac44aa.json |  132 -
 .../f21bcd75-fc9f-4266-8976-3227b18b6b32.json |  132 -
 .../7c1a81ec-1cb7-4858-8f1f-23b3ee49b73f.json |  132 -
 .../1cbfd1ad-237d-4cd3-8b5d-3135c194fcc0.json |  132 -
 .../ef5c1813-a74d-4b3d-9911-c27a46c1c84e.json |  132 -
 .../df50857d-c90e-4ec8-a9b6-96a6d2f894b1.json |  132 -
 .../774d54fb-a445-4ed9-b79a-9c1346537e98.json |  132 -
 .../420b8be3-3560-48e8-8ab3-bb55338a9069.json |  132 -
 .../c118b75c-597f-48a7-a4eb-675af72c9930.json |  132 -
 .../e75534d3-b994-4e88-9274-7b62f61916cf.json |  132 -
 .../770a1ff1-057f-49a7-9402-c6dd881ac03d.json |  132 -
 .../6cc9790d-9b02-437e-8ac7-be4152f5b17d.json |  132 -
 .../264f5b42-a3ac-4af1-8145-c5763b8e7fa6.json |  132 -
 .../549db368-437a-4982-ba5b-5c4d7bf203ae.json |  132 -
 .../0d098a19-7e8f-4a52-8466-729be91388d8.json |  132 -
 .../83335f65-25a4-4bec-a901-587567ed0e99.json |  132 -
 .../02fb24c3-927f-4c21-bd47-b883521162a3.json |  132 -
 .../2a6507c7-44c1-4416-9ff1-36abd6af3b73.json |  132 -
 .../327a146a-8cfd-4480-8342-46afde530677.json |  132 -
 .../0700fb7a-e722-432f-a64d-c040bba4deee.json |  132 -
 .../131d3a7e-43dd-4189-8466-6562703b3bdd.json |  132 -
 .../8f6d7008-b8de-4a76-94aa-bbecc93ef3f7.json |  132 -
 .../aadb0ce5-a1aa-4b0d-bec4-8bb0e8e54a1d.json |  132 -
 .../a73250f1-399a-4afa-bf83-4036dce78ef3.json |  132 -
 .../f68bf680-9626-4952-b95e-12a18fd60820.json |  132 -
 .../d6a78a5c-4a2e-4370-88f2-d8627a94f1ea.json |  132 -
 .../7b5eab2e-fba3-47d5-9839-02249c2568c5.json |  132 -
 .../2acee2c3-4322-4152-8151-c1d571475b7c.json |  132 -
 .../67ffb2de-0410-44a2-aad7-4a32e2c49c7d.json |  132 -
 .../2923aeb3-982f-400d-9588-707583c75a1d.json |  132 -
 .../b6a622da-5ce8-4ea5-a82a-f3a2a299ddf2.json |  132 -
 .../7b06ac17-bfc6-43d5-99e6-d2b7a31290fb.json |  132 -
 .../fd481b93-55b2-4831-9be9-1b1b2886fda3.json |  132 -
 .../f159748f-234e-4962-b582-cd5805448f33.json |  132 -
 .../044d53dd-d134-4959-a70c-46f11cc0b300.json |  132 -
 .../f05501fd-7c06-46d5-bc20-a9d0cc5c2e0f.json |  132 -
 .../5c44a2f2-23e3-4c9f-9b7c-9012ca8b15e9.json |  132 -
 .../80e5134b-0733-41cc-8b4f-ef32fbe57066.json |  132 -
 .../61123e41-7b2a-40da-9f7f-b830c27d7f12.json |  132 -
 .../b93c31d7-54c3-47b9-a267-3f8fdb796805.json |  132 -
 .../b3eaa4c5-7abc-4e2d-9c11-c70ecb8a843b.json |  132 -
 .../3b06f75e-3d22-4428-8d4f-2e704b96961e.json |  132 -
 .../dfda4aab-f8d4-49ee-b141-78539b69007c.json |  132 -
 .../690f3c19-c148-458d-b4c5-87761d72b851.json |  132 -
 .../b6a18246-776d-463f-80d5-140df74e9704.json |  132 -
 .../9831abdc-ad08-48c0-8384-86240e7350b5.json |  132 -
 .../96a572e5-4751-46ce-9202-deb223ef4dfe.json |  132 -
 .../f4320b1e-ea4f-4aea-8dab-cdb221ce53e5.json |  132 -
 .../8376c0bf-f9c3-4529-b13c-c57106182d15.json |  132 -
 .../97a80145-e621-4603-8ff8-2cc4bd74190a.json |  132 -
 .../99a7881c-cca0-43d6-96f5-ce5292ed60a0.json |  132 -
 .../60ca8f7e-1c20-4adb-bb84-892bad3c0d63.json |  132 -
 .../4a0f8dc7-9446-4dda-bf49-8cca4851746c.json |  132 -
 .../6eb3a040-8234-4d31-8274-6987b0e4e3b4.json |  132 -
 .../16053077-38fd-4136-81a5-fea0d4cd927a.json |  132 -
 .../25abb99f-536e-4638-8611-a1db5dee931d.json |  132 -
 .../aaf0e5bd-b033-455e-bb23-b12b6f7c4520.json |  132 -
 .../b3a46478-c5f4-4c74-9bf0-d1ba616ae24c.json |  132 -
 .../169fb05f-5201-47b8-a06e-7d01e574c689.json |  132 -
 .../db076309-32e5-4d46-9786-ff14f8daf5d2.json |  132 -
 .../cde914dc-7d57-425f-9787-e4b8d36d61cf.json |  132 -
 .../5d793ce3-a7fd-4ee3-b32c-c9da63ec0566.json |  132 -
 .../8c645c9f-02f6-44a5-b295-d6364ed49464.json |  132 -
 .../97bb5519-e2d3-44d5-abf4-b5263c2b3245.json |  132 -
 .../bd3d78d3-3ff1-4a92-a316-e4e30787a331.json |  132 -
 .../d8951ed7-f4ef-49ce-891e-8d8509e9cf93.json |  132 -
 .../e1772d6c-fd26-43a7-82b3-7997d8a6809f.json |  132 -
 .../febaf893-6aaf-4c87-89fc-cc865ebf2859.json |  132 -
 .../0ad591f4-c846-4fd1-8536-a169e0a7e4ab.json |  132 -
 .../0a318ebd-7bbb-456b-a6e4-9b480a858b5e.json |  132 -
 .../e1cfdc32-3c5e-4f4b-a205-f416c96cf5e6.json |  132 -
 .../85426280-8138-46d0-a111-b59b0d7c86c8.json |  132 -
 .../32bbd26e-05e7-4a0f-a491-8f54cea9f3d3.json |  132 -
 .../86ed6833-ae85-4a8e-b840-b0c9540083ce.json |  132 -
 .../2f751ac3-5ca5-4d0d-9ad4-48155e51468a.json |  132 -
 .../9677e68d-afda-4917-825c-83318219ff59.json |  132 -
 .../23cd57c2-bf7f-440a-ab3e-edfdede5e8cd.json |  132 -
 .../bec23315-f98a-4211-81a0-c49f395e66c9.json |  132 -
 .../1ac5faef-7fa0-4b58-a6ba-0c444a2023a8.json |  132 -
 .../39327803-11e7-4b28-8750-81feb027e8f3.json |  132 -
 .../ce2b6874-0fc8-4364-a526-7b25b101e1e3.json |  132 -
 .../9f9ebc90-31f9-45c1-b9c2-07b727b12f3d.json |  132 -
 .../d189a2fc-71f5-4bc9-a0b1-7e744a19921f.json |  132 -
 .../1eb697fe-9dd4-4a41-aa47-33456df39e2d.json |  132 -
 .../5f10df7b-cd2c-44ca-b13a-2852483c71f8.json |  132 -
 .../3abbb4b6-8050-44fd-b066-0f061ce2f4d7.json |  132 -
 .../5f47e65d-293f-469e-a18f-5627ca1adf44.json |  132 -
 .../b753c1aa-8a0c-4600-99ec-8eb51ab50da7.json |  132 -
 .../15c21655-9af8-4bee-9884-b047683e9adf.json |  132 -
 .../f642de95-218a-4db0-807f-1bb97618b4f6.json |  132 -
 .../01443b06-9ad3-41f5-ae0d-bc84086e0a0d.json |  132 -
 .../1ee8c377-2236-4225-942f-ef8ce5770741.json |  132 -
 .../4ee9aa78-d9eb-4a1c-91c4-f29f093b95d3.json |  132 -
 .../419c6631-805f-43ba-9db8-5296f8d221ec.json |  132 -
 .../3fc1822f-4a43-4a3b-90d7-fc163491c90a.json |  132 -
 .../76b4037b-c5d0-435f-966a-bd88b1665dad.json |  132 -
 .../757b85e7-84c8-429f-aeb4-870852fa8959.json |  132 -
 .../acab4982-1205-4362-803e-306b1e2371bf.json |  132 -
 .../0e549b5d-c1d9-443d-9a80-8dd34dadd22e.json |  132 -
 .../d3d4eccc-8792-40e5-91cf-22885f4cbaf5.json |  132 -
 .../708aded5-6252-44e3-bf0d-08bf3e7f32e0.json |  132 -
 .../ce6d31f2-f38e-4af3-85a3-d2f6c80f71f1.json |  132 -
 .../5efcc291-ca9a-4ca9-b2ed-dab37dce5f5a.json |  132 -
 .../47320824-8064-40d4-a08c-810faafbba77.json |  132 -
 .../8baeef58-0ba6-4723-8f23-7a4c386f2cad.json |  132 -
 .../0387ca63-1e31-4eaa-ac7c-35d417548c54.json |  132 -
 .../733983fe-4b9c-47e6-963d-c57829b6f1af.json |  132 -
 .../80c4859d-8016-4650-939f-100ba2e6d808.json |  132 -
 .../21724d3a-cc6c-43eb-9d69-46d8d91c97f8.json |  132 -
 .../d781945e-e9df-4136-90cd-632f0bed6246.json |  132 -
 .../8f146bb5-dd4d-49ce-ac60-76f66321feb8.json |  132 -
 .../89bfba6d-c622-445e-b0b9-512aadcea7cf.json |  132 -
 .../9c27f2e6-ebbe-4fac-bc51-74455d3a6512.json |  132 -
 .../455ef1e0-bdf2-49bf-a53d-2c9e3d00d5f3.json |  132 -
 .../e04a76a6-ac22-43b2-bbf9-196a08de2949.json |  132 -
 .../2fcb74f0-add1-4d46-8a0f-8578a616dbed.json |  132 -
 .../51530638-ef76-43ce-9396-8a0d07988712.json |  132 -
 .../74d99e4d-0e6f-4804-aa52-0dc76d37fac3.json |  132 -
 .../80e8b9f0-b507-4927-9d24-1c793e3783cc.json |  132 -
 .../7b037520-a5e9-4b58-80f3-f0ecc5957c67.json |  132 -
 .../10b88d05-62d2-4603-9d04-b0854e39ed40.json |  132 -
 .../4b693f41-d811-4b64-892c-d840eee5ace4.json |  132 -
 .../90d86c8c-3aa6-42ba-a94f-75c961e65c41.json |  132 -
 .../8318ae52-6ae3-45ce-82db-73f8cb5ad7c7.json |  132 -
 .../b20a1d13-2f14-42e4-bdde-49f053cef325.json |  132 -
 .../51521dfb-d4b5-45df-ac2a-54190aed0b9f.json |  132 -
 .../997a1ceb-185a-4e6c-8383-eb5a6f976771.json |  132 -
 .../22101998-c3d3-414f-9ed1-99330cdbe3b2.json |  132 -
 .../a2408953-a7eb-449c-b80c-3620915d44d0.json |  132 -
 .../d65e5b08-7d3c-4c0d-85fa-496db65a235c.json |  132 -
 .../ce2c9614-46d2-481d-ac25-3cc71a93bd5e.json |  132 -
 .../e9ba998d-8147-4046-afae-9ee7d544e98d.json |  132 -
 .../c44f1012-1123-42c8-b110-5735dc756fd5.json |  132 -
 .../5088f6a6-2acf-4d10-8b78-0d5bd4126ab5.json |  132 -
 .../b4d96088-5cc0-4ebc-8b8b-8c7e9f90420b.json |  132 -
 .../529dba11-53af-4045-ae46-04e1b9838d4a.json |  132 -
 .../391f6d6c-418f-44be-910a-fb90b5712649.json |  132 -
 .../2ccccb4b-7260-4a1a-9426-117e359c7c5c.json |  132 -
 .../84afecec-453d-491c-9f5a-de31d8fba43e.json |  132 -
 .../dba3a3a4-cd23-44c9-823f-0bd88cf6465b.json |  132 -
 .../1179bcce-558e-40ad-8537-c74c59557975.json |  132 -
 .../fe0a5c17-6c8d-4f06-a58e-47648ef9ecec.json |  132 -
 .../81cf8cbd-33bc-44ab-930a-65242e1ae7b2.json |  132 -
 .../173bb053-e817-4551-b169-c3f71163650a.json |  132 -
 .../b7e6a86f-340c-48ed-a828-2e80a13aa515.json |  132 -
 .../bd221eee-7aa8-4d6f-a6be-89ee5568e729.json |  132 -
 .../8727a325-a515-4456-ba34-65c30f84644a.json |  132 -
 .../3e4011fa-d480-4c16-9371-2025bc834358.json |  132 -
 .../867499a7-589b-4564-b04d-a004b7c0abb4.json |  132 -
 .../52f1fb51-fc7e-4cc2-918a-7c7226ae2ce5.json |  132 -
 .../5f4a8fb6-b22d-4eb2-aaef-da05ca45fbeb.json |  132 -
 .../3278855d-7bd1-4e7e-b27b-b1393006e7e7.json |  132 -
 .../5193ab4d-1627-43b5-bfb7-89e08ea1f810.json |  132 -
 .../598faeda-48fb-43a8-aaa9-849d5dfcea79.json |  132 -
 .../d1afa2fb-1256-4dd3-b13b-802917bf481b.json |  132 -
 .../397c9bc3-0af5-453c-9b68-5360783dfbf7.json |  132 -
 .../9bb39652-c79a-42bf-b6d8-c4ed6174a4c7.json |  132 -
 .../7e793244-b746-4aa4-a401-dcf5884f61a4.json |  132 -
 .../26a8da03-debd-41e3-8ee1-2827d76b26ca.json |  132 -
 .../e214c326-dd84-4915-bba1-faaafbb026b2.json |  132 -
 .../98a5ea0a-6e45-48f8-8219-32099b9fa9d0.json |  132 -
 .../40d7d17d-2d41-4d23-83c1-ab5f3320e36e.json |  132 -
 .../d881a83a-9ba8-4919-8b89-45f5a7220621.json |  132 -
 .../d6c966a1-7927-424a-9886-b98688d27e6f.json |  132 -
 .../c09fe163-a7f7-4b6b-b407-ee8d698b2ee8.json |  132 -
 .../b3979c7f-0596-4a24-b264-73a17ba19821.json |  132 -
 .../f6156893-92e7-4c4f-bff4-8b6d774ecbd8.json |  132 -
 .../8b1c19e0-8b47-46ae-8bf3-f84c7d3a9c0e.json |  132 -
 .../6221102e-4e8c-46dd-8c03-fa9e92b7e4ea.json |  132 -
 .../329e5e91-10ba-4795-ae86-dda95e698b4f.json |  132 -
 .../3fe89b13-135d-4790-871d-74e7a28ea2e9.json |  132 -
 .../4b807741-f1b9-4964-9bc9-bb93f9b34217.json |  132 -
 .../c52a8a4d-be91-4a0d-8cd5-8473a42f0978.json |  132 -
 .../f6e157c4-0ce9-41c9-b885-9222d894ff0c.json |  132 -
 .../fe52a94a-5324-4b59-accc-dfd1f9d4aead.json |  132 -
 .../1241f5e3-54eb-429e-b109-a5e163e39eda.json |  132 -
 .../8ccc7c8c-1d14-45bb-9a6b-f8f69e506139.json |  132 -
 .../5531b59e-24c0-41af-ab6b-d6a5e38b0a98.json |  132 -
 .../63e82cb3-2f6f-4617-abb7-ae093bc27830.json |  132 -
 .../0feb74e6-40d4-472d-9233-27faa2d3f802.json |  132 -
 .../e74dd005-c9b5-45c9-b7f5-455c3110e09b.json |  132 -
 .../d094bf6f-9952-45c7-995e-d7eda07f4668.json |  132 -
 .../0e5f3393-8a6a-4f2f-948a-a37ae4d8fdeb.json |  132 -
 .../f91982ac-0cab-415a-8503-e090d195bd05.json |  132 -
 .../fb1af66e-7828-495b-8277-5cff77c3070e.json |  132 -
 .../ac84c157-4d11-43c1-8731-b1e5cfa91668.json |  132 -
 .../bbc812dd-9a9c-4f99-b813-50361025eea3.json |  132 -
 .../fc818799-49d5-4fca-b131-ebe8d5d831f1.json |  132 -
 .../33349989-8573-4d71-ae0f-99691fdaffc3.json |  132 -
 .../91551de5-d8ac-4c0d-b9b4-3627db947f0e.json |  132 -
 .../c2d2c1f4-aaab-45f1-b3f6-5b4ea56b696e.json |  132 -
 .../36821a8b-af18-4631-b4b0-7e4b37bb194b.json |  132 -
 .../e402d129-f4f1-4b95-b079-4f30936119aa.json |  132 -
 .../814e1ea7-a639-4b05-9208-0bf537ea5479.json |  132 -
 .../35a50d36-31d0-454b-a13c-80ca26945f94.json |  132 -
 .../87347017-4ff1-4bd3-a1d7-8f3999061209.json |  132 -
 .../976184ed-c4ed-4898-83c7-521a8a8309ac.json |  132 -
 .../fa52f072-7725-4a4e-b728-042e5897a1bd.json |  132 -
 .../6374dcee-301c-4f28-9316-82ed8e693089.json |  132 -
 .../b7c95cb4-f32f-466e-a28c-32afd9ec5578.json |  132 -
 .../bddd742b-f7c9-44aa-ad2f-83f51a4625be.json |  132 -
 .../099af0ee-c06b-4435-8f97-27681f3eddff.json |  132 -
 .../fa826f3a-8688-4518-8d44-68189abb47ba.json |  132 -
 .../10d29dc0-3486-40df-9933-1ce8f0fabaa2.json |  132 -
 .../741ff375-3392-461e-a9b0-e0dab4e6e9f8.json |  132 -
 .../c3d709de-118d-40c2-ab89-040efedd7fdb.json |  132 -
 .../9be3dd27-93fa-49e9-a628-5a77a8a3bb9a.json |  132 -
 .../be850d1b-bf75-4c34-830f-8881792ac842.json |  132 -
 .../6b644b97-4fc3-4826-9ea9-68be1dc8e947.json |  132 -
 .../861d41f1-6d33-4e07-96ea-2c39a36c4b63.json |  132 -
 .../7501b038-4847-45bc-8b92-6800d7a58c1e.json |  132 -
 .../db48206d-700b-45f3-b597-8752110113b5.json |  132 -
 .../b52b76e4-9dec-4336-88b1-d98b95b95d2a.json |  132 -
 .../ba9ec2ea-2bce-4999-9e48-e1d0795b31d0.json |  132 -
 .../724221ce-d7b2-43cb-8e16-72ac529a7b60.json |  132 -
 .../552f3814-d071-4d00-a895-b739dffdcb2d.json |  132 -
 .../d3819133-bae8-493d-9a86-aee67da5d115.json |  132 -
 .../5c3a022f-7221-4b4f-ab67-d5b69c558434.json |  132 -
 .../c161b868-746f-4d88-9f41-eb8283a7b87a.json |  132 -
 .../f79a76fc-09ff-48c8-b0e7-5f18e0750e6d.json |  132 -
 .../39f4d1ab-fd42-4746-b949-9666ce32f9d1.json |  132 -
 .../8348f316-9109-4229-9fee-edc02431befa.json |  132 -
 .../6b2346c6-5fbf-4195-b3bb-66bbd446ca53.json |  132 -
 .../8645ffc1-6487-4205-b8b0-e980e094ac6c.json |  132 -
 .../2c6d1e57-7673-4a86-808e-6ff6a7146a11.json |  132 -
 .../64ab8b1a-62be-4561-8f0c-e42f1fe37178.json |  132 -
 .../3eb22885-eb7c-4c85-b79f-cd47ffacd551.json |  132 -
 .../8956d608-c627-469b-943d-bfad6c7382af.json |  132 -
 .../9ff060c8-d4fa-4880-a0cd-9581f5c2f574.json |  132 -
 .../e3d6b3d7-a231-40c1-bac9-0b7fcb478bca.json |  132 -
 .../20acb302-3a74-4425-af4c-a1d719b90a88.json |  132 -
 .../a8613588-687d-4291-ae5a-57688501cffd.json |  132 -
 .../83dd67cb-5508-4aa5-9435-d5585b7f3d52.json |  132 -
 .../26d981bb-f2e5-4195-8d6f-594bb0b26f4a.json |  132 -
 .../df06c977-b54c-4668-837f-eb583ef24d29.json |  132 -
 .../31a8ac03-f58b-46e3-9f17-53311b1fd506.json |  132 -
 .../3e4a7141-7a82-421a-a107-bbac3cbafc9b.json |  132 -
 .../9a3069f2-81ed-484a-b6e6-a45a259e9a43.json |  132 -
 .../c0a3d0c3-c541-4606-a925-4100b062284f.json |  132 -
 .../20685a4b-686f-4cd4-b49d-3067a005256d.json |  132 -
 .../85a91293-cd51-4f79-8b98-2f4bc67d78c1.json |  132 -
 .../d2e3a6c2-4e67-4150-b9a8-fec979fb1658.json |  132 -
 .../c4d686f2-2af1-4271-9556-09380f07ba5f.json |  132 -
 .../93167303-b38e-43f0-a552-72c26ccb4339.json |  132 -
 .../b52a176f-f369-4791-a7e3-88a72709c868.json |  132 -
 .../b6310012-17f1-4ee0-abd0-0079a9299350.json |  132 -
 .../f581e832-0f77-496e-bcd3-6cfec51ef594.json |  132 -
 .../47b47c89-b13b-4099-98b2-854feae05f63.json |  132 -
 .../8d51ae58-7b20-4fa4-b234-2abb9cdeaad4.json |  132 -
 .../4d4d5679-8ec6-49b8-a5d7-2a76497b44b7.json |  132 -
 .../0bdb6574-69e2-4858-b7aa-a90a5fadf741.json |  132 -
 .../fa1a92bb-ad25-4be2-a35f-7fdebbeeeba8.json |  132 -
 .../d62ea0a1-cc9d-41b7-8d60-479b8e2262b5.json |  132 -
 .../912446e3-efdf-4ed0-80bd-261c6c87a3d0.json |  132 -
 .../5e86dc31-ae3e-4ef7-858e-41e29b3a8031.json |  132 -
 .../80680e5e-ab83-4a59-aeec-9d4166509c47.json |  132 -
 .../c5bc9c92-8469-4174-aafd-67bb61aaccf2.json |  132 -
 .../1d67b792-178b-4baa-a108-2362f658bd4e.json |  132 -
 .../eb0c87b0-4795-4029-82c1-57ce37ba8259.json |  132 -
 .../dc9b2300-7ab0-4e92-9d23-15fe9ca52994.json |  132 -
 .../e005624d-c822-4be1-9477-873642aae228.json |  132 -
 .../e9756d91-b9e2-4dd0-bf08-c6154c7d1f2e.json |  132 -
 .../704598c3-c5d6-4ce0-bab3-0fa98118e16a.json |  132 -
 .../fafc9463-d725-4827-8bc1-5cd9e83814b6.json |  132 -
 .../109820e0-ee00-449c-9ae5-58a7bf1da5f8.json |  132 -
 .../37f29d5b-d803-4195-9ce0-75e45e32c160.json |  132 -
 .../43546f48-8c46-4481-b1e5-f4b1ad2535be.json |  132 -
 .../ec81e0ff-9cb4-4d43-9f78-1d5f4edc9103.json |  132 -
 .../9290c86f-40b0-4520-b8aa-3460de62c396.json |  132 -
 .../a4bf576e-9556-4956-8dcb-4d8906d45db0.json |  132 -
 .../320a5c00-3307-4bc3-9f47-9befb88e461c.json |  132 -
 .../844d1556-6bc6-467e-a145-f92646770727.json |  132 -
 .../78923f4b-c2e7-4472-8398-10a0a8453ec5.json |  132 -
 .../17abe1bf-2e97-409e-88e3-4f661861a195.json |  132 -
 .../756978e5-1dfe-433e-ba88-339004a50ea7.json |  132 -
 .../a889ae3a-5d86-4454-bfb9-332c4b61b836.json |  132 -
 .../2c5e1086-03b7-4cdd-801e-03fb26183076.json |  132 -
 .../d9578847-b732-4c75-b246-9cdf03674fe0.json |  132 -
 .../4c6f83fe-7896-4cf3-9434-b5f8d499f5ba.json |  132 -
 .../619037af-d528-4579-b7e3-58628468d8fb.json |  132 -
 .../5113b737-8d9f-4321-9a67-91f1aabb40a1.json |  132 -
 .../641ac372-2e5a-4b44-b22e-a17600a6a868.json |  132 -
 .../7cbb0b08-871d-48fc-bf3e-86267f5ef19d.json |  132 -
 .../c82e887c-c8ab-4221-aa0b-e8b7a86e7c46.json |  132 -
 .../50c65a83-9d08-4155-ad2c-5a2f8ffc8743.json |  132 -
 .../99d97aef-bb6b-471b-8ed7-f6f92f75842c.json |  132 -
 .../b98504a0-f1d6-4872-b748-2ca8199c5328.json |  132 -
 .../5a159667-7460-4a97-884e-6a96df59873b.json |  132 -
 .../16a2eceb-073d-4dc3-87a7-a15c641c5ebb.json |  132 -
 .../e8e2d04b-21db-43dc-8b8f-7fa3bba87abc.json |  132 -
 .../acbb93b3-f8fc-479d-9610-392efd7d4ecc.json |  132 -
 .../6d0589bd-1f05-44ee-afa5-3657b960d7c9.json |  132 -
 .../134663d8-05a8-4336-90e2-68e7cba5f1df.json |  132 -
 .../3bfced28-b06e-46ab-a6aa-171b0c424337.json |  132 -
 .../b6a83b82-6b05-4437-a076-e2a3982f6169.json |  132 -
 .../f621201b-f571-4487-9f1e-b767675c659d.json |  132 -
 .../710fdb79-fba4-42da-8e26-45b4caf75207.json |  132 -
 .../35fa7a5e-8866-4ce3-9899-8737e908f34f.json |  132 -
 .../2b24b69b-15dc-4666-83f3-c77db545bdbd.json |  132 -
 .../0d00d849-2147-4fc1-9e5f-d42a95be6ca5.json |  132 -
 .../f45135b0-3c26-44b5-9922-a6c0817a172d.json |  132 -
 .../67eb0d6c-9086-4c80-8506-c3e1489f2673.json |  132 -
 .../79d3dc85-08f6-475c-ac2c-1ff32f5a089f.json |  132 -
 .../4e9b3fa2-d3d2-4e4c-a1fa-c812f481f64a.json |  132 -
 .../6e62a8a0-0bdf-4b6c-93de-593423dadd3a.json |  132 -
 .../871131c1-295d-40a0-a396-09d24b880064.json |  132 -
 .../44eefbb2-22d4-4dff-889d-a87fc40b2eea.json |  132 -
 .../cd1de470-a174-4c08-9efe-a06d493dc4b2.json |  132 -
 .../fdb55a14-0697-4775-8358-fed202498b4f.json |  132 -
 .../c069a224-638a-4cad-a9ad-e4f8579e8c15.json |  132 -
 .../10e5c103-f25f-45bb-bfe6-a22876cffe87.json |  132 -
 .../a9ecca9a-c5d4-45b2-a403-e74a98a46322.json |  132 -
 .../630d8a60-03b7-4550-82f4-e879b2e01c6c.json |  132 -
 .../206b5a96-ae07-41fd-822f-436d49c57dcb.json |  132 -
 .../702d2120-5301-4e03-bb0f-1f8ab19e522a.json |  132 -
 .../61e39700-c237-49fc-baef-3fa573b3b0c6.json |  132 -
 .../8892ab84-750d-494f-9f87-ad28e73cf364.json |  132 -
 .../538a2eb7-34e4-4e78-a382-60a13710096e.json |  132 -
 .../a041629e-8ed8-4a6c-95ee-98e759501e19.json |  132 -
 .../09f05984-5815-4b3d-bc73-83ea1e5ecc27.json |  132 -
 .../6535524e-f8cf-4f2f-9d89-9ba70aedac91.json |  132 -
 .../08ea4f9d-0e3c-4a8b-85e6-075290d30ba4.json |  132 -
 .../631f0a1f-a6f5-46f6-9aa0-31ac9764c086.json |  132 -
 .../b771f6db-7516-4423-9010-3467db0e26e3.json |  132 -
 .../cf580dfb-2924-4c4b-9352-394275b959bd.json |  132 -
 .../ba549fe6-7718-4abf-a610-7e0f48611483.json |  132 -
 .../b92440b1-78a9-4288-a432-f057f2b04a2f.json |  132 -
 .../838f3932-edf2-4f72-9238-981d1aadc771.json |  132 -
 .../61e933b2-5cd1-4f08-8a9e-5b06ef54b6d5.json |  132 -
 .../0b307c78-94c7-418f-bc47-5106b81c30de.json |  132 -
 .../18783694-3e7b-4d06-9378-5a3fa4a7a0a2.json |  132 -
 .../dab922e5-1b46-4a90-b75c-1b26cd6cc6d3.json |  132 -
 .../8cfa1f00-3b26-4d75-9b0a-0dea65e2e352.json |  132 -
 .../f74d26e6-9dfb-4e81-8522-8309b27760cf.json |  132 -
 .../2022bcf3-a057-4b0a-aa33-6cf074ffc714.json |  132 -
 .../a6e79d12-42f6-47ad-95fa-ba03fa4d3a06.json |  132 -
 .../24d850fe-1817-4041-8767-085f4bd2bac3.json |  132 -
 .../610a3be1-1032-4079-ba37-d6c2c5f9fd55.json |  132 -
 .../857bb10e-1b43-4714-a758-0cef5816ba02.json |  132 -
 .../cdabdd54-6101-471c-9bd8-446953be986b.json |  132 -
 .../8029cb75-8d3b-411d-b0eb-74539b8ecb2f.json |  132 -
 .../65d10996-2c5b-4e11-9a07-319c2446a237.json |  132 -
 .../ef21d739-b122-4ab8-a8ff-a7cfecad5c8e.json |  132 -
 .../45f3b963-497b-4d89-ac66-9ff0ba8dadf8.json |  132 -
 .../4173435b-d907-4ac5-a8bd-dfa2759f3fb6.json |  132 -
 .../b4a79f30-3a04-4f78-861e-1571316a0642.json |  132 -
 .../53426038-df38-45ba-b621-34231c9cad7f.json |  132 -
 .../fa758fe5-21ec-45cc-941f-5cb5ca0612b1.json |  132 -
 .../d2a92a62-3bd0-4cb2-897b-742ea0d5203f.json |  132 -
 .../8b752519-63d4-4638-b56e-1c45c7f4694e.json |  132 -
 .../8da71b7c-7b73-453f-998b-84e70b54e471.json |  132 -
 .../2b7b1216-3ea7-48f1-89f6-e5d84fef2b32.json |  132 -
 .../37e19712-3197-42da-a8f2-ae1f36c2b06c.json |  132 -
 .../c6ae6691-64ec-443d-8d76-af614c8cc7f9.json |  132 -
 .../80567722-8c6b-41b9-8103-3bdaedfdb8ee.json |  132 -
 .../20192dc4-ea3a-4413-8457-18a592fa0c64.json |  132 -
 .../8c878c05-86f7-4d61-81d7-9bb286516581.json |  132 -
 .../fa753be0-4a98-4ec3-9cc9-3bf7b380ad17.json |  132 -
 .../0516b46b-a957-413f-aadc-58f4339dc60a.json |  132 -
 .../97200dd7-7ed0-4a7b-ace9-31c173f017f1.json |  132 -
 .../758f8332-ffa8-4059-ac6f-400f9367bb23.json |  132 -
 .../b1103662-055c-471e-ace8-dd75f607491d.json |  132 -
 .../27b0d675-498f-4351-b92f-7c0d1a3c83bd.json |  132 -
 .../3f1f88d4-2908-4f28-b8d3-4f9ded18ba0e.json |  132 -
 .../3883b0d3-e442-42d3-adc6-ed959c902dd3.json |  132 -
 .../da172cdb-1388-42f5-97b1-ae8e15291631.json |  132 -
 .../7c94dbfa-4b3a-43fd-9f2c-b3d63d8ef700.json |  132 -
 .../7cdd1de0-767d-4527-a024-c67166bb8b20.json |  132 -
 .../d4702278-54c4-42e8-a901-dfe5c7f2004a.json |  132 -
 .../149f8ee5-4376-4fcc-8f87-7412a3083570.json |  132 -
 .../de82b746-c5d7-450a-bc2b-1b2859d91d6b.json |  132 -
 .../d2a916a6-288a-4761-a3fd-ca674edb67c1.json |  132 -
 .../cda497f9-c7f9-48d6-944b-0167476e5e5c.json |  132 -
 .../b56c6c01-a226-4090-9332-330535d79e24.json |  132 -
 .../0ddc8e10-9cc5-48eb-b5b0-a2c2f071862b.json |  132 -
 .../2917c469-7e22-497e-8d62-9b9972266658.json |  132 -
 .../2424d85c-e092-4e7c-bf4f-ae014d08a159.json |  132 -
 .../90278363-1d8f-47ca-a7dc-c51c6b511dc9.json |  132 -
 .../3c3197ee-675d-4bb7-874d-28104d2a3cae.json |  132 -
 .../eb5a8679-bfdd-40f2-9a32-55c04a65ae7e.json |  132 -
 .../d770f88d-b110-4f27-85e9-e52217c11798.json |  132 -
 .../364328ce-5de7-401f-ad84-0c76e3c1dc91.json |  132 -
 .../f7dcfdbb-ff12-4692-9702-712de3d0b7ba.json |  132 -
 .../d641aa88-9981-4a25-90d5-fcc4564ede52.json |  132 -
 .../8915e742-df2e-41bc-b83f-3e111edfd257.json |  132 -
 .../e29a5e35-8677-4e53-83fd-85e919b4366a.json |  132 -
 .../e5c55d38-dc04-42b4-9aca-ae7be436ebe0.json |  132 -
 .../504baceb-6684-430d-a532-b7b5b0b061fe.json |  132 -
 .../31fcd34a-af1e-4eab-bd9a-5ec17eb572d2.json |  132 -
 .../01ab0a3e-393a-497a-9b32-8af790b7581a.json |  132 -
 .../541967a6-b856-4dc9-958a-9335197fba99.json |  132 -
 .../ee31c801-67cb-46a3-9e39-02e842c0473f.json |  132 -
 .../65fabe8b-05af-461e-b804-fcff3492da34.json |  132 -
 .../7e1a7121-2c9f-4196-bbdd-48aea257f384.json |  132 -
 .../dd32609c-316e-4511-8791-fcae33a1a506.json |  132 -
 .../d95d7058-49eb-47d7-b790-3a253291d22b.json |  132 -
 .../37cbc3d6-1198-4e23-b86c-1fd979eacd9a.json |  132 -
 .../76d0d338-e502-4638-adad-c4c4df00c26f.json |  132 -
 .../f47375bd-547a-4d0b-8c96-bbe2bc1ac445.json |  132 -
 .../6b1ed68c-3099-4bd7-892b-cdc36c90ccfe.json |  132 -
 .../0e59c8ca-cde0-4482-ab03-3309bcb8737c.json |  132 -
 .../d7e900e2-0574-44cd-a68a-0dd2715cf48c.json |  132 -
 .../fd626c3f-566d-4193-9a85-e7c9a89e671c.json |  132 -
 .../196b04ae-fd53-400f-9f08-19edd4959f6e.json |  132 -
 .../57177299-076a-4506-89a7-ce54af08df4f.json |  132 -
 .../d3bdf36f-7f89-4b5a-b6cb-847b49200b5b.json |  132 -
 .../92619b9e-dacf-4d0a-9f8b-6e131af74fa4.json |  132 -
 .../cbb408ea-ced6-4f47-9066-d4ff6d604b1e.json |  132 -
 .../6999bb02-29fd-4c59-886f-184362afa06e.json |  132 -
 .../913d1d8e-0b02-4ce5-9b7c-403143a8c880.json |  132 -
 .../82c87bc0-29cf-4150-92f5-c80fb0028ea6.json |  132 -
 .../a18834ad-6143-4ce2-9842-471817a60a39.json |  132 -
 .../be900bcf-8ec9-484f-81db-0e83975c1ecd.json |  132 -
 .../d226ccf6-674b-44c6-8b11-d782b59a961a.json |  132 -
 .../d8839a1a-8d07-4e0b-bd44-2668c84f750c.json |  132 -
 .../e90b04db-2eb3-483a-ab0e-ea8aef821d84.json |  132 -
 .../900921ae-fbb2-4488-ab19-18987c1d008d.json |  132 -
 .../0da0a7cd-c075-4bc0-8e88-8acc7212e5c3.json |  132 -
 .../b50a49cd-2909-4dbe-9c9f-c150abb99845.json |  132 -
 .../13831d81-a9dd-43c7-bce1-240aad42fbc6.json |  132 -
 .../56ea7cb3-3a1e-477a-bac8-26a0fde6297a.json |  132 -
 .../8ce19b33-4f2b-4b8d-80bd-1ed399a5e9dd.json |  132 -
 .../18ab167d-b72e-4fa9-94a8-09edc641c73f.json |  132 -
 .../7df237ea-29c0-4d0a-9092-c41df4c13aca.json |  132 -
 .../e5dc8caa-2d86-4ff0-af8d-22d85c8faeb0.json |  132 -
 .../01591bb6-9daf-40fb-b802-0a007f4cc388.json |  132 -
 .../f6c32abf-bbae-4827-9ce2-29ce20c9463e.json |  132 -
 .../74a6605d-3557-4458-bef5-cc9420434e68.json |  132 -
 .../dbe6e126-d35c-4634-a544-adf374ed5d00.json |  132 -
 .../d68681c1-01e4-4af0-9a81-e0aaed0ae865.json |  132 -
 .../de9620b8-7112-436f-8941-fae2c5e7f9e0.json |  132 -
 .../cafee7ac-deb6-4c4b-af8f-81548648cb14.json |  132 -
 .../3e3cb617-6f19-4731-b31a-b1f4d88237d5.json |  132 -
 .../3c2c2c14-d065-4d6c-8c98-44ba8f2ca461.json |  132 -
 .../8909f916-401b-4457-ab8f-2691696049c6.json |  132 -
 .../ae191508-7dad-4cac-ad4a-af95d7a15b5d.json |  132 -
 .../507f5047-fac3-415f-b9fa-aae4311fa837.json |  132 -
 .../0ee8716c-74f0-41b4-94a2-efc715150293.json |  132 -
 .../fcf491f4-cf57-4c95-9de1-4702ab5d54c7.json |  132 -
 .../4fd20259-c7c7-4da5-9013-ae2feb2175b1.json |  132 -
 .../a7c8c345-cade-48fd-93c0-0f344044d2b5.json |  132 -
 .../7a8e3986-7688-4a26-a74c-a9bb47cd3e8d.json |  132 -
 .../7a2ffb4d-1135-42a1-b28b-3b4e4d014979.json |  132 -
 .../25468720-93d7-4f10-a534-30c4976657e8.json |  132 -
 .../5ba1d617-9d9a-4c3b-b9cc-3224ace129b3.json |  132 -
 .../27b2b46f-1323-4ddd-9f65-d8fcd9cd6508.json |  132 -
 .../65917125-bb7c-4d64-ba5f-b5e4f67ec332.json |  132 -
 .../30bf22d8-b93a-4775-8073-30e14e15e35d.json |  132 -
 .../ff510365-a13d-4e44-9709-59a56e864991.json |  132 -
 .../6d1eebc4-228b-43f3-b31c-3d5b1591ae2d.json |  132 -
 .../f1e8cdbb-14b7-4959-a053-fb1b37629aff.json |  132 -
 .../4145d1a0-8d6a-4d64-8a45-a89cf343ac46.json |  132 -
 .../d6966190-e254-4902-8472-cac59bfbdbe0.json |  132 -
 .../5fdb5437-f413-451d-9800-42036cda7686.json |  132 -
 .../347577a4-2768-4472-ba48-9b174ad89724.json |  132 -
 .../33af440e-837d-4454-9340-af0d3ee74f77.json |  132 -
 .../1a1f4709-8d05-4905-8105-0c3606d5ef5b.json |  132 -
 .../28421948-089b-4487-bb71-a06e5ce74402.json |  132 -
 .../3fa0c783-9226-4fc8-b3a0-6e960684f43d.json |  132 -
 .../743b7fe2-f998-408c-98b1-af02d9c1ee2a.json |  132 -
 .../0039c88b-a881-4ce0-9a0a-a10f1a8cbc70.json |  132 -
 .../87c7fbd9-7648-4d0d-ac9e-8ba85860e335.json |  132 -
 .../6ca3ab87-c05e-46b5-879d-4fc8bf75417b.json |  132 -
 .../525f1b9f-88a2-459d-bb4a-7c01a0107968.json |  132 -
 .../503f79be-7f05-4464-ac9f-0f284f1e7965.json |  132 -
 .../86ec7d95-6f6d-4ca6-97d5-7a910f42a06d.json |  132 -
 .../d472ba79-6592-4f8a-a99c-ec3f71468d3e.json |  132 -
 .../6ddc052c-6bda-4d8e-ad97-20d881c8cfb7.json |  132 -
 .../76d1aed8-80fe-4b4f-bd81-ea0d6bf085c4.json |  132 -
 .../d2845d6e-65dd-4448-901d-d554b3e741f3.json |  132 -
 .../f7dd203f-24d8-4875-878a-12ed99e20cd3.json |  132 -
 .../287ae246-bee5-4fae-b78f-203491aa8df2.json |  132 -
 .../9ee493f7-e031-4593-beae-65be17678e00.json |  132 -
 .../86b10c6f-41c6-4d0a-ae59-f90e204e466c.json |  132 -
 .../043e3533-7d5c-4d45-bcd8-0dbcc8ca4819.json |  132 -
 .../1b3269fb-4b16-42b6-80c0-3d54bc2b4fed.json |  132 -
 .../ee625c29-62c4-49da-9790-e7e67233157d.json |  132 -
 .../02b16bf2-62bb-401e-9726-2135d8d610be.json |  132 -
 .../db10c6f9-2962-46cc-aa4e-4c99c4b494d1.json |  132 -
 .../aa37bda0-2e0a-4361-a5b4-468154d8ac72.json |  132 -
 .../d9a6565c-5a0b-4893-b6e0-1fc52ec55bf5.json |  132 -
 .../becf9805-83a9-4137-a938-81a61a10e4f0.json |  132 -
 .../6e848120-bc31-4628-af05-30707a6dcc41.json |  132 -
 .../864af855-71b0-4b11-ae3f-56294a7d0db9.json |  132 -
 .../285bd390-1dd9-4db2-af45-68dea557da3c.json |  132 -
 .../459e2375-1a15-4129-bee0-dc8852d531e2.json |  132 -
 .../7b4c7d92-f581-4057-bec9-e3a8c6a5386e.json |  132 -
 .../7ceab841-f9a3-455b-9314-243d8fc3cd11.json |  132 -
 .../c1e2fb45-22d8-4eb4-8971-ce89c3048b9e.json |  132 -
 .../68cb2ca1-1648-41a2-92b7-969bccdca4ee.json |  132 -
 .../5f285d61-5e4b-4c5c-8960-c10313d76ae3.json |  132 -
 .../3af19898-8590-4aec-b324-46c7fbf596d3.json |  132 -
 .../e8472266-6d03-439f-bd6b-e3ac5ef2cf09.json |  132 -
 .../3f578b45-48f9-4022-991c-32a71706aba3.json |  132 -
 .../ef8c22a7-3898-422e-88e2-1a8c14ab5bf2.json |  132 -
 .../81630ea2-d496-4872-92b7-e476badaf50d.json |  132 -
 .../9436d04a-9c81-47ad-a7b8-496e14058627.json |  132 -
 .../f1e6e54e-cb97-4980-8957-2190ee5c4c34.json |  132 -
 .../30914dd3-c857-4aaf-b6b9-d1c7e4917e89.json |  132 -
 .../1c389a32-68b3-47c0-a6b8-2c2291293002.json |  132 -
 .../e759a217-6571-446d-9bf9-d1512793f307.json |  132 -
 .../753f3b21-7365-4117-b2a0-a91f03ec3d39.json |  132 -
 .../297ef102-67c1-4e9c-b418-fed026bb1f8a.json |  132 -
 .../9fbf73d7-7d67-4d6c-a5b9-efc627cd1b2b.json |  132 -
 .../b1446577-f13f-434a-a0b4-916091395d4a.json |  132 -
 .../fc8946aa-8b04-482c-8c05-d026d2af07be.json |  132 -
 .../fabe3784-948c-4618-9cf0-c76a3ddd3820.json |  132 -
 .../736dcf09-6a19-4e88-a790-7a7ee74d8717.json |  132 -
 .../75b4c750-1570-4825-a04a-965c06861fd4.json |  132 -
 .../b7f8b678-2aea-4d41-ba21-2083fc472574.json |  132 -
 .../a8010630-58de-448c-af08-70b8ffec431b.json |  132 -
 .../4a0c2ce5-a4b4-4d35-b65d-bbc6e36a649b.json |  132 -
 .../1132251a-59c7-402e-9957-f9288864508f.json |  132 -
 .../e2fac049-8f9f-4b71-bcd3-5746b7d90150.json |  132 -
 .../d891a1e1-ad65-498f-9ee8-59523c1bfd19.json |  132 -
 .../9dd3103f-6c4f-4077-ac27-3a9b0f4a5882.json |  132 -
 .../ca031f70-5785-46d1-8a58-b279d8340776.json |  132 -
 .../18457711-92b8-4c27-a89a-928fecdf5724.json |  132 -
 .../3398aeb8-08a8-4be9-a24c-efeabcaa2139.json |  132 -
 .../707bc006-4318-41bc-b91b-aa43ca7cba6f.json |  132 -
 .../7bfda919-13be-4b68-8655-99fe6a4605a2.json |  132 -
 .../f844e739-5f0d-4db4-ba66-bd33b1290571.json |  132 -
 .../0cde6639-6a89-4682-bb3e-a2a24a1bc8ab.json |  132 -
 .../87652005-4404-4c45-bd4f-5f63c44adf63.json |  132 -
 .../a7e0bc2d-784d-4719-ac08-d8fa0c29d178.json |  132 -
 .../e8ba93e6-6f90-4169-8403-381b7f9e26ab.json |  132 -
 .../ea86b542-3d06-4e71-b49d-17cdd362b465.json |  132 -
 .../15615d2c-46a1-47c7-a273-697e97bdf9f2.json |  132 -
 .../a2b8da3f-c99e-4dba-b4a2-23739281eaf2.json |  132 -
 .../76f3fa3a-1629-4cdd-b457-3a108784b427.json |  132 -
 .../c9e979e1-4433-4a38-8fd4-c14895e74f44.json |  132 -
 .../3f2effba-1ab8-476d-b228-ed9491e83adf.json |  132 -
 .../a5f0fb1b-27a7-495f-a010-3307afdb8949.json |  132 -
 .../22f2aa1d-fff1-430a-9c20-3b32859d9665.json |  132 -
 .../daff0e6f-d29f-4861-855f-902a0cd9a469.json |  132 -
 .../0f5cb926-b691-4d57-87f5-290235fd250a.json |  132 -
 .../d9e813da-2966-4901-99f9-c7627c64fc52.json |  132 -
 .../4cb98a5b-3eb7-4fa8-adfd-17add38d3332.json |  132 -
 .../f7494fd4-d248-46a6-a46d-f9d8db560aae.json |  132 -
 .../4b8533d1-7770-435f-ba76-a5c658aabd8f.json |  132 -
 .../309c7906-0010-4f17-848f-185062d96a26.json |  132 -
 .../f18ab2ab-098b-4e46-8f8d-433b52cdb81b.json |  132 -
 .../b4a70c71-dfac-4888-937e-d5220b491b0e.json |  132 -
 .../b879a534-6b24-4873-a0e4-e18453540121.json |  132 -
 .../c67ae8f2-596b-4dab-8c4f-768b2f0608b4.json |  132 -
 .../7766c638-b4dc-4b2d-8c14-becdb1b709ef.json |  132 -
 .../dd211bef-3940-4d78-8f7b-a67da81d605b.json |  132 -
 .../87e20b7a-85c8-4845-94b0-ace1e18814cb.json |  132 -
 .../9ab01db6-3154-4c5b-b6a2-35479538d332.json |  132 -
 .../9d35316a-011d-4e45-ae57-317b53de621f.json |  132 -
 .../c9e7fec0-b244-4ca1-a117-a52fdd4671a5.json |  132 -
 .../0659cb01-0d52-42cb-9e3a-2d8cac01692e.json |  132 -
 .../98490bb1-70f0-4e7a-8fd6-698ec9fcbd5a.json |  132 -
 .../6e0f7e7e-8927-436e-95a7-5a7c626ca241.json |  132 -
 .../9c5b3f4d-6e0b-482b-b142-dd7b387cae22.json |  132 -
 .../04840708-a4cc-407c-8b2a-876b382920a1.json |  132 -
 .../83b0844c-70fe-4b63-8ed2-4147390518ee.json |  132 -
 .../9cf10c60-bee1-4f4f-9e03-c3c10287bded.json |  132 -
 .../8e92dd9e-a68c-46ef-9b03-955c06a21437.json |  132 -
 .../dd1139d8-2b44-4516-b24a-1219826f5482.json |  132 -
 .../e37e86f7-b67b-4f0a-b1bd-92f30842b303.json |  132 -
 .../bc3b55d5-35ca-48b5-832e-8544e145b1b1.json |  132 -
 .../5757cd3d-c64e-4743-8200-5e610e24bf95.json |  132 -
 .../ae8cd3ad-ce7b-41f4-8e4a-f11002af2e58.json |  132 -
 .../bee54048-ebb2-4051-a18f-aa85b0f2ce27.json |  132 -
 .../2f98c85b-5a2e-467e-9626-b1bdefe7bdd7.json |  132 -
 .../2c530a3b-888e-4a61-b97b-ea875b30ec9c.json |  132 -
 .../4c9fb322-735e-4644-8121-088d00f78c5f.json |  132 -
 .../e7e7733f-682b-4e68-8f07-85f3ba7a7ae1.json |  132 -
 .../e9a4e1e2-bd55-4c3d-99eb-8fafd8f6ec44.json |  132 -
 .../42ed92b3-63bc-4fa1-bc16-c19bfb73368f.json |  132 -
 .../915ae579-786a-4eb2-a1bb-107a12c9c40d.json |  132 -
 .../3489ffea-a607-4f3d-a0c2-bd17147f244f.json |  132 -
 .../7b5ba8a8-16c3-4169-b97d-13dd5d4f8395.json |  132 -
 .../6411c44a-b2b3-4fe3-8ba4-9422a0a0b31e.json |  132 -
 .../fe344f84-7428-45af-940f-736275bc4d50.json |  132 -
 .../60956ea2-8b0b-4e4b-801a-d0689f9d46f4.json |  132 -
 .../1ad54bdc-419a-4dd9-9fbb-d7b7ee7038d1.json |  132 -
 .../2ab375f0-2477-48a5-a5d9-0b5d0d7d0a84.json |  132 -
 .../e0525a52-d38c-4b2f-b59b-048b4bf71cb2.json |  132 -
 .../01bc964f-552b-4cda-9ed0-cf720f0c8de4.json |  132 -
 .../c9e95c55-978e-485b-8a77-ab2e668e3254.json |  132 -
 .../c71c606b-ccb7-48e9-a6c8-b72205ec6c06.json |  132 -
 .../ae1801cb-d112-4d1a-895d-c6743779846a.json |  132 -
 .../008e3601-dfc4-4bc1-bf8b-f5cef43ae098.json |  132 -
 .../379b315d-96fb-4edb-b2d6-3dc113a10c17.json |  132 -
 .../8cd36aa1-6f87-4d4d-a1bf-adc87e0a26c6.json |  132 -
 .../f76ce244-29f7-44f0-9850-7291f8e4cbf1.json |  132 -
 .../506871f1-0c87-4e8c-a270-eed7b5da2599.json |  132 -
 .../c20264fd-b1f9-4e0f-9f6e-1d58f1c18cda.json |  132 -
 .../59f14dca-923a-41f1-b443-cc3551063f45.json |  132 -
 .../a1ba054f-b0a1-4827-b7ea-3988aa4cf1f1.json |  132 -
 .../51d8f53f-ad7e-4dae-9e2a-0895729ff790.json |  132 -
 .../421119ea-0da8-4b26-a335-f2e720618c44.json |  132 -
 .../b0e6bfb2-a8d4-4b1d-859a-aa821f646e57.json |  132 -
 .../7c4c2ccf-7d7b-4d24-802e-20c182290d07.json |  132 -
 .../95212a55-f382-4869-9e11-cfa201ba865b.json |  132 -
 .../a7da2118-063c-489f-bb31-40f1b7beeefe.json |  132 -
 .../9a75ae18-8f9a-40a5-8a7b-0c38df34e9dd.json |  132 -
 .../a85d4a1f-fbd9-4d21-9700-9e55e30c1391.json |  132 -
 .../2fd1c45e-209c-43da-ae85-d60887513a96.json |  132 -
 .../91e0e6aa-b933-4a02-a28d-8d69e698c60a.json |  132 -
 .../6f3f3d06-2937-4c55-9b95-a62ae5253571.json |  132 -
 .../9b3ffdd3-ac18-4084-9e83-1bfc61db0ec2.json |  132 -
 .../60077cbd-87af-4a00-a359-9235acb011ed.json |  132 -
 .../577936a8-b450-4233-b633-064565b3d1a4.json |  132 -
 .../470b9413-2cc8-4bf4-9e7c-0b8e99929568.json |  132 -
 .../3cbf9c73-0dc8-402e-bc94-c6d52b9f1af7.json |  132 -
 .../3fccb1d0-5ae1-427a-adae-37004ecbacaa.json |  132 -
 .../6463183f-4043-4b96-b4d1-0bd41b4d6876.json |  132 -
 .../0b102423-1a06-4e5b-a287-710695658b63.json |  132 -
 .../b7e4ffd8-2a5a-4364-844a-a308dd7c899c.json |  132 -
 .../3fa2e3ef-a375-4ca5-9f85-7cb986313d53.json |  132 -
 .../abd48d9d-0443-40be-a23a-68922771e14f.json |  132 -
 .../436ff0a4-9907-4e56-a5f2-c97f1b13f81a.json |  132 -
 .../7a654100-b206-4011-828e-fb386df27d0c.json |  132 -
 .../2f0e262c-a099-41f4-89f1-8b251708a960.json |  132 -
 .../7bf3e9ca-7d6f-4d43-b8fe-aceb8d60c7c6.json |  132 -
 .../8703dbdd-12ef-457b-8cda-f570c8f5c890.json |  132 -
 .../d77f3e8f-1eea-478e-babd-ba873d2d427c.json |  132 -
 .../783a4385-c802-4bb3-9a21-90629d16efc7.json |  132 -
 .../bb4ff51e-ce3a-42f5-871e-3e5e8977bc42.json |  132 -
 .../e80d25b5-3f4b-45a7-9472-09f98db03bf0.json |  132 -
 .../7fed0b1d-0d79-4784-8fd6-42f8611b1751.json |  132 -
 .../be534cd3-8245-4370-ba6c-9687b431ee8d.json |  132 -
 .../e98967b7-3aff-4baa-92eb-eff86bf09797.json |  132 -
 .../8736a22a-f980-4a01-953d-217f27050129.json |  132 -
 .../75a2b5c9-7c73-4bb4-8e99-af4a3a27589d.json |  132 -
 .../0e0ebdc7-a5bd-4314-9bd7-fc8a11541a4e.json |  132 -
 .../f8579305-003b-4727-b904-bad4f363a616.json |  132 -
 .../3103f36a-4a88-4a39-8261-0b597f8d6db4.json |  132 -
 .../eda9de3b-ae53-4102-b203-eddadbc50464.json |  132 -
 .../b7de4fa8-d97d-400f-bc3f-ecb1963a03ed.json |  132 -
 .../fa6ecaf9-457e-4135-ad25-4790ebc27737.json |  132 -
 .../ebaa99c4-ff66-421d-8ba7-dae2c5fa274c.json |  132 -
 .../e388c707-8b35-49a4-94eb-f32e983fe33e.json |  132 -
 .../f6273192-31cf-4ee1-af45-c2f62de05330.json |  132 -
 .../105650e6-d9cf-4106-9d55-6f3c08f2f1cf.json |  132 -
 .../a1d23749-40c0-4ccb-a104-bf0de63bc2bd.json |  132 -
 .../4e4b4cf9-48d5-4ff6-92c0-1e9d7b874b6b.json |  132 -
 .../3c4713a3-3973-4a04-9c4a-a6782251734e.json |  132 -
 .../de70c700-a007-4e87-a3db-941ee285eb1f.json |  132 -
 .../a1324a7f-1911-4fa9-8d83-be891f752a61.json |  132 -
 .../9c4af0df-f538-4755-8cd0-eec6b2b26524.json |  132 -
 .../fde650a6-a5d1-4edc-bd64-8be806663263.json |  132 -
 .../96dd1a08-b166-4d8e-ac31-5e948adf931b.json |  132 -
 .../3b90b9db-a68e-4ee9-bd4d-a18cec357753.json |  132 -
 .../444a6ace-77d4-4d93-b80b-ff5c7e2f6888.json |  132 -
 .../7e11a778-fccf-4a91-81cf-c06f1a5c77c4.json |  132 -
 .../e5d126d7-e0bf-43dc-95c0-184ea1d586ea.json |  132 -
 .../d05b129c-6b9e-4e6b-80fc-af65db620c5d.json |  132 -
 .../d9792fac-29c1-45b2-b649-cdebb6830e2f.json |  132 -
 .../fcc2f06a-e6c8-4c28-bf22-4ee582392912.json |  132 -
 .../c6e13327-90b3-440d-9367-dbcec54dd6cc.json |  132 -
 .../30b02429-350c-4d86-aded-ba8597bec4d5.json |  132 -
 .../7d1ee802-106e-4313-ba1d-72d5a0676c88.json |  132 -
 .../1b3af020-f65e-44b8-a9a2-ad60fa686427.json |  132 -
 .../6e40871d-bc23-4f1c-a005-f5b8eb096f84.json |  132 -
 .../1ab33ed2-ea3b-4c6f-a2ac-2465ddd844f4.json |  132 -
 .../ec601f5d-bf19-4407-ac41-6b9272d94735.json |  132 -
 .../87e53761-e8b7-4032-ae7a-c3a91704d115.json |  132 -
 .../59492d86-4b85-4865-84e9-84ab4ace630c.json |  132 -
 .../cc082df2-259c-44c1-abe4-ef349056a2a9.json |  132 -
 .../3f069053-b24e-4242-9302-d46b82e511aa.json |  132 -
 .../62cd9bcb-a74c-40b9-be84-a0077235ae3c.json |  132 -
 .../b4cd25f1-87d5-4173-a4d3-928444f6cb37.json |  132 -
 .../ddd4716e-d8ae-46a1-8fb4-c27e2da40e6e.json |  132 -
 .../1e5b62a3-018b-429a-b2b4-325545ee99dc.json |  132 -
 .../958d410e-ce43-44c0-8a56-685c0a618408.json |  132 -
 .../57c53f20-aa32-49fd-926a-f26c9d0759d4.json |  132 -
 .../76def522-6fe1-458f-bfbf-99b50ece3367.json |  132 -
 .../c467bc88-6769-48ac-abd4-867ee38bbe57.json |  132 -
 .../801681eb-66f4-46e0-bb2b-7ba4b46679af.json |  132 -
 .../cdd0ea1c-b17a-4816-953c-1d7164c64114.json |  132 -
 .../b2060893-1f7d-4e7a-a458-3623147ac118.json |  132 -
 .../cf8aac35-679a-4ebb-bca8-6e0f2d42e71b.json |  132 -
 .../34bfe887-5a3a-4626-997e-c35d3a0ec341.json |  132 -
 .../b81acc47-6fd5-4f89-8c70-f8f14b677e04.json |  132 -
 .../30b977a8-7882-49be-8621-9ee3fce270ec.json |  132 -
 .../3367fd79-713c-4691-80cd-4abb6b2818ef.json |  132 -
 .../add899b8-f3e6-4d87-8846-8254f4dfbd5f.json |  132 -
 .../53829ec0-f233-4b61-a672-6a467823caaa.json |  132 -
 .../e2b41200-bff2-4835-a0ea-27ff56937570.json |  132 -
 .../3d33f26d-72be-451e-bcf0-501e0bc2f1db.json |  132 -
 .../3b4c05fc-2ccf-46db-8d64-045508f6614b.json |  132 -
 .../af83a91c-3b07-48c6-9726-5bd77347f810.json |  132 -
 .../48759b07-9aea-42bd-8d73-9c4208d2789f.json |  132 -
 .../68820679-55f4-494d-91a0-0db1bccb8983.json |  132 -
 .../029774ac-a63d-4acc-a37c-4194e4afdecc.json |  132 -
 .../146df856-e2c8-41eb-b860-ceb78c126e55.json |  132 -
 .../74c6bea7-ad16-4f08-a2b7-9c894b9ce207.json |  132 -
 .../b5e97b2d-d8a2-485a-8b0a-71590e4a376e.json |  132 -
 .../e79d0a8c-caec-4dec-b119-3229ffa69a73.json |  132 -
 .../2c760893-b52a-40a9-9420-fb193a62a5c3.json |  132 -
 .../ef9b84e0-68b0-4caa-9980-96ea5e7f440b.json |  132 -
 .../fb48aff8-3f6b-4934-9fb8-d72bf8614d6f.json |  132 -
 .../9450acd9-16b6-49a2-9b73-cf1161b96df3.json |  132 -
 .../0d50ec2d-5dd4-487e-80cb-9533246a9876.json |  132 -
 .../f6e6827d-fbf8-49cd-bdad-e8c7ea87550a.json |  132 -
 .../c5e48fd8-0eea-46a9-8790-1745923561d3.json |  132 -
 .../870c7739-8886-47df-8e20-09bfae03b9c5.json |  132 -
 .../d8eb5fd1-f1d4-481d-85af-88a11d7b6f6f.json |  132 -
 .../6625b2e0-1f65-4dc5-9913-ceb0e82e6439.json |  132 -
 .../24e7df20-e046-48f7-909e-502d0c70216a.json |  132 -
 .../7920f562-9e7f-4a64-85f4-584b13af44de.json |  132 -
 .../c6620817-69fe-40e2-bb0a-1e9c739ab65d.json |  132 -
 .../520e2d66-4143-493b-8533-64f86c6d676e.json |  132 -
 .../993bdfd2-3a88-4de3-9ed9-9b7b63c0f4f5.json |  132 -
 .../4e1be694-cc4d-4943-a8e4-74913cfb2ebe.json |  132 -
 .../42c174d1-6211-4438-bb9a-24f3cf386a6d.json |  132 -
 .../625bf39b-a118-4ec6-82d0-5405cf70ba53.json |  132 -
 .../e09cb198-d259-42ea-a356-6efe61b1e12b.json |  132 -
 .../5838b130-c2e6-400c-80b7-6822efb5db2c.json |  132 -
 .../52b51638-64cd-4b19-8fc7-c223d50bc549.json |  132 -
 .../28b3178b-c963-4267-9649-3f7fc10fba3c.json |  132 -
 .../748298a2-5042-4636-ac7e-051c28916f3a.json |  132 -
 .../03bcd4e6-1620-424a-9200-c0cf4b73bbd2.json |  132 -
 .../c7fba530-63cc-4ece-a171-4a2919aa8057.json |  132 -
 .../c25c1046-a8d5-4f4b-9a72-c4591cfb4023.json |  132 -
 .../c3800a5c-310b-41cb-9b07-cfc1f1b13256.json |  132 -
 .../e8e2b99f-cf83-4776-9117-aa2b5d9c8068.json |  132 -
 .../2da19e45-117f-446b-b956-b35a20bb7411.json |  132 -
 .../9e982a33-19cb-4381-8560-884bc8946a2b.json |  132 -
 .../9130a862-cfd7-47ce-a92a-f60438739491.json |  132 -
 .../858d3717-fcb2-45d9-8eaa-1b00ae0ca918.json |  132 -
 .../5f1f137b-cb2f-4ee6-8bc9-5e0b94939f35.json |  132 -
 .../6feca911-7a6e-43a2-b59d-7cb48070fe8e.json |  132 -
 .../d3ad9813-273e-47de-be16-312cc67ac64f.json |  132 -
 .../317205ee-2cc6-4523-9662-be6508314b08.json |  132 -
 .../3b5fe65a-50a1-4036-b81a-86117356cab9.json |  132 -
 .../812ac262-97f4-485e-93de-f8d420b8658e.json |  132 -
 .../39cd7eb0-781e-47b6-8eaa-c72e702f778f.json |  132 -
 .../9411a8a4-306e-43da-96d7-c93eb3aac398.json |  132 -
 .../c93feb32-0526-44ac-b3ed-95f08c37cc9f.json |  132 -
 .../1a3b0f7a-afb6-4002-9321-23a86f000c5c.json |  132 -
 .../8d29363d-3096-4c54-a40e-acf4a7318a04.json |  132 -
 .../8cea452d-63b8-4e82-9511-64c94f8e140d.json |  132 -
 .../5e5b5424-1d48-4a5e-8775-52c75609c338.json |  132 -
 .../73787033-ed1d-4d2e-b7b2-e886ef6f1036.json |  132 -
 .../54c9403f-2525-45c0-a585-9ff598f95f6b.json |  132 -
 .../77d0d88d-7ca8-4f3e-8b79-295f53140635.json |  132 -
 .../727f27e3-2a3f-4572-8db5-87e498c4b6ca.json |  132 -
 .../b6e0cc97-27cf-4082-a908-95d5c39014b8.json |  132 -
 .../3b77ec51-fd47-4bc7-9e96-ed46202fef7c.json |  132 -
 .../b24cdd3f-3e44-4ebe-b2b4-209ee0bbfbd3.json |  132 -
 .../e47a3cab-dfef-47f6-9377-9ee32489bab6.json |  132 -
 .../1e4481fe-458b-4c23-8a6c-55439fb8b4fd.json |  132 -
 .../6421e9dc-e7ca-4e1c-9f4f-1d1ac409c4d1.json |  132 -
 .../55f43b53-6ed9-4c16-bf75-c968999a6f36.json |  132 -
 .../6ce93e70-04b1-46b8-b3e3-7eb0df35e1c1.json |  132 -
 .../95096a89-2baf-4b14-bc6e-1f30e920c086.json |  132 -
 .../f1651632-2787-47cf-b471-89d1b89a6b01.json |  132 -
 .../e1fb2ac9-8f60-4dc1-9e0d-99fcb91a53a9.json |  132 -
 .../d3accbc1-d698-4357-ab08-0b98fb49b4ed.json |  132 -
 .../5388a25a-5780-4ae1-999f-172b558a7b52.json |  132 -
 .../9e4143ff-d461-4fdb-8bc7-86f959f69e68.json |  132 -
 .../5d843bd7-b34b-41d4-92ff-c25a709b4930.json |  132 -
 .../87975b2f-298b-4297-8f4d-e5bb1bf5d113.json |  132 -
 .../41bb8174-f3d6-4862-b892-dbc9f6e2e696.json |  132 -
 .../683ad2cd-5e39-4088-b98b-94d89dda7b88.json |  132 -
 .../08ffd7ab-ccca-4258-be6d-cbc151cc43aa.json |  132 -
 .../4b6efad4-c697-4f0a-8d24-75dc49d8ec06.json |  132 -
 .../4986c30a-85b0-4263-9be4-d69c9b067e0c.json |  132 -
 .../47b5a878-1a4a-425f-ae6f-ac286f681cca.json |  132 -
 .../992a6862-46b9-415e-858f-2eff8709ca81.json |  132 -
 .../c6391381-c973-4068-b72c-af08762d9e5c.json |  132 -
 .../0f6e18e6-1b0f-43f4-a9af-6632f6ce63cc.json |  132 -
 .../56d9ee92-6774-4c9b-9861-c5f0a9945e7c.json |  132 -
 .../d3e753cc-37fc-4d77-8b2d-da90a7843d60.json |  132 -
 .../eb08ef6f-6631-47c4-8f52-bf9454ad34b6.json |  132 -
 .../2207b154-c5d4-4e5a-ade0-271e62d6345f.json |  132 -
 .../f4161154-7777-4261-9275-a3002a1305d8.json |  132 -
 .../8523812d-1db6-4a9d-b06b-ac904191789d.json |  132 -
 .../6cd9ea81-618d-444e-a892-d4f9819daa67.json |  132 -
 .../2217326d-377a-4503-8180-206c12c87436.json |  132 -
 .../3bbb10fc-e3b9-4c6a-ac35-ee5de9ecd330.json |  132 -
 .../01124f11-b739-422b-97f7-062074b8d0fb.json |  132 -
 .../7cc4c93b-7c43-4bed-84a3-fa1cd9130abb.json |  132 -
 .../bf3aa551-f9c6-4203-b2d4-55cf9e6e2872.json |  132 -
 .../2eae8905-5338-4a78-86e7-d354d06efa23.json |  132 -
 .../9dcc4121-e046-49c7-969e-7255b0c32d3d.json |  132 -
 .../dd7d4acd-549a-467b-b461-0eba5b019122.json |  132 -
 .../159969cc-32c5-4f6f-b586-8e6d44180b44.json |  132 -
 .../b80e559d-e519-4678-8abc-ee5591b81fac.json |  132 -
 .../90c137c9-939d-4e77-9fcc-9e33551a6121.json |  132 -
 .../f25d6fef-d337-4cf7-ba05-ca6ff5eccd52.json |  132 -
 .../c6f92306-dcdc-4549-bfc2-feb62a3a6ef6.json |  132 -
 .../96c64d23-d23d-486c-83a4-4c0ab4f09d60.json |  132 -
 .../243abf0b-0f88-4b4f-ab51-6c8aebaf19be.json |  132 -
 .../438fb728-d6ad-4c28-a43c-ff82d522cd50.json |  132 -
 .../94b45b8d-b754-4fb4-843d-b7ffeafc4f1b.json |  132 -
 .../5618fc82-d455-4261-8e34-1190d70fd3f3.json |  132 -
 .../395f6339-3fca-4f4d-befc-2d231008efdd.json |  132 -
 .../b22696ac-7074-44f2-b72f-c59ca0a41ce6.json |  132 -
 .../6856f8b6-a719-4f69-be71-4df582015f28.json |  132 -
 .../f2c0ea2b-76ae-4469-832e-84c0b79fa283.json |  132 -
 .../5619e3cb-eb3e-4420-a156-6f7b2a5d372d.json |  132 -
 .../9d5e329f-491a-4608-bcac-1ee63046b34a.json |  132 -
 .../80953f08-6530-4bab-a375-cc542081aabb.json |  132 -
 .../0b8691a8-f394-4da3-a67b-faa1af9b42c9.json |  132 -
 .../fb541a2b-d9bd-4aa2-8b83-da62a3b77731.json |  132 -
 .../c20d1c62-d3e0-4e30-b0d3-4c62a6585d23.json |  132 -
 .../8a10eeb6-7178-4c78-8940-68fad78e389b.json |  132 -
 .../f0bb774c-a842-4261-b817-b169ce65a493.json |  132 -
 .../59afe234-3a7f-49bb-873c-df6cf793e5e5.json |  132 -
 .../4074081a-66a6-42e4-994f-72541f90888b.json |  132 -
 .../6a618ec8-c029-49ec-9ea5-da52b5231280.json |  132 -
 .../edc8f510-c961-4c1f-9757-e80c4247f275.json |  132 -
 .../aaa5d1e6-5aca-4471-87ea-7195610a6c1d.json |  132 -
 .../89b45e8b-9979-4c7f-8aa6-c6ab7009cab0.json |  132 -
 .../41000c74-8b29-4369-996f-cf3a2fd09f63.json |  132 -
 .../a1765846-74e1-440a-8851-12a571444059.json |  132 -
 .../9c6b594f-387a-42a3-9e40-3b26363e6071.json |  132 -
 .../2b910401-457a-45dd-920a-559f4595897b.json |  132 -
 .../90b7be49-53a0-4d7f-8995-cbc52fe3a70f.json |  132 -
 .../5e8854ba-7147-4fdd-a568-1ea58e79e7d8.json |  132 -
 .../df6e0cfb-d720-428a-a5ad-b1529faa07c0.json |  132 -
 .../a88a6e6f-2253-4b67-9527-55ab6153e40f.json |  132 -
 .../00c66a37-b46b-47e8-a098-ce12433c1135.json |  132 -
 .../6ad5483c-13dc-4e79-a719-66af383d195a.json |  132 -
 .../9fa6813a-7acb-4c08-9912-6dc0d356a7e2.json |  132 -
 .../3880e3bf-6ff0-4eef-a519-2649014254e1.json |  132 -
 .../e77efb9d-b1fc-4833-8e7f-8da683019018.json |  132 -
 .../2bcc02df-8d27-412a-8b58-c331df98e4d4.json |  132 -
 .../622531d5-03f8-42cf-974e-94291aa1e515.json |  132 -
 .../b772f20f-afbd-496c-9f94-e5fd30d54466.json |  132 -
 .../169d5ad3-ae4a-42de-b951-f264d85bf623.json |  132 -
 .../e84c3b50-4ea9-4f41-be11-50c6aa3d4656.json |  132 -
 .../594780dc-d969-4a6b-b90b-1cc32f40c452.json |  132 -
 .../4ff7c238-d69c-4b92-83d0-69cacdfa0fe6.json |  132 -
 .../bb576dc9-eede-48d6-b438-732da91a4d29.json |  132 -
 .../0fb2fe17-b55d-4802-ad48-bd4d711e1e0f.json |  132 -
 .../03d59002-dc98-467f-b2a9-605ef8d9b763.json |  132 -
 .../8a7034fd-7027-4a87-9cac-c95b745935d0.json |  132 -
 .../717f745f-1eae-4277-8a31-dbed140ef3e8.json |  132 -
 .../2dc78735-c0c3-4dd7-8e97-52c92785e623.json |  132 -
 .../e9ab98ff-5cf0-4437-9cf3-c77ecb546c84.json |  132 -
 .../6303d73e-4129-472a-a6fd-c64cb3de7204.json |  132 -
 .../8a689e8f-19cc-45b7-80be-ce861a549af7.json |  132 -
 .../84881315-55a4-4f05-a115-cf82f850090d.json |  132 -
 .../970dc71c-42be-4d50-86ac-f7301ec969ca.json |  132 -
 .../c02e1fcf-a837-4b8a-a42d-63837c56128d.json |  132 -
 .../37280340-5b9a-47d9-aa37-9299d9025518.json |  132 -
 .../46e7ad9b-b774-46b9-933c-913d1b307f7a.json |  132 -
 .../c154d3f5-39dc-43c0-85ea-2e43b08494b4.json |  132 -
 .../abd830e4-2b7f-4895-8262-75926edbafd9.json |  132 -
 .../2c945021-72e3-4e7a-9c6f-81efb27b2206.json |  132 -
 .../5f0ea694-7f73-45fa-b54f-49fc06d1a6d9.json |  132 -
 .../6c73f6ae-8ffd-4948-8071-33eab07437a6.json |  132 -
 .../fbf71df3-b9c3-4f9c-b538-e4ccf097e81c.json |  132 -
 .../e3dcfd94-ca04-4cd3-ada5-e701a8b776da.json |  132 -
 .../9278bcf2-bfab-437f-bd64-7496b24fb8cf.json |  132 -
 .../633aa068-5613-41d8-a194-aebc9ce1586f.json |  132 -
 .../d3c1a922-a453-4c7b-b33b-52934e7bf72b.json |  132 -
 .../3a27b2a6-5eea-450b-91c7-1dc006229985.json |  132 -
 .../395e37ae-005d-47c0-9cf5-919460e34350.json |  132 -
 .../b03b7c7a-f263-4712-bcf4-2e32ca4bd237.json |  132 -
 .../452ab810-6921-4922-9446-f2a5c081dc61.json |  132 -
 .../1abba5a0-f1a3-4f39-a81c-f4cd641d33ac.json |  132 -
 .../b2eefd3a-795c-4dc0-a10e-924bece05ea5.json |  132 -
 .../008cc919-f156-4a2e-af4b-eed015ca91f6.json |  132 -
 .../9d56082f-5e46-4d7a-8f06-cb44fc983b3f.json |  132 -
 .../7ea26e73-a501-40bf-8f01-81ab8e850a91.json |  132 -
 .../e3343130-cf4f-4e5c-b2d3-5dda13d575b9.json |  132 -
 .../ba1965f8-b59f-4d71-920c-e3b401ca0534.json |  132 -
 .../6dc87410-a39e-41b1-8759-68c1556c8419.json |  132 -
 .../c4ebe788-fb60-453b-914b-56bf87dd6374.json |  132 -
 .../45a44cc8-a550-4d2f-b0f4-37b4aac6a2b5.json |  132 -
 .../10593c13-3b30-4605-8063-c6a6526fc9d9.json |  132 -
 .../12b8f4d7-2ae8-492c-8756-f7cb21a58c76.json |  132 -
 .../96d9b675-c299-4138-a381-fb4de36287e5.json |  132 -
 .../17fffa9b-8ed4-44c7-87ea-7ee2c1f28e6a.json |  132 -
 .../8999a5f3-f421-4663-835e-7626cebd2282.json |  132 -
 .../951e1a4f-ed6c-49ca-b648-6086989e333f.json |  132 -
 .../2acc0666-e0ff-4760-a74a-227a02775344.json |  132 -
 .../3196c71d-0e0a-4d29-8bca-c31ba3d99dfd.json |  132 -
 .../e858aa6c-c424-447e-b512-7dcf794f9f0f.json |  132 -
 .../8773eac5-205e-4264-981b-58f1a25f872a.json |  132 -
 .../c26ae286-a9b8-499f-b886-4b75be0cf2da.json |  132 -
 .../d3a61998-2d41-4349-bd15-ce29143cc910.json |  132 -
 .../56b66428-2751-4c62-b98c-6c60e58c45ca.json |  132 -
 .../9b2ec4af-4a7c-4cf7-8b7d-79b6cc219880.json |  132 -
 .../5855a920-428f-4699-becc-73d4422f706f.json |  132 -
 .../f1004f08-7f46-4eb1-8f60-66893fca7180.json |  132 -
 .../97db158a-3035-45d3-8d92-a08c9e605493.json |  132 -
 .../0d81b928-2a24-4eb4-93d5-224e3c505532.json |  132 -
 .../bf4cc7ee-cad4-42af-8638-6b371577ec68.json |  132 -
 .../5b574dda-0d85-47aa-9ebc-7f8581d402ca.json |  132 -
 .../6043830f-8a9d-4a03-9de5-4805724a9ae8.json |  132 -
 .../9d5fdb25-0d6a-4d5c-bcfb-0903504e620a.json |  132 -
 .../217819b0-2c4b-4c26-823b-1ea14f893e01.json |  132 -
 .../0f844855-fb46-4b53-82c2-f36e5721c385.json |  132 -
 .../59aaa7ed-27d4-4765-b115-90570ad86c77.json |  132 -
 .../4478c5ff-3b51-4be2-abce-3fb6a951b6e7.json |  132 -
 .../9202146d-5889-49fd-9025-e03153ba9093.json |  132 -
 .../94257d3e-2b1e-47a1-bbd1-7fc696a574b3.json |  132 -
 .../2245cf71-fb8d-44ca-b58d-06608312ee8c.json |  132 -
 .../9a823fde-7802-4876-b72c-d8f73cd17236.json |  132 -
 .../ede99239-ef8f-49eb-a48b-0ec2553c99e5.json |  132 -
 .../4a307570-994f-491c-87a7-ad90b7965b8b.json |  132 -
 .../eb448d78-6417-4533-8458-99c1869a74ae.json |  132 -
 .../e1b8e4ad-4327-46b9-b957-fbd02e57c87e.json |  132 -
 .../aab6b224-b948-4fb1-84b7-0dbe5c46d527.json |  132 -
 .../2e5cd1de-6109-4f76-b722-abbd4b207f4d.json |  132 -
 .../767d1296-4971-478f-8d78-1d63d162ae5b.json |  132 -
 .../eab74e3b-de61-4fa9-87c2-56e69b70349a.json |  132 -
 .../3219d563-3bfb-4618-8cb3-e9b198d5b11f.json |  132 -
 .../233fd27c-561e-4c9e-a917-cbc5b08c055a.json |  132 -
 .../a875e8f7-a4e6-4c17-abbc-b8d4b73b7501.json |  132 -
 .../4b68ba49-6681-4add-9197-2cd711701e15.json |  132 -
 .../5679ca73-3d5f-4bc7-bea2-5e9e713db0cc.json |  132 -
 .../a6c631f6-890c-4199-abee-18b012bc48df.json |  132 -
 .../1edc3610-40fc-467d-8410-26d4b6adebce.json |  132 -
 .../42c773ba-8fb4-4b3c-8ac7-0688519bb55c.json |  132 -
 .../1a371df5-447f-4fd8-8fe8-dbf9a1dc079a.json |  132 -
 .../821a21a0-6fd7-438a-933d-5e31b2dd2adc.json |  132 -
 .../781a4cc6-a69d-4106-81aa-06e114f7c897.json |  132 -
 .../e49c98b4-46f4-406e-9eeb-7072bf72b9a3.json |  132 -
 .../3b7524a8-d17b-4788-93f2-11076df464a7.json |  132 -
 .../6188a57f-4bc3-42a5-ad18-c59774e40407.json |  132 -
 .../28689805-7c4c-438e-8431-f4a6aceb5e94.json |  132 -
 .../7c156689-9668-4ded-bacc-c88a03ad1526.json |  132 -
 .../7e43f187-1959-4dfe-802f-094ba88f3b0d.json |  132 -
 .../a6170173-ef17-4cfa-a76e-8e51cb8cb970.json |  132 -
 .../e998d52b-dd94-4ef2-9cfc-5034ded0105a.json |  132 -
 .../a3ac60bd-8fb3-47d9-b378-1f0c4d74fed2.json |  132 -
 .../0f69217c-74ed-4398-8d1b-53d1a43be890.json |  132 -
 .../b973adcc-769c-4009-87c5-5f5af02a5d3a.json |  132 -
 .../4b30f11e-a2b9-40e9-b080-9d7484a5d048.json |  132 -
 .../befdae09-4caa-4996-a3ac-fe36310aaf01.json |  132 -
 .../8cd7fc1b-2873-4154-9de7-c0b8e5f4f5e9.json |  132 -
 .../7f6e5858-f5d4-41cf-9bb7-c3c82a55c392.json |  132 -
 .../7b8bf84f-4101-41a1-b6ff-9cadbb5f84a3.json |  132 -
 .../1f3a733d-a6d3-453b-9763-61992cd514b0.json |  132 -
 .../d0eed3c1-2226-48c5-a314-e429f66c5053.json |  132 -
 .../957f02f1-45c7-4cce-b5aa-86bb5e485ad3.json |  132 -
 .../55a01e8e-318a-4609-a862-bab4d62b3e7a.json |  132 -
 .../cbdcd76f-be8f-42fe-89ed-d1d09d9d785f.json |  132 -
 .../c7b6515e-6f96-468b-8bc0-15212c31e790.json |  132 -
 .../f27f3a1d-c19a-42b2-8b49-64ecfe5d3405.json |  132 -
 .../994aa481-627a-4bed-8719-9e874373cbc6.json |  132 -
 .../9f5cd849-20b1-4e8d-9deb-f286dcfd9d6e.json |  132 -
 .../c4dd34f2-7acc-4a94-a9aa-3c6aeeae8a8c.json |  132 -
 .../e908b473-a015-4156-8e88-d67153479cb9.json |  132 -
 .../173af77d-7a51-4d5a-8fd3-366aaa5d78a0.json |  132 -
 .../0bb65f09-323d-485f-886e-5a35c8bcd342.json |  132 -
 .../86b4c877-ef2d-4563-93a2-92d7e77eab5c.json |  132 -
 .../be2ee3f6-37ee-4895-821a-3d3c7eb04eac.json |  132 -
 .../e574af17-dd3b-4c09-8689-ea598d44e562.json |  132 -
 .../83958185-047a-4356-918d-2f45f273c08a.json |  132 -
 .../d04c6e84-0b63-4de1-9278-aa37c9d2c8e3.json |  132 -
 .../a218e260-7f56-4676-af58-254bd84d0327.json |  132 -
 .../f21fb2c8-4abe-40de-ab2c-9d23e95ee281.json |  132 -
 .../da5774b2-8a6f-4f2d-8267-beb25490b06a.json |  132 -
 .../274705bd-8eb6-4863-8998-f5d67c4ac827.json |  132 -
 .../5b95cc2f-3378-45e7-9f56-6bb7e1ce4826.json |  132 -
 .../6918d1a3-e547-46b7-9062-274057c1f513.json |  132 -
 .../599deb3c-49f9-4c0b-af8d-78f9e166820b.json |  132 -
 .../b4ea3f14-3787-434b-8f26-20ff640c0146.json |  132 -
 .../6952c527-ca23-494a-910c-1c027e4a5a29.json |  132 -
 .../3f12e79c-dd1b-428d-9094-10a047205e3e.json |  132 -
 .../d508da29-0288-4a0a-b727-fc5355515c5e.json |  132 -
 .../48cf5a8a-70c6-4c55-8959-32d773d6dbcf.json |  132 -
 .../4bb7d331-f305-4c08-a073-87ba7b2cbde2.json |  132 -
 .../94639454-c525-4e6f-af27-d92d45a9ac40.json |  132 -
 .../9fa81bb7-7abc-4764-9465-d61217590da5.json |  132 -
 .../9a683492-4057-4de4-a30a-aa66becffb13.json |  132 -
 .../b917df45-62f2-4c3b-943a-ad6c98ef8bc1.json |  132 -
 .../ba658bc7-b89d-4fb7-a794-f48bd3715a49.json |  132 -
 .../93f79cdc-ffd7-4299-9876-c0c7bed55ae5.json |  132 -
 .../5a91b0bf-b043-41d2-960d-5f0e78abc400.json |  132 -
 .../263f56e5-b578-475a-9bc4-b5ffc142f9e2.json |  132 -
 .../9219ff66-73ba-45d8-99a0-23d23b3555ba.json |  132 -
 .../b2328396-e9b2-464d-94e4-f03db19144ea.json |  132 -
 .../3f895edf-8f54-48ff-a731-666144af0fda.json |  132 -
 .../b48b8e16-a555-466b-8b1c-246137223311.json |  132 -
 .../5fdcb98f-4c50-4cdb-bd99-dd32efc6d6f3.json |  132 -
 .../d49c5e72-0dd0-4663-a310-9cd9bf1f5150.json |  132 -
 .../0176903f-e6ca-4f21-b98a-00bc443bf244.json |  132 -
 .../11f32afc-95c1-4531-ae45-5a0974d36b3a.json |  132 -
 .../70657dd7-63cf-40f4-92a0-1097fc1ce9ae.json |  132 -
 .../53cf325b-6f32-4791-8f95-8b982ea03b23.json |  132 -
 .../8c50491b-6ed4-4f38-9d3f-d5168600cf4f.json |  132 -
 .../7adf79de-a51d-4b87-989a-c218ec6d99e3.json |  132 -
 .../92358e5a-5e73-4747-9e92-e5ac003b97f7.json |  132 -
 .../f1636512-b98f-4fe4-adf3-abd556dd0ab9.json |  132 -
 .../9333afdd-4866-412b-b11b-dfb118a06db9.json |  132 -
 .../840c0e19-6d75-47a2-b64b-f9c51cb1dcff.json |  132 -
 .../071b49f2-8e23-47b1-9858-78d676d9905e.json |  132 -
 .../d3821f53-87aa-470a-a403-c8e3cd100ae1.json |  132 -
 .../389dbaba-c9cd-4e6b-afb3-f2ee3951faa0.json |  132 -
 .../5f78f39a-42cc-4cf6-bb27-e2160765bf24.json |  132 -
 .../b6e3d811-bf9d-474e-b82d-358a44e0dfc9.json |  132 -
 .../bef1cbad-4f75-4dde-b467-6145f72a87f4.json |  132 -
 .../654bebe0-b461-427e-a4cf-06386e9272d8.json |  132 -
 .../37ef4e34-58f8-463a-950f-48b3a6833d54.json |  132 -
 .../20687086-8aab-40f1-aec6-03917f4f9bf5.json |  132 -
 .../53a0a998-a0a6-4800-80bf-bfd83123f2f6.json |  132 -
 .../4ee8df1c-e8ff-4a56-816c-0c2258a226e7.json |  132 -
 .../42c8d84d-c8b8-42c6-8f49-4e971df173d7.json |  132 -
 .../77b57dea-22e1-48a6-b8ae-9e474f08ad5f.json |  132 -
 .../a9ed5d04-57d2-4566-91df-b798be939fdb.json |  132 -
 .../bad4ec47-fe84-4518-b072-6955938f0c86.json |  132 -
 .../497e585c-059a-4e18-9a8f-bdaa066f59ea.json |  132 -
 .../e24b2a4e-83e4-4a79-bc41-03a54af00595.json |  132 -
 .../15e39361-585b-4870-b91a-64dce4fb37ec.json |  132 -
 .../96efd11b-e9f2-4bf1-90f9-561714137edf.json |  132 -
 .../98e9936d-d376-4c72-80a6-0a28cf722ac4.json |  132 -
 .../7ada9c83-7851-4da2-b9d1-d744b174b777.json |  132 -
 .../a6ed72b7-14f1-464c-a7f5-590791982696.json |  132 -
 .../79e3f38d-ae2b-44a7-be0d-024adad6bcd6.json |  132 -
 .../ef13bdea-cf73-4ead-b6d7-73a155fa9a79.json |  132 -
 .../2663884f-941c-4e16-8029-b38e3a543733.json |  132 -
 .../ca7af645-4796-4b31-ae7d-2cbebe5a369b.json |  132 -
 .../f95e098c-d320-4db1-887d-8c3252bbaf77.json |  132 -
 .../2bbf6dc9-8dd5-4dee-908e-d4a8fc03bc84.json |  132 -
 .../5f4edfdb-a62c-4410-83a3-1ceb15d2e7b0.json |  132 -
 .../aadfae06-73b6-4306-b056-0a733b9bd8f4.json |  132 -
 .../cfecbfbc-46c3-4dd3-8bd9-afe4cd386973.json |  132 -
 .../97640dd1-d415-4b56-818c-cdcede3c52fd.json |  132 -
 .../b750c460-ef70-4abf-b77d-118a82039598.json |  132 -
 .../f4c20519-9e33-4698-a17a-07e5fe7d2707.json |  132 -
 .../0f204733-55b4-4c06-bd12-dbc2e2593abd.json |  132 -
 .../0bb226ed-fe88-4678-9b50-f77883ceb708.json |  132 -
 .../fb297e45-9e14-4853-8384-75c187b28a9b.json |  132 -
 .../4f6eba27-2ab4-4b33-9568-814d15fbd6b9.json |  132 -
 .../c3bc3d69-a987-4dd0-b6a5-e0ecc50034fb.json |  132 -
 .../5d02ba78-cf8b-44ee-a1b3-e51ecf437d89.json |  132 -
 .../4a43fa67-2438-4c2a-b17b-9d2f221e5a86.json |  132 -
 .../2c044767-1169-48c6-9e37-e9d1e35f4cfe.json |  132 -
 .../bad67b35-d9ef-417a-955b-9c33e87cb927.json |  132 -
 .../60eaa315-f489-405d-a67d-7f1312e90cab.json |  132 -
 .../50de312a-293d-41a4-8bee-4feb0c148b90.json |  132 -
 .../56f24cac-394c-4439-8f2e-8270e7519bda.json |  132 -
 .../8efa1423-0a39-4674-a94d-3d92448010d6.json |  132 -
 .../350b3491-cba8-46b4-a07f-3d1277270530.json |  132 -
 .../0741ead7-24f3-49b0-9967-f726df84f78a.json |  132 -
 .../1ea4d10e-e099-4967-8c43-e84acaeb40be.json |  132 -
 .../6c78d9f7-a61e-4f65-ac57-61597f735541.json |  132 -
 .../e9bcfb1f-c688-4e7a-918a-e697adaf7aa5.json |  132 -
 .../153cfe7f-c27a-40b8-b8d2-54351f26f583.json |  132 -
 .../b58372cd-5d55-4f42-a5da-2970e55b44b0.json |  132 -
 .../34a028ac-2002-480c-a1af-5b945ffe872e.json |  132 -
 .../065ffc51-154c-4a93-a342-0dd476fda473.json |  132 -
 .../ebc74f4f-157d-4ee4-8b99-9fb5b685afd5.json |  132 -
 .../91004d26-7b8b-4c0a-bd8c-8880654dc93a.json |  132 -
 .../5eb1aa92-a031-40d4-ad64-552075dae68a.json |  132 -
 .../3ebc147d-58f2-4605-a011-a71c591fac0e.json |  132 -
 .../01795776-e909-46d3-8b6c-0989334e3d0e.json |  132 -
 .../00dffa94-31f9-4b5c-b032-03dd20fc2e8d.json |  132 -
 .../736249d0-cea9-46c6-9677-ecae4b410af4.json |  132 -
 .../ef602cfe-3453-4189-b583-292cf05421d1.json |  132 -
 .../559af2c1-deca-4c35-b83a-004c22ac958a.json |  132 -
 .../8d66d895-626a-477f-91b6-2195f35aacb3.json |  132 -
 .../004df803-70da-4e59-b3ad-f210c790f29e.json |  132 -
 .../bb2972ca-e673-4be5-bc7e-2689adeac3a9.json |  132 -
 .../eacf2411-a0ea-41fd-8363-e565fce0f26f.json |  132 -
 .../4eefe3cd-ff42-4d4c-89c6-c3e48d8c85e9.json |  132 -
 .../f19dab38-48ed-438e-8a62-86e4d111f6c8.json |  132 -
 .../ff4b6d28-62e2-4671-8df9-690ce7f13f0b.json |  132 -
 .../9c05a7e4-f495-41d0-a7f0-1959e7434ba2.json |  132 -
 .../404e3d61-26d3-4f95-9847-064f0c7c6970.json |  132 -
 .../0b4574f2-1b71-427f-9923-17db449be191.json |  132 -
 .../775b88cd-98e8-4d93-acca-e294f68f2da2.json |  132 -
 .../89464568-47cb-4659-af37-8b061d3f0c8c.json |  132 -
 .../9fad9d73-acbf-4ffc-886c-551c1fe1ed45.json |  132 -
 .../c1882335-0df5-4df2-bfa1-c16126c328fb.json |  132 -
 .../291471ed-3b7c-4bd4-91bb-c27cd74ec460.json |  132 -
 .../53565fe4-0368-477b-9916-ac9a4b8a9c7b.json |  132 -
 .../f6cb5e9d-c4c9-44a2-9adf-7fa5639d84d9.json |  132 -
 .../e51fee25-7648-49d9-a8da-b8dbc68a722b.json |  132 -
 .../6acdc96b-cfde-439f-b6b3-a66257b3fcde.json |  132 -
 .../850da8de-ca13-4f15-bb9f-68b910355cfd.json |  132 -
 .../542fbb7a-d4eb-4cbf-b63a-4305cb108361.json |  132 -
 .../1dbb8206-6a86-4e2c-8ee0-d80fed014a69.json |  132 -
 .../6341de3c-8d4c-4af8-8f0d-c81e948bacd6.json |  132 -
 .../e6cb6a87-6db8-4aee-bede-ce8a60dc8f4a.json |  132 -
 .../5113439d-1394-46f2-a38e-34b54e94a9e6.json |  132 -
 .../a03d88aa-7ccd-4f8a-9a1e-c9469d3ae559.json |  132 -
 .../1cfb40a7-7373-417c-aa1c-f6ab63ecb3b8.json |  132 -
 .../446ac93f-d47c-4207-bf32-0cd94e88a931.json |  132 -
 .../7e4ba4f8-2768-4e7b-a11d-75ad22a47c45.json |  132 -
 .../ca77f821-4722-45b1-b731-7d774232acb4.json |  132 -
 .../f32d2a11-edd3-4662-aed7-88c6820b2c2e.json |  132 -
 .../71c56883-dd14-4f16-b839-5ce607a4aadb.json |  132 -
 .../639004c2-81a5-410d-bd61-e3e263f55335.json |  132 -
 .../5f232a99-07c9-4df7-9d3b-837966ea6de5.json |  132 -
 .../482e34ee-8974-46c6-b3f4-4cc9872ef562.json |  132 -
 .../13743252-3ba3-406d-8e95-5a4cd3ac3772.json |  132 -
 .../ff25cb66-ed6f-421a-a038-1feb24666645.json |  132 -
 .../843f0d9a-04e8-4cea-bb18-94651a814d1f.json |  132 -
 .../fa3ccf4a-9b26-4a76-a974-3a776adec7c2.json |  132 -
 .../ef4ac8ab-4ff5-4fce-94b6-443b1ef7964f.json |  132 -
 .../468bbea7-6dee-4a1a-84b3-e44b0f3ab95a.json |  132 -
 .../bd8fdfa5-bda1-402b-9010-94bf78b0127b.json |  132 -
 .../a0b34b40-3e68-463f-a7fa-3c58c15aa16d.json |  132 -
 .../dbf4fbac-cd99-426d-b725-600e60af00d2.json |  132 -
 .../f793c471-1638-476a-a050-455a32368e29.json |  132 -
 .../1d9c1beb-f84b-4eb7-9c1e-ce5a70afabfb.json |  132 -
 .../99396d97-d875-4cd9-a8a1-a9aec5c43bfc.json |  132 -
 .../82a44b46-156f-4232-92e4-6a08d7a4f197.json |  132 -
 .../3b40defd-5a2e-4d6e-838f-dbbbf12236fb.json |  132 -
 .../dde41cd5-e6d1-43a9-9593-1a5751bc5f44.json |  132 -
 .../1cffcbeb-ef81-4efe-b883-0a8540a799e7.json |  132 -
 .../033ef96e-3d2d-49a4-bbff-8bc815a1b40e.json |  132 -
 .../bfe654b8-cb79-4845-bf14-85012207ce90.json |  132 -
 .../5c4efc23-9591-447b-aecc-4c82797d7d01.json |  132 -
 .../a5fe3fab-95d9-41ac-a95f-66205e489dae.json |  132 -
 .../c0bf8ffb-444a-43a3-9514-76aa92c5f5b7.json |  132 -
 .../3d556d9f-036b-4368-bb4a-18ad6b444bdf.json |  132 -
 .../92905e27-1033-4423-b87d-23236f9be964.json |  132 -
 .../17326bb0-42c2-469a-ac19-6a4b75d9e6e2.json |  132 -
 .../11574f56-6c34-48e4-8fb5-c58d42f07330.json |  132 -
 .../8f728c51-15f9-422d-bbdb-4d976961ab9d.json |  132 -
 .../8d6e4b5e-ad17-4390-bc6b-ab6581a62442.json |  132 -
 .../5e33bf05-6c67-4ecc-982d-7590e9953145.json |  132 -
 .../f55ae879-bd95-409c-a8a3-9a57cd615a31.json |  132 -
 .../b8426ac9-14f1-4e07-9c7e-b50cb2c7a1e3.json |  132 -
 .../51fd90b0-0d5a-4199-ba5b-ff29eeeab06b.json |  132 -
 .../c46e4fa1-afae-4b68-a13e-034b5cd2b779.json |  132 -
 .../42cc06ed-20fc-4e84-836f-3d7243ec336d.json |  132 -
 .../aaa53387-af33-4454-95f0-3af85f4778c0.json |  132 -
 .../465bca6d-b32a-4d34-9916-fc8b3166faa0.json |  132 -
 .../bf138f3d-09d9-4dea-aa43-5efc804bc775.json |  132 -
 .../cb4e944c-66f6-49f2-b1e0-d90454e34315.json |  132 -
 .../b2b6bc49-bda1-4a3e-a071-ec0a0bdc1313.json |  132 -
 .../933f3d40-8726-418f-be2f-1f9686e9ab02.json |  132 -
 .../af1bf15c-7c5f-46fa-ba3a-821b521e86f4.json |  132 -
 .../43df4336-1eb8-4df7-8309-1199aafc07b1.json |  132 -
 .../44ae222d-407c-4c8b-9b67-75440631f848.json |  132 -
 .../a87db0fe-3727-4ff1-875f-9edd3109f3a2.json |  132 -
 .../0c73e33a-7f6f-4925-970b-db289069d5ca.json |  132 -
 .../02bc7f5c-dc2f-4d8c-adcb-a89a34ff5549.json |  132 -
 .../590c031c-2aa6-48e6-9b3f-68b1a585dd39.json |  132 -
 .../970c9fb8-c217-444b-a025-f4d9acdd679d.json |  132 -
 .../07a08dd7-822b-49ac-859b-d2fc75b9c88d.json |  132 -
 .../0c0e9250-b75a-4549-9fb2-2b5c9ac2ef49.json |  132 -
 .../2ae306b1-5409-4418-b5e4-50feff9dafe7.json |  132 -
 .../44bf5d75-afb2-48fa-a0fa-96d283b0ae94.json |  132 -
 .../e3860bb2-b2e4-4fdf-91cb-3343ad6440d7.json |  132 -
 .../6369fceb-148f-4491-9488-420182a9838f.json |  132 -
 .../045c814e-a30f-4b6b-b4f4-382dee4063b7.json |  132 -
 .../59d2b375-5696-47d0-9c96-1a826c08bea0.json |  132 -
 .../ff601b4f-24a1-4376-8c5e-5bda2ea88f65.json |  132 -
 .../8c043ba8-f7dd-4cc8-a3b1-7201042b8dc8.json |  132 -
 .../ce27dff4-9ca7-47cb-bc18-b5dd167c72a2.json |  132 -
 .../d69ecbfa-5036-48b8-8fed-f9162e2857f5.json |  132 -
 .../b5924329-c182-482a-bee8-22fcb348281d.json |  132 -
 .../a6a6b6f2-ac28-4c4a-806e-8abe8c7f9190.json |  132 -
 .../b904301c-d0c0-41a4-b92e-92b2d7c9c13a.json |  132 -
 .../b5de0218-91dc-487a-be90-70f8bcb64803.json |  132 -
 .../3870f65b-3429-45c2-846f-6af30155a78b.json |  132 -
 .../d6c33a51-be09-4cb5-9942-4348668d3e5e.json |  132 -
 .../1ccd36ee-445a-4861-8835-d602973148fc.json |  132 -
 .../4c7ef4ee-3a7e-4f15-8a4a-c5853b1c6a47.json |  132 -
 .../6a69202c-1c68-43e4-bd45-bbc2ff2db743.json |  132 -
 .../a053d6a3-05d4-4d0b-a9b8-7865cf7ac612.json |  132 -
 .../f76d3d30-4fce-48a9-a26b-7d714fff1d29.json |  132 -
 .../eb38a092-1b56-4348-8188-baa2243f7046.json |  132 -
 .../1c4cfb94-fc66-4fe2-9879-78683abe654f.json |  132 -
 .../2deef730-c37b-46ca-82b7-de38ae724fd4.json |  132 -
 .../13a92beb-a8a4-4853-b2f5-1b09d3e2a64a.json |  132 -
 .../36cf5b59-5369-4baf-80c1-3a47678eb5cb.json |  132 -
 .../fced3ef1-fb69-47fe-bf68-3efe72db3142.json |  132 -
 .../7a83d75a-332e-476a-b0f7-986b2ec9cc5d.json |  132 -
 .../6f413d72-cd9f-435c-b13e-9cec14edeb5c.json |  132 -
 .../a7822bbf-bc23-437d-8e5b-32fb06d3a9ec.json |  132 -
 .../0b19508c-4996-4fb7-b0e0-9fa952854fa3.json |  132 -
 .../447c22c1-8929-420f-b59b-01ab32a22281.json |  132 -
 .../ab3dbe43-658e-4c8a-a399-b3d070d467ba.json |  132 -
 .../ee5c87a4-aa06-4728-a9bf-2fc35284b987.json |  132 -
 .../6a1a58f6-e399-4ac3-a516-f02a37b6ff68.json |  132 -
 .../9e2bfd77-b73e-436f-ad50-ccfd379cd3f2.json |  132 -
 .../100cf60a-c43c-4b3a-a667-a45cffdd562a.json |  132 -
 .../2088fca7-11d7-47de-808d-d47da0caad0f.json |  132 -
 .../bf0b3560-9d38-406a-ad30-5fd157f0fe43.json |  132 -
 .../9ce12fbc-00f7-4cc8-bd9d-67ead83a0801.json |  132 -
 .../14501de3-dac0-44af-8c17-7abcd9bbba8b.json |  132 -
 .../c9db8ce4-6f0d-4c13-8484-6fca9e9c3798.json |  132 -
 .../8c6c06be-bbc6-4307-ba5b-336dc2bb466f.json |  132 -
 .../1326ff61-d0b4-46eb-9bcf-f978166e622b.json |  132 -
 .../4c9e829f-7a99-4d61-8730-7457215a4fd6.json |  132 -
 .../afc24d42-6d25-4036-8f22-fcf944b481b7.json |  132 -
 .../6f6db681-991e-408b-8d4e-71fff9e1c974.json |  132 -
 .../f3fa76bf-f11c-4dee-9b9f-00f1ec793dac.json |  132 -
 .../77b457d9-4957-4f0d-a8d3-e005ae382239.json |  132 -
 .../11474a7a-73a6-4a3f-8bcb-bef783e12a2b.json |  132 -
 .../23cc1e7f-0994-43a5-8403-5361a2976285.json |  132 -
 .../88c257d3-d5c1-4e1f-bbc8-9fc6bd65e15e.json |  132 -
 .../ec4c2032-8fc0-448a-a7c4-ee9b35b642db.json |  132 -
 .../3c7ac4de-1456-4afb-b7ac-07beb6cb4d39.json |  132 -
 .../a06ad94f-13ee-466c-b25f-87cd87012678.json |  132 -
 .../9e1ca6d0-d2b2-48c5-acc2-ad299ce02e1f.json |  132 -
 .../7dcd6e37-3685-4b08-b983-b2a711aeaf73.json |  132 -
 .../b1ae6801-0139-41d3-85dc-102ad5cc4c6a.json |  132 -
 .../4cc037a2-d952-4566-a575-015f8e3a5925.json |  132 -
 .../a1eaadae-8601-4c18-ab0c-4f6d80d3307b.json |  132 -
 .../40e452df-8f0a-4473-a3d1-41f9c288c12f.json |  132 -
 .../216020ac-276b-436e-815b-d6968eb83770.json |  132 -
 .../1bb4aeac-a5e1-4fd7-9e70-64fdcfc600cd.json |  132 -
 .../25739611-f690-41b4-87de-9f4ea8b3d815.json |  132 -
 .../b8c27fdd-5b35-41ab-8a35-b5a48f27cceb.json |  132 -
 .../fa237949-c3ac-482a-8a54-5a2019f24016.json |  132 -
 .../b60dd828-a3e7-46a8-b4c2-322aeca42faf.json |  132 -
 .../5de9f914-333f-4181-a93f-79257a3daf54.json |  132 -
 .../e2d23da4-226a-4a02-8390-e8edaea4b65b.json |  132 -
 .../c64c7470-dcf9-46f8-b789-cab7e902739d.json |  132 -
 .../f6d727a3-19dc-4173-a88f-2c47449896aa.json |  132 -
 .../490d14c8-2cb0-4328-9f41-6074b28d6fdc.json |  132 -
 .../9351b079-7ef5-42ec-bb83-f0d8ec7de479.json |  132 -
 .../852d5adb-f422-4102-8114-082ab0b3c07d.json |  132 -
 .../c64e98cd-c022-4834-a3e0-3949416d1fb1.json |  132 -
 .../f101bd15-ac61-49d4-beac-c89bc889b34b.json |  132 -
 .../11caf1c1-e2a0-4abb-bb0e-d06853a06e4d.json |  132 -
 .../f0b57a60-8402-4430-93f3-b846a94113f2.json |  132 -
 .../50aa8077-4493-47a9-9cec-014c56343ecf.json |  132 -
 .../5e70d00b-c822-4ad6-afe8-3756a7038c57.json |  132 -
 .../8162ba41-e630-470f-a297-72fb9f2110fd.json |  132 -
 .../60dd9d02-476f-459d-a41c-f89f82116dc3.json |  132 -
 .../73e89f21-5799-4835-a0e0-a6664c0483da.json |  132 -
 .../7f355ad4-9156-486d-8cf4-723117da3bb8.json |  132 -
 .../4ccc6026-b639-488d-867f-d98ea49cf1b6.json |  132 -
 .../3cf2e68e-4de0-436e-935e-86935e11f72f.json |  132 -
 .../e9e4ae5d-0dd1-463c-9f15-47cb21efb409.json |  132 -
 .../c57eb23a-5998-4ab9-9a98-39b1338f5ba6.json |  132 -
 .../94fb625d-f58c-4f2e-8268-1dc4472c1cce.json |  132 -
 .../4481ddef-2bef-4284-b56d-21054f5a9a97.json |  132 -
 .../80048c4b-e97b-45c7-aa04-70ce69481a97.json |  132 -
 .../d21a2557-2348-4087-b2a6-6e1c0101bccc.json |  132 -
 .../76290d4b-5526-400b-8ca4-24d220f7c02d.json |  132 -
 .../3a146535-09b3-4246-8bd8-0e984e0905b1.json |  132 -
 .../6683f95c-f97f-4117-b3c5-c1ed9587289e.json |  132 -
 .../bbe74b2b-9e13-4c13-92c8-618078667248.json |  132 -
 .../61876ce3-acc4-4619-b0c2-78ac4dff48ea.json |  132 -
 .../b304baee-c9de-4982-801d-2b9e7f1a7334.json |  132 -
 .../6f27e746-1bdd-4cec-a955-c27f2f9900ef.json |  132 -
 .../30637c5d-1bc0-49dc-8afd-335a9a66f196.json |  132 -
 .../169e29b6-50d8-456d-aa20-3fe2f3b19a1e.json |  132 -
 .../427d32f7-190b-4005-b02c-6a8ce089dbbf.json |  132 -
 .../de7551a8-63b1-4de3-899f-9d98cb985005.json |  132 -
 .../eff6f456-906d-4320-8e6f-667fbbf0574a.json |  132 -
 .../6cbd9a3a-7e06-4eee-af9e-6db4ff35c36a.json |  132 -
 .../7e3d3803-c8d4-4025-8d12-c4c29c49c059.json |  132 -
 .../a43a6ca9-3543-44bc-8511-ee5c45552070.json |  132 -
 .../83f6fdec-9592-45a1-acdf-0ebbb400c8a4.json |  132 -
 .../6e2d4174-303f-437b-9abb-26667b1dd04c.json |  132 -
 .../955e93d0-bec1-483c-b3f0-258e13d5cb16.json |  132 -
 .../3065ca79-c5e9-4875-9f81-4231e971d818.json |  132 -
 .../fc7e485f-a416-420b-b43c-e45e502c4a8f.json |  132 -
 .../53e882c6-6eb5-4202-a8d0-3a313556c9f4.json |  132 -
 .../ba715669-c0ed-471f-80a6-b67453fb4930.json |  132 -
 .../316cab27-5cac-4d26-90ae-05d1fc3bd14a.json |  132 -
 .../d2b0a35a-ea72-42f4-9f71-fffa1480bc22.json |  132 -
 .../bf3eabff-fbf7-421c-9e04-548accc7678c.json |  132 -
 .../b7eeedd8-33ef-46b3-a3fb-6ac87247bc4e.json |  132 -
 .../b1c41abe-e7f6-4229-b776-8ed0b5f91bd4.json |  132 -
 .../5b769770-3b63-4863-a723-95212e2be40e.json |  132 -
 .../f2264b41-efa5-4278-91fd-2f454aa91c61.json |  132 -
 .../5c3484b4-6faa-47fd-a1a2-881898450f79.json |  132 -
 .../326b95f8-9eae-4064-a261-077a957e233c.json |  132 -
 .../c1c7336e-b8bf-4a69-a586-c1a224ba8a65.json |  132 -
 .../89e55482-b762-4f5d-a021-211048719bdc.json |  132 -
 .../81018e12-63f8-4ad8-87c4-181a13202497.json |  132 -
 .../5b09e8cb-aaf1-48fd-a2f4-11a8d4bc9a4d.json |  132 -
 .../8b344f21-9038-4b15-aba8-308aa62e4b39.json |  132 -
 .../68ca8f7c-88c2-4ede-bcb7-d4ae23429d8f.json |  132 -
 .../df557f25-5505-49dd-a0cb-88fff601c6e2.json |  132 -
 .../a50bf387-bf34-490f-979a-b6217a85a1bd.json |  132 -
 .../89264aa0-3bed-41d3-b171-2a5434cc990f.json |  132 -
 .../a3272caf-a292-4dc7-8932-636a4099ca6b.json |  132 -
 .../c4ade77e-628f-457d-bbe1-3e5a0cb19d04.json |  132 -
 .../b030646c-5f5c-43ab-bbc4-405f82992265.json |  132 -
 .../399e516c-d8c8-4511-a746-76c81f72b36a.json |  132 -
 .../bd8e4424-7903-43e7-8105-269de734582e.json |  132 -
 .../9126e939-3a87-4774-9606-084c5b56e933.json |  132 -
 .../be2ef197-738e-422d-9a88-cafd124584b7.json |  132 -
 .../ee22e6c5-8529-4987-86d0-4abf3b525f90.json |  132 -
 .../50f0ddc2-fccd-447c-ab50-a086ccb4cd3a.json |  132 -
 .../83294141-a70f-40da-b3f8-21b367098cce.json |  132 -
 .../303ae3d2-fdf5-404d-83ca-8e6071e13e6b.json |  132 -
 .../1b13d76d-259f-41f2-baba-ce96ef0cb937.json |  132 -
 .../b644a420-0a70-4b3d-9a5a-ff91911c857b.json |  132 -
 .../33aaa60f-eb69-4d36-917c-6862121a223e.json |  132 -
 .../a1d2e571-6de0-4bd7-bdcf-8b3921b450f6.json |  132 -
 .../ad93274e-3ca0-40cb-9f65-e6e6c66a8008.json |  132 -
 .../b8043d04-c3ab-4d6a-97eb-44b195a52710.json |  132 -
 .../c6bff6da-382f-4423-ba3a-d987839132e0.json |  132 -
 .../f3574ad1-a6d7-47fb-86e7-69c256452dea.json |  132 -
 .../f2e47267-6c40-4d70-8420-295c95b318f3.json |  132 -
 .../395f246e-34c6-40e6-bfeb-b047aa12cf90.json |  132 -
 .../3a91f8bb-c132-45b3-b8b4-d2ecc9f03f3a.json |  132 -
 .../97c92043-9bed-460a-8d7b-70ab3584c75b.json |  132 -
 .../ab2ce171-bfcf-49ea-a341-2a52b2bd803a.json |  132 -
 .../f9bbd9cc-dc6a-466f-b777-eaea4a15b874.json |  132 -
 .../cd0aefa3-b0c9-4683-872f-f9f9d285e6c3.json |  132 -
 .../c42db2ab-dbc4-48e4-9c16-7b8a5f8492c3.json |  132 -
 .../1b32c387-97a7-42ff-892c-d3bacebbf050.json |  132 -
 .../cbea057c-b0f9-48ac-a075-eb28ebbaf358.json |  132 -
 .../0b1bb876-9dc7-47d5-855a-f028fb7f2df6.json |  132 -
 .../a86678ad-344c-430f-80c7-02d634b0cd5b.json |  132 -
 .../827f3236-74fa-432b-8177-8785ac25ad76.json |  132 -
 .../7f694687-77e5-41d2-923b-f2d5f231729b.json |  132 -
 .../daa9d03e-63b0-4c08-ae72-e11041200ac7.json |  132 -
 .../1539822f-acc4-4dae-9e61-133da97ebcbe.json |  132 -
 .../eec80fda-ce2f-4ef4-94d3-9e7b90f7f2e5.json |  132 -
 .../448cac5f-a7d3-41fb-9b49-666758037eb4.json |  132 -
 .../5d7c5ac1-84c3-4fd1-ac51-4c00ed8c59c7.json |  132 -
 .../7e1741cc-f9ea-4940-9b6b-d7a515cfce31.json |  132 -
 .../ec4d21be-b1a6-47a9-84a4-1a25249c1768.json |  132 -
 .../c6b03539-04b3-4ef2-909d-8036a7ea2ae1.json |  132 -
 .../f156ac38-056e-4ef1-bdbe-e83c299a683b.json |  132 -
 .../11d3c8db-300c-4e02-b729-7adba6844ad2.json |  132 -
 .../fc75a820-fc0b-4e50-9304-61f0e93795c0.json |  132 -
 .../bb66896f-799c-4e17-8b54-af5e795699fa.json |  132 -
 .../30a1a786-7478-401f-85ae-57037ada3d32.json |  132 -
 .../05430b16-07b6-41a1-ade9-6211cdf8ccf1.json |  132 -
 .../09bc4d5a-f104-4a36-999c-11e2532eef1e.json |  132 -
 .../a92cfff6-6caf-4bf1-913a-9d7dd2d8d449.json |  132 -
 .../8972e92c-ebbe-4dc4-8a8c-6f7a42ab5c11.json |  132 -
 .../e4f39815-9704-4d0a-8d9b-39359367adcc.json |  132 -
 .../f40df456-eb9a-46f8-8fb0-b6ad2748f3c2.json |  132 -
 .../398996d9-299b-4120-a757-e2fe14e779ee.json |  132 -
 .../4398633e-77b0-4b61-ae85-29b0e5aad38b.json |  132 -
 .../1bc60148-512f-4830-b541-f30535cf74bf.json |  132 -
 .../a9dfb20a-13e0-4419-a747-7c001b2e9435.json |  132 -
 .../388e3559-a3b6-4738-9843-9bdd048bae09.json |  132 -
 .../994a6930-42d5-463a-9e7c-0a3070144211.json |  132 -
 .../cce46320-9794-443a-831a-92e2a21515b0.json |  132 -
 .../988f4cc0-ebfb-43a9-8a7f-3dd1f1c1e342.json |  132 -
 .../3c675148-5d09-4778-baad-9295ef8cfc79.json |  132 -
 .../620b80ba-81ab-4504-9f42-4965014f3cd1.json |  132 -
 .../b6c68fc1-c2c1-4cdf-91ef-2007becd7ade.json |  132 -
 .../19279c18-c2f7-4f75-a9c5-a121b2d4bcff.json |  132 -
 .../7966789d-8ace-4b39-9093-96bbb8e641d8.json |  132 -
 .../5e1d849d-0342-4de9-a7d8-dd5cd5960fac.json |  132 -
 .../a17563e3-0369-4042-8006-2ec781653f63.json |  132 -
 .../68369110-e371-4112-ae0a-14f7fe9fc40f.json |  132 -
 .../2a6925d3-992f-4c4f-a57b-3eb41062743b.json |  132 -
 .../28290ea9-9ce5-4605-ac5b-aa2d606994d8.json |  132 -
 .../eb2ed6eb-4789-400d-aea5-841547a20cd7.json |  132 -
 .../873218a0-7ddb-4287-88ce-8c8214e85c85.json |  132 -
 .../e4c32b92-46b4-431a-83f2-11499f587534.json |  132 -
 .../a05681a0-07e4-4206-ae89-dee4e9706467.json |  132 -
 .../b078f823-d603-4030-81a2-a3ca1a1117f9.json |  132 -
 .../26625158-6720-47c7-8c28-46ca7b4b947e.json |  132 -
 .../5e3e8dec-f14b-4b7a-ace1-1e1728395e84.json |  132 -
 .../35b4378e-52cd-4ae1-985b-c8e2c00dc61a.json |  132 -
 .../4d99a55e-39c0-41c7-9ef0-494f739ceaec.json |  132 -
 .../f3c7bacd-e231-45fd-b503-ee4d34caf4e8.json |  132 -
 .../1bb87d8f-2d66-42b2-a744-1a7cbc2c17dc.json |  132 -
 .../ae10fd26-e648-4fa0-ae24-dfaaf4ff510d.json |  132 -
 .../0af58746-0492-4ba7-8a17-c0a5c43d0700.json |  132 -
 .../88fff9f5-7aa7-463a-87e0-5fd2f5bacf09.json |  132 -
 .../bc79527d-ae58-4b17-afd8-df931562dbf3.json |  132 -
 .../3e7423d5-ad7e-48e2-bd25-a4946d443c24.json |  132 -
 .../7979fd6a-a886-41cc-987b-356b7c452bff.json |  132 -
 .../2be6bc34-1e61-426f-b963-6e096b5418fb.json |  132 -
 .../c4f69339-be6b-4bb4-8faf-a1f40e73d4b0.json |  132 -
 .../c845eb10-a028-4cc2-8f64-25d75480c0d5.json |  132 -
 .../377e7223-4876-49b6-8057-b1831d7f129b.json |  132 -
 .../4ddb9ed6-0599-482e-b12e-bcb01975cc85.json |  132 -
 .../9d5af106-be69-4b62-99c1-fcfb6091d080.json |  132 -
 .../2f2d7a55-2838-446d-9487-a6cfa0c03356.json |  132 -
 .../65d20d45-f63b-4b09-b66d-5f53297c0c20.json |  132 -
 .../4712953f-0777-4b97-8f13-f7309f19f0dc.json |  132 -
 .../84382308-04b5-439f-b486-b26d20da605a.json |  132 -
 .../e82be06f-14ed-45e8-a273-d28c50f5212b.json |  132 -
 .../5815ba55-40fc-4f8e-ae0b-b329c42fd503.json |  132 -
 .../e58eceb3-b501-4924-9d0d-98d7da3c16c5.json |  132 -
 .../5a88455c-7699-4c49-8a12-76cda15d878c.json |  132 -
 .../122b4c1e-6e6c-4db5-8991-b091361c3ecf.json |  132 -
 .../6abeb0e4-32ee-4dbb-9902-b19cc96a2aa7.json |  132 -
 .../679f214f-e03f-47a9-8a11-91adbf1c4880.json |  132 -
 .../680e77b8-9c64-4c52-aa83-55236039cef1.json |  132 -
 .../c24c471c-14b3-462e-8b81-6548b27e5ffc.json |  132 -
 .../efa7fa62-2e8b-403c-b345-eef876b48dbd.json |  132 -
 .../40bae762-65bd-4b4c-b422-ffd0fd3790a9.json |  132 -
 .../596957cc-719c-44c7-8284-06a9ba0d1a30.json |  132 -
 .../706bbc09-f867-4327-bc4d-b5ede41ebd93.json |  132 -
 .../8962e9be-75bf-4f57-8ce2-b29523740851.json |  132 -
 .../014f4838-22ff-4802-a887-4d2de01a9256.json |  132 -
 .../5c6eac9c-0ec6-4364-a86b-dcd894d69f0b.json |  132 -
 .../09b81cf2-3b79-448c-ab8e-87e378c804bb.json |  132 -
 .../28b9977a-db3d-4f38-b1f7-bd0cdcab5504.json |  132 -
 .../845ea162-cfa1-47f4-8914-d81d9bf1bb7d.json |  132 -
 .../706737c7-cd1a-4958-9ffc-2655f0b50178.json |  132 -
 .../5acd58cd-8dfb-4fb7-8832-6bc151e0b1a1.json |  132 -
 .../d374a68d-b985-47c2-b087-500bffa93c80.json |  132 -
 .../23fbceb0-b646-4945-b17f-66dde24a0e43.json |  132 -
 .../73d9e204-e829-4159-b340-6d9581c6f0e1.json |  132 -
 .../a6979dda-fba6-4104-b153-3b0a89de8585.json |  132 -
 .../62e04968-0c5c-4aad-a434-d9d24bccbdb8.json |  132 -
 .../bae4064e-b10f-4082-876d-e4168ca1a8cc.json |  132 -
 .../0040b48c-0f54-4c9b-97ee-1ca833c68e36.json |  132 -
 .../6050e969-bcde-4594-8e53-05fa74c7287d.json |  132 -
 .../3aaee358-bf3e-4d91-91bf-bd42e0a7c61e.json |  132 -
 .../ef5f4fb2-f409-49dc-b3f0-f3e19585cd8a.json |  132 -
 .../4048fa60-7427-4f7e-9939-e270aa5e8b51.json |  132 -
 .../f5c9baea-f2cf-414a-937a-6a43f55a1c1d.json |  132 -
 .../1da70796-d40b-4f2a-8ce3-b304f414a6d5.json |  132 -
 .../de476f79-2539-4f9e-a1d2-901c6c4342d4.json |  132 -
 .../80aee542-c894-46b6-a6ed-9f3400aefa9e.json |  132 -
 .../5c9d4eaf-0985-4f9e-8007-08b4081bb19d.json |  132 -
 .../4b019824-8454-4ce8-aa49-d122a2491f9c.json |  132 -
 .../0dfcd13c-f057-4aec-82ad-b5cf2b266502.json |  132 -
 .../927589bf-f6a0-4155-a24b-120231bbf029.json |  132 -
 .../1a2740cb-c541-434e-89a1-7a9fd2c4cabd.json |  132 -
 .../0110d1c9-755e-4f09-888b-0c9c1a263639.json |  132 -
 .../cda65781-494c-45bd-8c32-7b1fe987f31c.json |  132 -
 .../2fd7de02-f8d9-45c1-9bb5-db5134bd4862.json |  132 -
 .../acf07f51-5acd-4375-bafa-7a1a244db3c6.json |  132 -
 .../ff985193-ba26-45d3-97be-b7d3b17ab4d7.json |  132 -
 .../21dbea2c-5cb1-431c-a496-af9b932b3440.json |  132 -
 .../1143955c-c32c-4b41-8484-2c77e72f4946.json |  132 -
 .../94824ceb-08c3-415c-8003-b70a0d9af09d.json |  132 -
 .../bf2903cb-b954-4870-98c3-116a96aa49fb.json |  132 -
 .../b089c439-a38c-438d-bdad-1c68a1265d95.json |  132 -
 .../c988815b-50e5-47e4-a418-bbbcdf1eb4a0.json |  132 -
 .../fa11d66c-7ebc-4b81-83b7-d35a4ff23d3f.json |  132 -
 .../1c81787b-594e-4bb6-aee1-7f193a628b16.json |  132 -
 .../fd9ce37e-d43d-4ec2-94ec-0eb42e3cc685.json |  132 -
 .../0625f09a-3e02-410b-963b-49b83dfc5c8f.json |  132 -
 .../50c1399e-b409-4dff-b4d6-9be01dbb02c7.json |  132 -
 .../402bdb4a-b258-40a4-ac9f-de74026c02f3.json |  132 -
 .../65dcf458-db0f-45cd-a8a4-e16108e51161.json |  132 -
 .../f1346b1a-0e66-4d80-bfad-ccbe0a8e2abf.json |  132 -
 .../11e7b55a-d872-474a-98a6-fc82ce5a863e.json |  132 -
 .../19688633-fa6c-412a-8dbc-c16fc49b3276.json |  132 -
 .../7d67eb9c-a4d8-4b86-8c24-928ebbe58de7.json |  132 -
 .../447f880c-643f-4041-8cdb-87697d798085.json |  132 -
 .../653d459e-f8b7-48bc-a9db-779e515532cf.json |  132 -
 .../4e56faf6-dbde-4059-b502-32c76bdbed2d.json |  132 -
 .../f161df97-3cc6-48d3-bfc5-d3f01108ecbb.json |  132 -
 .../7d08412d-e987-497f-a6ec-ce0affe0f80f.json |  132 -
 .../f042f897-cfe8-4d8c-b75b-bbfca44505ea.json |  132 -
 .../f24ab334-c022-4e34-a930-3fed6ee18793.json |  132 -
 .../2bd3c620-780f-452d-92d7-d01a04539939.json |  132 -
 .../234042bd-237f-4cc5-8c5d-1eacd2e8bfaa.json |  132 -
 .../d8e0a32e-f307-4056-b450-47a12a0a7b15.json |  132 -
 .../9dc3c4f5-8974-4496-8a6e-daa4fe3e3c2a.json |  132 -
 .../037787fb-9c61-4c56-a7fc-704c04b519f7.json |  132 -
 .../5df3dd8f-4921-4916-8163-8651b796e478.json |  132 -
 .../50463593-3a53-4b3f-9621-d05670309b7e.json |  132 -
 .../d7fef356-36c7-488f-8f49-997682a2c01a.json |  132 -
 .../42e7abc6-eaa2-4971-90ee-e4d9dbb97ddb.json |  132 -
 .../b1cf06a6-d270-41ae-bb9b-443bdc5446f3.json |  132 -
 .../e40ea476-bcc5-4d3b-bf8e-e5048d9cbe42.json |  132 -
 .../731a5f85-a59e-40af-870c-00e519ca0e7e.json |  132 -
 .../38d93ae8-90ec-473c-8570-33d52c46770b.json |  132 -
 .../9072fd28-040b-44df-bd58-6e3f59398189.json |  132 -
 .../14827e00-09c5-4ebd-93cb-8e026ac73d20.json |  132 -
 .../11e76d74-b8e0-408f-b429-566faa5d60a2.json |  132 -
 .../944c84d8-231d-47ef-85f4-23c0286a4a02.json |  132 -
 .../47c8da1d-8ce3-4d19-b8b8-6b5e68e2e8ab.json |  132 -
 .../ca54a8d4-153b-4169-b6ee-133461a9bedd.json |  132 -
 .../652359ec-14f2-4f94-a694-b7dc98819bfc.json |  132 -
 .../b34f3335-c7a3-431f-b2c8-6f0731a81378.json |  132 -
 .../077306f9-5d40-40dc-9df4-b5ca559af5c7.json |  132 -
 .../e0f0fe87-8ed3-4398-8683-65aa042d01d9.json |  132 -
 .../2d968d3e-a3df-4bdf-86a4-034087c0d7fc.json |  132 -
 .../db476911-87fb-433f-b164-4435718dab46.json |  132 -
 .../75a967f6-a8ab-435f-999b-4889e8217dce.json |  132 -
 .../e072997b-2f79-4d25-b8dc-ebf15ac311e1.json |  132 -
 .../6d681a29-0d1a-4054-8250-5246993509f8.json |  132 -
 .../2a6af4ce-e45c-4721-a23c-03071a5e774f.json |  132 -
 .../5ae5ddff-714d-4a20-b1d3-3eeb95fd858c.json |  132 -
 .../60052d34-f6a7-4204-baea-532f5ba29880.json |  132 -
 .../e1ddd882-f8a1-48d0-bb2a-878f43095895.json |  132 -
 .../d2c3edec-38d8-48e3-9f6d-e26a63442af8.json |  132 -
 .../dcfafe94-dacb-4e7a-9365-8bb39ecb79ec.json |  132 -
 .../8ca0e602-bf6b-4d15-95c2-a0d47e78ded0.json |  132 -
 .../fc262523-dcde-4b45-80ba-2922e66d42c4.json |  132 -
 .../f8d745da-9867-4348-bace-d8052c3b4025.json |  132 -
 .../3d410f0f-6b24-4e86-a353-6142c51b1ecc.json |  132 -
 .../46329fc3-974f-4d04-be9e-ba85b3816efc.json |  132 -
 .../b964d0a4-7c44-4ea2-894e-3e1ca30321e0.json |  132 -
 .../126326f3-6521-45d1-aa14-5c51335c1929.json |   79 -
 .../b3f5937a-1489-417b-8162-6c62dea0703d.json |   79 -
 .../f06d6c4c-b2c4-4c48-9702-f0bf08af62c4.json |   79 -
 .../809a1503-a161-4532-afd3-fdbd6551eb63.json |   79 -
 .../808ca8e4-9b14-48ba-bb39-e3b6a5672c80.json |   79 -
 .../be076445-eb88-49b0-a855-2e0cb1551bab.json |   79 -
 .../69210faf-04a8-46d4-b92b-94f2ca521c09.json |   79 -
 .../ed293aa1-f64e-429d-bddf-91a35a4203d1.json |   79 -
 .../2bddd388-5e9a-423e-8767-37d6f9f69032.json |   79 -
 .../bfd991ca-13e9-4716-b389-11e0d2afe286.json |   79 -
 .../b29b7c8e-759e-45fe-a9d3-1054f19af617.json |   79 -
 .../801d2dc6-17e7-47f1-a54f-87b94a59b508.json |   79 -
 .../def0b2e3-cf5f-4dfd-8f1c-827f98d1626a.json |   79 -
 .../157dd68b-fcc2-416f-a2c0-c9781020e6af.json |   79 -
 .../174f0e23-84f1-43d0-bcdf-11b83c37025a.json |   79 -
 .../bef7254b-549f-4e6b-b5c8-31b84dc6acda.json |   79 -
 .../aa236b03-b81f-431b-b049-7101cea165f2.json |   79 -
 .../abc37028-a362-4e02-8499-1bb7497e0293.json |   79 -
 .../ba46ef91-d157-4984-b3df-ce33d8d97f8e.json |   79 -
 .../e70acf51-30ef-4c20-b7cc-51704d114d70.json |   79 -
 .../0e57aa1f-48c6-42b7-9aee-43a29d21b83f.json |   79 -
 .../de66cc70-b456-4165-a827-5193dd77e84d.json |   79 -
 .../e9139c52-ada0-4d1c-ae82-7852aacdb6ea.json |   79 -
 .../1dd8c827-72af-4c8f-9ead-989de7105590.json |   79 -
 .../ead39f61-b408-42b2-808f-8421a3200c89.json |   79 -
 .../f96bdb35-4d61-4fde-8d91-edf55f13dc03.json |   79 -
 .../5516f77c-932a-4eaa-ac31-dda9260ce82d.json |   79 -
 .../8992cef5-df7e-40a1-b099-331532c3deb0.json |   79 -
 .../a77c08d6-a782-440c-b545-c60b6169712d.json |   79 -
 .../623bae1f-19e9-47f9-bc7b-80a859218d07.json |  130 -
 .../fbba98c5-5d56-4837-9044-d4e5ac610c2c.json |  130 -
 .../dc6e1164-c9d7-4dd5-b8dc-fbc4e3f45011.json |  130 -
 .../c62a913b-3101-4ce3-a5c5-a1ac844e55f8.json |  130 -
 .../3101726d-fd51-436d-8adf-cbdf0d534834.json |  148 -
 .../f878a52a-fa80-4113-ae7d-0cb11e3ef9fd.json |  112 -
 .../904c6359-bd7b-4448-9f16-bc115d0629c4.json |  148 -
 .../49511052-6881-4151-9b46-686c75f73c22.json |  148 -
 .../b289e2e6-d57b-4a2b-aa61-e2974d193909.json |  130 -
 .../aeeca919-71a1-42a0-a6d0-6779d77750e6.json |  112 -
 .../db29538d-f40e-42d0-b3c0-e622f92112d2.json |  148 -
 .../ab0cdc4f-47dd-4dcc-b506-982ce3924105.json |  130 -
 .../44da63b6-d934-4330-bc20-33464bae61dd.json |  148 -
 .../c930cbe0-f429-4b61-9abe-86dcb7266cf7.json |  148 -
 .../c84b27b2-2dd9-48ee-9a53-ec27ae62ae7a.json |  112 -
 .../73ee9408-e669-4b8a-9419-76bd6051ce8d.json |  112 -
 .../0deed2f4-770e-4033-a65d-e1da19e00611.json |  148 -
 .../e727cb77-f229-4aaa-909f-99c7aa06676b.json |  130 -
 .../da9264cd-2fa3-4121-81de-eef994e15993.json |  130 -
 .../79cc5cd4-bfed-466d-9fbe-2f27e8aab175.json |  148 -
 .../28c35831-679d-489a-b2c4-fd2c7f333fbc.json |  148 -
 .../9db7907d-7b22-480c-86a5-f88ec2b302e7.json |  130 -
 .../2faddf79-41e6-47e9-9c26-17bc987bc870.json |  130 -
 .../20989a47-6556-4e3b-8909-d0a419cb159b.json |  130 -
 .../f3d0010f-efed-4f87-9582-b9c87b4de99a.json |  130 -
 .../a0ce3ed6-2a2c-46ad-be86-6f6701533e36.json |  130 -
 .../d54c4830-23c8-4c12-aea1-4f5b5245464f.json |  130 -
 .../b5853278-edd9-4bc8-bbeb-d6dab515b562.json |  130 -
 .../74188e30-1e49-47d8-af01-b80e430dafa0.json |  130 -
 .../93974286-0497-46a2-a2e8-404c1e89dba0.json |  130 -
 .../02c0020c-7d69-4701-a606-4bc79ad87afd.json |  130 -
 .../5dcb7c54-64e7-4f76-8903-8f57b35cdb0c.json |  130 -
 .../4887256e-0545-40dd-9756-ff850e003a29.json |  130 -
 .../d2b70870-9cbc-4666-bbd4-097fcebe716e.json |  130 -
 .../f420f432-2291-40a9-8ebd-b91241970113.json |  130 -
 .../02e68d1b-86f3-4344-ad8d-45df878b744c.json |  148 -
 .../f712ab4a-1127-44ba-b6b9-7a40290f3322.json |  148 -
 .../b4175f0f-f9f4-4418-b4aa-a31e7f1f93f4.json |  130 -
 .../9879e9a7-ddbc-4338-abc7-e3bc394869e9.json |  130 -
 .../d7d8a5cb-e295-4ced-b528-d99d814ff008.json |  130 -
 .../bff86a1f-71c3-4f27-aeae-bba6d03635ef.json |  130 -
 .../723281f8-54b7-4db6-8253-5a6dcf4f3d4a.json |  130 -
 .../0ce7dc54-f608-4985-9904-75cee09b6288.json |  112 -
 .../5bb0aaa4-2cc5-4622-8235-993bc4178f12.json |  112 -
 .../85ab22b8-0587-4e2b-857f-3d6d84d571a4.json |  148 -
 .../37aa6702-b2fa-43bf-b5a9-36740f627217.json |  112 -
 .../57f48d0c-e424-410d-b9ee-4707e2add036.json |  112 -
 .../8643b4dd-e18c-442c-adb5-84ef756534f8.json |  148 -
 .../2f3d2e46-1f9e-4b1c-9729-ab0a93cc245c.json |  130 -
 .../4aec78d3-a38c-48e0-b9e2-b6dc063bd37e.json |  148 -
 .../f9b60945-8b14-4564-9d44-3eb6db675ab9.json |  112 -
 .../56703c11-eccb-4f66-af13-60f972a5068f.json |  130 -
 .../fbd8be7e-5670-4729-a77d-83472510b734.json |  130 -
 .../2e18ee77-9c46-4cf9-9521-303ad15e5be4.json |  130 -
 .../ec5b296e-03e8-4371-a8c1-eca0b0b9759d.json |  148 -
 .../07b61a55-a8e3-4a6f-9806-a4100f8d5297.json |  130 -
 .../3d534c25-5016-44de-9c47-24b7d7399b0f.json |  148 -
 .../4de91433-05b3-4f88-9d0f-66691c671f62.json |  148 -
 .../dc71f1ba-f4b8-4231-ac72-0acf9a22d73e.json |  130 -
 .../36c4adc9-c2fb-4bc3-81ba-88478d30332e.json |  130 -
 .../f0827b15-20d0-4986-b5a0-bb4bc9be768e.json |  148 -
 .../aeaa8b33-e327-4c65-9641-5dfc63feee3b.json |  148 -
 .../c97c79f3-fd92-49db-9131-5e45834a7eaf.json |  130 -
 .../687099cb-c1bf-49ec-a902-329c2b818369.json |  148 -
 .../8da4f5eb-6264-4503-b9bc-fcf843b638be.json |  130 -
 .../28a68b87-5412-4374-9e61-896b0fff7669.json |  148 -
 .../3209c869-03c5-4801-8e4b-4c8bcde3d58f.json |  130 -
 .../9d1e124c-e133-41d3-8ac7-5c8c5027aa02.json |  112 -
 .../633d499b-58bd-4fca-9b56-0f005a5a21b8.json |  130 -
 .../5c4f3caf-6af3-48c6-83e2-4710d31e6acf.json |  130 -
 .../77d1edc1-fb54-4371-bf7c-baebbb351163.json |  130 -
 .../e7eecdb0-bc17-4d9f-b3e8-9ee777d2f595.json |  130 -
 .../3f3915b3-0d6e-451c-9185-fa4372b93f2b.json |  130 -
 .../e534d37b-3009-4a7d-82d8-d7c85b95649e.json |  130 -
 .../bd8f0ed1-75fc-48c1-996e-655d205c027c.json |  130 -
 .../e9effaf6-e48b-4b35-b035-430be81b316b.json |  148 -
 .../d2132eea-eb88-41e5-b8e6-2e8e8a623ed1.json |  112 -
 .../ffd05bc7-3724-40ba-85b9-c25ebe71fba2.json |  112 -
 .../43f0e93d-f0b8-46af-a549-e1ac315d96ea.json |  148 -
 .../9ccab7bd-d2ed-4ab3-ad81-656650c29a3b.json |  130 -
 .../c10d4213-f1fa-41e6-92d9-0d5337c1362b.json |  130 -
 .../63b08ba0-eeb9-48ae-a5d1-d7d3792aa1c0.json |  130 -
 .../d724076d-509f-4ad4-894c-976b0472de85.json |  130 -
 .../54d34f25-1cd9-4995-8e56-c36981842fc8.json |  112 -
 .../63ae1c75-fd4d-4f40-afd0-b9f91d700014.json |  130 -
 .../1d5ebbce-8cfe-446b-82c0-a227d4e9247f.json |  112 -
 .../3f9c81ac-5c76-43b4-a27d-7eaa055139c4.json |  148 -
 .../680098fb-76cf-47b6-a0ea-a1a06ca46dca.json |  148 -
 .../6ec21338-9908-4ce4-a1f2-dac14c5e27ab.json |  112 -
 .../592ad1e3-8a48-4c39-8013-81d7c731780f.json |  130 -
 .../5b36f0af-7ff6-4564-9714-08fbf41d261f.json |  148 -
 .../04f120c6-b648-4c83-81d8-05118efb0904.json |  130 -
 .../c907e494-ab2e-4a28-a28d-aeb68eb818ed.json |  148 -
 .../d9eed240-ebbe-482f-8dae-c5251ed6d067.json |  112 -
 .../670865e1-f219-465b-9fbe-6da6f73ac9e6.json |  130 -
 .../88953298-b63e-499f-a31e-f0f586c4772d.json |  112 -
 .../3acb690c-ffc0-4e67-8ae1-e79bcee4f824.json |  130 -
 .../6ad2cb6a-f9a3-424e-aed2-9493899872e3.json |  112 -
 .../1892bf75-916b-4d4f-96ab-fda36872ae5d.json |  112 -
 .../e06e1863-c28f-4c96-a672-b1073c80aa71.json |  112 -
 .../d923f7aa-a9d4-406a-b5d7-bdab508f04f7.json |  112 -
 .../5c5e40b1-e86a-4d30-b93c-f8f9e73cdca8.json |  112 -
 .../59299d8c-e468-490f-8a52-eef49b0aaeea.json |  148 -
 .../3ce9612f-9b57-476e-9fa4-6e63f14568a7.json |  112 -
 .../9c605bf1-2533-43db-a610-e71c0aaecdb5.json |  148 -
 .../c289f778-92b8-44df-a079-3bced33c8ab5.json |  112 -
 .../329d4101-e740-490c-9fbc-1708f76a2f61.json |  112 -
 .../3e87f52e-b136-4cb3-8cbb-d8d8a8571051.json |  148 -
 .../62b9adca-db38-46c0-a68a-ed7a8e735035.json |  112 -
 .../4d2f43eb-e6f3-4686-a9d9-6b6c6b68b86c.json |  148 -
 .../830df3fd-d479-4af8-a92b-93d82e804fec.json |  112 -
 .../0e6d85b8-aa37-448c-adb2-0da2bd13e322.json |  112 -
 .../45f0bd9c-e939-4b83-a623-1db61f431500.json |  148 -
 .../0f710903-7dd8-44ea-914d-d43bbfe894f1.json |  148 -
 .../b9ddd960-f6f7-4962-8297-88ec7fbbbd1f.json |  112 -
 .../25a4520b-c780-45fc-a00f-36db1776c6a8.json |  148 -
 .../96d7e5c1-2f43-4f09-9702-0af090afa141.json |  148 -
 .../5a47f8bd-401a-4b6b-91b0-9593b36e5996.json |  148 -
 .../c27e98d4-f5ea-48f9-babc-3ccda2d21d2a.json |  148 -
 .../060bf847-e7b5-4e30-934f-5306d01c499a.json |  148 -
 .../e648e6c2-18bb-49d7-b08f-47ce41a67d4f.json |  148 -
 .../537e92cb-25db-47f5-916a-6f666e14639a.json |  148 -
 .../e59ca33f-c6ce-44d4-9cb4-2fd65608313b.json |  112 -
 .../fc99848b-82c7-459e-8327-1867a332ff28.json |  148 -
 .../357f4f03-9542-495f-b575-4274111bbe1f.json |  112 -
 .../d78c42d6-fc0d-4719-bbb6-7a53dbb0d017.json |  112 -
 .../c94ddbe5-2bc0-4a33-b06b-10671fb22b70.json |   94 -
 .../cc2ac405-1710-46fa-aeba-dd86797c666c.json |   94 -
 .../49fcb3e2-2883-4c3d-b519-d511c6b10162.json |   94 -
 .../0ba5ce6c-f311-4b02-a67a-d49539119a8e.json |   94 -
 .../49029c9e-a831-4219-8e26-df20862ad3e1.json |   94 -
 .../6dedd117-eab0-4c31-b50b-4890099d9904.json |   94 -
 .../71c20c06-efb8-428e-9e9d-e4fedf11041a.json |   94 -
 .../862f3d57-8f5f-4372-b6fb-876fb35efba4.json |   94 -
 .../93ea2bfa-e058-42d5-afac-0d3fc50fce91.json |   94 -
 .../c1331fa1-7793-4526-b24b-02261bb4437f.json |   94 -
 .../c3cab72a-47b3-47ec-bb2d-986903ab8c26.json |   94 -
 .../cd0452a7-0370-4024-a51f-b3deff290db9.json |   94 -
 .../6fd85045-d600-451f-8d27-da637add4081.json |   94 -
 .../a15ca8c3-fd90-4ef9-80c5-40eeac60d785.json |   94 -
 .../5f43832f-14fa-49e1-a851-949163aec826.json |   94 -
 .../1f8869e7-e434-469e-906d-d34621582cba.json |  148 -
 .../8f9d05db-9bb0-4998-bc75-96dbfa695548.json |  130 -
 .../2681e475-da0a-48a9-ab68-e0bf59240f90.json |  148 -
 .../e2986d78-100d-417a-9f38-9a570a335d95.json |  130 -
 .../1bc5cd51-5a3a-46ea-bc78-56f9b3081f69.json |  130 -
 .../1d1127ee-7a0e-4915-b8bf-0b22f8ba338b.json |  148 -
 .../4bb55ff5-5adf-407f-a9d6-910c6c9d2770.json |  148 -
 .../daebee0b-3856-4270-94c6-c14bd84f5cf5.json |  130 -
 .../1be99417-352e-4a94-8108-b43123553667.json |  148 -
 .../8d3fbc68-2ee7-4989-a40c-f4a45e579b5c.json |  130 -
 .../9533891f-c2f7-4e82-9f39-131768dbc28a.json |  130 -
 .../b8a47660-f0a5-4136-a743-979863c53e3a.json |  148 -
 .../2673bea2-42eb-42a5-9dc2-13d43341c9b2.json |  148 -
 .../6f5555c2-588a-48d1-811c-be53634bbdef.json |  130 -
 .../9c96fa7b-52e8-4aed-9fdd-f389091d5e6f.json |  148 -
 .../0519d9fb-f220-40ab-8257-f20ed98a8b47.json |  130 -
 .../ece70375-447f-41e8-aa03-8f4b26abea73.json |  130 -
 .../7bbaffdd-f822-48cf-a0f2-e66b16db678d.json |  130 -
 .../27c5c441-64ce-41dd-8384-f84c8f6ccc14.json |  130 -
 .../38a14e6a-2094-4e0b-be22-45181ede2a63.json |  130 -
 .../cee37c2c-2766-47b7-9192-a141e5d22f2d.json |  148 -
 .../d1d69392-8717-462d-9ce0-c7ddf5faf97d.json |  148 -
 .../72071bb1-57c0-4727-8100-ba24d8da10f5.json |  148 -
 .../7626c158-edaf-48f3-9ac3-1188be0c6032.json |  148 -
 .../c37be7a8-dc10-4fea-962b-202986a4581e.json |  148 -
 .../223dc616-b20f-4065-91a7-3c35bfd11c94.json |  148 -
 .../4236b0a9-9d1e-41f6-8364-a7e8ebf51635.json |  148 -
 .../c8030a87-0cdf-4918-b0d5-d1fb0e284656.json |  148 -
 .../e6ecc1eb-7ff1-46aa-bf03-37bad1b391b7.json |  148 -
 .../64872b1a-1eae-4171-95ec-a80c782b69f0.json |  148 -
 .../37484401-c7fe-469d-889a-e70f7cadbf82.json |  148 -
 .../8cf36288-3add-4fcd-a012-0df9eae2a059.json |  148 -
 .../f2c8f979-c331-4b9b-b0a7-5efa82c17d3b.json |  148 -
 .../de409ce8-fb68-4113-8879-23712769cbde.json |  148 -
 .../264f20d7-1574-448c-8917-eb3f20810819.json |  148 -
 .../0ebaec42-9190-4326-95dd-5ecb48bf1a72.json |  148 -
 .../29515933-c60b-4686-b475-70ef53d75457.json |  148 -
 .../414174a9-7e44-4f7b-94ce-0757639f5af7.json |  148 -
 .../48513083-f854-455e-8455-ddbd2698ec03.json |  148 -
 .../0b373560-854f-4482-81d0-6c984e130144.json |  148 -
 .../1a021cab-d569-4077-af5e-1643f45de03d.json |  148 -
 .../e26e230d-59b3-4243-a6c4-3845ab74b89b.json |  148 -
 .../aa0991d0-9c5e-4f94-bc12-3342ca389e99.json |  148 -
 .../397abe47-d5e9-487d-b883-ec49db16c584.json |  148 -
 .../82f52a35-41b5-4b9c-bb3e-4bf18eed0b92.json |  148 -
 .../670382ab-a8a1-43f3-a572-b9a5aeae23ef.json |  148 -
 .../a4b3c031-7c01-4f7a-8cfe-52b3260d6ecc.json |  148 -
 .../7fcd3fce-2296-4b5c-8362-24b1c70ccb8f.json |  148 -
 .../4f164e8b-55a1-498f-b586-cf78da7d0b57.json |  148 -
 .../a84d3d61-6e05-4d4d-bc89-7f663e9667fb.json |  148 -
 .../7aa98f71-8262-4c1f-a71c-1ef36f2ef04c.json |  148 -
 .../93398c1f-3129-4be4-83b5-62a4a45c6b84.json |  148 -
 .../62493784-f899-4736-bdce-2107ec99a752.json |  148 -
 .../9b68ecaa-cf9d-414e-9cf1-c662c765bb5c.json |  148 -
 .../76f3d0bd-2b71-4406-a0d4-b01b6c91c4ff.json |  148 -
 .../2dc5ab6f-2427-42ae-9582-a0e6139f451a.json |  148 -
 .../0db97be6-6562-47d8-bd1a-5b469250e54b.json |  148 -
 .../228e4dc4-e517-4023-b690-7f0c321286b2.json |  148 -
 .../9442b27c-c94d-41c0-a752-3bd82385272d.json |  148 -
 .../561039ac-b156-40eb-bf53-21a275b858ca.json |  148 -
 .../d801d700-7b4d-4a62-883b-3d85b05385ea.json |  148 -
 .../b8f24058-4441-4d19-898e-80470cc7b685.json |  148 -
 .../1f372e00-e7a8-43ef-8e14-ef1b08e5e957.json |  148 -
 .../0200a1b3-71f1-4633-96a5-4ca9883a67a7.json |  148 -
 .../55479901-aec7-4875-b792-ba73b54aa37a.json |  148 -
 .../872597b2-4392-4f23-b5b2-41d418b6cf89.json |  148 -
 .../5cb437b5-5993-418d-bd9f-81dea71d9edf.json |  148 -
 .../c471cdf7-73f9-48c9-a970-baa66b609093.json |  148 -
 .../794a71b4-8a43-4c69-a663-369eea6a84a3.json |  148 -
 .../2ad22375-4ed8-4be6-a012-a6f6799581e2.json |  148 -
 .../a8df0dc2-d16c-4e1a-b0b5-abe2a4a1d803.json |  148 -
 .../ca0a010a-fe3a-4b87-8c80-4a8d3e2597fb.json |  148 -
 .../5d1c166c-6a22-4afb-b1b1-f7db9ec38bd8.json |  148 -
 .../10a432fa-dfef-4c9c-bdf7-ce0f81fd1895.json |  148 -
 .../a550663c-2a04-4dfb-8663-b177a7181f3d.json |  148 -
 .../72b6196e-0a2b-4ec9-80a3-a7eb14f7be09.json |  148 -
 .../5e41f068-f009-4e32-bac1-9de5220a2ce2.json |  148 -
 .../eca1331f-6503-481a-b77b-3d96791f54e8.json |  148 -
 .../69def7de-a916-4d23-984b-e676e91e1d8c.json |  148 -
 .../679c6e0b-9e0b-4224-b1e3-59df149739a0.json |  148 -
 .../2335433d-37c6-47f0-ad3b-5e0a42e9488f.json |  148 -
 .../fe84f8a3-5fe9-4385-b6d4-0436fb7e5197.json |  148 -
 .../70d2697e-0df5-40ae-9268-b906c9cabd9d.json |  148 -
 .../0a30fd70-2381-4a4b-89aa-dbd169c856f0.json |  148 -
 .../b9c787f9-3bcd-4215-a157-7fcfa2df82cc.json |  148 -
 .../bdd98f27-fbfd-4de7-bd4e-3b8c3e4e7cc0.json |  148 -
 .../44b20109-d534-4aa9-867d-fa59935ef6d0.json |  148 -
 .../d1196312-4153-4a38-aa46-2940d63d7924.json |  148 -
 .../4b1e3070-04ef-47e7-b720-739320194e7b.json |  148 -
 .../247f400e-dca8-4dab-bebf-092f778f02c9.json |  148 -
 .../d043ad21-102b-49f0-9e8e-6daef7cc3a2e.json |  148 -
 .../d45ec8b8-1ee6-49bb-9237-a7271ba9d13c.json |  148 -
 .../05a4c6aa-9af2-44f0-8c55-8aeed2e75eaf.json |  148 -
 .../a6ef712e-014e-470e-8d5b-f3b51f677aee.json |  148 -
 .../35a039ba-06be-4ec2-9bde-a6a6db2eefec.json |  148 -
 .../97cb96f8-ce4c-403f-bfbc-386d3c611c81.json |  148 -
 .../3a1621e9-75ee-4b34-9c0d-ae15399b1dab.json |  148 -
 .../237218ac-4c74-4647-82b1-700360ddfdbd.json |  148 -
 .../2858d126-d2ef-4512-8fc8-c39faf24b908.json |  148 -
 .../d118ddb1-aafc-4ddf-b5c7-f3ff921bbe0c.json |  148 -
 .../379ec82f-a6a7-4976-a4a6-ab80cb9da293.json |  148 -
 .../c4df42d1-a838-4717-a814-40559fcd7342.json |  148 -
 .../f022d826-3252-4def-b37b-3ce44d78f4ce.json |  148 -
 .../cecc321b-efbd-434e-8a31-a97bbb8bbb3b.json |  148 -
 .../278c2132-3415-48f4-a839-ed09d71e9240.json |  148 -
 .../92bbda1a-ecb1-493d-aa39-a29522c1a11e.json |  148 -
 .../f43b2dff-9e73-4779-86e0-b2cc30ae8b40.json |  148 -
 .../59a98f5d-d017-4b1a-a563-5abd113337e9.json |  148 -
 .../a41597ed-fbab-41af-9625-c277ca988546.json |  148 -
 .../e311eb59-f217-4bc2-b69b-dcea434797a8.json |  148 -
 .../69b037c3-bae2-4889-b10d-e732c45851e9.json |  148 -
 .../adeee000-0b62-4a0c-afaa-5e8c5f29ff6d.json |  148 -
 .../4464d588-62b2-440b-8188-2450bd7a94c5.json |  148 -
 .../bf358648-a41d-43ee-8c14-f8b8eef41871.json |  148 -
 .../afd99f12-f739-40d3-aa11-ef3a45316931.json |  148 -
 .../49b4a24b-ddf1-47f0-ba39-9366892a1213.json |  148 -
 .../ea14a487-39c3-488b-b52b-998e57135487.json |  148 -
 .../02f74b6a-7f63-484e-a7c1-0c53bd801b87.json |  148 -
 .../e492c59d-4b03-4dce-983e-a8724de35a60.json |  148 -
 .../53de0394-8516-4882-b2bc-c7e62e3d8ef0.json |  148 -
 .../56d4c1c5-5238-45dc-8331-64a14b830779.json |  148 -
 .../7003c9d4-c758-4373-a7a3-04822978bf35.json |  148 -
 .../75a7dcb6-789c-49de-b209-4cf7d27465e4.json |  148 -
 .../e91d3910-4f20-4e82-b1fb-8605f5d2b8ac.json |  148 -
 .../f18bfd44-3097-4eb8-a09c-2372c3ecd738.json |  148 -
 .../9ca974b9-c5fb-4fc4-ab3e-1246e31ecdb2.json |  148 -
 .../fb1ab5e0-18db-4e5f-add3-2352d9a1f260.json |  148 -
 .../60ba1f0d-7e85-49e4-8c73-330d74de6707.json |  148 -
 .../29d1c194-8b87-466c-8701-e0fcf267665c.json |  148 -
 .../31e8f616-7b64-4d1a-b395-20bf8bb4629c.json |  148 -
 .../cc3f315d-3cea-47e4-83b4-b5045e778c5e.json |  148 -
 .../5d20dbf8-bb14-46af-adcd-b7ba05f8352c.json |  148 -
 .../06f2cb33-3937-4fde-84e2-6b5467f051c6.json |  148 -
 .../f35c4efa-3767-4a0e-8769-06230cda2512.json |  148 -
 .../6cb65d6a-6c46-4991-8154-f28b101954f6.json |  148 -
 .../6e15a49b-7dc4-4d69-965e-cb962c084e4a.json |  148 -
 .../9f5591f4-751d-48d3-a348-4bb59f6bb1a3.json |  148 -
 .../b609c002-fa0a-46a8-b5a1-9213ee89606c.json |  148 -
 .../b147fc7f-0e31-49ca-abfd-ba990a925097.json |  148 -
 .../e4fbfe23-2b70-459e-821b-db0116d43d8c.json |  148 -
 .../2ab7dc14-af3e-4fb2-8c0c-fe0e14100321.json |  148 -
 .../aca2c665-79f2-4226-b806-307be277ed08.json |  148 -
 .../d37a63df-6d38-4083-bf87-11064162efde.json |  148 -
 .../16e550cc-e59d-4aaa-b221-8cf71e1b26d2.json |  148 -
 .../47058e2a-dc41-45f8-8c32-bc496a8d3bc5.json |  130 -
 .../7199c8b3-8346-4200-b07e-4362ad13a7db.json |  130 -
 .../de7e59d5-e2ce-4479-bbd9-ab9deb3beed3.json |  130 -
 .../17e011c3-1a53-40ae-b7b4-cb24c23df3de.json |  130 -
 .../1125dd05-2f0d-48ca-825c-f5efa18564aa.json |  130 -
 .../88014e0d-e89b-4fed-9eb6-5276bd7658df.json |  130 -
 .../7cc9bfc2-570d-456c-918f-68fd4b711f05.json |  130 -
 .../77b0957f-8779-4dbe-a6ea-cff50c4ee73b.json |  130 -
 .../ba0ce7ce-a755-4337-bfec-0391680d3625.json |  112 -
 .../4eb460eb-b3ad-4e0d-b131-5b59ef54015c.json |  112 -
 .../6868a1e5-ee86-4f89-8452-5e939ac19169.json |  112 -
 .../4a151d43-5fac-4afe-9c23-ba0e86a60849.json |  112 -
 .../5f16d574-adef-4016-abcf-9e7936771ba7.json |  112 -
 .../f3e0300f-39ed-4cfd-bd03-218904836037.json |  130 -
 .../42c82c00-b74e-4152-a222-15d481a13e0c.json |  148 -
 .../68096be8-c49f-4a23-824e-1275248369f7.json |  112 -
 .../c91270bd-3731-452a-b429-6cd4943d1194.json |  112 -
 .../337c7a43-46a7-4acb-b7f1-936e1f2cf46f.json |  148 -
 .../3b00f881-8f73-4608-8cbb-846fe7d1cfea.json |  148 -
 .../2821dfdc-291b-405e-bd81-cf536c802885.json |  148 -
 .../7d441240-7e85-4776-b51c-3c1bc84456ba.json |  148 -
 .../840d35d9-441e-4ba3-bbc3-1f4ff2627517.json |  112 -
 .../0127f3c5-9657-4eb6-a77a-5a6476a8fc79.json |  130 -
 .../b72e2988-75e4-4d26-9a47-daae4786b02f.json |  148 -
 .../643cf5a3-8992-4126-87c9-814887314266.json |  148 -
 .../f81f1f67-6506-481f-87ce-a17a6a7578f3.json |  112 -
 .../32b35218-a099-410e-8a65-a0d6e2f380a6.json |  148 -
 .../deec1e7c-0cb8-4e6f-b3ac-d37790b709f3.json |  112 -
 .../e42a9986-4dcc-4017-be97-8135646c7424.json |  112 -
 .../ffc92063-606a-4f31-bfdd-5683aa748ccc.json |  148 -
 .../23a5398c-0911-4a66-930d-abada12bf985.json |  148 -
 .../80b0bbcb-a57a-453c-8fff-502646520b1d.json |  112 -
 .../e383c939-b952-4fdd-94e3-eb3716691860.json |  130 -
 .../daf873f9-ab03-49df-96cb-a0f5a8613048.json |  130 -
 .../f4cff132-3b2f-4e03-bb49-098b16d87cef.json |  112 -
 .../f80685de-058c-4ab8-aa35-dc7321d1cea6.json |  130 -
 .../c8e4349d-a084-4eb5-990f-403ba930a9ad.json |  130 -
 .../729ca9c0-0680-49f1-97b9-5581be17a352.json |  112 -
 .../fdd4add5-b44d-46f9-8c98-da3120df4161.json |  112 -
 .../6b5ef643-30dd-4381-b66f-e9ecd6b0d06e.json |  112 -
 .../95271b8c-4135-48bf-bbad-ae94baa37640.json |  112 -
 .../f437e790-efe1-4dc5-8ccc-5b0bfd800069.json |  112 -
 .../7d0f761a-2650-4029-b1e9-13af2f0cc69d.json |  130 -
 .../49fc601e-4ac6-4672-a53d-0e89f19959c1.json |  130 -
 .../6195e81a-d5a5-40af-96f6-259252009ad7.json |  112 -
 .../2dec0f50-d374-4af3-9d27-80fcf50dac2c.json |  148 -
 .../96722888-0cc9-4dfd-b38d-91f4118c0be2.json |  112 -
 .../683abc2a-fce0-4d3d-bdcc-5cac2c76a46a.json |  112 -
 .../121344ec-61ef-49c5-a74b-b86f605d513e.json |  148 -
 .../8594f86b-a7f2-4046-a3a7-830d7ac20690.json |  112 -
 .../c0c5e5e1-801c-48fd-a994-a4a69c0b1213.json |  112 -
 .../0411ac30-1536-4639-8350-fc11d53298e3.json |  112 -
 .../92281e58-4160-4d76-9119-b38fb47ffd8f.json |  112 -
 .../43687871-2e19-4d2b-9754-1cb6527496c1.json |  112 -
 .../1debe1de-b394-4856-a946-9d14bd867bf6.json |  130 -
 .../80c589d2-c1eb-4dcf-8be8-042f4f66b7eb.json |  130 -
 .../62478772-bb85-4d3f-a916-c3d17db3ee61.json |  130 -
 .../a070bae2-c927-418b-91cc-161781c4f5b7.json |  148 -
 .../b884c919-a272-4f67-9a09-3d232f56d083.json |  148 -
 .../deac33dd-187b-4406-a76a-b33caf417380.json |  148 -
 .../185bd742-d7d4-4600-86bd-bcda75ed2ebc.json |  130 -
 .../901e4de6-3ef6-4c2a-873c-cdcc47201974.json |  112 -
 .../a051d5d6-18e6-483d-a000-4a52a06de676.json |  148 -
 .../94d77182-8952-4a63-b02b-3d8bd8a8dead.json |  112 -
 .../9a48d808-0280-4175-a28a-7e9ba8ac6deb.json |  148 -
 .../f0d9f57d-d552-44ea-a91c-751854133316.json |  130 -
 .../561cfba1-856d-4809-b5c7-41481735e1d6.json |  148 -
 .../995d1caf-b735-44dd-adff-875e3203aa46.json |  130 -
 .../81767043-23c2-4229-b3b5-1c24e470d52a.json |  130 -
 .../4f6344bc-af30-46f9-b6f8-41ff925d064e.json |  130 -
 .../abac8640-40be-4eb5-9035-2bf6fd436a7a.json |  148 -
 .../6fd972ab-c45f-4ccd-a5cf-4aac5e703342.json |  112 -
 .../8eb1bcf2-a6bd-467c-bc37-090fdb7a9460.json |  112 -
 .../5ad53725-ed5a-41f3-8ff6-7404f3f981db.json |  112 -
 .../ae2d05b4-5e80-4b00-af67-b94609b073eb.json |  112 -
 .../592f2811-c197-423e-89d4-e25ee5a324fb.json |  112 -
 .../17795e7b-e912-440f-a80e-63233d3b6d8c.json |  112 -
 .../375cf55f-64f6-42f6-a947-1487feffb196.json |  130 -
 .../94d2eddd-f7db-4360-ac58-0af39ce66935.json |  148 -
 .../996ca604-e01c-4a95-9286-60b6dc04f67d.json |  130 -
 .../b6f0089f-d04b-4bcd-be84-ce3bc0d6c2b9.json |  130 -
 .../83e15cba-4fec-48f2-9be4-78decbd96f66.json |  130 -
 .../493617c0-37eb-4c83-b175-2507a3647b5e.json |  130 -
 .../97f494ce-3c9c-4a19-a237-d458be611a0a.json |  130 -
 .../f8bf1e92-3cc3-4c7e-9770-485a3074e85f.json |  130 -
 .../5bf73fba-520f-4a2f-9296-8240847eb8ec.json |  130 -
 .../3dd2c89f-64f5-4bbc-a621-791a9f0538b2.json |  130 -
 .../ef987556-7277-48d8-ac07-532586773a3a.json |  112 -
 .../add7eddb-7a8b-4c78-9864-c4316a97ce5e.json |  130 -
 .../caf02954-1eed-44eb-b5f4-df47c90828d7.json |  130 -
 .../00798930-daa2-4e79-82c6-2cccf1c3a0cb.json |  130 -
 .../71658cf8-0189-49dc-847f-b9a9b5faee4a.json |  148 -
 .../3d506b91-5b0d-47e3-a3a0-bc09808bf5b5.json |  130 -
 .../04c71231-2025-4e1a-b7ed-56b245868089.json |  130 -
 .../08b2edd0-f8e9-47cd-b19d-53fdc7209917.json |  148 -
 .../79a43841-4032-4a20-8b5a-83b4b446d107.json |  130 -
 .../a2c16ab8-1098-490a-8d0a-392d835427e0.json |  148 -
 .../0aa12860-7ebe-49c2-a5af-1926d23e34f8.json |  130 -
 .../796d3ec1-9c26-4ead-87cb-4eb866209120.json |  148 -
 5320 files changed, 1099844 deletions(-)
 delete mode 100644 data/global-mmlu-lite/alibaba/qwen3-235b-a22b-instruct-2507/c8ab4e94-d8e8-417f-be18-fececf3c815c.json
 delete mode 100644 data/global-mmlu-lite/anthropic/claude-3-5-haiku-20241022/402c8833-1827-46fc-a497-46b40a6794ff.json
 delete mode 100644 data/global-mmlu-lite/anthropic/claude-3-7-sonnet-20250219/acd2082a-ce0c-418f-9383-f3c9f11735a2.json
 delete mode 100644 data/global-mmlu-lite/anthropic/claude-opus-4-1-20250805/c65ed336-b283-46c2-8284-c4695cad588d.json
 delete mode 100644 data/global-mmlu-lite/anthropic/claude-sonnet-4-20250514/5ebb009d-b548-4f2b-b075-feb76ca295d2.json
 delete mode 100644 data/global-mmlu-lite/cohere/command-a-03-2025/c7df2916-bde4-4987-9139-fcfd18a14ac1.json
 delete mode 100644 data/global-mmlu-lite/deepseek/deepseek-r1-0528/56ec8ab0-d76d-4c03-953b-a2a4a43af5f4.json
 delete mode 100644 data/global-mmlu-lite/deepseek/deepseek-v3.1/ad3211a9-4390-4247-b64d-600191a88a75.json
 delete mode 100644 data/global-mmlu-lite/google/gemini-2.5-flash-preview-05-20/1a34326a-f75e-434c-a027-9f8cf7fe8fb9.json
 delete mode 100644 data/global-mmlu-lite/google/gemini-2.5-flash/129c8b21-f97e-4284-9574-33d5932332f7.json
 delete mode 100644 data/global-mmlu-lite/google/gemini-2.5-pro/3644fd67-0f46-4de3-b542-edf219d0e0cd.json
 delete mode 100644 data/global-mmlu-lite/google/gemini-3-pro-preview/c0692e14-6484-4d02-8dac-55ce4373fb15.json
 delete mode 100644 data/global-mmlu-lite/google/gemma-3-27b-it/ab4940d1-118c-479a-bd37-1ea2da6f02a3.json
 delete mode 100644 data/global-mmlu-lite/google/gemma-3-4b-it/85552093-435f-4d85-897d-4e74c3655533.json
 delete mode 100644 data/global-mmlu-lite/mistralai/mistral-medium-3/4ddc0062-6577-4ab9-85f1-791fd2822776.json
 delete mode 100644 data/global-mmlu-lite/mistralai/mistral-small-2503/50fc4840-933b-43ec-847e-1834b30f9f14.json
 delete mode 100644 data/global-mmlu-lite/openai/gpt-4.1-2025-04-14/6cdc5384-2be5-47e0-a9b2-9cd6719c1760.json
 delete mode 100644 data/global-mmlu-lite/openai/gpt-5-2025-08-07/a668c931-34e4-4702-a84c-97d8c6f59ef4.json
 delete mode 100644 data/global-mmlu-lite/openai/o3-mini-2025-01-31/3a7e2aa6-4e57-446f-a127-4a7e022fe3e1.json
 delete mode 100644 data/global-mmlu-lite/unknown/aya-expanse-32b/938a35f1-195d-49c8-9a16-90fab96692bd.json
 delete mode 100644 data/global-mmlu-lite/unknown/granite-4.0-h-small/ce756801-f75e-4250-9721-1d627a37f055.json
 delete mode 100644 data/global-mmlu-lite/unknown/o4-mini-2025-04-16/b83b41d4-6c95-4c7d-a290-65d89bf776c2.json
 delete mode 100644 data/global-mmlu-lite/xai/grok-3-mini/31c3fe1b-be4b-42ef-8ec0-9da323b2ebb6.json
 delete mode 100644 data/global-mmlu-lite/xai/grok-4-0709/a8e0fc0e-b3a4-4a0b-938f-aa11f1c64358.json
 delete mode 100644 data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json
 delete mode 100644 data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json
 delete mode 100644 data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json
 delete mode 100644 data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json
 delete mode 100644 data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json
 delete mode 100644 data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json
 delete mode 100644 data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json
 delete mode 100644 data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json
 delete mode 100644 data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json
 delete mode 100644 data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json
 delete mode 100644 data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json
 delete mode 100644 data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json
 delete mode 100644 data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json
 delete mode 100644 data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json
 delete mode 100644 data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json
 delete mode 100644 data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json
 delete mode 100644 data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json
 delete mode 100644 data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json
 delete mode 100644 data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json
 delete mode 100644 data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json
 delete mode 100644 data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json
 delete mode 100644 data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json
 delete mode 100644 data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json
 delete mode 100644 data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json
 delete mode 100644 data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json
 delete mode 100644 data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json
 delete mode 100644 data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json
 delete mode 100644 data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json
 delete mode 100644 data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json
 delete mode 100644 data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json
 delete mode 100644 data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json
 delete mode 100644 data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json
 delete mode 100644 data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json
 delete mode 100644 data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json
 delete mode 100644 data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json
 delete mode 100644 data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json
 delete mode 100644 data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json
 delete mode 100644 data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json
 delete mode 100644 data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json
 delete mode 100644 data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json
 delete mode 100644 data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json
 delete mode 100644 data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json
 delete mode 100644 data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json
 delete mode 100644 data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json
 delete mode 100644 data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json
 delete mode 100644 data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json
 delete mode 100644 data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json
 delete mode 100644 data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json
 delete mode 100644 data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json
 delete mode 100644 data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json
 delete mode 100644 data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json
 delete mode 100644 data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json
 delete mode 100644 data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json
 delete mode 100644 data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json
 delete mode 100644 data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json
 delete mode 100644 data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json
 delete mode 100644 data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json
 delete mode 100644 data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json
 delete mode 100644 data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json
 delete mode 100644 data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json
 delete mode 100644 data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json
 delete mode 100644 data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json
 delete mode 100644 data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json
 delete mode 100644 data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json
 delete mode 100644 data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json
 delete mode 100644 data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json
 delete mode 100644 data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json
 delete mode 100644 data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json
 delete mode 100644 data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json
 delete mode 100644 data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json
 delete mode 100644 data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json
 delete mode 100644 data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json
 delete mode 100644 data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json
 delete mode 100644 data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json
 delete mode 100644 data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json
 delete mode 100644 data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json
 delete mode 100644 data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json
 delete mode 100644 data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json
 delete mode 100644 data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json
 delete mode 100644 data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json
 delete mode 100644 data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json
 delete mode 100644 data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json
 delete mode 100644 data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json
 delete mode 100644 data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json
 delete mode 100644 data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json
 delete mode 100644 data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json
 delete mode 100644 data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json
 delete mode 100644 data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json
 delete mode 100644 data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json
 delete mode 100644 data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json
 delete mode 100644 data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json
 delete mode 100644 data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json
 delete mode 100644 data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json
 delete mode 100644 data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json
 delete mode 100644 data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json
 delete mode 100644 data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json
 delete mode 100644 data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json
 delete mode 100644 data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json
 delete mode 100644 data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json
 delete mode 100644 data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json
 delete mode 100644 data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json
 delete mode 100644 data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json
 delete mode 100644 data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json
 delete mode 100644 data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json
 delete mode 100644 data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json
 delete mode 100644 data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json
 delete mode 100644 data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json
 delete mode 100644 data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json
 delete mode 100644 data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json
 delete mode 100644 data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json
 delete mode 100644 data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json
 delete mode 100644 data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json
 delete mode 100644 data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json
 delete mode 100644 data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json
 delete mode 100644 data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json
 delete mode 100644 data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json
 delete mode 100644 data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json
 delete mode 100644 data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json
 delete mode 100644 data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json
 delete mode 100644 data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json
 delete mode 100644 data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json
 delete mode 100644 data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json
 delete mode 100644 data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json
 delete mode 100644 data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json
 delete mode 100644 data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json
 delete mode 100644 data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json
 delete mode 100644 data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json
 delete mode 100644 data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json
 delete mode 100644 data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json
 delete mode 100644 data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json
 delete mode 100644 data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json
 delete mode 100644 data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json
 delete mode 100644 data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json
 delete mode 100644 data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json
 delete mode 100644 data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json
 delete mode 100644 data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json
 delete mode 100644 data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json
 delete mode 100644 data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json
 delete mode 100644 data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json
 delete mode 100644 data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json
 delete mode 100644 data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json
 delete mode 100644 data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json
 delete mode 100644 data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json
 delete mode 100644 data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json
 delete mode 100644 data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json
 delete mode 100644 data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json
 delete mode 100644 data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json
 delete mode 100644 data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json
 delete mode 100644 data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json
 delete mode 100644 data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json
 delete mode 100644 data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json
 delete mode 100644 data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json
 delete mode 100644 data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json
 delete mode 100644 data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json
 delete mode 100644 data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json
 delete mode 100644 data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json
 delete mode 100644 data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json
 delete mode 100644 data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json
 delete mode 100644 data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json
 delete mode 100644 data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json
 delete mode 100644 data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json
 delete mode 100644 data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json
 delete mode 100644 data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json
 delete mode 100644 data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json
 delete mode 100644 data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json
 delete mode 100644 data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json
 delete mode 100644 data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json
 delete mode 100644 data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json
 delete mode 100644 data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json
 delete mode 100644 data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json
 delete mode 100644 data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json
 delete mode 100644 data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json
 delete mode 100644 data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json
 delete mode 100644 data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json
 delete mode 100644 data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json
 delete mode 100644 data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json
 delete mode 100644 data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json
 delete mode 100644 data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json
 delete mode 100644 data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json
 delete mode 100644 data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json
 delete mode 100644 data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json
 delete mode 100644 data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json
 delete mode 100644 data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json
 delete mode 100644 data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json
 delete mode 100644 data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json
 delete mode 100644 data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json
 delete mode 100644 data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json
 delete mode 100644 data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json
 delete mode 100644 data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json
 delete mode 100644 data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json
 delete mode 100644 data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json
 delete mode 100644 data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json
 delete mode 100644 data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json
 delete mode 100644 data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json
 delete mode 100644 data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json
 delete mode 100644 data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json
 delete mode 100644 data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json
 delete mode 100644 data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json
 delete mode 100644 data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json
 delete mode 100644 data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json
 delete mode 100644 data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json
 delete mode 100644 data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json
 delete mode 100644 data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json
 delete mode 100644 data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json
 delete mode 100644 data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json
 delete mode 100644 data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json
 delete mode 100644 data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json
 delete mode 100644 data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json
 delete mode 100644 data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json
 delete mode 100644 data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json
 delete mode 100644 data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json
 delete mode 100644 data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json
 delete mode 100644 data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json
 delete mode 100644 data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json
 delete mode 100644 data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json
 delete mode 100644 data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json
 delete mode 100644 data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json
 delete mode 100644 data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json
 delete mode 100644 data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json
 delete mode 100644 data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json
 delete mode 100644 data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json
 delete mode 100644 data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json
 delete mode 100644 data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json
 delete mode 100644 data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json
 delete mode 100644 data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json
 delete mode 100644 data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json
 delete mode 100644 data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json
 delete mode 100644 data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json
 delete mode 100644 data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json
 delete mode 100644 data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json
 delete mode 100644 data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json
 delete mode 100644 data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json
 delete mode 100644 data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json
 delete mode 100644 data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json
 delete mode 100644 data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json
 delete mode 100644 data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json
 delete mode 100644 data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json
 delete mode 100644 data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json
 delete mode 100644 data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json
 delete mode 100644 data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json
 delete mode 100644 data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json
 delete mode 100644 data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json
 delete mode 100644 data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json
 delete mode 100644 data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json
 delete mode 100644 data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json
 delete mode 100644 data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json
 delete mode 100644 data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json
 delete mode 100644 data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json
 delete mode 100644 data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json
 delete mode 100644 data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json
 delete mode 100644 data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json
 delete mode 100644 data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json
 delete mode 100644 data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json
 delete mode 100644 data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json
 delete mode 100644 data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json
 delete mode 100644 data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json
 delete mode 100644 data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json
 delete mode 100644 data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json
 delete mode 100644 data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json
 delete mode 100644 data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json
 delete mode 100644 data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json
 delete mode 100644 data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json
 delete mode 100644 data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json
 delete mode 100644 data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json
 delete mode 100644 data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json
 delete mode 100644 data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json
 delete mode 100644 data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json
 delete mode 100644 data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json
 delete mode 100644 data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json
 delete mode 100644 data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json
 delete mode 100644 data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json
 delete mode 100644 data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json
 delete mode 100644 data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json
 delete mode 100644 data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json
 delete mode 100644 data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json
 delete mode 100644 data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json
 delete mode 100644 data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json
 delete mode 100644 data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json
 delete mode 100644 data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json
 delete mode 100644 data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json
 delete mode 100644 data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json
 delete mode 100644 data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json
 delete mode 100644 data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json
 delete mode 100644 data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json
 delete mode 100644 data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json
 delete mode 100644 data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json
 delete mode 100644 data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json
 delete mode 100644 data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json
 delete mode 100644 data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json
 delete mode 100644 data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json
 delete mode 100644 data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json
 delete mode 100644 data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json
 delete mode 100644 data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json
 delete mode 100644 data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json
 delete mode 100644 data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json
 delete mode 100644 data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json
 delete mode 100644 data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json
 delete mode 100644 data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json
 delete mode 100644 data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json
 delete mode 100644 data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json
 delete mode 100644 data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json
 delete mode 100644 data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json
 delete mode 100644 data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json
 delete mode 100644 data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json
 delete mode 100644 data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json
 delete mode 100644 data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json
 delete mode 100644 data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json
 delete mode 100644 data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json
 delete mode 100644 data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json
 delete mode 100644 data/hfopenllm_v2/0-hero/Matter-0.2-7B-DPO/0d7928c3-c769-474e-8249-7a5c70c4c559.json
 delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-34B-32K/f63536ed-752b-4538-9b92-2514a617a4bf.json
 delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-34B-Chat-16K/8ff13de2-ea43-4392-992f-ba70b6023e96.json
 delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-34B-Chat/02bac8a7-bd09-4e73-979a-7dbaa7a8ed75.json
 delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-34B/74e4406d-b2b6-4c3f-b059-f52cccf1fff4.json
 delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-6B-Chat/ec8a6d6c-b8ea-48a3-9af6-d357e0057ec1.json
 delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-6B/05307b41-d832-4533-99bd-c8608bf8e64c.json
 delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-9B-32K/c09bd9b0-6f85-4120-94a9-b628c68bccb7.json
 delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-9B-Chat-16K/9f971385-1146-4436-91a6-0e52d4db1f07.json
 delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-9B-Chat/80ed14ca-b4cd-4ceb-8fdb-24705e47bd0e.json
 delete mode 100644 data/hfopenllm_v2/01-ai/Yi-1.5-9B/db88e3f5-58a9-4783-9093-a6df96483342.json
 delete mode 100644 data/hfopenllm_v2/01-ai/Yi-34B-200K/8cd90f8a-d8dc-469b-95b9-260fcef804d2.json
 delete mode 100644 data/hfopenllm_v2/01-ai/Yi-34B-Chat/b2c82703-2b5c-407d-b84f-a8f8261ac894.json
 delete mode 100644 data/hfopenllm_v2/01-ai/Yi-34B/55462e67-5eca-4e9d-9095-51fcf12de5fa.json
 delete mode 100644 data/hfopenllm_v2/01-ai/Yi-6B-200K/25a119f0-5eaa-4fa9-8cd4-e0f437ada456.json
 delete mode 100644 data/hfopenllm_v2/01-ai/Yi-6B-Chat/efc036b6-d8de-4393-87a1-d4f86fb44d91.json
 delete mode 100644 data/hfopenllm_v2/01-ai/Yi-6B/a5144406-eb85-43b2-a49d-be6b06d6b04a.json
 delete mode 100644 data/hfopenllm_v2/01-ai/Yi-9B-200K/900184ad-656d-416b-956f-5f6e3a991d1b.json
 delete mode 100644 data/hfopenllm_v2/01-ai/Yi-9B/7a58954a-5d7d-4640-99fd-773249640237.json
 delete mode 100644 data/hfopenllm_v2/01-ai/Yi-Coder-9B-Chat/4ea3146c-b912-424a-b0a9-7c37348348c8.json
 delete mode 100644 data/hfopenllm_v2/1-800-LLMs/Qwen-2.5-14B-Hindi-Custom-Instruct/b0276278-6d86-49c0-a246-cd9110ac1deb.json
 delete mode 100644 data/hfopenllm_v2/1-800-LLMs/Qwen-2.5-14B-Hindi/04216f67-1385-43bf-b7de-5bae7a60f379.json
 delete mode 100644 data/hfopenllm_v2/1024m/PHI-4-Hindi/fbf7b76b-7ced-4217-8e14-1d02184e271c.json
 delete mode 100644 data/hfopenllm_v2/1024m/QWEN-14B-B100/74ac8aba-6dfb-464c-81b5-d02a9192b9cc.json
 delete mode 100644 data/hfopenllm_v2/152334H/miqu-1-70b-sf/295938e1-ade2-4d36-beca-3cbe506b5b90.json
 delete mode 100644 data/hfopenllm_v2/1TuanPham/T-VisStar-7B-v0.1/f331782f-ea09-41bd-8c6a-e964c88d7e09.json
 delete mode 100644 data/hfopenllm_v2/1TuanPham/T-VisStar-v0.1/e4e3d79a-1de9-43be-a029-0be4f60e472b.json
 delete mode 100644 data/hfopenllm_v2/3rd-Degree-Burn/L-3.1-Science-Writer-8B/6914ac28-b543-4f36-81f1-f7491c018e3b.json
 delete mode 100644 data/hfopenllm_v2/3rd-Degree-Burn/Llama-3.1-8B-Squareroot-v1/b7378f41-46ab-41af-94cc-e7fb10738658.json
 delete mode 100644 data/hfopenllm_v2/3rd-Degree-Burn/Llama-3.1-8B-Squareroot/acedae59-6192-4ac4-a354-d520ecd6ba36.json
 delete mode 100644 data/hfopenllm_v2/3rd-Degree-Burn/Llama-Squared-8B/ff105961-761d-4261-8a44-20acf2e7f440.json
 delete mode 100644 data/hfopenllm_v2/4season/final_model_test_v2/fa0901f6-514e-44ae-84dc-0b793f26169e.json
 delete mode 100644 data/hfopenllm_v2/AALF/FuseChat-Llama-3.1-8B-Instruct-preview/d2dff5df-343b-40f3-85de-14eb72dab050.json
 delete mode 100644 data/hfopenllm_v2/AALF/FuseChat-Llama-3.1-8B-SFT-preview/8fa3010f-b7a1-4fc1-9156-ba70453add86.json
 delete mode 100644 data/hfopenllm_v2/AALF/gemma-2-27b-it-SimPO-37K-100steps/58034f99-3b01-46d6-aea9-90c75d073bb0.json
 delete mode 100644 data/hfopenllm_v2/AALF/gemma-2-27b-it-SimPO-37K/e6c08c9c-6d01-45c7-8a24-219b756b8632.json
 delete mode 100644 data/hfopenllm_v2/AELLM/gemma-2-aeria-infinity-9b/cd97ad01-1d20-4cbd-a9bb-2acf3d9fdcc7.json
 delete mode 100644 data/hfopenllm_v2/AELLM/gemma-2-lyco-infinity-9b/95f44ef8-e5ba-4bdc-97a7-2c5a678b07be.json
 delete mode 100644 data/hfopenllm_v2/AGI-0/Art-v0-3B/082f25f0-994c-438a-8086-b1e439aca466.json
 delete mode 100644 data/hfopenllm_v2/AGI-0/Artificium-llama3.1-8B-001/31423cbd-08cd-4079-b1c5-ba412acf1b51.json
 delete mode 100644 data/hfopenllm_v2/AGI-0/smartllama3.1-8B-001/2669bd86-da65-4d87-8464-bfa8c741ce0b.json
 delete mode 100644 data/hfopenllm_v2/AI-MO/NuminaMath-7B-CoT/ab2c19ff-5671-446f-b09e-731e2ae515ca.json
 delete mode 100644 data/hfopenllm_v2/AI-MO/NuminaMath-7B-TIR/36250dc3-cb51-43be-8ab0-6788eb5bda7c.json
 delete mode 100644 data/hfopenllm_v2/AI-Sweden-Models/Llama-3-8B-instruct/cd616d6a-151f-4aaa-93b5-9c4a758f95b5.json
 delete mode 100644 data/hfopenllm_v2/AI-Sweden-Models/gpt-sw3-40b/9cb09cae-9b1b-43b1-afbf-f44b0a44053c.json
 delete mode 100644 data/hfopenllm_v2/AI4free/Dhanishtha/038c32da-add5-4299-ac17-df6ef3fdea58.json
 delete mode 100644 data/hfopenllm_v2/AI4free/t2/25eb4bdf-beb4-4ad2-a5e9-3a2f31c46cb5.json
 delete mode 100644 data/hfopenllm_v2/AIDC-AI/Marco-o1/77655d60-872f-468a-acc6-d584ef5bf46a.json
 delete mode 100644 data/hfopenllm_v2/Aashraf995/Creative-7B-nerd/4de378c8-ccf6-4f0b-8287-3d138a8645b9.json
 delete mode 100644 data/hfopenllm_v2/Aashraf995/Gemma-Evo-10B/8039cadf-6644-44e7-8452-90e9c8069e28.json
 delete mode 100644 data/hfopenllm_v2/Aashraf995/Qwen-Evo-7B/8914d89d-c873-4704-998e-dc807e96030b.json
 delete mode 100644 data/hfopenllm_v2/Aashraf995/QwenStock-14B/c2e9fc29-db07-4b49-a98a-084158831ac4.json
 delete mode 100644 data/hfopenllm_v2/AbacusResearch/Jallabi-34B/58724539-6fc5-40d9-ba43-87410959894d.json
 delete mode 100644 data/hfopenllm_v2/Ahdoot/StructuredThinker-v0.3-MoreStructure/b13324cf-f6f5-4bf1-9cf3-c196120c4bcf.json
 delete mode 100644 data/hfopenllm_v2/Ahdoot/Test_StealthThinker/782b2df0-d1b3-414c-a4bd-59052a4441a9.json
 delete mode 100644 data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V0-Coder/b508e41e-0f1c-49ce-8b80-5e7ec82b8f15.json
 delete mode 100644 data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V0/2824e8d4-2749-4b18-a3a1-b987ed215ac6.json
 delete mode 100644 data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V1.1/53176984-ba93-4a64-b81e-21f6e0f65bcd.json
 delete mode 100644 data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V1/53252698-7d17-4f2a-9106-3b744ae7a985.json
 delete mode 100644 data/hfopenllm_v2/Alepach/notHumpback-M0/6dd0f3a2-27ee-48f1-9d97-ef6954d298c8.json
 delete mode 100644 data/hfopenllm_v2/Alepach/notHumpback-M1-v2/35f11d5e-88c4-4a95-8d06-a40bee648b00.json
 delete mode 100644 data/hfopenllm_v2/Alepach/notHumpback-M1/ba1193c0-42b8-487d-b9fd-ddbc1fd15359.json
 delete mode 100644 data/hfopenllm_v2/Alibaba-NLP/gte-Qwen2-7B-instruct/95733620-e1e7-4442-b9c3-a699165df5e7.json
 delete mode 100644 data/hfopenllm_v2/Alsebay/Qwen2.5-7B-test-novelist/cacfce0d-f5f1-4101-8065-f5f02eaab1fb.json
 delete mode 100644 data/hfopenllm_v2/Amaorynho/BBAI2006/72be5537-198a-43e9-9840-a803083158d3.json
 delete mode 100644 data/hfopenllm_v2/Amaorynho/BBAI270V4/2e9a3443-970d-4f37-a356-277a11c81754.json
 delete mode 100644 data/hfopenllm_v2/Amaorynho/BBAIIFEV1/1188402f-aa1c-4306-b031-c92ff0a5dd64.json
 delete mode 100644 data/hfopenllm_v2/Amaorynho/BBAI_375/ee2f567a-6403-46d5-9a6b-bd029f81d660.json
 delete mode 100644 data/hfopenllm_v2/Amu/t1-1.5B/d809fdff-f5ff-44f5-afc7-7e8af9ce2f93.json
 delete mode 100644 data/hfopenllm_v2/Amu/t1-3B/87d66efc-173f-4c14-b76c-d8b7e00d575d.json
 delete mode 100644 data/hfopenllm_v2/ArliAI/ArliAI-RPMax-12B-v1.1/47f62378-c3cc-408f-a0d1-71eb3f522f57.json
 delete mode 100644 data/hfopenllm_v2/ArliAI/Llama-3.1-8B-ArliAI-RPMax-v1.1/dba8c12c-388d-4f8b-8ce8-83acfc4920c7.json
 delete mode 100644 data/hfopenllm_v2/Arthur-LAGACHERIE/Precis-1B-Instruct/e4087285-1d1a-465e-ac88-91310e939710.json
 delete mode 100644 data/hfopenllm_v2/Artples/L-MChat-7b/09f189d9-74fd-47bb-b5fb-7994cba56ae2.json
 delete mode 100644 data/hfopenllm_v2/Artples/L-MChat-Small/5754c262-6ddf-4f54-9722-22ff20a8d76f.json
 delete mode 100644 data/hfopenllm_v2/Aryanne/QwentileSwap/cc1bd811-ec88-4514-8b47-4140ded4f03d.json
 delete mode 100644 data/hfopenllm_v2/Aryanne/SHBA/3f08155d-8551-4472-86fe-7988cd6df78b.json
 delete mode 100644 data/hfopenllm_v2/Aryanne/SuperHeart/339e12fb-b4a4-4a4b-bb40-899b4ad833f9.json
 delete mode 100644 data/hfopenllm_v2/AtAndDev/Qwen2.5-1.5B-continuous-learnt/4fd60e9c-5c90-492a-b24d-7ca6d1e91eae.json
 delete mode 100644 data/hfopenllm_v2/AtAndDev/Qwen2.5-1.5B-continuous-learnt/7f8d935e-3782-4769-8bd0-ee8a0ce91cd6.json
 delete mode 100644 data/hfopenllm_v2/Ateron/Glowing-Forest-12B/6fa07e60-9f82-4abc-aa45-4dfc0bcf9b8d.json
 delete mode 100644 data/hfopenllm_v2/Ateron/Lotus-Magpic/99a0022b-3fe7-4612-9cbb-cf082c1f6b70.json
 delete mode 100644 data/hfopenllm_v2/Ateron/Way_of_MagPicaro/b1153714-d6fe-4ff9-ab8c-85b677d57f8f.json
 delete mode 100644 data/hfopenllm_v2/AuraIndustries/Aura-4B/c3d39b6c-02af-410d-8a5c-224495b04572.json
 delete mode 100644 data/hfopenllm_v2/AuraIndustries/Aura-8B/0426fcba-3db4-492d-b622-e34ab8d3fc8f.json
 delete mode 100644 data/hfopenllm_v2/AuraIndustries/Aura-MoE-2x4B-v2/aa099cfe-ac9a-42dd-8357-f4d8115133ca.json
 delete mode 100644 data/hfopenllm_v2/AuraIndustries/Aura-MoE-2x4B/ccbc8a5e-9a97-452a-b023-cc996ffe31f1.json
 delete mode 100644 data/hfopenllm_v2/Aurel9/testmerge-7b/b359a7a3-cf2c-4952-b308-333672dadcec.json
 delete mode 100644 data/hfopenllm_v2/Ayush-Singh/Llama1B-sft-2/0864d5cf-d6fe-42bc-9059-9f2e5ff06b60.json
 delete mode 100644 data/hfopenllm_v2/Azure99/Blossom-V6-14B/e6ef2559-8a63-43e3-a60b-0d2b7256ad3d.json
 delete mode 100644 data/hfopenllm_v2/Azure99/Blossom-V6-7B/45d019ab-b23c-4fc3-baf5-d57576e9945c.json
 delete mode 100644 data/hfopenllm_v2/Azure99/blossom-v5-32b/e3cd7c32-e5a1-4cd6-a9dc-95364a8abe75.json
 delete mode 100644 data/hfopenllm_v2/Azure99/blossom-v5-llama3-8b/9be442e8-4b77-43e0-a981-887338e59b78.json
 delete mode 100644 data/hfopenllm_v2/Azure99/blossom-v5.1-34b/a07b6326-f393-490e-b696-d8b45f593d4b.json
 delete mode 100644 data/hfopenllm_v2/Azure99/blossom-v5.1-9b/b66ed91a-98d5-407c-9896-9c2e2a31e9da.json
 delete mode 100644 data/hfopenllm_v2/BAAI/Gemma2-9B-IT-Simpo-Infinity-Preference/9c70921d-956b-4727-9201-1addbd01bb8b.json
 delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0613-Llama3-70B/4ba6d51e-314a-4db4-9552-568a4093e01a.json
 delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0613-Mistral-7B/835f5056-56bf-4a6c-886f-fbe6f263ac07.json
 delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Llama3-70B/c2a63afa-9d25-41dc-b25f-848f5a640501.json
 delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Llama3-8B/f64f9d24-e448-4bb6-89c3-edb66499bac9.json
 delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Mistral-7B/2de14bfb-844a-4711-815e-8f63487a78fd.json
 delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Qwen2-7B/f953e0e2-ddca-42a2-a0f6-752a137bc6b5.json
 delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Yi-1.5-9B/98187b98-0cc8-4756-9cb7-c53deb998f90.json
 delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-0729-Llama3_1-8B/8c79c60d-ebf4-4409-be4f-928a54cedd1d.json
 delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-0729-mistral-7B/5d5cebeb-faf0-4fdf-8749-6307080e82f2.json
 delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-Llama3_1-70B/e926ce8f-45bb-4f3d-b579-ecadb3df6468.json
 delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-Llama3_1-8B/070609d6-5f41-4712-9ad7-e215b1a6bb81.json
 delete mode 100644 data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-mistral-7B/8d2909c7-37f2-4198-a1e2-4bf2ebc1444d.json
 delete mode 100644 data/hfopenllm_v2/BAAI/OPI-Llama-3.1-8B-Instruct/53587959-25f9-43aa-a34b-f274d8bc93af.json
 delete mode 100644 data/hfopenllm_v2/BEE-spoke-data/Meta-Llama-3-8Bee/2a7f80ed-d404-4c81-b000-b65c83069121.json
 delete mode 100644 data/hfopenllm_v2/BEE-spoke-data/smol_llama-101M-GQA/f0983645-4adb-4ddb-bf2f-33480cb7f421.json
 delete mode 100644 data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-GQA-fineweb_edu/161dadfe-4983-4f56-8a7d-9b97f1c5a3c7.json
 delete mode 100644 data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-GQA/694a02f9-4729-4d0b-97ce-80adaef29be2.json
 delete mode 100644 data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-openhermes/0521f51d-22c1-4821-8f04-23c533411668.json
 delete mode 100644 data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024/8fdea71b-5e68-4a78-aefc-8a00650464c4.json
 delete mode 100644 data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-flan/e2ba5674-9251-4a4e-9eb8-046c834da400.json
 delete mode 100644 data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-instruct_2e/4caafdb2-3065-40d4-b5a7-9deb41e1d8a7.json
 delete mode 100644 data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-instruct-orpo/886e0b8b-b2dc-434f-a299-50f668006241.json
 delete mode 100644 data/hfopenllm_v2/BSC-LT/salamandra-7b-instruct/7a6a9443-f331-4dfa-acf9-6aa30049bade.json
 delete mode 100644 data/hfopenllm_v2/BSC-LT/salamandra-7b/6d523da4-ec4a-405b-a25d-afc7b1b5aefd.json
 delete mode 100644 data/hfopenllm_v2/Ba2han/Llama-Phi-3_DoRA/cfecfce3-090d-4c2e-826c-03c0c5337e98.json
 delete mode 100644 data/hfopenllm_v2/Baptiste-HUVELLE-10/LeTriomphant2.2_ECE_iLAB/5aa124dc-4abd-4c5f-b40a-a8d81af922eb.json
 delete mode 100644 data/hfopenllm_v2/BenevolenceMessiah/Qwen2.5-72B-2x-Instruct-TIES-v1.0/ec91b122-c8f5-4dfb-94fd-336ef78c3e14.json
 delete mode 100644 data/hfopenllm_v2/BenevolenceMessiah/Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0/114f246a-6049-40bf-ad86-9a822d13cf74.json
 delete mode 100644 data/hfopenllm_v2/BlackBeenie/Bloslain-8B-v0.2/82d28a3a-44f2-463f-a1b8-7e9079ec47b7.json
 delete mode 100644 data/hfopenllm_v2/BlackBeenie/Llama-3.1-8B-OpenO1-SFT-v0.1/ed3c1349-a154-4866-890f-2b115ffaf127.json
 delete mode 100644 data/hfopenllm_v2/BlackBeenie/Llama-3.1-8B-pythonic-passthrough-merge/47942c55-5ddb-4fda-9c5b-34676ae2046a.json
 delete mode 100644 data/hfopenllm_v2/BlackBeenie/Neos-Gemma-2-9b/d860210b-4c8a-4d15-ad3a-4e39905f91ed.json
 delete mode 100644 data/hfopenllm_v2/BlackBeenie/Neos-Llama-3.1-8B/d137f429-2b65-4ee9-9d66-3f619b270fad.json
 delete mode 100644 data/hfopenllm_v2/BlackBeenie/Neos-Llama-3.1-base/1da10dfe-b0a3-4cb8-aaa3-e16d48f3aab4.json
 delete mode 100644 data/hfopenllm_v2/BlackBeenie/Neos-Phi-3-14B-v0.1/6156a0d2-4c32-40b2-9624-ef0c7a6a95bb.json
 delete mode 100644 data/hfopenllm_v2/BlackBeenie/llama-3-luminous-merged/676342d2-f37a-4b6a-967d-3ac750243470.json
 delete mode 100644 data/hfopenllm_v2/BlackBeenie/llama-3.1-8B-Galore-openassistant-guanaco/950b7108-0192-4875-b4e9-c3e43ab71e08.json
 delete mode 100644 data/hfopenllm_v2/Bllossom/llama-3.2-Korean-Bllossom-AICA-5B/85672df5-2f35-43be-8648-9937c66872dc.json
 delete mode 100644 data/hfopenllm_v2/BoltMonkey/DreadMix/051c5642-3b23-4879-9d10-639d1b3127d7.json
 delete mode 100644 data/hfopenllm_v2/BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/2acf0d12-7e0c-46dc-a079-ebc48a8818d3.json
 delete mode 100644 data/hfopenllm_v2/BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/8ce42090-006e-4e08-8d3f-5b1eb0b8da0b.json
 delete mode 100644 data/hfopenllm_v2/BoltMonkey/SuperNeuralDreadDevil-8b/703df6c3-dae4-437f-9379-f8c264797adc.json
 delete mode 100644 data/hfopenllm_v2/BrainWave-ML/llama3.2-3B-maths-orpo/1e349ad3-d29b-4a4b-97e7-b82055e41b07.json
 delete mode 100644 data/hfopenllm_v2/BramVanroy/GEITje-7B-ultra/8f677a76-932c-4c35-9708-4b723226aa19.json
 delete mode 100644 data/hfopenllm_v2/BramVanroy/fietje-2-chat/ebfe625f-ff1f-45f9-826c-9351ea4134e1.json
 delete mode 100644 data/hfopenllm_v2/BramVanroy/fietje-2-instruct/66e6a757-ac22-47f3-82ce-81af45e1d3cf.json
 delete mode 100644 data/hfopenllm_v2/BramVanroy/fietje-2/1cd840c7-d432-495c-a3df-af1fa6264259.json
 delete mode 100644 data/hfopenllm_v2/CYFRAGOVPL/Llama-PLLuM-8B-base/066f520f-9a64-4564-abfc-6435732c3585.json
 delete mode 100644 data/hfopenllm_v2/CYFRAGOVPL/Llama-PLLuM-8B-chat/aced5181-040a-48c0-bc5f-78d0de3afae8.json
 delete mode 100644 data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-base/a4889a38-84d2-4ae1-b8a9-297b4400602d.json
 delete mode 100644 data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-chat/d540505a-c67b-4b72-a53a-c03aa6f8d3e7.json
 delete mode 100644 data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-nc-base/9859afee-02ca-4c48-acc8-acfd20c37e4e.json
 delete mode 100644 data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-nc-chat/e222d12b-c796-4890-a584-cd689bae7ea6.json
 delete mode 100644 data/hfopenllm_v2/CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct-2412/c16850f8-0b80-4455-8f38-8ec453cd1d41.json
 delete mode 100644 data/hfopenllm_v2/CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct/0d400b0f-cc82-4c86-b600-93a31b133f9d.json
 delete mode 100644 data/hfopenllm_v2/Casual-Autopsy/L3-Umbral-Mind-RP-v2.0-8B/90f6f8f1-02fc-425a-8499-e9b43ae8ac59.json
 delete mode 100644 data/hfopenllm_v2/CausalLM/14B/6704d6bc-6d38-4c59-87a4-81d3eacde3b1.json
 delete mode 100644 data/hfopenllm_v2/CausalLM/34b-beta/e8ad6ce4-7efc-499e-a2c9-9e0df898fbb9.json
 delete mode 100644 data/hfopenllm_v2/CausalLM/preview-1-hf/5e9c1273-536d-4280-8fff-9931f46dc968.json
 delete mode 100644 data/hfopenllm_v2/Changgil/K2S3-14b-v0.2/460ca160-ac34-4091-ba2d-986b53532b55.json
 delete mode 100644 data/hfopenllm_v2/Changgil/K2S3-v0.1/ef9d2fab-07a2-44e2-aae2-ede5a2ff31d9.json
 delete mode 100644 data/hfopenllm_v2/ClaudioItaly/Albacus/a29a69d3-d64e-4463-aa52-0a9d6d012c98.json
 delete mode 100644 data/hfopenllm_v2/ClaudioItaly/Book-Gut12B/4539c16e-1ac6-47f4-88eb-a09842497330.json
 delete mode 100644 data/hfopenllm_v2/ClaudioItaly/Evolutionstory-7B-v2.2/2ff33c55-1236-4c57-8809-2d3076e43cc7.json
 delete mode 100644 data/hfopenllm_v2/ClaudioItaly/intelligence-cod-rag-7b-v3/281ba822-49a2-4746-bc04-8de046439508.json
 delete mode 100644 data/hfopenllm_v2/CohereForAI/aya-23-35B/0606d916-95ea-4318-af0c-3942329071c6.json
 delete mode 100644 data/hfopenllm_v2/CohereForAI/aya-23-8B/005159f0-da68-480d-972c-c160d145a682.json
 delete mode 100644 data/hfopenllm_v2/CohereForAI/aya-expanse-32b/2f6abb5d-52b3-44b0-b960-115793485fb1.json
 delete mode 100644 data/hfopenllm_v2/CohereForAI/aya-expanse-8b/6ffacad9-1a4d-472e-bbbf-0d64d068dd0d.json
 delete mode 100644 data/hfopenllm_v2/CohereForAI/c4ai-command-r-plus-08-2024/26eadaf8-bfb8-4aad-a8a4-90699b6f0fcd.json
 delete mode 100644 data/hfopenllm_v2/CohereForAI/c4ai-command-r-plus/d4536913-5708-45e4-a024-45ae37fdae13.json
 delete mode 100644 data/hfopenllm_v2/CohereForAI/c4ai-command-r-v01/848860aa-7de3-4fae-afca-ac11224b96c5.json
 delete mode 100644 data/hfopenllm_v2/CohereForAI/c4ai-command-r7b-12-2024/0241a8e3-d6e5-4ba5-afb9-862bde2ba851.json
 delete mode 100644 data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-dpo-v1.0/20b69120-d476-4e34-b3c6-8cef11d6ee78.json
 delete mode 100644 data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-dpo-v1.0/696bbbfc-49dd-444e-a90b-76821845a726.json
 delete mode 100644 data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-odpo-v1.0/e6d974d3-467e-4fe7-bd84-79fc7c72cde2.json
 delete mode 100644 data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-sft-v1.0/b26ba2b7-1365-4b1c-a1be-35d588e02d36.json
 delete mode 100644 data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-dpo-v1.0/64bd755d-ba4b-4559-ad8e-f56c697b1ae6.json
 delete mode 100644 data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-odpo-v1.0/c4e572cb-1d12-4baf-a4d8-a55422692207.json
 delete mode 100644 data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-sft-v1.0/c6123e10-b1f9-49dc-888b-083881e6ef09.json
 delete mode 100644 data/hfopenllm_v2/CombinHorizon/Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES/e1647f10-fec5-463d-b8e5-6b2b880bd687.json
 delete mode 100644 data/hfopenllm_v2/CombinHorizon/Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES/6d5fa235-8d69-456e-9f23-0f702760baf4.json
 delete mode 100644 data/hfopenllm_v2/CombinHorizon/YiSM-blossom5.1-34B-SLERP/e8709a6a-a2b8-4b09-9342-d1aeae89de1f.json
 delete mode 100644 data/hfopenllm_v2/CombinHorizon/huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES/603e95c9-7e7f-4892-93f7-92f92b256865.json
 delete mode 100644 data/hfopenllm_v2/CombinHorizon/huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES/3e2fd38a-186e-49aa-915c-7eb3cde50562.json
 delete mode 100644 data/hfopenllm_v2/CombinHorizon/zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES/16d55e66-9015-4d72-81e4-3f14c42b0368.json
 delete mode 100644 data/hfopenllm_v2/ContactDoctor/Bio-Medical-3B-CoT-012025/696644b9-bd40-4047-bb85-0cb19510a96c.json
 delete mode 100644 data/hfopenllm_v2/ContactDoctor/Bio-Medical-Llama-3-8B/cbae8c39-0aec-4859-98bc-3b2d065833ad.json
 delete mode 100644 data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme-merge2/15fb3cc7-1ba5-4ba5-ba02-8e8a9d2029d0.json
 delete mode 100644 data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme-merge3/357f6051-b880-48bb-8e68-e4b0a7a0cbcc.json
 delete mode 100644 data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme/a50a542b-668e-47b1-a37e-805a58eea3d1.json
 delete mode 100644 data/hfopenllm_v2/Corianas/Neural-Mistral-7B/00f7bd51-0b31-446d-be8c-1e0dc0d82e54.json
 delete mode 100644 data/hfopenllm_v2/Corianas/Quokka_2.7b/26782941-b918-44c5-a7f6-5f770e47c3d6.json
 delete mode 100644 data/hfopenllm_v2/Corianas/llama-3-reactor/5547ddaf-8fbb-4259-8b88-e946fc3d2404.json
 delete mode 100644 data/hfopenllm_v2/CortexLM/btlm-7b-base-v0.2/bee5ea59-b97a-4783-b763-b6bd432d4558.json
 delete mode 100644 data/hfopenllm_v2/Cran-May/SCE-2-24B/8150333f-8e79-4230-af8b-7ddb1d5eeb21.json
 delete mode 100644 data/hfopenllm_v2/Cran-May/SCE-3-24B/be8510a9-ecd4-4ac7-9930-3200cacb7b50.json
 delete mode 100644 data/hfopenllm_v2/Cran-May/T.E-8.1/887e4574-f876-4e75-afb8-e543bcb30020.json
 delete mode 100644 data/hfopenllm_v2/Cran-May/merge_model_20250308_2/fd21d8bd-28cf-4b91-8075-c38a61f5f32a.json
 delete mode 100644 data/hfopenllm_v2/Cran-May/merge_model_20250308_3/c0f05e38-6592-478a-9c46-26567f24ff85.json
 delete mode 100644 data/hfopenllm_v2/Cran-May/merge_model_20250308_4/06cc2913-8e05-44bf-a128-9a7c4aeff536.json
 delete mode 100644 data/hfopenllm_v2/Cran-May/tempmotacilla-cinerea-0308/86368d5b-0509-4b52-b988-58bcf7e1043e.json
 delete mode 100644 data/hfopenllm_v2/CreitinGameplays/Llama-3.1-8B-R1-v0.1/77b89fe6-464b-4017-a77f-8750e2668a82.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Broca/d2e47d86-23dd-4c95-a7fb-99518615d09f.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-BrocaV9/0a09891e-ac97-4c3a-8364-7106a851f1a8.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav3/eb41fe62-ac46-4630-bb2d-6b907f271737.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav6/d540a6c8-e9ec-4413-b9d2-dee68533c377.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav7/5b1f413a-05c4-43be-bdbc-9de5728e8d0a.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Emerged/6701738c-27e4-4bbd-b614-fbc297c3164f.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Emergedv3/7f4563b4-0b25-49e7-ac1c-afaa28b0eda2.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-FinalMerge/32b6e4af-69ba-49b7-9367-dfafe3e390e8.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyper/e16deaf7-da55-40ba-ac18-860fa3f14d34.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-HyperMarck-dl/8a7a5886-0618-4615-9cdf-46f5d19a29fe.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv3/66d18e5b-9ebc-4ab6-94fb-6d5c23c58672.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv4/a36aaaf6-2478-4b98-ad0c-2b06ddb8c308.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv5/4a6237a7-019c-4310-971e-84b08d1b5067.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-MegaMerge-pt2/996e781e-5939-41ac-b347-95c99037c34a.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-MergeStock/e880fa0e-ae49-4398-91bd-eadf8695425f.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-ReasoningMerge/da04ff51-fbeb-41a8-ae5e-8ddf5925b792.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Ultimav2/6d709396-1ae1-4e5c-a03c-13c1e9425202.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Unity/5b616df9-e15a-4f84-98b4-c2cb532c1b95.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke-SFT/0f6552d9-3cbe-447e-909b-068e5ceed4c9.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke-SLERP/2861aae0-d2ec-48f5-bd20-9e7bcaf8dabd.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke/51a64f37-256c-4fe7-b28c-6117520f04ec.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernickev3/03ce9c1d-38e8-4a6c-b293-57428a9d7c0e.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwen2.5-14B-partialmergept1/3b0f5dea-db9b-4657-9807-6b3e56d38823.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwenfinity-2.5-14B/2d19e9ff-e331-4171-ae90-47e44f3f8885.json
 delete mode 100644 data/hfopenllm_v2/CultriX/Qwestion-14B/6bfb8b24-1abd-405b-b01d-7d7111705dbb.json
 delete mode 100644 data/hfopenllm_v2/CultriX/SeQwence-14B-EvolMerge/c83e6b6c-c8be-4d97-9c65-2d883f88f37f.json
 delete mode 100644 data/hfopenllm_v2/CultriX/SeQwence-14B-EvolMergev1/72569796-1b11-48cc-ada7-e8c09522dd54.json
 delete mode 100644 data/hfopenllm_v2/CultriX/SeQwence-14B-v5/58403e30-bd2b-4f4c-ad41-daa890c77d40.json
 delete mode 100644 data/hfopenllm_v2/CultriX/SeQwence-14B/eb8e1f1d-c6b3-407c-b172-d240553d2f89.json
 delete mode 100644 data/hfopenllm_v2/CultriX/SeQwence-14Bv1/356d75a0-6520-46c1-afa9-7dbb2596a5c1.json
 delete mode 100644 data/hfopenllm_v2/CultriX/SeQwence-14Bv2/78681e0c-5fe2-4920-af7b-99345cea3efe.json
 delete mode 100644 data/hfopenllm_v2/CultriX/SeQwence-14Bv3/ba0ee5b4-070a-461d-a3d2-cd4036387cc9.json
 delete mode 100644 data/hfopenllm_v2/DRXD1000/Atlas-7B/17d0d377-bca4-411c-be11-6c5cfce07798.json
 delete mode 100644 data/hfopenllm_v2/DRXD1000/Phoenix-7B/d01a56a1-1eb9-4ccf-8c09-348b6ba5480b.json
 delete mode 100644 data/hfopenllm_v2/DUAL-GPO/zephyr-7b-ipo-0k-15k-i1/389821ff-d8e2-4d1d-8fb2-57a689867ac5.json
 delete mode 100644 data/hfopenllm_v2/DZgas/GIGABATEMAN-7B/7913f782-29b0-48bd-bc62-37da9a5ac7d9.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/AetherDrake-SFT/b0930974-999e-4372-9d21-b9790e0bad4c.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/AetherSett/8265f577-f504-4a56-9cf0-42c34766559a.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/AetherTOT/82044cd2-1a46-406e-bc68-397ce41b29ea.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/AetherTOT/de09e323-8cf1-4aa9-9537-e8ad30a8c297.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/AetherUncensored/bfe543b4-ec38-488e-ae04-125cd358b61f.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/Cogito-MIS/be36d8ae-b81c-4b4e-aa2f-5999c7582237.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/CogitoDistil/342b435f-89e9-48ad-ab0f-2c1f52f4571a.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/CogitoZ/b0c8737d-d838-4da1-909b-b218e22119dc.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/CogitoZ14/4cd40f28-842f-44d5-9eb2-86238077fc55.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/DocumentCogito/0758051c-2d75-402e-af0e-769096cbb17c.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/DocumentCogito/c93f610b-fb97-4ad1-b8af-fc41c6d8da33.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/Llama3.3-70B-CogniLink/b8467118-d895-41fa-81c7-89892e1844d5.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/Llama_cot/30d867bb-63c6-48d1-8d43-6c24f4cf44ba.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/MawaredT1/89b92cda-c5b6-45ed-a534-361c9d34794a.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/Mini_QwQ/48cdf76a-886d-41ec-8580-00ed4232b601.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/NemoR/116272d4-d25d-49cb-80cb-ff26a0fb3cf4.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/PathFinderAI2.0/bb103828-70fe-4767-9302-6750d839129e.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/PathFinderAi3.0/7b58ab54-239b-4e49-93f1-c3940df61474.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/PathfinderAI/559067a2-816c-4091-893e-b1c7860171ec.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/PathfinderAI/ec502619-880b-4b7c-acfe-c43cf6514e3f.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/Phi-4-COT/6941a5dd-2a70-4846-a5f6-b16ef2d56a03.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/PixelParse_AI/636e2f93-3242-491c-9df5-003aa1dacecf.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/RA2.0/1f4efa23-816d-49be-8659-feb003f4b3ef.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/RA_Reasoner/d05be1e4-bcac-4b4a-bbde-8b17a5a71243.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/RA_Reasoner2.0/9ab53055-86f5-4a88-976f-015dd9c9e832.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/ReasonTest/ba34083a-9b13-46d9-8f36-aa3ddd586711.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/Research_PathfinderAI/6a39d734-ad73-4c4a-9583-3563e336d4b3.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/SphinX/2af71e88-4931-4359-b92a-c64fa33df802.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/Sphinx2.0/bf9336a7-a7c4-420a-9dd0-68d8e0c815c4.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/TinySphinx/2de872b2-10c7-44dd-91c3-f20205207da6.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/TinySphinx2.0/5cabed09-d8ea-46c2-bb78-012dac954d6b.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/Zirel-7B-Math/8236db6a-ff8a-4237-af5a-03bb258f8e59.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/Zirel_1.5/1a7b078e-bc1f-400f-a0cd-f7b535548f23.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/mini-Cogito-R1/fdaf561c-567c-416d-a74a-ac3c07c5be5b.json
 delete mode 100644 data/hfopenllm_v2/Daemontatox/mini_Pathfinder/58900b3b-303b-49c8-b807-7b8d06601568.json
 delete mode 100644 data/hfopenllm_v2/Dampfinchen/Llama-3.1-8B-Ultra-Instruct/7ac5a45a-7b41-4f63-8556-8737638a00ea.json
 delete mode 100644 data/hfopenllm_v2/Danielbrdz/Barcenas-10b/3cb55475-30c8-43c8-8d7d-394450fdc117.json
 delete mode 100644 data/hfopenllm_v2/Danielbrdz/Barcenas-14b-Phi-3-medium-ORPO/f5e140ff-0c0e-4769-8116-63cf50255773.json
 delete mode 100644 data/hfopenllm_v2/Danielbrdz/Barcenas-14b-phi-4-v2/df85ec6e-1325-40ce-8087-d960a1d767dd.json
 delete mode 100644 data/hfopenllm_v2/Danielbrdz/Barcenas-14b-phi-4/a7bd3fff-f01e-46ca-af85-5b4ac6ae7320.json
 delete mode 100644 data/hfopenllm_v2/Danielbrdz/Barcenas-3b-GRPO/11842dd9-0572-41ef-aaa0-8d19f3420efc.json
 delete mode 100644 data/hfopenllm_v2/Danielbrdz/Barcenas-Llama3-8b-ORPO/01abccec-1cea-4060-89be-289987d0a2ce.json
 delete mode 100644 data/hfopenllm_v2/Danielbrdz/Barcenas-R1-Qwen-1.5b/dce8226c-57bd-4255-b813-8a70494f0a1a.json
 delete mode 100644 data/hfopenllm_v2/Dans-DiscountModels/12b-mn-dans-reasoning-test-2/7f80e69c-eec6-49ac-a088-6248ee25f736.json
 delete mode 100644 data/hfopenllm_v2/Dans-DiscountModels/12b-mn-dans-reasoning-test-3/e0267a2c-dfc5-456e-864d-b5b0ad1fa508.json
 delete mode 100644 data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-CoreCurriculum-12b-ChatML/e6ad37be-28f4-43b4-9df1-b7b47d31232e.json
 delete mode 100644 data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.0/5514368a-1f7d-4cd0-b7f7-d116b753f975.json
 delete mode 100644 data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.1/c0e29cf8-897f-4e07-abb4-71c801d34301.json
 delete mode 100644 data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.2.0/68310379-65b2-482d-892b-f76547bce2b0.json
 delete mode 100644 data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML/a034c4ec-d4cd-439b-8dbd-e67685ea7616.json
 delete mode 100644 data/hfopenllm_v2/Dans-DiscountModels/Mistral-7b-v0.3-Test-E0.7/e4b761d3-bb84-4433-b9fb-4c92ecae6279.json
 delete mode 100644 data/hfopenllm_v2/Dans-DiscountModels/mistral-7b-test-merged/38d78d30-be6d-476c-a3aa-d9a40f570a56.json
 delete mode 100644 data/hfopenllm_v2/Darkknight535/OpenCrystal-12B-L3/36e60f6c-60f7-4b17-88fe-82810e195fc7.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm/a6c647e8-ed24-4150-8563-dd9b20e21498.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B/b5a366ac-d736-4447-a2f1-98d0b84ba3bd.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B/5d098dc6-8124-4d26-86ec-d54e6e09c3a6.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B/1137cbc4-d80b-4e21-bfeb-feab41dc80b2.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B/097bbfbc-0ccd-4fd4-9e0c-9c192cba9e8b.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm/db8c6169-bfc1-48bb-be53-fa93c673f051.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B/41437fc9-6d48-4317-a8de-ab4e63b2cf46.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B/e075f4fe-95e0-48f4-94c4-f6ebd3f4edaa.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/Gemma-The-Writer-9B/3349d66c-e12b-49c1-a406-e0e77b697458.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/Gemma-The-Writer-DEADLINE-10B/7aa0ff6b-11a9-4554-a27f-e477a0ff77c7.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/Gemma-The-Writer-J.GutenBerg-10B/ac749485-df6d-485e-8fa7-63bdfd744167.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/Gemma-The-Writer-Mighty-Sword-9B/54363a4b-312b-4035-a1c3-b5321311cec4.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/Gemma-The-Writer-N-Restless-Quill-10B-Uncensored/aa9e2b9e-cd25-4492-9801-eba7d40b4365.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/L3-DARKEST-PLANET-16.5B/c6b484b8-f6f3-4516-aff5-c2f6438c9047.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/L3-Dark-Planet-8B/c6c760c9-a345-4e25-b333-b403bf6db389.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/L3-Jamet-12.2B-MK.V-Blackroot-Instruct/65b2aa58-2c04-48f2-9ea3-c8fd97cb9dde.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/L3-Lumimaid-12.2B-v0.1-OAS-Instruct/92903344-0dde-4f5a-a7d2-749a1ffe9cd3.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/L3-SMB-Instruct-12.2B-F32/59ddd478-c1cd-4bd8-80c3-fdebe762414a.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/L3-Stheno-Maid-Blackroot-Grand-HORROR-16B/02f63fc6-9376-4fb5-b067-63493238cc27.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/L3-Stheno-v3.2-12.2B-Instruct/dd7597fd-27f5-4e77-a44f-b01d0db82719.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/L3.1-Dark-Planet-SpinFire-Uncensored-8B/20cd0d60-eb0d-41bd-b37f-910a03dd7f82.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B/c4e9d045-3769-4828-a2ca-7fa508873089.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B/0a0501ec-4ecd-47c1-914b-d473f795cef2.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B/beca755f-203f-4bc8-b5cf-f9a9e3f8bd8f.json
 delete mode 100644 data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32/79e1e1c6-cbe0-43a9-a593-8e2119baaf77.json
 delete mode 100644 data/hfopenllm_v2/Davidsv/SUONG-1/def80b44-3d9a-46ba-bf5f-ffc81e50af2e.json
 delete mode 100644 data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter0/5e1aa809-ef20-445e-a05b-eccd585d5991.json
 delete mode 100644 data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter0/7c2be651-ca56-4285-afc7-1bfe1c8ce11e.json
 delete mode 100644 data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter1/cfe4ea72-ddb9-49b5-9599-99f215e112e5.json
 delete mode 100644 data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter2/81d63d8e-88dd-4b16-b9b8-d07604878f8f.json
 delete mode 100644 data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter3/81f8208b-f7e7-4685-bb84-321d9e097470.json
 delete mode 100644 data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter3/a0c9a434-9b8c-47c5-b511-9daac7901686.json
 delete mode 100644 data/hfopenllm_v2/DavieLion/Lllma-3.2-1B/28b60eae-1b38-4404-8db1-3fb2997583f4.json
 delete mode 100644 data/hfopenllm_v2/DebateLabKIT/Llama-3.1-Argunaut-1-8B-SFT/746862a2-a90c-4612-91d0-f989b9eed1a5.json
 delete mode 100644 data/hfopenllm_v2/Deci/DeciLM-7B-instruct/715ee057-9c9a-4e04-991c-7040b1eef65b.json
 delete mode 100644 data/hfopenllm_v2/Deci/DeciLM-7B/4dc1d103-3458-4b8c-9e63-b98effd69667.json
 delete mode 100644 data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.1-8B-Inst/070ff2a5-9a5d-48cf-8517-1ad9b6642d59.json
 delete mode 100644 data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst/8406a5b8-a87d-489b-b75b-00e9f675f09f.json
 delete mode 100644 data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v0/11e8f9b6-32ab-4b83-a601-e5644c0b2c39.json
 delete mode 100644 data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1.1/6b542f5a-ea62-45ce-8e98-436a4d058877.json
 delete mode 100644 data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1/9b280640-bfee-4730-acc3-386a54b2434c.json
 delete mode 100644 data/hfopenllm_v2/DeepAutoAI/causal_gpt2/eff5171b-6119-4013-8aa8-8a4f0215b045.json
 delete mode 100644 data/hfopenllm_v2/DeepAutoAI/d2nwg_Llama-3.1-8B-Instruct-v0.0/471c5fed-f155-4521-9d9c-b5370ca91bec.json
 delete mode 100644 data/hfopenllm_v2/DeepAutoAI/d2nwg_causal_gpt2/690be099-3ace-484f-b01f-2fe6b324d12a.json
 delete mode 100644 data/hfopenllm_v2/DeepAutoAI/d2nwg_causal_gpt2_v1/71fbd15f-5eec-40d9-84e8-07323f3ffac6.json
 delete mode 100644 data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Inst/eb93dd3e-3d13-4234-bb66-f6177648aa2b.json
 delete mode 100644 data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.0/f7ec1ed7-cc30-4879-8ab1-4909011553d5.json
 delete mode 100644 data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.1/3e100704-dbd3-4d05-b325-5bb4bc90e51c.json
 delete mode 100644 data/hfopenllm_v2/DeepMount00/Lexora-Lite-3B/12f003ef-1098-4d3f-aed7-7343034157bc.json
 delete mode 100644 data/hfopenllm_v2/DeepMount00/Lexora-Lite-3B_v2/9de2e564-3a30-4f1c-80da-6432a245a64f.json
 delete mode 100644 data/hfopenllm_v2/DeepMount00/Lexora-Medium-7B/dd5aaa3f-b24b-4a5b-852b-b80f4a6bf366.json
 delete mode 100644 data/hfopenllm_v2/DeepMount00/Llama-3-8b-Ita/8d8b9fd2-43f6-4edc-8340-44d20824a7e7.json
 delete mode 100644 data/hfopenllm_v2/DeepMount00/Llama-3.1-8b-ITA/7fe45c20-a2c0-4acf-9425-651a1ec3b0d0.json
 delete mode 100644 data/hfopenllm_v2/DeepMount00/Llama-3.1-8b-ITA/baf93ef6-56f3-4809-93f6-32dcf4730388.json
 delete mode 100644 data/hfopenllm_v2/DeepMount00/Llama-3.1-Distilled/f6df14bd-207c-4fea-b789-c9f9aef749b3.json
 delete mode 100644 data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita/97766a7f-cf5b-46ae-b51e-5c5702ae000b.json
 delete mode 100644 data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v2/d5cd2a1b-3def-4b33-a8fe-4b02e090db27.json
 delete mode 100644 data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v3/275d4bf0-566c-4b50-86b9-38c7f45df143.json
 delete mode 100644 data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v5/aa504db9-81f3-424f-b7d9-683ebe31f5d8.json
 delete mode 100644 data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v6/2cc209b7-ef10-435d-a840-b904ab741491.json
 delete mode 100644 data/hfopenllm_v2/DeepMount00/Qwen2.5-7B-Instruct-MathCoder/9b9390ac-fd65-4a58-9834-5352aa340cdc.json
 delete mode 100644 data/hfopenllm_v2/DeepMount00/mergekit-ties-okvgjfz/4efe5cd4-6b8a-4951-a63a-4c7dc390bbec.json
 delete mode 100644 data/hfopenllm_v2/Delta-Vector/Baldur-8B/4bc5a0db-1c88-4c61-9343-1d340305ecc5.json
 delete mode 100644 data/hfopenllm_v2/Delta-Vector/Control-8B-V1.1/74527f51-dcec-4b82-8ba8-075c933404f5.json
 delete mode 100644 data/hfopenllm_v2/Delta-Vector/Control-8B/ac31bc90-3854-4d38-925d-ef8dc7e75d24.json
 delete mode 100644 data/hfopenllm_v2/Delta-Vector/Darkens-8B/88583cff-1adc-4b1b-8e68-07f0074d0ae2.json
 delete mode 100644 data/hfopenllm_v2/Delta-Vector/Henbane-7b-attempt2/fadbac9e-7224-41d1-abfa-7039cbcba9f6.json
 delete mode 100644 data/hfopenllm_v2/Delta-Vector/Odin-9B/1fb90540-0fa0-44ca-ad67-1e3503f6b729.json
 delete mode 100644 data/hfopenllm_v2/Delta-Vector/Tor-8B/047784e2-c1ee-40d9-a60d-e43504825801.json
 delete mode 100644 data/hfopenllm_v2/DevQuasar/DevQuasar-R1-Uncensored-Llama-8B/ee60453d-2d51-46f7-8a18-c651d590f0e7.json
 delete mode 100644 data/hfopenllm_v2/Dongwei/DeepSeek-R1-Distill-Qwen-7B-GRPO/b0ac4b11-f7b4-4753-baae-310a92f08259.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/L3-8B-R1-WolfCore-V1.5-test/324db8b3-38c7-4a2c-82e8-7bebfa38e760.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/L3-8B-R1-WolfCore/54dd9033-61b9-4f26-9cde-e04c7136524b.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/L3-8B-WolfCore/d0973d6c-373c-41cd-9e62-52470c044dac.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame-test/da15da67-b316-4c2e-86a5-c1f88eece9cb.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame2-test/b0c34174-bfd0-4556-a3bf-92ec0ddf5ec4.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame3-test/bce7b15d-1670-46db-bdff-24fb38bc3fd9.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Kakigori/15e5e02f-27b9-4063-b601-42c2b17180f9.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-2/51b0c546-0dde-4668-a8b8-3b9753a31aa0.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-3/45842b1c-cf68-44a7-928f-2da454cdd13f.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-4/c15cdefd-dbe3-432e-aab0-3c43540cd320.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame/1f489afa-a01d-40f3-836a-9e386c502d1d.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame/94bcc87e-eb06-4321-9b72-2f99168cf92a.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-GreenSnake/c0bc9811-4d7c-412f-a12b-3e6eab2e5a6f.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Nocturne/b5a8b278-69e9-41ba-89ee-8fd6b2d90a1c.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v2-Experiment/a3ad7f0f-64bd-42a1-bc7d-d7d4cbbd80fd.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v3-Experiment/f07c3a4a-2a8e-45c4-a726-be95726df2db.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v4-Experiment/f36d56b8-cd77-4d69-a51d-39025bcfcdfd.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi/65acabdc-ea5f-426c-820b-2b79f2b20b44.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-1/96b00cfa-1383-4b36-a043-17eb39678ffc.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-2/3b8a796e-6bde-4506-8335-bd3cc72482e1.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-3/a93e99e2-ca13-4cdc-9904-7ae5cc82c623.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-4/65d9e237-2757-459e-94e7-e382213e4eeb.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake/c3f44524-4c75-4cd0-9f5d-79c8b08f6f77.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-Unleashed-Twilight/2e7d3674-d0b0-4b87-8bd8-8202114b7665.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MN-12B-WolFrame/30d21295-beb1-4179-8c6f-7bac79b29474.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-test/e2fc95de-b9d9-4043-b55c-aa2819d4f52f.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1b-test/7fbd7f97-baf9-4acd-ba0c-90ffbf0c47a5.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1c-test/336effcd-d8fc-4477-846f-70fc40bdc111.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1d-test/28f87820-d587-498e-b713-7c0af0cdc324.json
 delete mode 100644 data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B/f1b671ab-ebb3-43ec-86fa-832982d04cc1.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Again-8B-Model_Stock/327cde83-d107-4455-bc03-7e03026c52e6.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Alita99-8B-LINEAR/7497b8fb-9a7d-46dc-868e-1a2bbcdc7860.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/AnotherTest/92c8afbe-7735-40c8-af0e-29da687c2070.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Aspire-8B-model_stock/bca052ac-6556-49d8-94e3-f4bda560a5d3.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Aspire_1.3-8B_model-stock/5f74fe6e-8575-4cea-959b-e6ba03c7e273.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Aspire_V2-8B-Model_Stock/b0f696f5-ed70-4293-999d-a9121192c137.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Aspire_V2.1-8B-Model_Stock/18751a6f-062c-4915-bbe0-ae222cf9ae0b.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Aspire_V2_ALT-8B-Model_Stock/398ebe04-638f-4a11-b99d-6778ff3ff97b.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Aspire_V2_ALT_ROW-8B-Model_Stock/b4f197f2-3456-4221-b222-10dfbbb50f56.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Aspire_V3-8B-Model_Stock/0a2fa86a-f9b3-4a49-b215-4cd3ee9b4c22.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Aspire_V4-8B-Model_Stock/1561ec50-1cb9-47ce-9db1-09efe9c3fc61.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Aspire_V4_ALT-8B-Model_Stock/496525ff-394a-4b7b-9d93-f5b38d2a1ee3.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Asymmetric_Linearity-8B-Model_Stock/37071760-d24c-43cc-9965-d8c7873c0ee8.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LINEAR/91a71a49-5dd4-43b1-9e1c-fd9492236712.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LORABLATED/d1d48abb-6dcf-4905-958f-c3a3e75feac6.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LORABLATED_ALT/68282f29-f56f-420b-bd1e-9cc54783c1a5.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Autumn_Dawn-8B-LINEAR/cd1c84dc-6c6e-4789-add7-0e3ca783b0ea.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/BaeZel-8B-LINEAR/22a9d3b8-ac45-4433-8926-5d28681af922.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/BaeZel-8B-Model_Stock/57c4b9eb-dffd-4623-a2d5-b2374d3c9109.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/BaeZel_V2-8B-Model_Stock/24adbd8c-df3a-4b58-94e6-61a3dfa6828e.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/BaeZel_V2_ALT-8B-Model_Stock/6ed62f64-c2be-4bca-b17d-bd0184a3d498.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/BaeZel_V3-8B-Model_Stock/db9e4d03-03a8-4a10-8739-16bbcfbb06d4.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Blunt_Edge-8B-SLERP/7b0fc4fe-51c8-4f01-b07b-5bca05b40859.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/BulkUp/6f286418-d8e3-4c11-8941-cfe5a18b1037.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Cadence-8B-LINEAR/b0a83b1f-3af2-45e8-9d88-d7302a529112.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Caelid-8B-Model_Stock/0462fce1-51b4-48d8-8278-a90048ffd637.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Casuar-9B-Model_Stock/e02f597c-c368-4223-ac90-c99d82c90634.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Condensed_Milk-8B-Model_Stock/32e63ffc-c64e-4562-ba99-14873f5bac2e.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/CoolerCoder-8B-LINEAR/6af4faad-05c2-488b-9685-e11ae4e1cbf0.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Damasteel-8B-LINEAR/8aa7701b-7019-44a0-851f-cfc9108fdfbd.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Dearly_Beloved-8B-TIES/a2f95fad-5ab5-47d0-b9aa-33358c673caf.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Decayed-8B-LINEAR/aef73a77-9df7-4d4f-89ef-50905d326198.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Derivative-8B-Model_Stock/e9ffdfb6-6f91-4bac-89d2-40b1eb43f3ee.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Derivative_V2-8B-Model_Stock/8ff39438-907c-465f-ac7a-5a25cfd8d824.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Derivative_V2_ALT-8B-Model_Stock/83d831c5-a74f-4699-9961-664a7a51b7b8.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Derivative_V3-8B-Model_Stock/83fb88ec-f640-4c1e-b71c-53a123fc4c2e.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Elusive_Dragon_Heart-8B-LINEAR/3811cc34-45cb-4932-b862-39bf042331e0.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Emu_Eggs-9B-Model_Stock/5b2a16a1-7a2a-40b7-add6-b99378b6af00.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Eunoia_Vespera-8B-LINEAR/1dc2a5bb-40b6-401e-8f1c-6110cb4c0f0d.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Fu_sion_HA-8B-SLERP/742e0a1c-7496-4076-bdbf-ada0a8e528c2.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/HOT_STINKING_GARBAGE/f0664035-3256-444c-b848-ef603e0d46b5.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/H_the_eighth-8B-LINEAR/9159aaa6-8663-491f-901a-74da4c343d20.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Happy_New_Year-8B-Model_Stock/5179b145-9fdb-4ab5-8cca-87966ecf6519.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Heart_Stolen-8B-Model_Stock/da872193-1d25-4e8e-bc22-9138a9d121ba.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Heart_Stolen-ALT-8B-Model_Stock/967fdd26-1f8a-40d6-8f7d-ca731c7ef2e3.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Here_We_Go_Again-8B-SLERP/dd615b4c-189e-4361-bcf4-879fd59b28a2.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Howdy-8B-LINEAR/0aeee3e8-00ce-4f95-bbd9-307d93a194a4.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Incidental-8B-Model_Stock/8c583b51-4349-48af-98d9-8eaaf43d60b6.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Irina-8B-model_stock/34aab556-5e97-4ea2-9ada-d17dc3624be2.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Kindling-8B-Model_Stock/fbd9d5e3-15f7-45ce-92fb-368b3bfcc526.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/L3.1-BaeZel-8B-Della/b177e329-ce6b-4bc6-aeac-1c01306e6b1f.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Laughing_Stock-8B-Model_Stock/7f371c11-e8f0-4233-b359-aac39c0a1110.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Lava_Lamp-8B-SLERP/9f758d4e-d121-4688-8ece-8dc67a499811.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/LemonP-8B-Model_Stock/903b8c71-d54d-4ce4-9845-71eb8ca8733a.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Lydia_of_Whiterun-8B-LINEAR/9bdc17bf-7b81-49c8-81f5-c6dfa31b449b.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Matryoshka-8B-LINEAR/28109e00-87c1-4809-a4fc-dddebba52621.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Mercury_In_Retrograde-8b-Model-Stock/6a21381b-426d-4a5d-ad6d-2aeb57ed14c5.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Minthy-8B-Model_Stock/03a8091c-473e-4fbe-af70-35f791a23a0f.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Minthy_ALT-8B-Model_Stock/ed75e9ed-841b-4783-a201-bc72651afd0a.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Minthy_V2-8B-Model_Stock/38cd418c-9770-49d2-8b30-ac47e445cee3.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Minus_Penus-8B-Model_Stock/d49b6a48-ae81-467d-87c5-b17f9ca306f8.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Morphing-8B-Model_Stock/39b7e250-9f71-4833-941e-85692a48b6e6.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Not_Even_My_Final_Form-8B-Model_Stock/c0d102a2-ff8c-45ac-a825-31472b98b871.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Nother_One-8B-Model_Stock/7c5674a8-6a1c-483e-be9c-b0a6d00d3ac4.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Noxis-8B-LINEAR/d34b899e-b067-4c9c-9fa2-439f8b2d589d.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Nullsworn-12B-LINEAR/8c7b2332-510b-42d3-bcbb-e177c35d27d5.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Nwah-8B-Model_Stock/685f107f-e431-4dba-a117-8d6f1dd2c296.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/ONeil-model_stock-8B/e1570804-85b6-4518-a099-5f21ab27d12c.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Oh_Boy-8B-LINEAR/a779ebec-76ab-4a1e-aa4f-d1a6adfe2d5c.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/OrangeJ-8B-Model_Stock/1ed7f6ed-d04d-4cfc-a36a-1ef0f72d4814.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Promissum_Mane-8B-LINEAR-lorablated/c901a9ee-069a-4e3e-ac52-3017d67d8800.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Promissum_Mane-8B-LINEAR/08317b59-ff74-43c8-bea5-2a266c38816e.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/RPMash-8B-Model_Stock/4106d4d3-344a-4c1f-b9ce-a3140d435013.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/RPMash_V3-8B-Model_Stock/2b308fad-8494-4056-8b84-82733cd2710a.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Rusted_Gold-8B-LINEAR/93c867d0-4f10-440c-838c-91d1633fe584.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Rusted_Platinum-8B-LINEAR/1a4a69c5-4acc-4ad9-adb2-bd9cf0fa2875.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Rusted_Platinum-8B-Model_Stock/151226ba-9744-45bc-b923-30df57f7aa3e.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Sellen-8B-model_stock/98363657-0793-4eb3-94de-28961afc92ea.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Something-8B-Model_Stock/a32b4ded-6bff-441e-afbd-736e6d8cce5c.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Spring_Dusk-8B-SCE/326bcf4a-02e9-4218-8bf2-55a94a79435e.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Summer_Dawn-8B-SCE/145facc2-ab11-4c68-b841-762e0ad9bd5a.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Summer_Dusk-8B-TIES/d3e6aae6-9284-4309-8d8c-02c9e797a58b.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Summer_Rain-8B-SCE/6ee8537c-90e8-4455-83ca-c8c375a5ead7.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Summer_Rain-8B-TIES/6efbfb38-57e5-46c7-b765-f7d0356afb97.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Sun-8B-Model_Stock/f4d418d9-1089-452d-9c7f-4cc4712e6ac7.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Sweetened_Condensed_Milk-8B-Model_Stock/1c9b325b-92b3-499a-a3ea-026269c63c88.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/TEST02-Ignore/c546ccde-cef3-4de2-a49f-24517d76dde5.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/TEST03-ignore/e85d3ccf-f48d-4e5c-b893-771a107773d4.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/TEST06-ignore/b8d22ade-874e-4ff3-9fcd-dbe14220d48b.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/TEST07-ignore/97e8e7e2-74a4-42a5-a0b1-250e47d3c3e6.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/TEST08-ignore/b2d56bb6-a726-4e47-8bc6-c016a51aac5c.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Trinas_Nectar-8B-model_stock/3366f6d8-41bc-4c2c-a72c-bc0fd7dc8dd2.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/UNTESTED-VENN_1.2-8B-Model_Stock/7ba52efb-3890-4691-8740-9f051f1f645e.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/VENN_1.2-8B-Model_Stock/7b192b49-057e-418a-b47d-44b0ec82a6b6.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/WIP-Acacia-8B-Model_Stock/f2120d53-bef6-44d6-84a6-a6f8e3537188.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/WIP_Damascus-8B-TIES/f5408aa9-85c8-46e5-b225-0480b2e18e97.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Wannabe-8B-Model_Stock/c1918f55-286c-4b29-ac53-2ee8f9d36d9e.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/What_A_Thrill-8B-Model_Stock/52659d37-67f8-45b8-88e4-11917dc90488.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Winter-8B-SCE/556ae77c-effe-44ab-ac4a-1ad7cbd7c363.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Winter_Dawn-8B-TIES/048fc971-3baf-4740-a132-2f9476d01b7a.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Winter_Dusk-8B-TIES/abd28d25-01e0-474d-be35-08d816d281f5.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Winter_Night-8B-Model_Stock/17f49724-6553-4baa-b354-45ffd0f2c844.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Yafune-8B-Model_Stock/3e60d982-d7d5-432b-962e-b7734cc90534.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Yearn_V3-8B-Model_Stock/79a0fdf3-b432-4598-be62-f9eb57fa5a43.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/ZEUS-8B-V17-Abliterated_ALT/662566e0-2af3-40d6-90de-9b361bcae355.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Zelus-8B-Model_Stock/d81c0035-a0b1-426c-9080-8ccbf745642b.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/Zelus_V2-8B-Model_Stock/100bc243-158c-4e5c-918b-1439bf26fee8.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/felix_dies-mistral-7B-model_stock/45e32080-1464-40e0-a232-310fdda967eb.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/hakuchido-8B-MODEL_STOCK/e89b279f-d548-4aa8-b5e5-0bffdd98b840.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/ichor-8B-Model_Stock/777a53f9-891c-4f9e-99a8-bb1988f61f19.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/ichor_1.1-8B-Model_Stock/f15846b1-8eaa-411b-88f7-25064161af4e.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/inexpertus-8B-Model_Stock/e803fc85-fb98-4db8-aab0-a63100dcd5fc.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/inexpertus_1.1-8B-LINEAR/50620749-5ecf-41eb-a131-611675560e07.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/inexpertus_1.2-8B-LINEAR/2d40a551-6440-4d71-87e4-639d486c1c5e.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/mergekit-nuslerp-nqzkedi/22235942-2e3e-4ef4-b7a0-5800f507571a.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/remember_to_breathe-8b-Model-Stock/ac06867d-3a34-42f6-9e2e-226cf86748f6.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/test/394f1fc8-dc2c-4ff9-9ad0-7b3a8a8ddeb3.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/test_ALT/03e52d4f-78d7-453c-9685-844dd1636904.json
 delete mode 100644 data/hfopenllm_v2/DreadPoor/tests_pending-do_not_use_yet/3ce136d5-be81-4b8c-a7dc-4e1346935d35.json
 delete mode 100644 data/hfopenllm_v2/ECE-ILAB-PRYMMAL/ILAB-Merging-3B-V2/fb35accf-0c5d-4f72-8d73-ba366a41a76d.json
 delete mode 100644 data/hfopenllm_v2/EVA-UNIT-01/EVA-Qwen2.5-14B-v0.2/75e5ca5d-cce1-4463-b398-553399ce6833.json
 delete mode 100644 data/hfopenllm_v2/EVA-UNIT-01/EVA-Qwen2.5-72B-v0.2/c426bae7-b98d-4343-b419-ac8206196a95.json
 delete mode 100644 data/hfopenllm_v2/Edgerunners/meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16/b17de9f2-6f94-49f6-b908-fa983e8f8f9b.json
 delete mode 100644 data/hfopenllm_v2/EleutherAI/gpt-j-6b/58ba7ca1-8cca-4668-836b-824491d9cf01.json
 delete mode 100644 data/hfopenllm_v2/EleutherAI/gpt-neo-1.3B/23da100a-13b9-42a7-ba79-234be551d0e4.json
 delete mode 100644 data/hfopenllm_v2/EleutherAI/gpt-neo-125m/2d0c12b9-cff8-4366-a3ce-7772e4c098c9.json
 delete mode 100644 data/hfopenllm_v2/EleutherAI/gpt-neo-2.7B/4b87eea2-169c-411e-9d15-caf6b7826590.json
 delete mode 100644 data/hfopenllm_v2/EleutherAI/gpt-neox-20b/62a3cce2-4ff5-4dc9-beab-a06001fd82d9.json
 delete mode 100644 data/hfopenllm_v2/EleutherAI/pythia-1.4b/0e5961e1-af27-4eee-8b9b-c82ee4ab61b1.json
 delete mode 100644 data/hfopenllm_v2/EleutherAI/pythia-12b/b62352d4-e3b0-4b4d-8d68-e2d973d820c1.json
 delete mode 100644 data/hfopenllm_v2/EleutherAI/pythia-160m/7fadc486-767e-45ef-979d-74ecb858cb99.json
 delete mode 100644 data/hfopenllm_v2/EleutherAI/pythia-1b/d0628e6f-a6f3-42eb-b9fc-e880ae8c0688.json
 delete mode 100644 data/hfopenllm_v2/EleutherAI/pythia-2.8b/0999a066-1151-4445-b130-00d8fe4a516e.json
 delete mode 100644 data/hfopenllm_v2/EleutherAI/pythia-410m/1efc09d8-6a5c-4d48-b76e-2e04ef97b676.json
 delete mode 100644 data/hfopenllm_v2/EleutherAI/pythia-6.9b/1a59412f-fe78-4ecf-8951-8f2996dd374f.json
 delete mode 100644 data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-French-Llama-3-8B-v0.4/b5403311-2069-488d-af98-27da14496c15.json
 delete mode 100644 data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3-8B-v0.3/6c10c176-b2b6-4216-91c0-1444944612f7.json
 delete mode 100644 data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3-8B/80ebd92e-d9b6-46ce-b77e-973c3f3f6051.json
 delete mode 100644 data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3.1-8B-v0.9/0418e36f-17ea-46a2-bfeb-91cc0ff719bf.json
 delete mode 100644 data/hfopenllm_v2/EnnoAi/EnnoAi-7B-French-Instruct-202502/4f5ba3fc-694a-45b1-ae9d-2c7d33e41519.json
 delete mode 100644 data/hfopenllm_v2/EnnoAi/EnnoAi-Pro-Llama-3.1-8B-v1.0/8b0d1556-bbd5-49e3-b881-32224bc1aa9a.json
 delete mode 100644 data/hfopenllm_v2/Epiculous/Azure_Dusk-v0.2/524e634f-280c-4f3a-9f1f-bdda19fad740.json
 delete mode 100644 data/hfopenllm_v2/Epiculous/Crimson_Dawn-v0.2/cb82e92b-f207-4fbd-9bfe-43184769cdbd.json
 delete mode 100644 data/hfopenllm_v2/Epiculous/NovaSpark/0b674103-4e55-41f4-accb-b7be73671801.json
 delete mode 100644 data/hfopenllm_v2/Epiculous/Violet_Twilight-v0.2/fa0290e0-723f-4502-90b6-c77007fffc1f.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Alpaca-Llama3.1-8B/c3827ecd-d02a-4464-a098-110f4fb54516.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Athena-gemma-2-2b-it-Philos/af9700fe-20c0-4b7c-9f3a-c4d78fab7911.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Athena-gemma-2-2b-it/959a4e4d-211c-4e45-94f1-f8f877e0b36f.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Athene-codegemma-2-7b-it-alpaca-v1.3/96a8b3c0-d6bc-41fe-8967-0d798669aa8e.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/DeepPhi-3.5-mini-instruct/ed5d2ca8-d551-493d-8877-348204ef91cc.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/DeepThinkers-Phi4/04e20a14-8346-4801-8515-189861c857cb.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/FineLlama3.1-8B-Instruct/eec2da56-ba0a-418f-afe1-8a46882b9839.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-12B-v1.13a-philosophers/321cf68b-9220-4ada-89da-061341a20a9d.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-12B/86fda025-2345-4a40-9094-223b96b21f13.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200/3c734233-9868-4ba6-83c0-2b63f2ce8980.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta/7f5eca48-0ab9-4ef2-85c2-a7f1fe713afe.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2/f5e0e809-08b8-43dd-a44d-875f365610c3.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto/8d267135-a7e6-4ec5-ae09-66478804bb66.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/4940ed0e-2c1e-4408-9806-49ceed30a69e.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/5f6f7b7c-ef6a-4468-aae5-d7dfc25c5659.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds/5244ee3c-7d65-434a-acfe-cdb277ff5264.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code/eba4644f-d455-4a23-a16f-8ecb038ffe7f.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K/fb270319-7010-4946-b60c-409aebe41aaa.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT/d57bd77a-11cc-497c-b0bb-31c1ffa63dc2.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto/0220984e-fe8c-4e72-bc3e-92b949ffe769.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Math/16482634-ec03-463a-9deb-2230ee955800.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO/4c1db32d-96fc-4a66-b083-530a3e75ad6d.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-Mistral-Nemo-Base-2407-v1-DPO2/c0c5c846-395a-47ac-9e8e-e598939f317d.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-R1-Llama-3.1-8B-Medical-COT/6b3f6b59-a8eb-48c2-acbc-92e8f34b2dd6.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-R1-Llama-3.1-8B/d017e3bf-2abe-4b84-810e-e0eaf973adc3.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Fireball-R1.1-Llama-3.1-8B/62a3ecb8-f6d1-429c-807f-5545b2a5897f.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Llama-3.2-3B-Agent007-Coder/748557ce-1a49-4b3a-9c38-9007dc04aafb.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Mistral-Nemo-Instruct-12B-Philosophy-Math/95d43d01-a75e-4af4-a2cc-b60f832071d3.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/OpenReasoner-Llama-3.2-3B-rs1.0/4dc7c889-7839-4047-b48c-33be5b688e72.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy/751851c8-9a7f-4135-a106-eab4efbd0734.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic/2930e30c-9f2e-4248-ae3b-ed7ffbd12f8c.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent/c1acc460-aeb8-4a99-8ca5-376ab60fb74a.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO/33b8b64f-7da5-45aa-bf80-7145ef704229.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT/2662d257-49e2-430d-b44f-b0b347c61271.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.2/870b639b-ee7a-4b13-872b-52657539c836.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.3/6ff20678-a335-4fa8-8126-9f96ce247f34.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO/19c4ea89-896a-4577-a386-c2470eaf743f.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1/22eb2479-16ff-4a56-b9e4-e8835da7ca0e.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math/aca3f1fd-9c46-47f6-81c6-dc56a702c1de.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-0/071ca686-5950-4af4-80f2-969b1008e370.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-Instruct-r01-Reflect/78977c34-33f8-4037-86e0-dfce1d01c3f8.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-R01/480e4294-c8d9-4088-9b8c-7a239d57f683.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2/be9b21e8-90ce-451a-bcaf-2ebc7c72bc34.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2A/b0054dd8-e62c-4d0c-9b18-090851c3a7e2.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2B/985e479b-658a-4548-9b5e-c9c04b8838c1.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2C/d0ef8af4-156d-456d-9e33-b2cdb3f8c04e.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-T1-V1/5050c787-2f95-4a17-a4b0-c094860627b5.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-T1_1/bb5c8274-4324-47f2-94c5-d0c831ce0de7.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Athene-codegemma-2-7b-it-alpaca-v1.2/8113a26a-5941-4f3d-872a-bdde5456ad97.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-12B-v1.2/5b60047b-2e85-4a47-a31f-4c07f4bd2c30.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1-8B-Philos/88d79858-3a35-43eb-8da6-95b80b5deef6.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.01-8B-Philos/63266a49-01ea-40f1-83ef-778f391aff2b.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.03-8B-Philos/f0da069a-833f-489a-a923-c79542a3a9a6.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.04-8B-Philos/205b9da8-d561-41ec-946e-1d2f9a43e437.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo/2ea4da56-4b95-4222-a4e2-f57c73e0ee4e.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math/c086f693-cef1-4212-9c17-669b210f4caa.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection/290995f2-9982-4f29-ac74-dc646905206c.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1/c60e65e6-d771-4c53-80d0-c1e09aa39377.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Llama-3.1-8B-Philos-Reflection/fcff202d-3b4f-4ba9-b3f6-1122d8abcac1.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-MathMistral-Nemo-Base-2407-v2dpo/5f0fa37a-e829-402b-b2ab-c68ffa248b6e.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math/a0b4a345-3530-4da2-8403-87259bbd1405.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT/3548f0ea-f3ab-4a0e-9c77-5ae62014ed44.json
 delete mode 100644 data/hfopenllm_v2/EpistemeAI2/Fireball-Phi-3-medium-4k-inst-Philos/707270e3-334b-4eba-84c0-2795ae53d79a.json
 delete mode 100644 data/hfopenllm_v2/Eric111/CatunaMayo-DPO/c827bee3-a181-42bc-9387-ca132d59c8ba.json
 delete mode 100644 data/hfopenllm_v2/Eric111/CatunaMayo/d3e8949b-f6f8-459f-891b-f4900ff806cd.json
 delete mode 100644 data/hfopenllm_v2/Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties-v2/35d5f5e3-74eb-4eea-9f78-b7b8969830a2.json
 delete mode 100644 data/hfopenllm_v2/Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties/4cf4479a-622a-4bc2-86f2-aa526216f24c.json
 delete mode 100644 data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b-Ties/6ed27890-3e61-4c7d-8c94-a78c0b34ba32.json
 delete mode 100644 data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b/87b5e360-7867-4edd-b45e-e7bb92a91b69.json
 delete mode 100644 data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b/d93116b8-28ff-41ea-8273-56f7ae11cf18.json
 delete mode 100644 data/hfopenllm_v2/Etherll/Qwen2.5-7B-della-test/ba8c2c17-64f6-4cdb-b3b9-8977ce1bdbe2.json
 delete mode 100644 data/hfopenllm_v2/Etherll/Qwen2.5-Coder-7B-Instruct-Ties/5e5602cc-b4de-4247-aa6d-940817fc849b.json
 delete mode 100644 data/hfopenllm_v2/Etherll/Replete-LLM-V3-Llama-3.1-8b/cc5f27f5-36d8-49bb-9c9d-7879598bfe71.json
 delete mode 100644 data/hfopenllm_v2/Etherll/SuperHermes/aec03bd9-808a-4c3f-bbde-40bcac5775fb.json
 delete mode 100644 data/hfopenllm_v2/Eurdem/Defne-llama3.1-8B/b4ae6f0b-8a6b-4c60-8eb2-3e202877bcf5.json
 delete mode 100644 data/hfopenllm_v2/FINGU-AI/Chocolatine-Fusion-14B/c68deb4d-73a8-40ab-b4e5-1773b7ec4ed8.json
 delete mode 100644 data/hfopenllm_v2/FINGU-AI/L3-8B/a93c5674-599b-429c-a322-3c6bc7248f45.json
 delete mode 100644 data/hfopenllm_v2/FINGU-AI/Phi-4-RRStock/5e6374a6-56bd-4bd9-b04b-30ec9cf234bc.json
 delete mode 100644 data/hfopenllm_v2/FINGU-AI/Q-Small-3B/c3d2fc86-a5c4-4e92-bcf9-26096ca32ad4.json
 delete mode 100644 data/hfopenllm_v2/FINGU-AI/QwQ-Buddy-32B-Alpha/1b49cb06-3ee1-4945-aaed-12c868d9e45e.json
 delete mode 100644 data/hfopenllm_v2/FINGU-AI/RomboUltima-32B/65853bb5-ff3e-4880-8c32-ce9aabcadd7b.json
 delete mode 100644 data/hfopenllm_v2/FINGU-AI/Ultimos-32B/7fecc176-debf-4bf7-b3f3-479d05678a1e.json
 delete mode 100644 data/hfopenllm_v2/FallenMerick/Chewy-Lemon-Cookie-11B/3c965626-a264-40db-93e1-cd7659d0662e.json
 delete mode 100644 data/hfopenllm_v2/Felladrin/Llama-160M-Chat-v1/50fa6f0c-d689-4380-b619-253209b5badc.json
 delete mode 100644 data/hfopenllm_v2/Felladrin/Minueza-32M-UltraChat/adb25c88-6113-4307-bbf0-d377f757bc18.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/b9ac5e03-c878-4e46-a89c-1906f3b91dce.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/d6a6badf-4472-44b5-af9e-4282e4406a8e.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/92e62d3a-3091-4538-b6da-ba705e11687a.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/04f5fdc6-f1cd-4b2d-947a-86fee67b3b62.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/5013ccfc-6bc5-4862-898c-1ca781f92572.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb/38fff98c-72b1-453c-a2cf-cf077dd19d10.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed/42911928-ef64-474b-828a-02ce3383773e.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_selected/7989d7d3-c5e9-43c6-80a1-6de51533f9bf.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb/5b9acd52-7eb6-4099-98be-ecd6cae07835.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed/666bef5a-2d62-4743-bff1-07365716ab19.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_selected/85de411c-2308-4824-bd6e-3327eeb6fe3e.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb/df28c4c2-d6a4-4ab0-a1ac-faf00a93de99.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed/6fb37ad0-b41b-4ad7-91a2-79bbb835d445.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_selected/c41df02e-5aff-4de6-a1c4-d45b5585e29d.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed/aa587b4a-9c19-4231-ba72-9b66446460f9.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_selected/be14e75e-4fb1-41aa-b168-1ec23eb305e0.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb/73be4a2b-28c9-4208-8107-3734fea25008.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed/0bf2fa4e-3bcb-46ff-a068-f4c796123c6d.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_selected/9f8fc05a-8658-4ed3-994a-965e6882d242.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb/ced11f6e-490d-42e9-8f3e-00e22cfc2910.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed/70ba788b-fe8c-4667-a859-0fb122de22b9.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_selected/e93f2d5f-7ffc-44b8-b2dc-d07b73de44ab.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb/15cacfe0-bdfb-4b87-a813-bfa70ff71984.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed/cff00e2a-41e3-40d2-aab3-4bb3bd7d0d0e.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_selected/e1eab0cf-2c6d-44b2-8aaf-a75347741529.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/smollm2_pretrained_200k_fineweb/ed221db8-cf81-4257-8785-db9381eec5b7.json
 delete mode 100644 data/hfopenllm_v2/FlofloB/test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/b314468b-401a-4318-b022-c966bf3366aa.json
 delete mode 100644 data/hfopenllm_v2/FuJhen/ft-openhermes-25-mistral-7b-irca-dpo-pairs/a0dbb2eb-66c7-48a3-a85c-725b49141edf.json
 delete mode 100644 data/hfopenllm_v2/FuJhen/mistral-instruct-7B-DPO/812a36ec-4928-40a9-9aa8-ee39d7bb02f5.json
 delete mode 100644 data/hfopenllm_v2/FuJhen/mistral_7b_v0.1_structedData_e2e/77af2424-0a23-49f3-97b0-316d04a33547.json
 delete mode 100644 data/hfopenllm_v2/FuJhen/mistral_7b_v0.1_structedData_viggo/6f422676-2d7e-40ed-a5e3-4afc25564cfc.json
 delete mode 100644 data/hfopenllm_v2/FuseAI/FuseChat-7B-v2.0/43923dd6-838a-4259-a938-7766dfd9c07e.json
 delete mode 100644 data/hfopenllm_v2/FuseAI/FuseChat-Llama-3.1-8B-Instruct/dba94a49-02b0-4e92-bd6c-c6bfc9be3cfb.json
 delete mode 100644 data/hfopenllm_v2/FuseAI/FuseChat-Llama-3.2-3B-Instruct/16a782dc-0795-4281-aad6-4f664a0940ab.json
 delete mode 100644 data/hfopenllm_v2/FuseAI/FuseChat-Qwen-2.5-7B-Instruct/5d24d4ad-9f37-4634-ba23-74fbc74fd298.json
 delete mode 100644 data/hfopenllm_v2/GalrionSoftworks/MN-LooseCannon-12B-v1/043cd315-fcb7-4871-ae79-dee3fdefaef0.json
 delete mode 100644 data/hfopenllm_v2/GalrionSoftworks/MagnusIntellectus-12B-v1/3c377d7e-14bc-4c82-9ada-7560552abbe4.json
 delete mode 100644 data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaOrca-2-Merged/43bb650b-8bb7-41b4-866a-cb2dad1499d6.json
 delete mode 100644 data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaOrca-Merged/bdf8f907-37ca-41ca-9a4e-f4dd446f895f.json
 delete mode 100644 data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaUltra-Merged/14a1872c-7afd-4cd4-ad87-853e4fc0847e.json
 delete mode 100644 data/hfopenllm_v2/GenVRadmin/llama38bGenZ_Vikas-Merged/887e4ca9-ed48-4b33-b933-f8534a8d0377.json
 delete mode 100644 data/hfopenllm_v2/GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct/c585488d-4043-482f-b1fa-4a61e96f7f0f.json
 delete mode 100644 data/hfopenllm_v2/GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct/d64541f6-19ef-4f04-a991-93efec6fe24f.json
 delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1c13e194-8bee-4456-a249-f71e7e34b0eb.json
 delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1d3db737-20e7-4da1-a311-e60de0b41c93.json
 delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1/7b73d50e-358b-4961-8b58-63765ce5a82a.json
 delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2/81dfd69c-cf01-4114-8157-fd09af6f490c.json
 delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3/f38240ab-35e4-431e-b4d5-b1b0e1d57c5f.json
 delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-14B-Instruct-abliterated-v4/01863b4f-9550-49c3-ad83-74c0bb535eb9.json
 delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/edd25437-38bc-443c-9da3-bc041270447e.json
 delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/j.o.s.i.e.v4o-1.5b-dpo-stage1-v1/31836d43-5022-488f-ba9e-379195809069.json
 delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/josie-3b-v6.0/2a5a3ed6-7137-49e2-a141-497ceba88757.json
 delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0-step2000/0b1c6aa6-b94e-4400-9b0d-c39aa1bcd808.json
 delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0-step2000/69423132-adc9-4b97-b799-15f37de1d7e5.json
 delete mode 100644 data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0/54d5bf0f-7c4c-40b1-bca6-5484ef8e2a04.json
 delete mode 100644 data/hfopenllm_v2/GreenNode/GreenNode-small-9B-it/cfe8f9c7-e9bf-4a17-afa0-d5b8f46d24e7.json
 delete mode 100644 data/hfopenllm_v2/GritLM/GritLM-7B-KTO/7fbc0323-1c78-46b6-a08a-6e5870c64e53.json
 delete mode 100644 data/hfopenllm_v2/GritLM/GritLM-8x7B-KTO/1c769f0d-b99d-4b82-a529-f5264f7b3349.json
 delete mode 100644 data/hfopenllm_v2/Groq/Llama-3-Groq-8B-Tool-Use/a9365685-e299-48e2-931a-c63e123a9e00.json
 delete mode 100644 data/hfopenllm_v2/Gryphe/Pantheon-RP-1.0-8b-Llama-3/bdf2d61a-daa1-4b1f-9245-43ff263540fb.json
 delete mode 100644 data/hfopenllm_v2/Gryphe/Pantheon-RP-1.5-12b-Nemo/f0b4eef9-dab2-48e2-87f8-ad83ec33ec23.json
 delete mode 100644 data/hfopenllm_v2/Gryphe/Pantheon-RP-1.6-12b-Nemo-KTO/29e10491-8c34-4b7a-a0bd-77f6ca0dc54c.json
 delete mode 100644 data/hfopenllm_v2/Gryphe/Pantheon-RP-1.6-12b-Nemo/c588d86a-80c4-46d1-93e0-b7fa8491f3b3.json
 delete mode 100644 data/hfopenllm_v2/Gryphe/Pantheon-RP-Pure-1.6.2-22b-Small/0b11eb9a-61c8-4af1-8335-24bef2597e5d.json
 delete mode 100644 data/hfopenllm_v2/GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall/7d31e5fd-700a-42a8-bea8-8989e8c52603.json
 delete mode 100644 data/hfopenllm_v2/GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall/f993880a-3c7c-4af9-a3ce-3c27207b9a3c.json
 delete mode 100644 data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge-PEFT/2fae7e4a-8c28-4be8-9391-ca79077e32c2.json
 delete mode 100644 data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge-PEFT/436e651e-6f04-44ff-ab3d-db8ed0d639bd.json
 delete mode 100644 data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge/9fbccac2-c840-494e-a24d-a6f0c9a07b88.json
 delete mode 100644 data/hfopenllm_v2/HPAI-BSC/Llama3-Aloe-8B-Alpha/a4ee6a33-df51-4a4e-a13d-45488a094fd7.json
 delete mode 100644 data/hfopenllm_v2/HPAI-BSC/Llama3.1-Aloe-Beta-8B/a3923f10-e64c-4556-9616-4fe7072eff60.json
 delete mode 100644 data/hfopenllm_v2/HPAI-BSC/Qwen2.5-Aloe-Beta-7B/ca15d972-9075-42df-884b-5d069f6ff425.json
 delete mode 100644 data/hfopenllm_v2/HarbingerX/Zeitgeist-3b-V1.2/905909a5-abef-46bf-9392-c97873e229df.json
 delete mode 100644 data/hfopenllm_v2/HarbingerX/Zeitgeist-3b-V1/95bd05cf-8f59-409d-a99e-d249bad6c561.json
 delete mode 100644 data/hfopenllm_v2/Hastagaras/L3.2-JametMini-3B-MK.III/76b12246-33f6-4992-a0ab-38704dcf6345.json
 delete mode 100644 data/hfopenllm_v2/Hastagaras/Llama-3.1-Jamet-8B-MK.I/e4415806-0ec0-465a-b28f-9c8741436fb4.json
 delete mode 100644 data/hfopenllm_v2/Hastagaras/Zabuza-8B-Llama-3.1/98e62ab5-d35a-42dd-904b-bed9c50f3745.json
 delete mode 100644 data/hfopenllm_v2/HelpingAI/Cipher-20B/8fb3596e-224e-492b-bdb6-a95a16656eb0.json
 delete mode 100644 data/hfopenllm_v2/HelpingAI/Dhanishtha-Large/154203c4-d86e-4c36-806b-c45c5cc568ce.json
 delete mode 100644 data/hfopenllm_v2/HelpingAI/Priya-10B/e42c01f7-2869-4103-bbfd-81aa5a15c140.json
 delete mode 100644 data/hfopenllm_v2/HelpingAI/Priya-3B/323d2f94-5e04-4627-9f74-129217f53eea.json
 delete mode 100644 data/hfopenllm_v2/HeraiHench/DeepSeek-R1-Qwen-Coder-8B/6bcc284b-8973-47d5-b5b1-1abb7a3242ee.json
 delete mode 100644 data/hfopenllm_v2/HeraiHench/Double-Down-Qwen-Math-7B/691cace3-5316-4f5b-8693-67efb24a0a06.json
 delete mode 100644 data/hfopenllm_v2/HeraiHench/Marge-Qwen-Math-7B/d387b3dc-9e76-44a6-9a9f-132a4fd762b4.json
 delete mode 100644 data/hfopenllm_v2/HeraiHench/Phi-4-slerp-ReasoningRP-14B/f6f515d3-f5e9-4362-be51-bb8fc05527e6.json
 delete mode 100644 data/hfopenllm_v2/HiroseKoichi/Llama-Salad-4x8B-V3/2e1e215f-b622-439f-a13f-531441e25ae3.json
 delete mode 100644 data/hfopenllm_v2/HoangHa/Pensez-Llama3.1-8B/d50d66a9-a0c4-4b82-922c-9d012f1b50a1.json
 delete mode 100644 data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-alpha/ea7292a8-3f07-47be-b8ae-7d352ed1ecb6.json
 delete mode 100644 data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-beta/4eedd6d4-279f-4660-8d71-708a27bb53e0.json
 delete mode 100644 data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-gemma-v0.1/9c0f67d1-f95d-4ca0-a234-2e09ac788f55.json
 delete mode 100644 data/hfopenllm_v2/HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1/e5c0fbc9-f424-4b04-839a-8335adaf89cc.json
 delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM-1.7B-Instruct/d91107fa-eb8d-4d01-90a2-fc9831f337b2.json
 delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM-1.7B/926999bf-1ba6-4321-82b2-fcced4336739.json
 delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM-135M-Instruct/57d481bf-0db9-4208-afda-dcd20df13964.json
 delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM-135M/eb417e47-fe63-4dc5-b3e5-28782f3782da.json
 delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM-360M-Instruct/b0f516dd-7185-4906-87a5-3c6f019894d0.json
 delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM-360M/1e562944-a205-4ef7-aff1-3776595d131c.json
 delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM2-1.7B-Instruct/6ccaf08d-1b0a-4ca9-941e-a71e2dce5cb4.json
 delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM2-1.7B/2064938d-9f05-4740-a4d4-2a2da0eac21d.json
 delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M-Instruct/43240184-8245-43ff-a971-678523918fe0.json
 delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M-Instruct/b3b854b6-700c-4297-b335-6acc3c385f84.json
 delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M/a9d79c6a-f99a-4b60-8e37-ee2cdfe75f30.json
 delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M-Instruct/88e1dd78-d3bc-401b-88e9-d963bac181db.json
 delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M-Instruct/a41bd607-f319-4063-a6e4-813f43e40568.json
 delete mode 100644 data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M/8629aef1-c673-4b17-a9cc-b361a53bdaa7.json
 delete mode 100644 data/hfopenllm_v2/HumanLLMs/Humanish-LLama3-8B-Instruct/532c927a-dc0c-4e65-8ab0-7b9ddd889d89.json
 delete mode 100644 data/hfopenllm_v2/HumanLLMs/Humanish-Mistral-Nemo-Instruct-2407/843f9927-9865-4066-9cc0-f0522d3b914f.json
 delete mode 100644 data/hfopenllm_v2/HumanLLMs/Humanish-Qwen2.5-7B-Instruct/eeecb2cb-e286-443f-84aa-d825702a4ad8.json
 delete mode 100644 data/hfopenllm_v2/IDEA-CCNL/Ziya-LLaMA-13B-v1/36ab4f5a-b2cf-4d01-8283-9eaf2c90928f.json
 delete mode 100644 data/hfopenllm_v2/INSAIT-Institute/BgGPT-Gemma-2-27B-IT-v1.0/c4e810f1-ffb3-4ece-b445-64e339761530.json
 delete mode 100644 data/hfopenllm_v2/IlyaGusev/gemma-2-2b-it-abliterated/025725b6-0034-48c0-a720-5fc210e5e24b.json
 delete mode 100644 data/hfopenllm_v2/IlyaGusev/gemma-2-9b-it-abliterated/7bdd8928-c336-494e-9c87-de9ecc2749b8.json
 delete mode 100644 data/hfopenllm_v2/Infinirc/Infinirc-Llama3-8B-2G-Release-v1.0/ff7369dc-3ff2-424b-80b0-e06a141b54f3.json
 delete mode 100644 data/hfopenllm_v2/Intel/neural-chat-7b-v3-1/a6dc7253-75fd-4897-be85-8ac89fc11f8e.json
 delete mode 100644 data/hfopenllm_v2/Intel/neural-chat-7b-v3-2/296ceacc-542a-4000-bf9b-ae59b33a53ce.json
 delete mode 100644 data/hfopenllm_v2/Intel/neural-chat-7b-v3-3/13870577-7579-48b4-9c92-202318ca6ecc.json
 delete mode 100644 data/hfopenllm_v2/Intel/neural-chat-7b-v3/6ebd2806-2623-4773-93bd-1036ff01cb8c.json
 delete mode 100644 data/hfopenllm_v2/IntervitensInc/internlm2_5-20b-llamafied/99d6a44b-d556-4674-8ade-a5b30cf99255.json
 delete mode 100644 data/hfopenllm_v2/Invalid-Null/PeiYangMe-0.5/605118a3-316a-46b5-9719-f596e361a2a8.json
 delete mode 100644 data/hfopenllm_v2/Invalid-Null/PeiYangMe-0.7/271d2829-fbd4-438e-9f09-59539af68c8b.json
 delete mode 100644 data/hfopenllm_v2/Isaak-Carter/JOSIEv4o-8b-stage1-v4/107bc549-75c1-4272-b567-f8ab9f6cd675.json
 delete mode 100644 data/hfopenllm_v2/Isaak-Carter/JOSIEv4o-8b-stage1-v4/dfb451e9-c1c1-45a1-8082-155763366129.json
 delete mode 100644 data/hfopenllm_v2/Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/b2d80977-d079-42ec-b057-5aac530b9d70.json
 delete mode 100644 data/hfopenllm_v2/Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated/16b33b80-3b4b-4edb-b89f-3d93dca8969c.json
 delete mode 100644 data/hfopenllm_v2/J-LAB/Thynk_orpo/63c94e0a-4572-4b8a-bfe0-7f88bb847d7f.json
 delete mode 100644 data/hfopenllm_v2/JackFram/llama-160m/538f2b43-328c-456d-8a40-ff2b37924453.json
 delete mode 100644 data/hfopenllm_v2/JackFram/llama-68m/fb7a68e6-716e-48c6-96c0-d227735f9a7c.json
 delete mode 100644 data/hfopenllm_v2/Jacoby746/Casual-Magnum-34B/3593d4b8-5602-4cca-935f-a76e342f060a.json
 delete mode 100644 data/hfopenllm_v2/Jacoby746/Inf-Silent-Kunoichi-v0.1-2x7B/72d503fc-b221-498e-811a-a806769175d6.json
 delete mode 100644 data/hfopenllm_v2/Jacoby746/Inf-Silent-Kunoichi-v0.2-2x7B/ad7d9698-d9e6-4f2d-9767-987835626c8c.json
 delete mode 100644 data/hfopenllm_v2/Jacoby746/Proto-Athena-4x7B/98899942-fcf0-41de-8587-44d7429bea47.json
 delete mode 100644 data/hfopenllm_v2/Jacoby746/Proto-Athena-v0.2-4x7B/bb51eb59-88f6-49c2-814a-11b2c80313d0.json
 delete mode 100644 data/hfopenllm_v2/Jacoby746/Proto-Harpy-Blazing-Light-v0.1-2x7B/d8563f36-e299-4186-a5dc-9dae51824e1f.json
 delete mode 100644 data/hfopenllm_v2/Jacoby746/Proto-Harpy-Spark-v0.1-7B/43bc0528-7bc5-4eac-8848-c9995079450f.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen-0.5B-DPO-1epoch/ce19893b-a7e1-4f8e-96f2-eb9cee2afeac.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen-0.5B-DPO-5epoch/24629e14-d197-4a5b-adff-7840af652f22.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen-0.5B-IRPO-1epoch/9c3ea35c-2cf7-4c31-8b83-c69df3cd9448.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen-0.5B-IRPO-5epoch/46548403-6eb5-4f7a-874c-1327420f4cab.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen-0.5B-eDPO-1epoch/0bd9c061-b7ee-4bc2-9deb-ea7eea012c49.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen-0.5B-eDPO-5epoch/aa2fe858-111c-45e8-b0d4-0048d7fc7ef7.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1/ad03cae6-b126-4157-a225-9576e4d651d0.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1/0d57b65d-3dd4-4185-b8cf-531105e94b5e.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1/f8882044-6e71-4788-b2ee-f51f85e67ecc.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT/3c8f96c5-af91-4f41-a0b4-6e1b7d55d8ad.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-2ep/e26743b9-4caf-46f8-bd5a-7e4445c850b1.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-3ep/febd4016-3a30-4b26-93e5-f7b556781b9b.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-5ep/ae82125e-94ac-48ca-8240-807e4b7ef9a0.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4/5321fa0b-b010-4e1d-9f20-a97b56f4f937.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-2ep/d25a4602-ea50-4a53-952c-112ba250123b.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-3ep/232e3fc4-5cd2-4515-9e15-acd7d56bc34d.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-5ep/975f54fe-a581-4ce1-b0c1-7becb7605f09.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5/92ae4461-48bc-47fe-a3ad-ea4c3452d395.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-2ep/638e1cc0-9baf-4555-a278-4b21c46af86f.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-3ep/cef4161a-4e1c-4a92-bca8-b07f957a13b1.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-5ep/715b556b-2bc0-4864-b4b1-b7413a5d45bc.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4/7552ad5c-5d1f-478b-a931-036083b2954e.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam/7bb3ae9f-9bb3-4bf2-9d97-d7f4f30697ac.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam/821d67e5-da8d-4383-8825-3bfa72a91fc9.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam/c5bddcba-4a40-4fbb-93e8-aebd06a70a66.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam/dc35237c-606d-4609-927a-566bea767312.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam/3924d1af-e167-4186-a34b-d9b4b8c26d59.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam/f733c4cc-90fc-4b31-bed3-c57dba6d4b6a.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam/08f933a0-b096-4271-890e-0df7e20d1d20.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam/8434e448-ed77-45f2-9c31-39128912f842.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam/d801037b-1eb0-4058-9096-429e5237e015.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam/e0c46f18-598e-402f-8955-68e71fab67cd.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam/4b987cb5-cf7c-4866-8cf0-9926f78c2de9.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam/ec658058-1075-4918-9dc9-fc79d0dcf897.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam/b68baa86-3e1a-4888-98ba-2ecede79b4a7.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam/0b11c8ab-2cfa-425d-9d81-d999f94401db.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam/a3e48db8-3679-4f19-853d-82a73ef49400.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam/7dbf35b2-80c1-4181-80f9-850ea51cead2.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam/231f47db-1662-4313-9ff4-f32883f5615c.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam/c79df898-14c6-4f00-9f65-0d01cd34ed61.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam/2c52917f-c396-410d-bc78-c93c433797fc.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam/0f1d2925-4e1c-495b-94be-f3515fbd53d7.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam/5cbb1972-9895-4689-9f6f-7e0037829a78.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam/6bc42e37-1f31-47cb-97e4-9d0b28b53691.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam/a1573b95-59e6-4ae0-bc12-6ef6fee90b76.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam/78c61b39-3c76-4af9-8d5e-fcd67d6c8779.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam/e4c06400-da86-4448-b421-23476f50bdb3.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam/48f4c2a7-e819-4789-92ea-e02c5e92d3e4.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam/cd9cbbac-f1ca-4193-88cc-e5968cc1bb62.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam/ab3685ab-1795-4a0e-8ee4-4f509616d1b8.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam/9018f443-a63f-4e07-b10b-272f66d1eb0d.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam/548d1536-b941-43a9-a60b-ae5448b70933.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam/99853109-17d9-46fa-a502-e4c977c1fb8f.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam/e171a0a0-f46d-404f-84e8-539155284e17.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam/eadd93e5-5770-4d4a-a1b2-6e732a82ce34.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam/151cb8c4-0a7d-4886-80ea-560902e1f932.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam/1acb97c4-a9d2-4ec8-9486-77eb6857646c.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam/1d803ac5-3ca6-4cb0-bcd1-779eaea1562d.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam/81562e50-23c5-4ef1-b98c-b40625f3b8c6.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam/95fa292a-ee64-4844-9646-ce3cc7f730d2.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam/4d14c584-b5a1-41cd-9605-78088dfebd7f.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam/1415d3d9-d7f8-48ef-8a2f-aa675c4c14db.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam/4b0ab369-e72f-4229-b449-3a21ee9d2c95.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam/478b6c1f-3329-4c9b-9d90-59b8b551c1af.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam/212f8dd2-3c61-45bd-a3de-2326334feb73.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam/9251282e-f72f-406e-a2cf-e7063516f624.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam/91a3c739-7e16-4d21-8879-bb2fd4d4c6ad.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam/aaa78d8f-6050-4b5d-bb67-da6c9d1ee065.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam/1f0430fe-24ff-4ef6-8577-ee5bfa74f18b.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam/f374772b-2685-41e2-a455-9002e48e3739.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam/6db801f8-5253-47c0-b87e-6779bff42f6b.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam/0d704671-c0b6-4296-85b5-eaf972d6be6a.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam/7e31545f-0865-4843-914b-a71f8a84314f.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam/431c7130-5a19-4a71-8a92-fea9726769ac.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam/ca850c4a-14d0-4145-9977-0d33e6e3e362.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam/7389caa3-6d8f-43e3-b3f2-d9320e56f621.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam/1e822b0f-0d80-4613-983b-ebd2e6fbfcd6.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam/1206f592-e6f7-4e7d-83cd-cbe82b37ec58.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam/e4085c6a-bc16-4328-a724-4b9838b55faa.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam/b929b955-1fbb-43d0-add1-4d58fdc4097c.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam/df723a0f-9a32-42f3-9421-780159f7d821.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep/c1046d2c-0b5b-4ab7-b173-8d5b5ecbc07d.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-3ep/60c02070-7554-4764-8a02-841ca75a0d5c.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam/d243f226-149b-4824-837e-e80ab68bae9d.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep/4f9361d0-2ad9-44da-a1d9-876d43451ae6.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep/6c6e9ebc-f83d-48d5-b69f-be43d4167a0e.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam/7cd2c0da-15b8-4ad6-8cad-feb68631c079.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep/36b84cf2-d221-4e9a-b728-37dc2bf7e1d6.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep/1fd0d1db-1d75-4b10-bae8-33023c2c7466.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep/c6c02512-6c91-4818-a084-c48915fd83de.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5/326affa2-9ea4-4fc9-b60f-d2abeb7493c3.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-2ep/b3a190d1-5b86-4439-a21e-1f118239db82.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-3ep/b37a7db5-b26f-4a82-b27c-6c3a2ba72fda.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-5ep/05a59445-b816-4982-9b1a-1c2394ffbaa9.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5/ff952579-e92d-4af8-9497-f49fed5efba0.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-2ep/b541ede0-6de9-4557-8280-43567fd3dd96.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-3ep/8514f601-0bb2-4639-90cc-29e96088e7de.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-5ep/57e6d0cf-943a-4b83-a1f4-4f03b5066523.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5/ec205127-21c0-4edf-bb3a-ec8ccac4fcdb.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-DPO-1epoch_v1/14b260e6-4300-43ec-b7af-587a2f5b03fb.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-MDPO-1epoch_v1/53de1fc9-7097-4103-b731-588a7bf39f80.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT/1a1031c5-3ec2-4d12-93eb-e0a3b0448ed4.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam/51b62d59-f39c-49ca-af0a-73df6440e29d.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam/622a0ae1-0eb5-49f0-bc44-d396c7233e27.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam/71291a41-283e-42ca-b192-7b759e3c3712.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam/7e504fef-b304-4c1a-856d-06e56a8869d7.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam/f8258f5e-8826-4fe1-b9d3-61708e79d4ab.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam/099ce031-1e11-4a07-bac1-03bef9b915d6.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam/75ff25fd-e5f7-4380-b192-cbc8a8ee95aa.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam/cbc43c7a-d8ac-4b03-a383-703f7fa51757.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam/72d7f252-1bff-40ad-9ec8-1ac2a2e02a8e.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam/5eb10878-11e6-43ad-9bb5-658a3495129c.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam/23b29cd4-cfd0-49f1-8959-c3aa8be9722f.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_1e-6-3ep_0alp_0lam/03db2532-f8e0-41e9-ac0c-ff2913f4b12a.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_1e-7-3ep_0alp_0lam/273f0d50-aa4e-4469-8360-2ce0a2e1a850.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-1ep_0alp_0lam/79a48e79-d59b-4f86-a8f4-3af174a9ee0b.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-2ep_0alp_0lam/9da9a0e6-257a-41f6-b3a3-e3279a4924db.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-3ep_0alp_0lam/dfed058c-48b2-4e1e-9a29-624771e3e9dd.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-1ep_0alp_0lam/bcb53a8a-1670-400c-aab6-bd8ed2ebcdf4.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-2ep_0alp_0lam/8438a108-0d5d-48b6-b73a-981d13329daa.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-3ep_0alp_0lam/88616292-1e38-4481-af30-6b60e28fb097.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-1ep_0alp_0lam/44094907-0b09-4706-a117-116a7e10a6e5.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-2ep_0alp_0lam/d19e8078-87e9-4760-9b91-6b5f478820e1.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-3ep_0alp_0lam/896464f1-01bc-4370-8d90-3368323b2908.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IPO_5e-7-1ep_0alp_0lam/9889f0b9-9051-485c-bd44-32b1e56b865c.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IPO_5e-7-3ep_0alp_0lam/6563ce79-6df4-4c78-89e2-064f1250d898.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam/b1778755-e6e6-47e2-925d-44d786c4ff62.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam/3ae923b8-e9f4-472e-8d5e-54fa5f42ce01.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam/40831e23-0a9e-4bdc-a365-9399b6b82ff9.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam/4a60fa82-34dc-4b0c-9102-65adac5039e4.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam/75ff2c43-dd19-48ae-9ba3-f99cdbadda1c.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam/d7962833-660a-4b9b-9836-8a2f3251f38e.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam/ad8ecabf-a868-496e-892b-582efb54fa6a.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam/49f25d3d-80c9-4723-8fa9-1501d44d70aa.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam/70ea520c-3e0c-4412-9dbe-40a00801335c.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam/8e7f8bad-812b-4f6c-8dea-1cf44584c300.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam/3b39a8f0-c5ba-4f74-9d27-bf5b389e038c.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam/702a14d5-a7fd-4926-ab26-e4c3b7f5eda7.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam/20e5d087-7b20-4a39-81da-7334354b61f0.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam/4c5a769c-0472-402c-8e97-d24e5b302bac.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam/96166735-ed03-4931-81c9-d3daed1913d9.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam/06d9b1e3-d054-4fa5-bf1f-9d6149e5111c.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam/776fd8d8-9846-4359-97d4-2340425d1315.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam/197ae1c5-c9b1-4912-91a3-8ccacddc1be6.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam/1fffd3d9-1c6b-4965-84e6-980bb0a13af3.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam/57e8aaf0-f10b-4024-9f93-7b7f13f3ab10.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam/304d5bee-df2d-40fc-b4a0-e3d99178f4bd.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam/6126d30d-e2dd-4b8b-9cb3-acdc76084bbb.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam/fc7284d9-a73f-4562-a781-5cb87247183f.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam/26ab447c-a850-4197-983a-a0dca4532029.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam/ee9e2131-aa99-49e1-9814-f0664614354b.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_3e-6-1ep_3vpo_const/23c472f7-f060-4a69-8f72-12490675825a.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam/04172bef-c06b-4c08-b2af-9e1fe4d97664.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_10vpo_const/3436355a-d2fe-411f-a764-4cb8284deb4c.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_1vpo_const/265655c0-2ead-4dd7-8c7e-4bee69d51bce.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_3vpo_const/645cae82-9e7b-4d1b-b944-e3783089c1c1.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam/ab658117-7c6b-428f-8f60-bf88a1d8a5bc.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_1vpo_const/03c4b5ce-3b22-4d9f-bf60-b626b52a114b.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_3vpo_const/ce7e3a31-c65b-4521-b685-fcbd067c75d9.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam/adb53e2c-5dee-4840-8eae-e0186c6e103f.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_10vpo_const/ba89563d-f53a-4bf0-91e1-92ac950523d8.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_1vpo_const/3fc0ad8d-4bb2-401a-9baf-b94b39b7e1aa.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_30vpo_const/ed816bcb-bbe9-48ae-a6ac-3603779a985f.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_3vpo_const/f347ed24-066a-4cba-8478-f03628cb2b5b.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam/ffddfea0-d17e-44e7-8931-a9601e9cb26b.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_10vpo_const/ec351fa1-78c2-48c6-83f0-7c2a9b2f0731.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_1vpo_const/a0038c34-130b-49dc-a93f-94706a3dad50.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_30vpo_const/cbd5ea42-1e5b-4984-bdcf-e60fbfb9d692.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_3vpo_const/b902e2b2-a0b3-4467-b076-b98717c40d74.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1/4c749665-59ff-49df-a193-0262f66e6003.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3/c99899c6-95e1-4dea-ac12-f8df49728a3b.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1/13deca9f-073e-444b-bf79-35e816f7c312.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1/c8adc0a5-f4bf-4f88-984c-aba506eae6a9.json
 delete mode 100644 data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3/b146daaf-ce1f-4520-bc19-21ce8679b220.json
 delete mode 100644 data/hfopenllm_v2/Jimmy19991222/Llama-3-Instruct-8B-SimPO-v0.2/45e1d037-1ed0-472c-a311-c651fde270fc.json
 delete mode 100644 data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun/3f4ce54a-01f3-4c23-a4ba-22d47e0344dc.json
 delete mode 100644 data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log/470d52be-9dbd-4714-b004-f65cc82d245f.json
 delete mode 100644 data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log/c836fd05-1969-439c-91e1-fd0cab816f6c.json
 delete mode 100644 data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4/14774c6b-eb03-4abc-92df-1e7a196ca8a4.json
 delete mode 100644 data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun/5293ae0c-8022-44d4-b2f5-4f5390dff93e.json
 delete mode 100644 data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log/9020f91f-a8f0-447d-af68-247aa81a25c6.json
 delete mode 100644 data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log/0cd6837a-8c3f-4529-9ea0-8755e1725467.json
 delete mode 100644 data/hfopenllm_v2/Joseph717171/Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32/7cb17011-cf77-4e86-b67f-84e6ff4b8086.json
 delete mode 100644 data/hfopenllm_v2/Joseph717171/Llama-3.1-SuperNova-8B-Lite_TIES_with_Base/086831f9-c677-428b-a997-4da58733633c.json
 delete mode 100644 data/hfopenllm_v2/Josephgflowers/Cinder-Phi-2-V1-F16-gguf/d71893b8-b82c-490b-a700-b579d64e0610.json
 delete mode 100644 data/hfopenllm_v2/Josephgflowers/Differential-Attention-Liquid-Metal-Tinyllama/9893689f-c27d-4148-a27f-cd07b07e98b7.json
 delete mode 100644 data/hfopenllm_v2/Josephgflowers/TinyLlama-Cinder-Agent-v1/90f2df23-a9ec-44be-ade5-89b59cb7368a.json
 delete mode 100644 data/hfopenllm_v2/Josephgflowers/TinyLlama-v1.1-Cinders-World/afd545da-390a-478a-b0f5-ea819f088f27.json
 delete mode 100644 data/hfopenllm_v2/Josephgflowers/TinyLlama_v1.1_math_code-world-test-1/ce776f68-856f-4aee-b7e4-e55d15e8d714.json
 delete mode 100644 data/hfopenllm_v2/Josephgflowers/Tinyllama-STEM-Cinder-Agent-v1/9b015729-524c-44f3-9c2c-c42981d7a61e.json
 delete mode 100644 data/hfopenllm_v2/Josephgflowers/Tinyllama-r1/56a54ffc-4692-496c-95df-8e4ad19d4d95.json
 delete mode 100644 data/hfopenllm_v2/JungZoona/T3Q-Qwen2.5-14B-Instruct-1M-e3/4b105969-2ce5-4c62-89ef-efd392c2ca89.json
 delete mode 100644 data/hfopenllm_v2/JungZoona/T3Q-qwen2.5-14b-v1.0-e3/31af79b1-48c1-4399-9d16-8582c92996ee.json
 delete mode 100644 data/hfopenllm_v2/Junhoee/Qwen-Megumin/59a67f29-cb7d-497c-b7bb-1764a665ae33.json
 delete mode 100644 data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-1415/fe57367c-74b7-483e-af54-4f404cbea75b.json
 delete mode 100644 data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-500/fda2277b-1513-416e-b586-ed05920a0bb4.json
 delete mode 100644 data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-final/b3dde216-f80a-4664-aadc-b5f5dd3e5895.json
 delete mode 100644 data/hfopenllm_v2/KSU-HW-SEC/Llama3.1-70b-SVA-FT-1000step/07ed6241-fd1a-46eb-91fd-92a4a8f6bd15.json
 delete mode 100644 data/hfopenllm_v2/Khetterman/DarkAtom-12B-v3/ba76c356-cd6a-4636-8ab1-18bb9df69881.json
 delete mode 100644 data/hfopenllm_v2/Khetterman/Kosmos-8B-v1/c6ae54a1-2821-48d1-b689-bbb85aaa70a6.json
 delete mode 100644 data/hfopenllm_v2/Kimargin/GPT-NEO-1.3B-wiki/6f296f0e-80ca-49b7-94e7-cb45b795c715.json
 delete mode 100644 data/hfopenllm_v2/KingNish/Qwen2.5-0.5b-Test-ft/b5509e11-820a-4ad4-8c6a-0294762502a8.json
 delete mode 100644 data/hfopenllm_v2/KingNish/Reasoning-0.5b/90d73665-8d83-4e74-ab7d-29b1d3b6181b.json
 delete mode 100644 data/hfopenllm_v2/KingNish/Reasoning-Llama-3b-v0.1/72387647-cbac-4b72-9c22-db7029a39457.json
 delete mode 100644 data/hfopenllm_v2/KingNish/qwen-1b-continued-v2.1/6219ec01-4b6a-4acd-aee1-96c3e8e48643.json
 delete mode 100644 data/hfopenllm_v2/KingNish/qwen-1b-continued-v2.2/5c323d7c-25cd-4718-8a1f-54d986cadaf2.json
 delete mode 100644 data/hfopenllm_v2/KingNish/qwen-1b-continued-v2/adfab21a-941b-4efc-8b63-fdfb3074ba9b.json
 delete mode 100644 data/hfopenllm_v2/KingNish/qwen-1b-continued/350d00a4-7501-4130-a069-323530bc9729.json
 delete mode 100644 data/hfopenllm_v2/Kquant03/CognitiveFusion2-4x7B-BF16/ea809d28-178e-4a0b-ab5a-34739077c5ff.json
 delete mode 100644 data/hfopenllm_v2/Kquant03/L3-Pneuma-8B/243d5ccd-58f3-4da5-8718-553f3f456490.json
 delete mode 100644 data/hfopenllm_v2/Krystalan/DRT-o1-14B/a45537a7-76a6-4855-b83b-abe965f13460.json
 delete mode 100644 data/hfopenllm_v2/Krystalan/DRT-o1-7B/9be911b6-b9f4-47b1-849d-62eb20c9e944.json
 delete mode 100644 data/hfopenllm_v2/Kukedlc/NeuralExperiment-7b-MagicCoder-v7.5/33d7d5f0-cbee-4a26-b5e8-48bdd12492cf.json
 delete mode 100644 data/hfopenllm_v2/Kukedlc/NeuralLLaMa-3-8b-DT-v0.1/4355fbdd-ac72-4f26-8e07-b7e8d774d238.json
 delete mode 100644 data/hfopenllm_v2/Kukedlc/NeuralLLaMa-3-8b-ORPO-v0.3/4bffc633-e20c-4874-b7db-d1b7dabb8070.json
 delete mode 100644 data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7B-v0.1/2d5c844d-d950-4254-bac2-0a986659c541.json
 delete mode 100644 data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7B-v0.3/f6e74b3c-9ee4-40c3-bf92-35d965503a04.json
 delete mode 100644 data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7b-v0.4-slerp/8f1d2600-7347-48b8-9759-11570598459d.json
 delete mode 100644 data/hfopenllm_v2/Kukedlc/Qwen-2.5-7b-Spanish-o1-CoT/cd653bfd-2c06-4224-aeeb-bf591995a69e.json
 delete mode 100644 data/hfopenllm_v2/Kumar955/Hemanth-llm/cdf1fcc7-429d-44bd-b76c-d26ee743f6fe.json
 delete mode 100644 data/hfopenllm_v2/L-RAGE/3_PRYMMAL-ECE-7B-SLERP-V1/4828bd36-5453-4383-8985-08d04a7ebecd.json
 delete mode 100644 data/hfopenllm_v2/LEESM/llama-2-7b-hf-lora-oki100p/4c2baa59-c2f1-4779-9d21-1f69c0821968.json
 delete mode 100644 data/hfopenllm_v2/LEESM/llama-2-7b-hf-lora-oki10p/555c1079-c4d0-4b9e-9d2d-769e7ba32429.json
 delete mode 100644 data/hfopenllm_v2/LEESM/llama-3-8b-bnb-4b-kowiki231101/58a4a1c6-0ee4-4524-9ca1-b40870f1d600.json
 delete mode 100644 data/hfopenllm_v2/LEESM/llama-3-Korean-Bllossom-8B-trexlab-oki10p/eea2a38a-4f1b-48d0-894c-09974894f264.json
 delete mode 100644 data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/3d8063ab-0ad5-43e4-83ff-90b46dee766f.json
 delete mode 100644 data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct/da5e0284-7c44-42d4-a110-a23880de277f.json
 delete mode 100644 data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-32B-Instruct/bef017bb-47b1-48e4-93c4-3b222a16af7a.json
 delete mode 100644 data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct/401c83b0-b7d2-4987-9e46-f127fdbb595f.json
 delete mode 100644 data/hfopenllm_v2/LLM360/K2-Chat/c6fde59b-73ed-4179-a907-076be068b262.json
 delete mode 100644 data/hfopenllm_v2/LLM360/K2/90997fea-6c67-493e-bd8e-5327cfb33ea4.json
 delete mode 100644 data/hfopenllm_v2/LLM4Binary/llm4decompile-1.3b-v2/08957d63-7462-44ff-9dd8-060a5801a31b.json
 delete mode 100644 data/hfopenllm_v2/Lambent/qwen2.5-reinstruct-alternate-lumen-14B/a434f569-e7d6-4464-afa8-6104be43fa06.json
 delete mode 100644 data/hfopenllm_v2/Langboat/Mengzi3-8B-Chat/e32ed251-e817-409f-b4c3-8f168f1ff822.json
 delete mode 100644 data/hfopenllm_v2/Lawnakk/BBA100/1d9a65a3-d2bb-48a7-8a00-8e4a79c36db2.json
 delete mode 100644 data/hfopenllm_v2/Lawnakk/BBALAW1.0/608398da-ae2a-4be2-aaf9-6ec8899aa63d.json
 delete mode 100644 data/hfopenllm_v2/Lawnakk/BBALAW1.2/80e04641-be7d-4351-a4f6-1318981ef834.json
 delete mode 100644 data/hfopenllm_v2/Lawnakk/BBALAW1.3/e74222c6-636c-4075-8d4d-30c73fa70fda.json
 delete mode 100644 data/hfopenllm_v2/Lawnakk/BBALAW1.6/aed80361-9304-44a0-934a-52976d7f1bf3.json
 delete mode 100644 data/hfopenllm_v2/Lawnakk/BBALAW1.61/709bd280-b03e-4908-808f-34566bc968f4.json
 delete mode 100644 data/hfopenllm_v2/Lawnakk/BBALAW1.62/66c495b3-4b09-42ad-b742-4d753c3bde7a.json
 delete mode 100644 data/hfopenllm_v2/Lawnakk/BBALAW1.63/e24f7be6-3051-4990-8b93-121aec5402eb.json
 delete mode 100644 data/hfopenllm_v2/Lawnakk/BBALAW1.64/0321571b-4246-4490-bd6c-7b106eb8e15a.json
 delete mode 100644 data/hfopenllm_v2/Lawnakk/BBALAW1/54dbf947-ab18-40dd-9cd7-a496289b2e72.json
 delete mode 100644 data/hfopenllm_v2/LenguajeNaturalAI/leniachat-gemma-2b-v0/d841e204-ed6a-439d-8408-d5cfb3b38dae.json
 delete mode 100644 data/hfopenllm_v2/LenguajeNaturalAI/leniachat-qwen2-1.5B-v0/96b57891-83e3-4948-ad48-64a2a370e166.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/CheckPoint_A/30301818-6dad-45f9-acfb-a68ccc7c0609.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/CheckPoint_B/50743107-30de-4c5d-bf83-cc003af8a5db.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/CheckPoint_C/625ee1b3-e0a1-4a86-83a4-6e66b380f864.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/CheckPoint_R1/89fda762-1989-4850-837c-f79ef538c58c.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/LCARS_AI_001/1de1f906-0e36-4f79-b159-16ef8ee33ab3.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/LCARS_AI_1x4_003_SuperAI/d8588222-9e4b-47c1-9f86-92f47c9c8e38.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/LCARS_AI_StarTrek_Computer/15e6e6e6-39fa-424f-ba12-5f209cd4b2cc.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/LCARS_TOP_SCORE/81225b85-1523-49c1-b770-897112d2e6ae.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/Mixtral_AI_SwahiliTron_7b/254deaf7-a253-4d41-a10d-1143f86b288c.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWebAI_Human_AGI/ba0b66f5-724a-4a6b-ac20-a36d530a8b4b.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWebAI_Human_AGI_001/eed0b3b4-e277-49ee-aed5-f3599b2d5653.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_CyberTron_Ultra_7b/96a21b6e-ed47-40fb-85cd-15924330e60d.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAGI_001_M2/f41f5471-6384-4510-85d2-41f236082583.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAGI_002/2728eccc-525f-4350-901b-dbc352c78014.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_001/3e7ae935-46c3-427c-8713-41c659c1828a.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_006/66782676-c942-4aff-b754-b96cd96cf1f9.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_007/941a9e27-2ac4-4dab-a6d0-cb9319c79a27.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_009_CHAT/caf93f75-530e-4f4d-9cc0-2cf9b0a7f2ff.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_010_CHAT/d3ca0458-ee97-4a4c-a6a9-066880ffefb5.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT/615bf89b-9357-46f4-82ed-f49b0021da01.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML/06398630-23ad-4000-8ea2-fcca230568d7.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1/bdfa30f8-da0f-418f-adaf-caafda4c81a5.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/bd5e550c-5355-4e01-bafc-2ca89899253a.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/f842ad5b-24f0-419b-9d65-5a6ff1f5e04b.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_MX/3a09590f-28f3-4161-8a93-d42cec62aa90.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/0f6b76ca-c4b8-40b2-a3af-2ea1c3650933.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/f276ad54-4e3b-4718-ae1f-0479565e4565.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_RP/dec20396-6555-4773-bf02-2cd1fcedda89.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_TextVision/eebc33e1-0016-4adf-815a-72653a34c01b.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M1/803c3898-c1a6-4832-ac3a-a86139489810.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M2/bfaa3d3e-66fd-4477-85af-4b83f13ff05b.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M3/99debdd2-1dea-4eb6-be5c-c144656cfe20.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_12/ad67bb88-7f74-4eb4-b771-0b3b60be4416.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_14/af2f579d-1e8a-47d8-8e44-a599bee83e37.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_001/763c840e-ea73-453e-8e54-5f4fd6fda9cd.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_002/4fb40ac4-a637-4b9a-b69d-ba551c0f0938.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_MUSR/ffc4ef41-4a28-4816-be54-8ffd8e153073.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_MasterCoder/f75fe902-f1c7-4e6c-87d6-128688db8d94.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_001/dbd3098b-4532-441b-a81c-072c52579be6.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_003/438e4aa3-5e02-446e-bd3a-07ef724d24ff.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent/027fdc55-61eb-416c-b6ad-4408912d151b.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Student/37a4895d-def5-494d-9b62-d8c97ba9350b.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Teacher/0d53c27e-962c-428f-b540-35ab027883a8.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_001/6f7b2d91-24d6-442c-93a5-9afc88e9a308.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_002/21793520-7d1a-4040-bb96-fa7fe98ae580.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Coder/59d53c40-5b16-4a70-a693-5fb554cf7614.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Math/b28a569c-6bdf-4547-a2ce-c3e224764be3.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_MathMaster/2de129c8-2259-4367-a619-85d9e8f61e06.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Student_Coder/c242030f-fb2b-42dc-a5d1-687273b17282.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Teacher_Coder/3b3fdb16-b6e1-40c8-9ac0-02f1f2207eb7.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Top_Student/ef6e8e0d-7ba4-45ea-aaf7-617f68f2e97c.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_X1/f8c131a4-1fee-4694-8753-88853418ef4b.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_X2/27dec9ff-fb18-43dd-949f-7c0587a5858f.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_RP_R1/060df34d-ab67-43e1-bd56-ebaceb77abd3.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_BIBLE_002/a6357673-3daa-4593-8593-2b65a7d5477e.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatML_002/121d4877-1955-48db-a23a-6b0ad0623b9e.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatQA/1f1eab02-219e-4ad8-af50-e103541e1c9d.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatQA_003/b4cccfb3-1c17-48a3-a211-a26c44de757f.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_TEMP_/05e97a86-681d-42a2-8a47-beade25d8fc9.json
 delete mode 100644 data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_Top_Teacher_/6c0899b4-f066-45f6-827d-11c535ef0634.json
 delete mode 100644 data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.0/f9660557-b9f6-4ecc-b260-c245f0e62b5b.json
 delete mode 100644 data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.1/89168032-5840-4c2c-821e-b3d717ade46f.json
 delete mode 100644 data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.2/10d0aa63-67d9-4dba-9bdc-db7ab3b4547d.json
 delete mode 100644 data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-2B-SLERP-V1/6f66ae5b-8cb6-4263-98a4-4a1eddfaca10.json
 delete mode 100644 data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-2B-SLERP-V2/5e715199-7030-47b4-89c6-83ba0968c07c.json
 delete mode 100644 data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V1/3fca39e8-443d-47da-a858-83a68c18eec9.json
 delete mode 100644 data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V2/b7518bd2-d3af-49e6-823a-f8d507e8e60f.json
 delete mode 100644 data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V3/fa399f16-1652-430c-be19-afaf5ab96be1.json
 delete mode 100644 data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP/cbe5032b-122c-4a0b-a099-50e998a4bc77.json
 delete mode 100644 data/hfopenllm_v2/Lil-R/PRYMMAL-ECE-1B-SLERP-V1/fd8c3209-dcc0-4d27-a3aa-d0f76ef86f8d.json
 delete mode 100644 data/hfopenllm_v2/Lil-R/PRYMMAL-ECE-7B-SLERP-V8/1a18d49c-ad7b-4823-abbc-7191e9d659cd.json
 delete mode 100644 data/hfopenllm_v2/LilRg/10PRYMMAL-3B-slerp/9e2c614e-1104-43a6-9e8f-b7851562e01a.json
 delete mode 100644 data/hfopenllm_v2/LilRg/ECE-1B-merge-PRYMMAL/7d4b83ab-9c9d-46e5-8cbf-b8afcf781230.json
 delete mode 100644 data/hfopenllm_v2/LilRg/ECE_Finetunning/a42b5d7e-be7f-4cde-aaf0-001e2cf05a44.json
 delete mode 100644 data/hfopenllm_v2/LilRg/PRYMMAL-6B-slerp/21f6688c-be52-4352-9c95-d37c0a5f6c94.json
 delete mode 100644 data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V3/e92ba586-7bee-4a9b-b388-e35efde3d36f.json
 delete mode 100644 data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V4/45ed0bb3-efbf-4a32-9735-d814aa08790a.json
 delete mode 100644 data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V5/eff28375-89a7-4970-9342-428b07d0c6f4.json
 delete mode 100644 data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V6/23877e30-b8fb-45ea-a803-47df757ea909.json
 delete mode 100644 data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V7/8bc25d04-9cc5-4551-a9c5-ce185c7ad974.json
 delete mode 100644 data/hfopenllm_v2/LilRg/PRYMMAL-slerp-Merge/d2d4b5a5-109d-4d26-a166-3d97b341584e.json
 delete mode 100644 data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v2-merged/ac404d92-7a06-4758-ab1d-fcf840c2b995.json
 delete mode 100644 data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v3-merged/95ea7fbf-d3f2-4fc1-ba17-05549f6e4d25.json
 delete mode 100644 data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged/c101e272-24d2-44db-9b0f-2ed4d17cec41.json
 delete mode 100644 data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-merged/2cb789c7-dddf-42b2-8fdf-4cbd5132946c.json
 delete mode 100644 data/hfopenllm_v2/LimYeri/CodeMind-Llama3.1-8B-unsloth-merged/a414aefd-ce24-49a9-b431-0c6014ebfbd8.json
 delete mode 100644 data/hfopenllm_v2/Locutusque/CollectiveLM-Falcon-3-7B/91fcb6a3-d351-48c8-87e8-e2a06642e925.json
 delete mode 100644 data/hfopenllm_v2/Locutusque/Hercules-6.0-Llama-3.1-8B/3cd90efa-ddf0-43c4-884c-84337ded14b2.json
 delete mode 100644 data/hfopenllm_v2/Locutusque/Hercules-6.1-Llama-3.1-8B/c66c21e9-a332-40f9-ae87-bdd78a25d753.json
 delete mode 100644 data/hfopenllm_v2/Locutusque/Llama-3-NeuralHercules-5.0-8B/0b4def91-29df-45d9-8dd4-c4097ec47ba3.json
 delete mode 100644 data/hfopenllm_v2/Locutusque/Llama-3-Yggdrasil-2.0-8B/2cbf258c-369e-4b1c-863f-43cf97c3a7a4.json
 delete mode 100644 data/hfopenllm_v2/Locutusque/TinyMistral-248M-v2.5/8372889e-f9cd-4cf7-aec0-8e18d5c627e3.json
 delete mode 100644 data/hfopenllm_v2/Luni/StarDust-12b-v1/ce4cc270-57da-4d08-9130-62508b409cb2.json
 delete mode 100644 data/hfopenllm_v2/Luni/StarDust-12b-v2/4cfedb8f-0e47-4008-9bc5-fb15e4afa607.json
 delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v3/de3c949d-bab5-4430-bdd1-48e1b7860934.json
 delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v4/011e53cd-409f-479b-9c3d-bfce75a1277b.json
 delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v5/1ff40e45-5be4-4625-9f66-5599a829903d.json
 delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt/fed97d94-2949-4383-8f25-fa79bd413508.json
 delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6/f4820bc8-7dfd-4439-af95-21b6cc9367ac.json
 delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase/36e576bb-de50-49ec-a91f-f134c11bbe38.json
 delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7/0edd388b-7a1b-4334-9b72-52d84653ff67.json
 delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.5/b3199674-328e-41a0-9aa4-bf39aec735bc.json
 delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.6/52db4d79-7040-4525-934e-0f33e4acec63.json
 delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.7/ee34821e-9182-433f-a8b0-745711e23738.json
 delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.8/10ef0990-5356-432f-b24c-dd107188ec5f.json
 delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.9/47de680d-33b1-4441-92da-4b97a5fc513f.json
 delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8/96ac0351-2ade-4d76-bcf9-bc0f633f8694.json
 delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9-stock/31aae266-c14b-451f-8bab-62ee7d5d382e.json
 delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.1/f6edb102-e867-46d1-afdc-3c45166bd510.json
 delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.2/8b7756cc-9af3-4f98-84ac-7fef4c1bdaa0.json
 delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9/dcf33a22-5e57-4476-a2cb-ebd60407a920.json
 delete mode 100644 data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-OriginalFusion/15659480-be0b-41c8-a463-873be444b194.json
 delete mode 100644 data/hfopenllm_v2/Lyte/Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3/0444c1bf-a3d3-4d23-bc6c-0a98c4dc1e9d.json
 delete mode 100644 data/hfopenllm_v2/Lyte/Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04/93aa3a13-5069-410f-a1df-6944e0231e0e.json
 delete mode 100644 data/hfopenllm_v2/Lyte/Llama-3.2-3B-Overthinker/427ea7d0-c1f1-4cfe-b6a7-555262a7a317.json
 delete mode 100644 data/hfopenllm_v2/M4-ai/TinyMistral-248M-v3/c6dbe372-7a3c-487c-87c0-fb324c39f8c9.json
 delete mode 100644 data/hfopenllm_v2/MEscriva/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/cf8d99c8-8790-4bdf-bfc2-1a6d1fe35916.json
 delete mode 100644 data/hfopenllm_v2/MLP-KTLim/llama-3-Korean-Bllossom-8B/5b5d42d7-8012-46f1-826f-32d839806048.json
 delete mode 100644 data/hfopenllm_v2/MTSAIR/Cotype-Nano/5e1bf2cb-55c4-4806-89af-cb9953c7c1b1.json
 delete mode 100644 data/hfopenllm_v2/MTSAIR/MultiVerse_70B/21ee4b33-9829-4cca-9603-c30fd4a1f7ff.json
 delete mode 100644 data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.1/c6c14a8b-0e9f-4b97-b9f3-27c7250fb8f2.json
 delete mode 100644 data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.3/6586fa94-9f43-4814-8c8a-8ed244ac94e7.json
 delete mode 100644 data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.1/df7d7db2-867e-47f0-9abf-d71b79e97630.json
 delete mode 100644 data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.1/e2502e7e-3a10-49f3-b5c6-b20496fed998.json
 delete mode 100644 data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.3/51cde18f-09b0-4b66-a962-811ee49e192f.json
 delete mode 100644 data/hfopenllm_v2/Magpie-Align/Llama-3.1-8B-Magpie-Align-SFT-v0.1/4ea48b42-8026-4799-b35d-46757fd2753f.json
 delete mode 100644 data/hfopenllm_v2/Magpie-Align/Llama-3.1-8B-Magpie-Align-v0.1/52e9b4ae-9119-4f26-87e4-6532d1148ecd.json
 delete mode 100644 data/hfopenllm_v2/Magpie-Align/MagpieLM-8B-Chat-v0.1/4bda68c0-cc09-4945-961b-48776b7b5fc8.json
 delete mode 100644 data/hfopenllm_v2/Magpie-Align/MagpieLM-8B-SFT-v0.1/18ea0ad0-a216-4906-a96c-c8b040398dbd.json
 delete mode 100644 data/hfopenllm_v2/MagusCorp/grpo_lora_enem_llama3_7b/1e2321f6-93bd-4acf-9f5b-c82807a40233.json
 delete mode 100644 data/hfopenllm_v2/ManoloPueblo/ContentCuisine_1-7B-slerp/13032961-52a1-43cf-b69d-1802c43e1bcc.json
 delete mode 100644 data/hfopenllm_v2/ManoloPueblo/LLM_MERGE_CC2/9d444061-2c29-499a-8906-77ef58aba34d.json
 delete mode 100644 data/hfopenllm_v2/ManoloPueblo/LLM_MERGE_CC3/1ffdf6b0-b3a3-432a-a0e4-69b4d447bb76.json
 delete mode 100644 data/hfopenllm_v2/MarinaraSpaghetti/NemoReRemix-12B/8ce733ea-e6e9-4f9b-ab28-f93202507265.json
 delete mode 100644 data/hfopenllm_v2/MarinaraSpaghetti/Nemomix-v4.0-12B/0e88aa91-609c-4d2d-9296-25b06eeb0342.json
 delete mode 100644 data/hfopenllm_v2/Marsouuu/MiniMathExpert-2_61B-ECE-PRYMMAL-Martial/3e235ea0-3f04-4d99-9db2-7cafcbdbac6f.json
 delete mode 100644 data/hfopenllm_v2/Marsouuu/MiniQwenMathExpert-ECE-PRYMMAL-Martial/5e31a55c-f222-4192-b031-27bb40ba56fa.json
 delete mode 100644 data/hfopenllm_v2/Marsouuu/MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial/11fd4b70-4ea7-4bee-8caf-8921d4c89f24.json
 delete mode 100644 data/hfopenllm_v2/Marsouuu/general3B-ECE-PRYMMAL-Martial/8e721067-898d-45ca-b4f5-9f523c4ce3d3.json
 delete mode 100644 data/hfopenllm_v2/Marsouuu/general3Bv2-ECE-PRYMMAL-Martial/be5d5480-ce4c-4ade-8c6a-c08cd2826909.json
 delete mode 100644 data/hfopenllm_v2/Marsouuu/lareneg1_78B-ECE-PRYMMAL-Martial/54dec074-29f8-4863-be37-2c08f6f2c3cb.json
 delete mode 100644 data/hfopenllm_v2/Marsouuu/lareneg3B-ECE-PRYMMAL-Martial/88a15025-556b-469d-be77-c773f2c61038.json
 delete mode 100644 data/hfopenllm_v2/Marsouuu/lareneg3Bv2-ECE-PRYMMAL-Martial/b4f4596b-17e5-40bf-ae60-0b17492ba9f8.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/Calme-4x7B-MoE-v0.1/97ce858e-a64f-4881-b6d0-0a2c0814336d.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/Calme-4x7B-MoE-v0.2/1becd83e-e9b8-49c1-a137-80c5a8dbdf0d.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/Llama-3-70B-Instruct-v0.1/337bb321-9c6e-4751-9c9b-d8ba0120dd07.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.10/cfa95cc9-5bb1-4921-97c7-078f2f929a2f.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.8/6d5ba3c4-a0c2-40cd-9766-68d36d21c5b6.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.9/6cc4404a-f3e1-47b9-b56b-34e4269e1261.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/Qwen1.5-MoE-A2.7B-Wikihow/8d820e43-ff42-4247-9ad0-4ed8e70672b4.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/Qwen2-7B-Instruct-v0.1/d858ce8e-6a4b-46b1-8d51-03ebc2d8aaec.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/Qwen2-7B-Instruct-v0.8/9813dd88-ff70-4d9e-86c5-9b73444275c5.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.1-llama3.1-70b/ac677432-e7d1-4439-9c05-426059c285ef.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.1-phi3-4b/018f270f-3cfe-403c-a236-483038a0b04e.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.1-phi3.5-4b/718a40ea-26b1-4cf4-9584-57be798640ae.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2-72b/207a28a9-ae24-4a31-be95-96296b2e466d.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2-7b/72efedb8-d456-41ed-b1ae-4887cb6c18f8.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2.5-72b/ac91fb37-5742-4a3d-b93a-86c63b90cad5.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.1-rys-78b/c71d025d-e954-4420-b397-e07c3644d1f4.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.2-llama3-70b/968c3759-de5f-4255-ba95-cafc7a3c70a7.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.2-llama3.1-70b/5e23b2f7-33f7-4e49-b73a-a02b8650ee0d.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.2-phi3-4b/1b6c64f6-acf8-4cff-bcae-6e8b3725c6f1.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2-72b/7908f572-8886-4add-ae84-b4ec0ec17c26.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2-7b/9e04ec5c-2208-4569-9b63-4768ed4262b9.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2.5-72b/ee2c8beb-6566-4b19-91d0-8e48c12a3fdf.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.2-rys-78b/c7579616-0c21-443a-a149-0c51a0ae92ac.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.3-llama3-70b/ef7a1429-db2f-433b-a606-339a9d868e7a.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.3-llama3.1-70b/f531e13c-79ed-45da-a246-857fd2c884c1.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.3-phi3-4b/0f525d93-663a-442c-9a51-1ad3a5054172.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.3-qwen2-72b/15af21e1-3193-47fa-a3fc-1f087216d4d9.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.3-qwen2-7b/67b270d9-3422-4770-9957-7bde65acca0a.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.3-rys-78b/e2d38bcc-9133-4051-82d0-4e4fd66e00f8.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.4-llama3-70b/4ff256af-73c7-4a5a-96da-19546a786c59.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.4-qwen2-7b/225cbeef-1d0d-40fc-949d-4ba6696fb690.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.4-rys-78b/24fcd662-5abb-4bf8-b8df-1c21b048cd92.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.5-qwen2-7b/7badcb45-7826-4fd1-b964-c697fbda76cc.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.6-qwen2-7b/bfb532f1-3319-46ff-80ae-0ca783a18bb6.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-2.7-qwen2-7b/ea304515-b41f-4e96-a0ec-78c897ebf9a4.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-3.1-baguette-3b/1fe79ea5-1922-4a5e-8857-1c832353b0a6.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-3.1-instruct-3b/9098d70f-cbcd-4f6c-bcba-0b1da743396e.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-3.1-instruct-78b/df4ed9e0-30bc-4a3f-b7a2-8955cbb38d31.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-3.1-llamaloi-3b/f68957d5-20a1-438f-9931-6a787aaed467.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-3.2-baguette-3b/416e0c04-9119-4230-ba71-b0f47e2d4997.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-3.2-instruct-3b/d57780e2-154e-437d-ac2f-0007e1f9140e.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-3.2-instruct-78b/027d464b-1375-4de7-aa57-e1473d16ba89.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-3.3-baguette-3b/a81f20fa-57e8-498c-a162-6d8a9be09ee6.json
 delete mode 100644 data/hfopenllm_v2/MaziyarPanahi/calme-3.3-instruct-3b/d72ddbff-8ff7-446f-a74a-10a46bce6e3e.json
 delete mode 100644 data/hfopenllm_v2/Minami-su/Amara-o1-7B-Qwen/f681d612-f574-4641-b34e-95b6de97f9e8.json
 delete mode 100644 data/hfopenllm_v2/Minami-su/Amara-o2-7B-Qwen/cae1adaf-e424-4dcd-943b-5bbb708aca57.json
 delete mode 100644 data/hfopenllm_v2/Minami-su/test-7B-00/969ac825-92f2-448c-899a-226e69dee377.json
 delete mode 100644 data/hfopenllm_v2/Minami-su/test-7B-01/e108ad28-c155-4162-852c-0f588a136bdc.json
 delete mode 100644 data/hfopenllm_v2/Minami-su/test-v2-7B-00/93cfeba9-7d31-45b4-a6e2-99a5f318f5b3.json
 delete mode 100644 data/hfopenllm_v2/ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1/c1b16b84-9392-48f3-b483-0a9786925506.json
 delete mode 100644 data/hfopenllm_v2/ModelSpace/GemmaX2-28-9B-v0.1/b0c6e08d-b426-49d5-8a66-ee3d70131b62.json
 delete mode 100644 data/hfopenllm_v2/MoonRide/Llama-3.2-3B-Khelavaster/6a6651a3-b34e-404d-ac25-42c151fb9ba3.json
 delete mode 100644 data/hfopenllm_v2/Mostafa8Mehrabi/llama-3.2-1b-Insomnia-ChatBot-merged/da63b789-5571-4ed8-976e-146d385b18e2.json
 delete mode 100644 data/hfopenllm_v2/MrRobotoAI/MrRoboto-ProLong-8b-v4i/87b900e7-3bab-4e60-b0ef-349667cb2656.json
 delete mode 100644 data/hfopenllm_v2/MrRobotoAI/MrRoboto-ProLongBASE-pt8-unaligned-8b/c9fd4740-4990-4174-b782-9b63c34d6407.json
 delete mode 100644 data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1211-3B/2582a049-e940-408b-b2d9-7a7bdf470e49.json
 delete mode 100644 data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct-V2/99310118-d2ec-4647-85db-fcc22aee9161.json
 delete mode 100644 data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct/bedd12e4-da18-4ca6-ba51-6d13e1c80bae.json
 delete mode 100644 data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1222-3B-Instruct/6767e14a-bbfa-4a0d-8120-1f48a565474e.json
 delete mode 100644 data/hfopenllm_v2/MultivexAI/Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF/70260aac-1bbf-4913-9dcc-58633d055314.json
 delete mode 100644 data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v1.1/fba6e1a2-c197-4731-91ea-f6d059ba8b16.json
 delete mode 100644 data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v1/22e74d0c-70d6-43c5-be4d-62842d93fedf.json
 delete mode 100644 data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v2/f7c33065-1da1-4da4-81c7-f2c9307b6e9b.json
 delete mode 100644 data/hfopenllm_v2/Mxode/NanoLM-1B-Instruct-v1.1/ecdb4661-426a-46be-aefc-7e04483cebc0.json
 delete mode 100644 data/hfopenllm_v2/Mxode/NanoLM-1B-Instruct-v2/236976b3-af46-45ac-a8a5-f5897e3468a1.json
 delete mode 100644 data/hfopenllm_v2/NAPS-ai/naps-gemma-2-27b-v-0.1.0/fd175296-a5f6-4914-80e9-b8b75bc659de.json
 delete mode 100644 data/hfopenllm_v2/NAPS-ai/naps-gemma-2-27b-v0.1.0/d910bbaa-d55c-4b00-9320-856a8a6713c0.json
 delete mode 100644 data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-8b-instruct-v0.3/99a5f123-5d2e-469b-884e-c9a64c6bc197.json
 delete mode 100644 data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-8b-instruct-v0.4/ed17a715-f0ae-461c-9618-ac952c450ec5.json
 delete mode 100644 data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-instruct-v0.5.0/3dd2a474-9ea8-4e26-8986-5bcc67c78c39.json
 delete mode 100644 data/hfopenllm_v2/NAPS-ai/naps-llama-3_1_instruct-v0.6.0/b39e14a6-c05f-4e88-b2d4-63a199aa61a1.json
 delete mode 100644 data/hfopenllm_v2/NAPS-ai/naps-llama3.1-70B-v0.2-fp16/39893637-552a-48d8-9b83-433415eb26c3.json
 delete mode 100644 data/hfopenllm_v2/NCSOFT/Llama-VARCO-8B-Instruct/f9549713-f487-4e26-bfeb-ec6d394b7014.json
 delete mode 100644 data/hfopenllm_v2/NJS26/NJS_777/02579c41-f117-4412-9c00-ee7db3e9ab97.json
 delete mode 100644 data/hfopenllm_v2/NLPark/AnFeng_v3.1-Avocet/bfa1d761-00aa-4438-a5de-972d934c63d5.json
 delete mode 100644 data/hfopenllm_v2/NLPark/B-and-W_Flycatcher-3AD1E/20a84d88-05c2-4e02-8c84-2afa84cc659f.json
 delete mode 100644 data/hfopenllm_v2/NLPark/Shi-Ci-Robin-Test_3AD80/84eedce3-3a93-4630-b914-aa281fd2efda.json
 delete mode 100644 data/hfopenllm_v2/NTQAI/NxMobileLM-1.5B-SFT/b3b7b62f-ac82-4ef9-9634-afb81645ec19.json
 delete mode 100644 data/hfopenllm_v2/NTQAI/Nxcode-CQ-7B-orpo/283c5166-b9c5-4d20-9653-0cd0346d87c1.json
 delete mode 100644 data/hfopenllm_v2/NYTK/PULI-GPTrio/478b54cd-6410-41e5-8a53-4e46bcd9d7af.json
 delete mode 100644 data/hfopenllm_v2/NYTK/PULI-LlumiX-32K/de2ae7a9-93eb-4149-b3ff-b5b7dfba29c4.json
 delete mode 100644 data/hfopenllm_v2/Naveenpoliasetty/llama3-8B-V2/ef5aa9db-804b-4a53-9c22-9c99f6c69eeb.json
 delete mode 100644 data/hfopenllm_v2/NbAiLab/nb-llama-3.1-8B-Instruct/553fd36d-08dd-46a3-ab04-77b9039e7921.json
 delete mode 100644 data/hfopenllm_v2/NbAiLab/nb-llama-3.1-8B-sft/e2bae853-cc0f-456a-a635-98d5f87ac47c.json
 delete mode 100644 data/hfopenllm_v2/Nekochu/Llama-3.1-8B-German-ORPO/d6c5f196-c97b-4a0a-81b0-59143ec4b10e.json
 delete mode 100644 data/hfopenllm_v2/Nekochu/Llama-3.1-8B-french-DPO/5d92e02f-b590-4b6b-8c64-30690f79e916.json
 delete mode 100644 data/hfopenllm_v2/Nekochu/Luminia-13B-v3/e10f38df-b5d5-47c6-924f-563c6f8a6616.json
 delete mode 100644 data/hfopenllm_v2/Nekochu/Luminia-8B-RP/27257dc9-750c-4673-8865-986434bc5c0e.json
 delete mode 100644 data/hfopenllm_v2/NeverSleep/Lumimaid-v0.2-12B/e599f3f8-e5eb-4bfe-a102-efc5a967434d.json
 delete mode 100644 data/hfopenllm_v2/NeverSleep/Lumimaid-v0.2-8B/8e56f2dd-49d0-4eff-beea-53d01cd96f0e.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Dolphin3.0-Llama3.1-1B-abliterated/f1a2b5d0-2c8a-4bbc-8bc5-0484485c2dad.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DeepDive_3_Prev_v1.0/2c12ee67-0c77-4cb2-9e88-1c731ed55c3f.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0/567f8f54-225f-4d9b-be06-f24091adc1e6.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DobHerWild_R1_v1.1R/ebb59730-9522-4c45-8f42-c0d941fd728c.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.01/2c44fa8c-ebd3-4ea6-8578-61da38965c09.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.02/3ef26b8c-6bfb-457b-a160-a65c3cc8b0c6.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.03/0ab721ba-fbda-44ca-a349-1d3abfaabe62.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.01/2fea1128-4f0c-40d8-be87-72c42c0648fb.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.02/db9dc9d2-4aa2-43d0-9f2e-15fbd05af62c.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.03/28399fd0-840c-49d3-8179-407ed83d3bfc.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.10/d7108c13-e14a-4366-9a39-204f853b1bee.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.01/56152d05-9273-4701-8c0a-723e2cab618d.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.03/55d2f23d-cb6c-42d2-8b57-837451d3c6df.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_V1.01/7479ae87-e795-4e20-848a-291614176def.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolerstormed_V1.04/04ceb40e-bde8-487b-9d29-dc8f681af9be.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedash_R1_V1.04/e26b00b0-d9df-4ce2-a649-b19f8957b8ce.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.01/9954194c-69b5-4eb4-8b32-859845548cb0.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.03/2afbc279-242a-4276-85f0-facd29c2d89b.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_V1.01/ba307ad4-3647-4785-9bf1-cd4dacf3c71f.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Mediver_V1.01/d03c73ca-7364-4517-aea4-f0ac564c49df.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Medusa_v1.01/1dd4b82a-ca80-4c9c-8800-f97ab2b9cbe7.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Smarteaz_0.2_R1/f2363099-c39a-4874-bf77-ccc0fa087680.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Smarteaz_V1.01/596eeee8-3600-4f8a-8888-978b610eb2ca.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Stormeder_v1.04/595ddba1-c450-4b69-85b7-0e3118c8c6c7.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Typhoon_v1.03/64890314-bba0-4fb2-8c21-38b413cff4c8.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_AquaSyn_0.1/470b8b0d-fbaf-408c-a28e-57d1b294f8a8.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_AquaSyn_0.11/00a1579e-8636-4eca-9a63-c0b067a5f3dc.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Dolto_0.1/a52cc4c9-6d60-4083-ac77-591e247d86c9.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Odyssea_V1.01/ac5c321a-d35a-4e0f-a1be-bcc0b7109f91.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Odyssea_V1/c4d11b01-ae5b-4198-b102-07160f100a41.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_OpenTree_R1_0.1/19405ead-2263-4613-8053-43beeafb4bfc.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_OrcaSun_V1/6c698a60-a813-4be7-b55f-b684029b492d.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_RandomLego_RP_R1_0.1/b67c4a44-7787-45e2-b88c-5d7e8e496fa3.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_SunOrca_V1/a20a529e-c52e-41b7-a8ee-909167048bfb.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Sydonia_0.1/2735e6f4-839f-4ab1-8ede-3447891b1b26.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Syneridol_0.2/e74e7e7f-8550-4cba-97cd-2626c82d6b29.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Synopsys_0.1/14f4c00d-8915-413d-8e85-79f395127682.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Synopsys_0.11/9119b586-d3b2-4ce0-a243-d584e2087184.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v1/629f3f1a-f8ee-4d1b-b604-7bbd35c6517b.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v2.1/a6ac828c-904b-413a-a5fa-a5ed06a28143.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v2/251a3ef9-c7ae-4d79-8a60-4bc021a3f001.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Nemotron_W_4b_Halo_0.1/962b48a3-23d7-4104-b34d-4e5c2af31d58.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Nemotron_W_4b_MagLight_0.1/e4b0be31-6f9a-4a57-b433-e561da9bd827.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/Qwen_2.5_3b_Smarteaz_0.01a/9a31f208-b7d8-4baa-b96e-99926ecb35af.json
 delete mode 100644 data/hfopenllm_v2/Nexesenex/pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL/8d933df1-60cb-471d-bfc3-b11c93150203.json
 delete mode 100644 data/hfopenllm_v2/Nexusflow/NexusRaven-V2-13B/35315c3a-ec06-433a-b3fa-ae7a4a59b7ea.json
 delete mode 100644 data/hfopenllm_v2/NikolaSigmoid/AceMath-1.5B-Instruct-1epoch/3530db9a-0d61-4cf8-9fff-b15f6488c845.json
 delete mode 100644 data/hfopenllm_v2/NikolaSigmoid/AceMath-1.5B-Instruct-dolphin-r1-200/7d9901e0-eafe-4d49-a5bb-fab059708bcb.json
 delete mode 100644 data/hfopenllm_v2/NikolaSigmoid/DeepSeek-R1-Distill-Qwen-1.5B-500/ee7f9025-bb2c-4902-b8e2-bfac2b63d2fd.json
 delete mode 100644 data/hfopenllm_v2/NikolaSigmoid/acemath-200/6157f79e-2673-4ad6-99d7-e5cf5e4e1db2.json
 delete mode 100644 data/hfopenllm_v2/NikolaSigmoid/phi-4-14b/0aa7572c-1aa6-4997-a2a2-3b557fbde639.json
 delete mode 100644 data/hfopenllm_v2/NikolaSigmoid/phi-4-1steps/6f5df760-2d3e-47b1-b55e-4031a5f11d41.json
 delete mode 100644 data/hfopenllm_v2/NikolaSigmoid/phi-4-300steps/ac676b03-c3ce-4ff1-83fc-5c8db82f1497.json
 delete mode 100644 data/hfopenllm_v2/Nitral-AI/Captain-Eris-BMO_Violent-GRPO-v0.420/2229cdf8-3ecb-4f11-8824-9c3bfbf6f968.json
 delete mode 100644 data/hfopenllm_v2/Nitral-AI/Captain-Eris_BMO-Violent-12B/95ebc5b8-a541-4fca-9e7c-692720e73362.json
 delete mode 100644 data/hfopenllm_v2/Nitral-AI/Captain-Eris_Violet-GRPO-v0.420/09a2508d-a171-493f-9ff2-e7f375815c91.json
 delete mode 100644 data/hfopenllm_v2/Nitral-AI/Captain-Eris_Violet-V0.420-12B/12a4a921-5859-4fd6-9d64-677a7d8ef696.json
 delete mode 100644 data/hfopenllm_v2/Nitral-AI/Captain_BMO-12B/b79f12d0-cdfc-4c9d-a88b-40612dcbf64d.json
 delete mode 100644 data/hfopenllm_v2/Nitral-AI/Hathor_Stable-v0.2-L3-8B/d162cf7c-3ef4-420f-aab4-789a98b1195a.json
 delete mode 100644 data/hfopenllm_v2/Nitral-AI/Hathor_Tahsin-L3-8B-v0.85/7e49018e-5e2d-4cdb-be5b-2ac04ec84bf5.json
 delete mode 100644 data/hfopenllm_v2/Nitral-AI/Nera_Noctis-12B/24677f2a-ea89-4289-bcb6-13699de9782f.json
 delete mode 100644 data/hfopenllm_v2/Nohobby/MS-Schisandra-22B-v0.1/3e09df3c-2224-4a29-8e55-18a485db2b25.json
 delete mode 100644 data/hfopenllm_v2/Nohobby/MS-Schisandra-22B-v0.2/cc0bd236-8fc4-43d3-a18f-4b2afb112946.json
 delete mode 100644 data/hfopenllm_v2/Norquinal/Alpha/5afd4c0f-b61d-452f-8c48-d298780d91d5.json
 delete mode 100644 data/hfopenllm_v2/Norquinal/Bravo/eac52141-4fd8-4e21-9c78-920ab8933e5a.json
 delete mode 100644 data/hfopenllm_v2/Norquinal/Charlie/8449837f-64ac-4293-b1f8-210e62779202.json
 delete mode 100644 data/hfopenllm_v2/Norquinal/Delta/ab8a665c-8234-484f-a8a9-8ee79d73edff.json
 delete mode 100644 data/hfopenllm_v2/Norquinal/Echo/a954242f-41a6-49d7-a71d-3bfe940cdb92.json
 delete mode 100644 data/hfopenllm_v2/Norquinal/Foxtrot/6d1c518f-3f42-49eb-9208-b30e27e7e87e.json
 delete mode 100644 data/hfopenllm_v2/Norquinal/Golf/87931db7-42a4-48df-b5a5-8bd934061dbe.json
 delete mode 100644 data/hfopenllm_v2/Norquinal/Hotel/54088dbc-04cc-4b35-b4e1-e495b7cfd47f.json
 delete mode 100644 data/hfopenllm_v2/NotASI/FineTome-Llama3.2-1B-0929/7129efad-8ab2-4f7a-b6ed-055989b3e131.json
 delete mode 100644 data/hfopenllm_v2/NotASI/FineTome-Llama3.2-3B-1002/cfc6f85f-e4b6-4164-b7eb-4efb888e1ba5.json
 delete mode 100644 data/hfopenllm_v2/NotASI/FineTome-v1.5-Llama3.2-1B-1007/0f053a45-cd79-4e51-9b4c-ae5c51006c17.json
 delete mode 100644 data/hfopenllm_v2/NotASI/FineTome-v1.5-Llama3.2-3B-1007/d8002b35-1454-4635-a31e-b419c7000b53.json
 delete mode 100644 data/hfopenllm_v2/NousResearch/DeepHermes-3-Mistral-24B-Preview/4c08530e-d529-49a1-a3fe-2351c422981a.json
 delete mode 100644 data/hfopenllm_v2/NousResearch/Hermes-2-Pro-Llama-3-8B/d16879dc-7ed7-49c4-aca6-4c9cd3b3a350.json
 delete mode 100644 data/hfopenllm_v2/NousResearch/Hermes-2-Pro-Mistral-7B/70656b13-e0a2-4ef4-af43-0d9995d57af6.json
 delete mode 100644 data/hfopenllm_v2/NousResearch/Hermes-2-Theta-Llama-3-8B/6544f1ca-02a6-4e58-98f0-e19cc6082682.json
 delete mode 100644 data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.1-70B/5cd3796f-fb31-49c1-a974-019c5c5b20ae.json
 delete mode 100644 data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.1-8B/49eff9ad-90c9-43b1-a1f5-cf371ac4b39b.json
 delete mode 100644 data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.2-3B/59720f7e-7e09-483f-8332-8dc7aa19ae78.json
 delete mode 100644 data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/a3a89e4a-0589-4776-a1da-227552482e94.json
 delete mode 100644 data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/b3c04d1f-80e3-4d86-9779-c5e4bbce6f35.json
 delete mode 100644 data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT/448fda35-bfdc-42ae-90f9-d44383e0a454.json
 delete mode 100644 data/hfopenllm_v2/NousResearch/Nous-Hermes-2-SOLAR-10.7B/0d97542e-82b6-4f27-9822-62b67e7690c2.json
 delete mode 100644 data/hfopenllm_v2/NousResearch/Nous-Hermes-llama-2-7b/2725bd69-839d-4427-8e05-0e289fff70de.json
 delete mode 100644 data/hfopenllm_v2/NousResearch/Yarn-Llama-2-13b-128k/adb71488-adb8-4848-bf1d-aecd04cb6718.json
 delete mode 100644 data/hfopenllm_v2/NousResearch/Yarn-Llama-2-7b-128k/c7736577-c4c3-4233-9308-a4bb9b2dbb89.json
 delete mode 100644 data/hfopenllm_v2/NousResearch/Yarn-Llama-2-7b-64k/76fe52f4-9fa5-4ccb-8c92-7bd9eb9886ee.json
 delete mode 100644 data/hfopenllm_v2/NousResearch/Yarn-Mistral-7b-128k/1d92e45f-c5a5-4dd6-a61f-8e0f7246117a.json
 delete mode 100644 data/hfopenllm_v2/NousResearch/Yarn-Mistral-7b-64k/5e1513f1-4375-4380-85fa-b96a419c013b.json
 delete mode 100644 data/hfopenllm_v2/NousResearch/Yarn-Solar-10b-32k/fadbf3b2-283a-4f8e-9acf-463d75924b97.json
 delete mode 100644 data/hfopenllm_v2/NousResearch/Yarn-Solar-10b-64k/c04ffe5b-c313-4249-83bb-bbe07ad6fc69.json
 delete mode 100644 data/hfopenllm_v2/Novaciano/ASTAROTH-3.2-1B/a9aa164e-386b-4987-9f49-2dde64ade45c.json
 delete mode 100644 data/hfopenllm_v2/Novaciano/BLAST_PROCESSING-3.2-1B/e4c1b3ef-e1db-4eca-b818-f3b1680cc5f0.json
 delete mode 100644 data/hfopenllm_v2/Novaciano/Cerberus-3.2-1B/1ab95edc-ea3c-4d3f-9f59-dc7f7468adb9.json
 delete mode 100644 data/hfopenllm_v2/Novaciano/Cultist-3.2-1B/80a81bbc-6edf-48b9-afb7-e4e0a03753d8.json
 delete mode 100644 data/hfopenllm_v2/Novaciano/FuseChat-3.2-1B-GRPO_Creative_RP/afb24bf8-3c47-4278-9b84-19b05017745b.json
 delete mode 100644 data/hfopenllm_v2/Novaciano/Fusetrix-3.2-1B-GRPO_RP_Creative/4f8cda4d-959b-41ab-a79d-d2b35968eb89.json
 delete mode 100644 data/hfopenllm_v2/Novaciano/Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP/2818aa8c-5c73-4de9-bcbe-fd8f68e8bc6b.json
 delete mode 100644 data/hfopenllm_v2/Novaciano/HarmfulProject-3.2-1B/6a683ead-0f3e-449b-9ae1-8afc9f1ab33d.json
 delete mode 100644 data/hfopenllm_v2/Novaciano/LEWD-Mental-Cultist-3.2-1B/38cb02a8-862d-40e1-922a-e65f537df87e.json
 delete mode 100644 data/hfopenllm_v2/Novaciano/La_Mejor_Mezcla-3.2-1B/f816e2a7-2629-4abe-9ed0-3d1299e95194.json
 delete mode 100644 data/hfopenllm_v2/Novaciano/Sigil-Of-Satan-3.2-1B/286fae5b-544a-4033-9092-d633fc80f47b.json
 delete mode 100644 data/hfopenllm_v2/NucleusAI/nucleus-22B-token-500B/93477bf6-ea00-418b-8a2f-975a9554263e.json
 delete mode 100644 data/hfopenllm_v2/NyxKrage/Microsoft_Phi-4/3d7c6576-f99c-4bb3-94fa-4f713e2898f6.json
 delete mode 100644 data/hfopenllm_v2/OEvortex/Emotional-llama-8B/d1e9a242-941f-4461-b75b-7043c2c01ef7.json
 delete mode 100644 data/hfopenllm_v2/OEvortex/HelpingAI-15B/e39661af-ad93-41d7-8892-1230064f1a1c.json
 delete mode 100644 data/hfopenllm_v2/OEvortex/HelpingAI-3B-reloaded/595b61b2-5220-48f6-91a0-3aa0d37c63d8.json
 delete mode 100644 data/hfopenllm_v2/OEvortex/HelpingAI2-9B/3173263e-2a42-4e8d-956e-8175ef464e76.json
 delete mode 100644 data/hfopenllm_v2/OEvortex/HelpingAI2.5-10B/f77f8291-1573-4fb6-a984-1cc099c09621.json
 delete mode 100644 data/hfopenllm_v2/OliveiraJLT/Sagui-7B-Instruct-v0.1/c4681e14-513c-4e5e-af8c-88ca11849176.json
 delete mode 100644 data/hfopenllm_v2/Omkar1102/code-yi/0c220edd-2563-4fec-99a4-ef8c210ca5ce.json
 delete mode 100644 data/hfopenllm_v2/Omkar1102/code-yi/bd7ef5a7-aa75-4eb4-8860-aec63f8bf9d1.json
 delete mode 100644 data/hfopenllm_v2/OmnicromsBrain/NeuralStar_FusionWriter_4x7b/85c20522-03c0-4dac-a1c8-2945e4bf0e0e.json
 delete mode 100644 data/hfopenllm_v2/OnlyCheeini/greesychat-turbo/f180fddd-077f-43f9-b2d9-38c5f33be44d.json
 delete mode 100644 data/hfopenllm_v2/Open-Orca/Mistral-7B-OpenOrca/ef384329-8406-4767-ac1a-3eba3131f726.json
 delete mode 100644 data/hfopenllm_v2/OpenAssistant/oasst-sft-1-pythia-12b/2ddeae27-77d3-413c-a6e1-9de0f3980c4e.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-falcon3-10b-v24.2-131k/38b2dbbe-be86-4ef0-a39b-89841f662141.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-70b-v21.2-32k/999a8091-22bd-4c08-bee1-772202e7edde.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-8b-v21.1-8k/fda91d98-d259-430c-929b-78852cab64ec.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-8b-v21.2-32k/535bfa4f-ab63-4832-9f17-7b245ff2b2af.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-70b-v22.1-131k/681a6cc5-5519-4b13-8b50-93adcab4a3f7.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-8b-v22.2-131k/141dd12c-6901-4a96-a051-f35647ddcc73.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-8b-v22.3-131k/5b095779-aacc-41f3-9a3f-83f64a1c0d4c.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.2-1b-v23.1-131k/7a88c95a-b253-4f36-8fde-1b0158bbf0b6.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.2-3b-v23.2-131k/7938a00e-4e11-4223-a900-fa53df168ab7.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.3-70b-v24.1-131k/8f966b4e-1baf-445f-9f10-4ba6b47aaf9b.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k/a334d998-21a5-4108-96e3-9935507a9f8f.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-nemotron-70b-v23.1-131k/941e27c6-81da-4ce1-b1c8-544c1426cd11.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-nemotron-70b-v23.2-131k/e409a374-685b-482d-82e4-2436dca37309.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.1-200k/84713625-97b6-4fad-982d-41b5c500d73a.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.3-200k/b7edd9ab-a018-4b2f-9b01-b56cbe98abda.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-7b-v23.1-200k/ec896115-21ef-4337-9fdd-32a04c574a05.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-qwq-32b-v24.1-200k/d8e5f49b-7bf3-41d4-a91e-c566219609f6.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-qwq-32b-v24.2-200k/ce1a92a3-6bec-410f-ab42-c567c5d23856.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-yi1.5-34b-v21.3-32k/0a125470-b50f-4ca0-90dc-1f6b69c3ccd4.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-zero-14b-v22.3-32k/aeee0165-ac7e-4da6-8102-ba60f43587de.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-zero-3b-v21.2-32k/b47b8666-2556-45df-ba5b-9a5e94186784.json
 delete mode 100644 data/hfopenllm_v2/OpenBuddy/openbuddy-zero-56b-v21.2-32k/0bde5d57-39be-4497-a2a8-d08d3c8d65f4.json
 delete mode 100644 data/hfopenllm_v2/OpenGenerativeAI/Bifrost-14B/86599961-3ec2-4837-89a4-809f1dd7226c.json
 delete mode 100644 data/hfopenllm_v2/OpenGenerativeAI/Bifrost/dc3ca25e-41b2-4206-afaa-7d2d10fd27a7.json
 delete mode 100644 data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct-human-data/cd77d407-3be3-4b84-8a73-34a15744de93.json
 delete mode 100644 data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct-v1.1/1cd20db5-0225-4724-b1f9-7c32eae456e1.json
 delete mode 100644 data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct/dfc45dc3-51e6-454b-aee9-ea6b0714f0ca.json
 delete mode 100644 data/hfopenllm_v2/OpenLLM-France/Lucie-7B/3da2a408-672c-47b8-be32-61f56a15e9f3.json
 delete mode 100644 data/hfopenllm_v2/OpenLeecher/llama3-8b-lima/94700c3c-f18d-4f96-a794-65bcf483fca9.json
 delete mode 100644 data/hfopenllm_v2/OpenScholar/Llama-3.1_OpenScholar-8B/6f3481d4-076f-45bd-8564-d485109c7a63.json
 delete mode 100644 data/hfopenllm_v2/Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2/9f5ca3b2-747a-4fd0-b382-bf7ef503ba25.json
 delete mode 100644 data/hfopenllm_v2/Orenguteng/Llama-3.1-8B-Lexi-Uncensored/f1932041-263a-4841-9c8b-c6cc9fa50c21.json
 delete mode 100644 data/hfopenllm_v2/Orion-zhen/Qwen2.5-7B-Instruct-Uncensored/691bef38-bc9e-4f8d-b774-9d7c62eec72b.json
 delete mode 100644 data/hfopenllm_v2/Orion-zhen/phi-4-abliterated/5795f693-9ebc-47c6-9d2c-185dd0d32044.json
 delete mode 100644 data/hfopenllm_v2/P0x0/Astra-v1-12B/eb83f474-0d3d-488c-bc0f-93e5d1dfb2f3.json
 delete mode 100644 data/hfopenllm_v2/PJMixers-Dev/L3.2-Instruct-Thinking-v0.1-1B/f93b2053-11c4-4868-860f-90fbfe8288fc.json
 delete mode 100644 data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-Instruct-Interleaved-Zeroed-13B/8984fe95-9fd3-48ff-aa5f-18df63ecd6bb.json
 delete mode 100644 data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-RomboTiesTest-8B/a0f6f5de-578c-4290-85b5-c51aed985074.json
 delete mode 100644 data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-RomboTiesTest2-8B/8ccc76ff-25c9-4706-b6a8-31b49f8be813.json
 delete mode 100644 data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B/924f8b31-506d-4df2-8a7b-d0cd66d55f6d.json
 delete mode 100644 data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B/8e7dfd9f-350d-406c-811d-453f1744dd53.json
 delete mode 100644 data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B/b713d1d2-351f-43a1-b77d-27723e1d4267.json
 delete mode 100644 data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMixBread-v0.1-3B/322a9442-174f-4223-b839-6f8f9664d5e5.json
 delete mode 100644 data/hfopenllm_v2/PJMixers-Dev/Qwen2.5-RomboTiesTest-7B/b12e71d1-c435-4172-a28f-38e26791dadb.json
 delete mode 100644 data/hfopenllm_v2/PJMixers/LLaMa-3-CursedStock-v2.0-8B/ad33b0e8-39c8-4118-81bd-bc86b482f122.json
 delete mode 100644 data/hfopenllm_v2/Parissa3/test-model/db8a7864-293b-45e9-995b-5301071c902d.json
 delete mode 100644 data/hfopenllm_v2/Pinkstack/PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B/31e3beea-28dc-4b47-a5e9-5fafc89226db.json
 delete mode 100644 data/hfopenllm_v2/Pinkstack/SuperThoughts-CoT-14B-16k-o1-QwQ/49315a95-394f-4508-8e6c-7c1d5547c257.json
 delete mode 100644 data/hfopenllm_v2/Pinkstack/Superthoughts-lite-1.8B-experimental-o1/375d3a94-97af-47ef-82af-afd7581663d4.json
 delete mode 100644 data/hfopenllm_v2/Pinkstack/Superthoughts-lite-v1/77cfe896-4aa1-4bcd-a39a-f437c3f7e738.json
 delete mode 100644 data/hfopenllm_v2/PocketDoc/Dans-Instruct-CoreCurriculum-12b/3d69ec7d-9999-4e16-8dc9-99fad35e156e.json
 delete mode 100644 data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-V1.1.0-12b/d2a7459b-8a12-4529-b978-c7237979f16b.json
 delete mode 100644 data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-V1.2.0-24b/e7a228ad-69de-471a-9f31-6bdc7221999c.json
 delete mode 100644 data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-v1.0.0-8b/9196ae39-adb0-4d53-8399-0ccd4d628065.json
 delete mode 100644 data/hfopenllm_v2/PocketDoc/Dans-SakuraKaze-V1.0.0-12b/ea318f99-a1ab-41ed-ae5d-39c62ac40e1b.json
 delete mode 100644 data/hfopenllm_v2/PowerInfer/SmallThinker-3B-Preview/05f69fd6-a77e-478d-ad86-3e83e615e892.json
 delete mode 100644 data/hfopenllm_v2/PranavHarshan/LaMistral-V4/5b8e9508-befb-4674-bd84-9c722a0864ce.json
 delete mode 100644 data/hfopenllm_v2/PranavHarshan/MedNarra-X1/8beb3730-23e8-4b89-933d-2d3f1a1d1365.json
 delete mode 100644 data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Appended/07417712-1933-4920-8964-67ba74bf6d01.json
 delete mode 100644 data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Interleaved/ae4cc05d-a65a-4f18-a99c-f133603686d1.json
 delete mode 100644 data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_32K-PoSE/54df4d3e-0ef0-4e30-aa46-b47a4589a34c.json
 delete mode 100644 data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Appended/a717d466-9157-4991-8459-f39847d914a2.json
 delete mode 100644 data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Interleaved/15a8789b-27de-49d1-b3e5-9b1fc9b5694e.json
 delete mode 100644 data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Appended/921562fe-cc21-4ff3-93de-a62e1d4bf7e7.json
 delete mode 100644 data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Interleaved/863969d9-e567-43cc-a0a9-7f80eaba374a.json
 delete mode 100644 data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_9.86B_44Layers-Appended/2987fa45-363e-4a07-8e9f-db01586a135b.json
 delete mode 100644 data/hfopenllm_v2/Pretergeek/openchat-3.5-0106_Rebased_Mistral-7B-v0.2/3488de21-d9a6-49e8-ba8f-d9beee9bdabe.json
 delete mode 100644 data/hfopenllm_v2/PrimeIntellect/INTELLECT-1-Instruct/0cacf042-6b62-4b67-8821-97cd703788d0.json
 delete mode 100644 data/hfopenllm_v2/PrimeIntellect/INTELLECT-1/9f0dfceb-1332-447a-bf6f-6c6c40686a6f.json
 delete mode 100644 data/hfopenllm_v2/PrimeIntellect/INTELLECT-1/c1308f95-6d55-4ff6-b14e-1bd09b467d99.json
 delete mode 100644 data/hfopenllm_v2/PuxAI/LUA_model/4ab16120-8d39-4dea-aa76-5c249506848d.json
 delete mode 100644 data/hfopenllm_v2/PygmalionAI/pygmalion-6b/f9647ea0-6464-4aa0-b1ea-a994a7bcca3c.json
 delete mode 100644 data/hfopenllm_v2/Q-bert/MetaMath-1B/c5ef47ab-2e73-43d6-b9ea-1ee7e50d9df8.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/1up-14b/9ef7a4a0-b751-45ff-ab1f-d50687a3f4c3.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Adamant-14B-sce/8b303795-557b-4fa1-bbc6-d36bd77ee739.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Alice-14B/7fec288e-0b0d-45c0-b0e6-17b905cd7ea3.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Alien-CoT-14B-sce/5a09783b-82da-43ae-a607-2cfea550d931.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Aura-8B-Linear/6c2d191a-a2d1-459c-b2e2-5766bec62ce7.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Casa-14b-sce/121cb5fc-2fa2-4718-b325-c40014802e40.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Casa-14b-sce/8bbfa040-b16e-4116-ad3e-b3e4e58a7de6.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Charlie-8B-Linear/c8891914-c9fb-4b4d-9592-826f04520e7b.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Chromatic-8b-sce/e77ffcb3-c7d8-4700-b4ea-fe4e5ba94223.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/CoT_Phi/da237415-f34e-4cbb-9a94-3ff621f3df8d.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Dyson-14b/479f3bfa-d614-46a9-88c7-9891852b0d8c.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Edu-14B-Linear/f5f0c7da-fb03-4023-81a7-801b0729a19d.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Fugazi14b/40f51424-2922-498d-bbbc-d500667a8554.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/GZA-14B-sce/4f25d177-6bcf-4864-87a4-1beb21a7373d.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Geedorah-14B/b160ab1f-be6b-4dfa-8fa9-36fc65a64782.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/GivingTree-8b-sce/d497a7e3-11c2-4e0c-8788-091caabede56.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/GuiltySpark-14B-ties/4a55bcf2-e1c1-4fce-8f79-472dae869b26.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Halo-14B-sce/5b00dd5e-0ad3-4ea0-aa0d-2327d610e6a6.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Heretic1.5b/1c80d383-1ccb-4f32-a63d-dd3954fe5f6b.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Hyde-14b-sce/75065074-7ef6-41ac-be7c-496cc458640a.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Imagine-v0.5-16bit/49a0287b-48d7-44db-bf20-a084919d332f.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Imbue-14b/7b2861ee-58f9-4ac9-99ee-2ec663e1b157.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Insom/628542f9-fac6-42a7-8ec5-5cd93f977a7e.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/InspectorDeck-14B-sce/5b0924ae-cf52-4245-a687-91e4b1742c16.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Jekyl-8b-sce/459c2b98-c3af-4334-a4bc-13334efe49b8.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Jigsaw-14B-Linear/b2780aa3-d299-4180-8441-dd54e94255cb.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Katana-8b-sce/f55d398d-0555-4e89-a37c-def04741a0dd.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Knot-CoT-14B-sce/63caf8f8-9e55-4ef6-ae76-ee7184a50675.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Lineage-14B/f82ccde3-bd3b-499c-8b8c-182822392cea.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Lo-Phi-14b/8a52fb4a-d6ae-4c8d-aed0-2137e0a83ea1.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Loke-14B-sce/b7cbc2fb-2c52-4c13-9266-52103421f2ee.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/MFDOOM-14B/f4474361-e897-4dbb-a89e-5451a4724474.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/MFGRIMM-14B/de257b5e-4629-4f8a-b08d-d2ca372593e2.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Math_Phi4_Reason/a37aada3-104a-488a-898f-245ff257de46.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Mithril-14B-sce/d9d655d1-d94c-483a-a3a2-ca196e1391d1.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Mononoke-14B-sce/77bf7126-0cb9-43ef-8d23-5f1395f91642.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Motion-8B-Linear/73f410be-3084-4994-8406-f8ac70880626.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Mouse-9B/24caad7a-15fa-4820-91cc-0f544a34d173.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Nova-14b-sce/e087b221-f813-4688-8d98-17980f98ac5b.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/NovaScotia-14b-stock/f4d03bff-3b34-497f-a17f-0379bc562f11.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/ODB-14B-sce/2ca21612-ea90-41f3-b618-3ea81c09c3ae.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/ODB-14B-sce/d4dc2088-9911-4966-afe9-022df89dd522.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Oasis-14B-ties/ad03a075-8f24-46f6-ae04-5a04eb7061c1.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Origami-14B-sce/2d1da226-e65c-48a0-aabb-46b1cf670a82.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Phi4.Turn.R1Distill.16bit/7fb3a035-2b83-4a58-818f-16fe6d9a8ab3.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Phi4.Turn.R1Distill_v1.5.1-Tensors/87018726-9f81-47b1-883e-609afea7fb37.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Phi4Basis-14B-sce/292b9333-96c7-4fc7-bf35-78bbce9f10d3.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Ponder-14B-linear/b44224c3-ed2c-4120-9e2a-e6286358a4da.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/RZA-14B-sce/f7a2c9af-c55c-4307-bfef-1ca709525d82.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Rosemary-14b/d9655f35-edfd-4c53-b359-559870e8019e.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Rune-14b/afdd962d-652a-4395-92f7-c16dc874a779.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/SZA-14B-sce/2594e917-3ebd-428b-8f36-cb0da668695d.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Sake-20b/91a86644-ad96-4c66-8691-1c0b531b572c.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Spok-14b-sce/331f56ce-5e45-46d8-9143-3f66be20b699.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Sumatra-20b/6138ebe0-8483-4cfb-8d95-b334bb09e831.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/SuperNova14b/4d16dd47-42d1-4ea6-8f1b-dc50648bceab.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/TB0-8B-sce/a6b0f2bf-08da-472f-b858-8be967a44cdc.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/TBL-8B-sce/57c7553d-f3e5-4a31-8c16-66aae570d8ec.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/ThinkPhi1.1-Tensors/58c31bdd-f86f-4fbb-8549-191bb9f46f02.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Venti-20b/dd25c1dd-0edf-44ca-b18c-633dbd47368f.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Venti-Blend-sce/2a030613-b5f7-4393-ac39-d2d072c913dc.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Vine-14b-sce/f8c73290-c400-4f1f-a00a-516592497b0d.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Wendy-14B/b31908fc-5e7e-45d6-835f-4e86a05b23fb.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/Wu-14b-sce/4320cb98-7f9f-4510-bb88-448ce231bae8.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/bloom-14b-stock/28b986d1-2e67-4462-9165-6cb8f260b6c6.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/caramel-14B/fe1e21cb-7934-4022-a74a-777172310021.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/graphite-14b-sce/90871638-b828-484d-8822-95ffceb20909.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/mocha-14B/04a98dfb-8e96-444c-8df4-ed7cf72a26ea.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/mosaic-14b-sce/8c5c22af-f230-4d34-b80d-f42ef27e1675.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/tesseract-14b-stock/f3466a90-541b-4a08-a9c6-d5a79b2299b0.json
 delete mode 100644 data/hfopenllm_v2/Quazim0t0/time-14b-stock/ef9ee5ae-d92b-4143-af1b-d62a7c3c7fd4.json
 delete mode 100644 data/hfopenllm_v2/Qwen/QwQ-32B-Preview/859af708-ac37-4749-bc06-73d92338d1f5.json
 delete mode 100644 data/hfopenllm_v2/Qwen/QwQ-32B/e274380d-e0f7-47c3-afc3-e603e6cecf9e.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-0.5B-Chat/19810be8-ea81-4db5-9854-1830b05a5732.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-0.5B/1258c282-3672-4b42-9d4d-117568e17bf5.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-1.8B-Chat/9b9f6e01-238e-4893-b398-4e1c83c44dfa.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-1.8B/b267621b-dbba-4c4a-bb9f-fa85734d0f59.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-110B-Chat/a7e4e787-8e95-48a0-9d50-53ba9f05cd1c.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-110B/3d39dcab-55df-4ad3-bdc8-03ae684e4390.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-14B-Chat/1b499881-9edb-4626-a919-977393d6bef1.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-14B/84b8970c-6c29-4ee1-93b8-c97e4a7c4950.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-32B-Chat/2e070663-2622-4a8e-bd39-7f0ef9df399e.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-32B/047fa91e-2dc7-4881-8254-3dfbd4a2ff1b.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-4B-Chat/6d73016e-078e-4ffe-b2ae-5b829d1456df.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-4B/0b68b5bd-d22c-4194-9ddf-f22e9181f84d.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-7B-Chat/03d51d90-fd15-42b7-ad5f-c7326cc642a7.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-7B/d3e5c939-c53a-49d6-80cd-34420dbb176a.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-MoE-A2.7B-Chat/ab321358-26f9-4577-a5fb-1f5d4b8784b4.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen1.5-MoE-A2.7B/a43aae68-f12c-4a6d-b846-c498cf35f6cd.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-0.5B-Instruct/b84615c0-43c4-49ec-83fe-5d3f8e6026af.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-0.5B/7e687d24-9e12-4ecf-b283-e222efb9473a.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-1.5B-Instruct/4aea143c-28fd-48bb-b911-37ac3fe58220.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-1.5B/34a8daec-bfff-4cf4-9011-0542b30c1d10.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-57B-A14B-Instruct/3e919d7b-53db-41fb-ac93-224e2768b9c6.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-57B-A14B/66becca1-d92b-409f-ab56-44d05cac66fd.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-72B-Instruct/6293b269-7c4c-44da-bd85-e51954c173a1.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-72B/add3b058-e7bc-4b7b-bb98-0d7039979072.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-7B-Instruct/db0b6b3f-e5a9-4367-ab87-e58d5c6ccd81.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-7B/54b055d0-80ae-4bba-b729-bd77b3ec7502.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-Math-72B-Instruct/5c22d0b3-5082-4c6e-865c-71da03cf9378.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-Math-7B/f8e5ee9f-519d-4ed8-bd2a-88897075f401.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-VL-72B-Instruct/b74c3215-7bd5-42d1-9193-f4c9c6a8bec2.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2-VL-7B-Instruct/27df1e06-463b-4519-87eb-a1666ad3f98c.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-0.5B-Instruct/9d975b05-7bee-462d-a33a-afa0d5af94d4.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-0.5B-Instruct/9ef9135a-473e-43a5-a460-fd3ec50226f9.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-0.5B/c57cae01-328e-447b-8945-e3cd2c4b8a7b.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-1.5B-Instruct/494c86cf-7f37-49d8-8160-b81859552c87.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-1.5B/6de5e76e-4297-4bcd-b06e-f63fa28da0e0.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-14B-Instruct-1M/9b10cd14-82f3-4b36-a4be-5092127d68c3.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-14B-Instruct/bbd94181-0523-4543-80a7-056b041e03b7.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-14B/e10d8573-e201-460e-a931-49a1b13ceeea.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-32B-Instruct/e2ca9477-2414-4b8a-8d22-68f9ced54ae5.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-32B/831246b8-5433-48e6-ba11-8a4239373106.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-3B-Instruct/8277994c-8bf5-4ece-9f34-4fe9a4310bbf.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-3B/5aabc7c5-eb3a-42e0-8b40-0a08004f6e1a.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-72B-Instruct/cbb73c83-ad94-4973-9bf5-a5e7ca4d1653.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-72B/3ed06a16-d5fe-43d3-a369-f4ed29fb3a5d.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-7B-Instruct-1M/fc817789-2f44-4d2b-b40e-2422fe33d104.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-7B-Instruct/5e1c8723-7c43-4d8f-8c7c-386c2eb6b9cf.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-7B/b6740747-19ac-4a9c-892f-6556013ddc8b.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Coder-14B-Instruct/3263ab46-09ae-4c24-9332-b6874d0d0330.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Coder-14B/a8706a7e-5693-4768-a955-a448549d2e77.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Coder-32B-Instruct/3c932329-0440-4799-886f-10bc4a5aeb09.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Coder-32B/b1e42d9d-827d-4109-8d1b-182694033b21.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B-Instruct/0c6f0d92-3ee0-48d7-b3fc-70149911a51d.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B-Instruct/73b07681-8e10-414e-8922-650908f9cf6a.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B/8b1549f8-0602-4538-842c-abe9dca7baff.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Math-1.5B-Instruct/ad395ad4-0f9f-4b49-83c9-b89fa6b6dd89.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Math-72B-Instruct/14c01681-fbef-49c4-b737-a7baaa02d393.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Math-7B-Instruct/3ad495c0-da8e-4776-8d05-bc7dce1fe120.json
 delete mode 100644 data/hfopenllm_v2/Qwen/Qwen2.5-Math-7B/0762ca9e-f0d4-408e-9992-e91a10e0e65f.json
 delete mode 100644 data/hfopenllm_v2/RDson/WomboCombo-R1-Coder-14B-Preview/ec6c1d05-cea7-445c-bed3-9eee1e1ff03d.json
 delete mode 100644 data/hfopenllm_v2/RESMPDEV/EVA-Qwen2.5-1.5B-FRFR/1fc39812-77fb-4d0c-b9fb-706e94c40afe.json
 delete mode 100644 data/hfopenllm_v2/RESMPDEV/Qwen2-Wukong-0.5B/fdc3c502-53ad-4bf7-85ce-51eaed72754b.json
 delete mode 100644 data/hfopenllm_v2/RLHFlow/ArmoRM-Llama3-8B-v0.1/3f74c1c7-f349-4193-95cf-b0033112fea0.json
 delete mode 100644 data/hfopenllm_v2/RLHFlow/LLaMA3-iterative-DPO-final/36a803da-83ab-4c49-8855-9344aaa7a68b.json
 delete mode 100644 data/hfopenllm_v2/RWKV/rwkv-raven-14b/df986996-249e-49f9-b074-91e8dcdf62e2.json
 delete mode 100644 data/hfopenllm_v2/Rakuten/RakutenAI-2.0-mini-instruct/90f007e9-e323-4a82-b276-ac1b928030ca.json
 delete mode 100644 data/hfopenllm_v2/Rakuten/RakutenAI-7B-chat/2b627f93-5cc7-4a5e-b682-d129396362e5.json
 delete mode 100644 data/hfopenllm_v2/Rakuten/RakutenAI-7B/2fde07ac-d218-4cc6-947e-8ceb87eedbee.json
 delete mode 100644 data/hfopenllm_v2/Replete-AI/L3-Pneuma-8B/2a141bfe-4632-4058-a232-1f2c5540c41f.json
 delete mode 100644 data/hfopenllm_v2/Replete-AI/L3.1-Pneuma-8B/fa2d74a5-e8f6-4a1c-9310-a9b16c2e59d1.json
 delete mode 100644 data/hfopenllm_v2/Replete-AI/Llama3-8B-Instruct-Replete-Adapted/c7c0ceff-9273-4cc3-8f8e-bd93181590ba.json
 delete mode 100644 data/hfopenllm_v2/Replete-AI/Replete-Coder-Instruct-8b-Merged/c439478a-1734-4038-aa8b-bb2d12ec022d.json
 delete mode 100644 data/hfopenllm_v2/Replete-AI/Replete-Coder-Llama3-8B/4a36f73a-9495-4ea2-863c-220b8ca6bf99.json
 delete mode 100644 data/hfopenllm_v2/Replete-AI/Replete-Coder-Qwen2-1.5b/faa9d3b9-343a-4a9e-82c5-6bc81bc87b9c.json
 delete mode 100644 data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b/a55bf380-d567-4228-b30c-57e9df31e844.json
 delete mode 100644 data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b/dfd92311-4f3d-4355-8ccf-a59f29914b8f.json
 delete mode 100644 data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b_Beta-Preview/d98e190e-5b5f-46eb-b701-e32d2dbef3a0.json
 delete mode 100644 data/hfopenllm_v2/Replete-AI/Replete-LLM-V2-Llama-3.1-8b/32edb764-2a42-4efe-ac86-9eda81942b84.json
 delete mode 100644 data/hfopenllm_v2/RezVortex/JAJUKA-WEWILLNEVERFORGETYOU-3B/36855ebd-2030-4d5d-9c42-ca049244e694.json
 delete mode 100644 data/hfopenllm_v2/RezVortex/Jajuka-3b/9651a0a1-4004-42f3-ad8f-2aebb38ec967.json
 delete mode 100644 data/hfopenllm_v2/Ro-xe/FMixIA-7B-DARE-0/a59e55dc-e2b5-43be-8469-49eee0e98d55.json
 delete mode 100644 data/hfopenllm_v2/Ro-xe/FMixIA-7B-SLERP-27/a956e306-f184-4dbc-ac7a-3793ae735801.json
 delete mode 100644 data/hfopenllm_v2/Ro-xe/FMixIA-7B-TIES-1/c05cc6ce-12fd-491d-b41b-57cc14b6d34a.json
 delete mode 100644 data/hfopenllm_v2/Ro-xe/FMixIA-FrankenMerge-9.5B-PT-9/415875b7-fe10-47e7-aca0-029c2f51c067.json
 delete mode 100644 data/hfopenllm_v2/Rombo-Org/Rombo-LLM-V2.5-Qwen-7b/c505ee64-3d3b-48e2-9c8a-f59609a758e9.json
 delete mode 100644 data/hfopenllm_v2/RubielLabarta/LogoS-7Bx2-MoE-13B-v0.2/00003185-c291-40c5-bba1-f87eae0afc08.json
 delete mode 100644 data/hfopenllm_v2/SaisExperiments/Evil-Alpaca-3B-L3.2/328f61d7-677b-4a06-b464-0da42153f9ae.json
 delete mode 100644 data/hfopenllm_v2/SaisExperiments/Gemma-2-2B-Opus-Instruct/9cb5b8fd-062c-4161-9301-640980d21b9f.json
 delete mode 100644 data/hfopenllm_v2/SaisExperiments/Gemma-2-2B-Stheno-Filtered/09284b75-a2f9-40ea-8135-7aa61c626fa2.json
 delete mode 100644 data/hfopenllm_v2/SaisExperiments/Not-So-Small-Alpaca-24B/e2502331-6ac3-43bc-8218-259b44333283.json
 delete mode 100644 data/hfopenllm_v2/SaisExperiments/QwOwO-7B-V1/8dde454d-aa48-4ee1-b5c6-f3353087d492.json
 delete mode 100644 data/hfopenllm_v2/SaisExperiments/RightSheep-Llama3.2-3B/662c8ed2-2407-4606-ac1e-ec7ade185d2d.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Anemoi-3B/332aef8c-7c62-463e-ba3c-07ae0205d457.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Euphrates-14B/cfdfcf21-e445-430e-a295-946cb8c3fce9.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Llama3.2-3B-Uranus-1/a5606b92-aa2d-44e3-a92c-47d0b38fef9c.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Magro-7B-v1.1/465d473c-ef28-4725-8cac-02f2a031b22c.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Neptuno-3B/2c636544-8676-4eee-8bcd-d623be0275be.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Neptuno-Alpha/8b332fac-1cfa-498b-853a-52ec5492ddc7.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Oxyge1-33B/2bf1b38b-e90b-4fa8-b19e-47d93ff9ab4e.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Phi3.5-Comets-3.8B/69bb0243-75b2-4858-ba6b-5e70cfb516a7.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Qwen2.5-1B-Instruct/4bb7e325-8741-4c09-81f6-9efdb30ef5a5.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/QwenTest-7/87878b74-22ce-4554-914c-03e486d13de3.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-0.5B/5030f8d4-f216-4f78-84f1-dd03b0324bb0.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-1.5B-Alpha-1.1/c5e244fd-e85e-4fbb-9703-b8e733fb91bf.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-1.5B-Alpha/38261a01-62df-42b2-9b1d-f924598e70ef.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-1.7B/5736f0b5-3903-4774-a84a-c3db260d36e4.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-14B/70134d58-972e-49c9-8cde-4ba2691d3dc3.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-2.4B/d4bb1440-2064-4752-bcb3-c9cec234fd1b.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-24B-Alpha/d9e6059e-d20b-4465-b7ba-2ee3a72562b6.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-2B-V1.1/f8b02d65-c8a0-43eb-b48e-d1e1f7f363d6.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-2B/7bf23db0-877c-4700-95c8-e35dee5e57b4.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-3.7B/07f8351e-c7c6-463f-9e91-ee1d3bb2b35c.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-4B/8535ffae-f39d-46ed-89bb-a1656885db91.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-7.5B/5e832121-9a67-44d9-973d-fffdb1b37975.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-7B-V1.1-Multilingal/92d3f67d-a026-49e3-a440-68c10fb358ae.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-7B-V1.1/9d0baaef-bd31-4a96-bb2a-e92b62b748d2.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-8B-V1.1/489e8e84-5e30-46fa-a421-f52308f051e7.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-8B/a208f807-c930-4e81-8ebd-dcbb4db76442.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-900M/4956539d-a255-4c56-877f-257e463fa3e4.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJT-Moe2x7.5B/3451eb65-020c-4e34-9128-7410e6b293cd.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJTPass-2/b5cd0061-e4dd-4049-a51e-b16490e69120.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJTPass-4/c4686af6-0b7b-4df3-9152-14a3ef087b7f.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SJTPass-5/155885ca-11e7-4cd2-b26c-53e001e2a6f9.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Saba-Passthrough-2/d9ca5411-def6-43b3-a522-595131d8e5e6.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Saba1-1.8B/e54553ab-0897-4cb5-9213-5bb72758d2b5.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Saba1-7B/eed48cdc-18db-4c03-84bf-d2d50e3328b0.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Saba1.5-1.5B/d7952aef-37e2-4c15-a1a4-598690773bbb.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Saba1.5-Pro-3B/5e1e1376-bb22-4fc9-a1d6-3f2fe7d302b9.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Saba2-14B-Preview/cfdae559-f3f1-4a78-b4cc-fbfb8bb37b16.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Saba2-3B/a12208ce-e9e1-4476-8054-0d565efad92c.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Sailor-japanese/f46e1eeb-8b8b-4d47-9510-445109b5518b.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Saka-1.5B/7dc4970f-ce35-4ffa-9052-2ab40abb1e55.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Saka-14B/823e886a-1431-4078-81a3-4b941983461d.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Saka-24B/583609f0-de5b-43cd-a667-bb2c36679fd2.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Saka-7.2B/2d2cea8b-167e-4d63-b01c-537f372672f9.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Saka-7.6B/f584f596-3a17-404a-81a2-3033ad38cad6.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SakaMoe-3x1.6B-Instruct/ebb0930f-92be-4e1b-a2a6-779f69d2151c.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SakalFusion-7B-Alpha/b8926567-e208-442e-8ba8-c6dd4ecc5c4a.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/SakalFusion-7B-Beta/4bf6efe1-81fc-48f6-96ba-8df9ffbef2f2.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/Tara-3.8B-v1.1/05ffcb7a-2694-4276-bf45-73e1110bc494.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/light-1.1-3B/dc3b944b-a57a-44ab-87ac-8e1882b7bcce.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/light-3B/154f70b4-d77c-4d1b-b85c-bc81fe8162bd.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/light-3b-beta/998316d2-389a-4ce0-b0b0-0430c1361de7.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/light-7b-beta/ce803cde-6e23-433c-a4d2-38c5cb5ba14b.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/llama-3-yanyuedao-8b-instruct/2519485b-47cd-497c-a349-9e69db0266f3.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/magro-7B/56d86e26-4ee6-4652-9b7b-a538238a24d4.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/mergekit-01/416b89e4-5e8a-4131-9403-e8967a4127b8.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/mergekit-della_linear-vmeykci/347a90e8-d8b7-4266-8242-ceac865796a0.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/model-3/389f7ab8-b30e-4d0c-b9a4-625e74a1f73f.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/qwen2.5-2.3B/6ae33b7f-53a1-45c5-8b0b-d462188c3f9d.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/tara-3.8B/d96fb0b2-7cba-4cc4-a5f4-b8a451754857.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/ultiima-14B-v0.2/f8d362f6-eafc-4d11-bc40-d169d69d3a95.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/ultiima-14B-v0.3/4bacd3dd-44c2-42d8-98c0-3eeb920dc0f0.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/ultiima-14B-v0.4/de073f45-0d14-4f8a-9d3b-d4fd961186b8.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/ultiima-14B/fd88d234-b3f9-4f48-896c-af58f1a69880.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/ultiima-32B/273745b1-3761-463e-b9ab-7860968064eb.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/ultiima-72B-v1.5/101d84d3-e741-4eb2-bd8a-db6c12022fe2.json
 delete mode 100644 data/hfopenllm_v2/Sakalti/ultiima-72B/9c82deca-1998-4506-b038-c5dd592324d8.json
 delete mode 100644 data/hfopenllm_v2/Salesforce/LLaMA-3-8B-SFR-Iterative-DPO-R/da620a94-4c0d-4c50-9619-10e12001fb5d.json
 delete mode 100644 data/hfopenllm_v2/SanjiWatsuki/Kunoichi-DPO-v2-7B/51dade8f-34e7-4237-8691-22655249bf76.json
 delete mode 100644 data/hfopenllm_v2/SanjiWatsuki/Silicon-Maid-7B/cdd59385-0a54-4ca1-b24d-9316a70f2875.json
 delete mode 100644 data/hfopenllm_v2/Sao10K/70B-L3.3-Cirrus-x1/514a3103-e8a1-49e8-b9da-a85963f5b3dd.json
 delete mode 100644 data/hfopenllm_v2/Sao10K/Fimbulvetr-11B-v2/daafaafa-1e00-4433-95f3-91c169598ebd.json
 delete mode 100644 data/hfopenllm_v2/Sao10K/L3-70B-Euryale-v2.1/50e53ad5-8693-44c1-b5c7-45b91d7e0ae4.json
 delete mode 100644 data/hfopenllm_v2/Sao10K/L3-70B-Euryale-v2.1/bda5d02f-7973-41a3-8f8e-4e33a12b74e0.json
 delete mode 100644 data/hfopenllm_v2/Sao10K/L3-8B-Lunaris-v1/99ff5ca5-4409-4d9c-9ec0-4cf392afeff2.json
 delete mode 100644 data/hfopenllm_v2/Sao10K/L3-8B-Niitama-v1/362f5875-4dbc-4e68-90ce-789f692bb533.json
 delete mode 100644 data/hfopenllm_v2/Sao10K/L3-8B-Stheno-v3.2/fdb5faf6-2cdd-42bb-b154-d6e93b2348bf.json
 delete mode 100644 data/hfopenllm_v2/Sao10K/L3-8B-Stheno-v3.3-32K/93f829b8-b8d9-4389-a210-2a38c3a30edb.json
 delete mode 100644 data/hfopenllm_v2/Sao10K/MN-12B-Lyra-v3/6ec3554d-377b-4bf6-88ef-8a4c9e70f485.json
 delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V1-32B/70d749cf-2e92-4847-86de-7964fc8eb990.json
 delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V2-32B/623f2b04-6cd7-4ea0-8844-badb0ff6c9c6.json
 delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V3-32B/e1aca741-2765-4e47-b6a1-49f3d9532432.json
 delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V4-32B/4f42366e-e6aa-4974-9a40-5781e350616d.json
 delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V5-32B/4ec2231d-c012-4ad3-830c-8ff86c977202.json
 delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V6-32B/1d2e5513-bd0c-4795-8487-f5266c6e368f.json
 delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V2-27B/104172b7-86f5-410a-a454-63e1cfbeb87f.json
 delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V3-27B/d28e04ac-7d18-43fb-80b8-82c0662fec79.json
 delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Superb-22B/20bb3819-9d85-4d84-99ba-65e33965f0c5.json
 delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Superb-27B/3a4bdf58-0137-4d85-b567-59b3fed3dad5.json
 delete mode 100644 data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Superb-27B/04f843ba-947c-4732-979c-2aeae7d34e5a.json
 delete mode 100644 data/hfopenllm_v2/SeaLLMs/SeaLLM-7B-v2.5/173a31d3-7d12-4ab1-a963-005a81aee767.json
 delete mode 100644 data/hfopenllm_v2/SeaLLMs/SeaLLM-7B-v2/d0555736-b614-43ca-91d7-8264e3566872.json
 delete mode 100644 data/hfopenllm_v2/SeaLLMs/SeaLLMs-v3-7B-Chat/4b7b13b7-4aee-4462-87e6-aa6c15068236.json
 delete mode 100644 data/hfopenllm_v2/SenseLLM/ReflectionCoder-CL-34B/4b1f9ce5-bb12-42e3-b0e0-afaa784b0c4c.json
 delete mode 100644 data/hfopenllm_v2/SenseLLM/ReflectionCoder-DS-33B/acbcd5a5-bcd8-4209-b35f-425feada7e8b.json
 delete mode 100644 data/hfopenllm_v2/SentientAGI/Dobby-Mini-Leashed-Llama-3.1-8B/cb9a415f-1a02-46ad-a731-bf825ddd78ae.json
 delete mode 100644 data/hfopenllm_v2/SentientAGI/Dobby-Mini-Unhinged-Llama-3.1-8B/92cde6db-47f4-43c6-9ad5-643c35faa226.json
 delete mode 100644 data/hfopenllm_v2/SeppeV/SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo/5e88a037-f9bd-4b39-944f-f0781bb7884f.json
 delete mode 100644 data/hfopenllm_v2/Sharathhebbar24/SSH_355M/d4b08f5d-5add-49f4-b8db-c1a12e0a5313.json
 delete mode 100644 data/hfopenllm_v2/Sharathhebbar24/chat_gpt2_dpo/ac5adf39-f0a4-439b-9873-9141e0a554b1.json
 delete mode 100644 data/hfopenllm_v2/Shreyash2010/Uma-4x4B-Instruct-v0.1/62965c92-cdf4-4a3b-b035-990abaab615c.json
 delete mode 100644 data/hfopenllm_v2/Sicarius-Prototyping/Brainy_LLAMA/3866ece8-d70a-4061-9e86-0798ecd98bd6.json
 delete mode 100644 data/hfopenllm_v2/Sicarius-Prototyping/Micropenis_1B/ff484d0e-bb14-4a80-ae29-2351b03cf278.json
 delete mode 100644 data/hfopenllm_v2/Sicarius-Prototyping/bacon_and_food/06ac1718-fe71-4e05-a47f-1200e067336c.json
 delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/2B-ad/4ddb1616-7889-45ef-96de-823fee338e1d.json
 delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/2B_or_not_2B/487dd91b-5bc4-4355-90d3-c82ecc789ab3.json
 delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Dusk_Rainbow/a74e86d9-8b94-4f60-8f0c-73cc4b04d905.json
 delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Eximius_Persona_5B/9a9239ab-9e0e-449b-bd1b-6ec280fad505.json
 delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Impish_LLAMA_3B/2c710cd5-75a6-46b7-8356-212da7bf864d.json
 delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Impish_Mind_8B/377d5240-73b5-48d0-bbdc-0960ad1d9069.json
 delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Impish_QWEN_14B-1M/9f31a6da-c5bd-4143-b2f9-715c0e9f7b74.json
 delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Impish_QWEN_7B-1M/104a0157-c614-44cf-b6cc-9f15dab4b187.json
 delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/LLAMA-3_8B_Unaligned_BETA/bb379093-c169-44bd-ac86-edb8ab8fc225.json
 delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Phi-Line_14B/e29001c0-17c0-4deb-8ca2-ce9ad06d8cb3.json
 delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Phi-lthy4/43d87bf5-2620-4f8e-a8b6-f86fc157d987.json
 delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncencored/735d9d75-d9d1-4553-b7cf-f8e7c2e65218.json
 delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncensored/0c6dcc87-343c-4973-a589-3e3393829184.json
 delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncensored_Instruct/7c1d1657-e9ae-433f-be9d-523431bfc7ae.json
 delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Redemption_Wind_24B/0b2d9a65-c028-4f4b-a280-dc0c35ac9516.json
 delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Winged_Imp_8B/e87e1d3f-1476-499d-a9f3-b6463b429262.json
 delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Wingless_Imp_8B/246e8450-3c53-4bde-99bb-5663f751e88e.json
 delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/Zion_Alpha/496b9e45-2f64-456e-b35e-12a94c5643b1.json
 delete mode 100644 data/hfopenllm_v2/SicariusSicariiStuff/dn_ep02/05890047-a95a-433e-b6b6-fb037592cdd1.json
 delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.1-8B-lora-epoch1/4a30580c-1d25-49d4-984d-2d28ef3a5656.json
 delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.1-8B-lora/696d7966-d140-4f43-91df-54f02247b34f.json
 delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch3/fdf10ab8-e3f9-49e6-8fd0-ed116868c217.json
 delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch5/9ac16d1f-d894-414d-8a14-110e971d0ba6.json
 delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch3/2eb01e0e-8f7b-4956-9a2d-b32ecaa936f6.json
 delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch5/3b221b0e-6158-471f-bcd2-b09514f28bd7.json
 delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch1/c8af8428-aab6-4d19-b185-2b437c0334fa.json
 delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch2/c617d12b-c37f-47ef-9704-e19774c67aeb.json
 delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch3/577f31e2-1808-45e2-a528-5933019cfa85.json
 delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-3B-Instruct/7bd7f5c8-be9e-473e-be18-03ad22a195ee.json
 delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000/5036a549-5583-4775-935a-1a12b6de3e7d.json
 delete mode 100644 data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000/5c0ffff9-542c-424e-88e9-89584e686e12.json
 delete mode 100644 data/hfopenllm_v2/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/5c6a045d-2c90-4938-9185-9c1a0f82903a.json
 delete mode 100644 data/hfopenllm_v2/Skywork/Skywork-o1-Open-Llama-3.1-8B/02480176-2058-4e71-a970-9698be8d235e.json
 delete mode 100644 data/hfopenllm_v2/Solshine/Brimful-merged-replete/4be1e5b4-254c-4287-907d-cc845042de37.json
 delete mode 100644 data/hfopenllm_v2/Solshine/Llama-3-1-big-thoughtful-passthrough-merge-2/21b51852-5cad-414e-92d5-31878f025d67.json
 delete mode 100644 data/hfopenllm_v2/Sorawiz/Gemma-9B-Base/9eb07d4a-1f01-4696-9137-d477ffca43be.json
 delete mode 100644 data/hfopenllm_v2/Sorawiz/Gemma-Creative-9B-Base/4236485b-aa92-4bc4-a652-17ed3231ecf4.json
 delete mode 100644 data/hfopenllm_v2/Sourjayon/DeepSeek-R1-8b-Sify/9c0d6b71-8c6a-4294-961c-972a002b847f.json
 delete mode 100644 data/hfopenllm_v2/Sourjayon/DeepSeek-R1-ForumNXT/d1e906d5-8f0d-49c2-88c3-cf71774de600.json
 delete mode 100644 data/hfopenllm_v2/SpaceYL/ECE_Poirot/798e4f83-6262-4d5b-a854-6ff114167209.json
 delete mode 100644 data/hfopenllm_v2/Spestly/Athena-1-3B/dd2603d5-e99e-4778-95d0-159c788626cf.json
 delete mode 100644 data/hfopenllm_v2/Spestly/Atlas-Pro-1.5B-Preview/41c71990-e79d-447f-b082-63c96fd67a1f.json
 delete mode 100644 data/hfopenllm_v2/Spestly/Atlas-Pro-7B-Preview/b9e25948-2871-4b6c-933b-8a731e48e81b.json
 delete mode 100644 data/hfopenllm_v2/Stark2008/GutenLaserPi/7c70df74-2bc2-40e0-b0f4-77be1a7e044c.json
 delete mode 100644 data/hfopenllm_v2/Stark2008/LayleleFlamPi/ea71bdd5-3aa1-4d26-9256-5aeb2f79fa8c.json
 delete mode 100644 data/hfopenllm_v2/Stark2008/VisFlamCat/b0e9c0ca-cd56-42c8-96ed-477884bfd9f9.json
 delete mode 100644 data/hfopenllm_v2/Steelskull/L3.3-MS-Nevoria-70b/7395fcde-49dd-47f4-a8ea-463eda40f5e3.json
 delete mode 100644 data/hfopenllm_v2/Steelskull/L3.3-Nevoria-R1-70b/a130087f-566f-4405-b662-1102f1664c49.json
 delete mode 100644 data/hfopenllm_v2/StelleX/Qwen2.5_Math_7B_Cot/3be58cf3-4761-4459-9f3c-eabf812a3c19.json
 delete mode 100644 data/hfopenllm_v2/StelleX/Vorisatex-7B-preview/dbdd71ad-db5b-4b4b-8856-68b55adbe127.json
 delete mode 100644 data/hfopenllm_v2/SultanR/SmolTulu-1.7b-Instruct/da159a16-48a0-45e3-ad4d-bdc9e8b5288c.json
 delete mode 100644 data/hfopenllm_v2/SultanR/SmolTulu-1.7b-Reinforced/77d5f51e-5ad2-42a6-a32c-060cd844b949.json
 delete mode 100644 data/hfopenllm_v2/SultanR/SmolTulu-1.7b-it-v0/724cc582-cc83-474b-9606-70dbc22f3581.json
 delete mode 100644 data/hfopenllm_v2/Supichi/BBA-123/8a1b2aae-d717-4b49-8ed2-a7ee2cee1940.json
 delete mode 100644 data/hfopenllm_v2/Supichi/BBA99/0dfb062d-a6ec-42a6-a9f9-6f6424bbdf0c.json
 delete mode 100644 data/hfopenllm_v2/Supichi/BBAIK29/ab2512fa-2335-4817-9a76-3259690bbc67.json
 delete mode 100644 data/hfopenllm_v2/Supichi/BBAI_135_Gemma/fe7f1442-b7db-42d5-bc83-b8afd1d0c802.json
 delete mode 100644 data/hfopenllm_v2/Supichi/BBAI_250_Xia0_gZ/0e14484a-69d7-423e-bf6c-33d0992f408c.json
 delete mode 100644 data/hfopenllm_v2/Supichi/BBAI_275_Tsunami_gZ/881eaa2c-af5f-4e84-8807-d0835c10ebd2.json
 delete mode 100644 data/hfopenllm_v2/Supichi/BBAI_525_Tsu_gZ_Xia0/ef8a7079-9d13-42b7-ab2d-b72df5ae5d95.json
 delete mode 100644 data/hfopenllm_v2/Supichi/BBAI_78B_Calme_3_1_Ties/db8d3fc4-58f4-4f07-8c27-c73a4a4719fb.json
 delete mode 100644 data/hfopenllm_v2/Supichi/BBAI_QWEEN_V000000_LUMEN_14B/0c44a429-e705-4794-b702-1a731e52df90.json
 delete mode 100644 data/hfopenllm_v2/Supichi/HF_TOKEN/92b3d2c1-61f4-432a-82a7-43b4367f7ef0.json
 delete mode 100644 data/hfopenllm_v2/Supichi/NJS26/5703e81d-055c-459b-8202-80ec382a8d5b.json
 delete mode 100644 data/hfopenllm_v2/Svak/MN-12B-Inferor-v0.0/f6260b6e-52a2-4142-93ba-5393807fa0d4.json
 delete mode 100644 data/hfopenllm_v2/Svak/MN-12B-Inferor-v0.1/83b84506-4826-48de-a6fe-2af6ae5d425a.json
 delete mode 100644 data/hfopenllm_v2/Syed-Hasan-8503/Phi-3-mini-4K-instruct-cpo-simpo/7483e260-9853-4d3f-aa10-187796d96de9.json
 delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V1-P1/f9925806-4252-44e8-b67e-917737572bd4.json
 delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V1-P2/70470e6c-8d66-4249-b762-a5a2e3589a53.json
 delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V1-P3/d3abfe3c-ebfe-4dfd-b0db-93c14d32c585.json
 delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V2/a35b06bc-d759-421a-94cf-f408a98e9273.json
 delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V3/bbac659c-7cf8-41d4-98d4-ded4c471bd98.json
 delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V4/0c73f3a0-0a92-4b1c-abfa-6eb77138dacd.json
 delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V5/a7ab6f16-717f-4567-8057-a4a18e1a1e77.json
 delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V6/2abe2c9d-032d-469e-852b-114eca5e84f8.json
 delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V7/2e8a83dc-c760-4f42-a361-e02cf3a65427.json
 delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V8/743dfe64-e7cd-493e-817d-8d5fcdc2ea24.json
 delete mode 100644 data/hfopenllm_v2/T145/KRONOS-8B-V9/4e37c90b-65a8-4b71-bfc2-d63541fb8962.json
 delete mode 100644 data/hfopenllm_v2/T145/Llama-3.1-8B-Instruct-Zeus/2e34d74e-1b69-4daf-8bee-77e5357fd439.json
 delete mode 100644 data/hfopenllm_v2/T145/Llama-3.1-8B-Zeus/0646e2f7-d2e6-42d3-8f09-f8daee302709.json
 delete mode 100644 data/hfopenllm_v2/T145/Meta-Llama-3.1-8B-Instruct-TIES/c66b1ff8-9c04-4f9c-b83e-088f31f79590.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V10/1bd2affc-9970-4149-b52b-51549b1f0029.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V11/f0479d74-4684-4b41-a63b-16d7fe0e3290.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V12/95deb890-a15d-4c71-8151-ed45c3dfb87f.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V13-abliterated/1c07fc4c-a773-4e03-bb14-7144e7815c01.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V13/e7e8388e-db3c-4881-b67c-5177c60562b9.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V14/c4923208-2a47-45f2-a74a-4483e4b99bee.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V15/b5f06a78-5b57-45a5-93be-4f3c1b36f208.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V16/835f19d3-515c-4bc4-ab96-5cb5bece45dc.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated-V2/7dd96382-6fc1-4a39-924b-d9034b5b0839.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated-V4/77a666a2-a9b2-43cc-8e64-67172f4ab6c8.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated/e3eae267-46ab-4433-a8f3-2a2f8448299b.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V17/e31308c4-8eb2-4a72-8127-18049d58b814.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V18/c7098a7a-e865-4ecd-b511-abeb2c0872bd.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V19/b3a8c734-e63a-47f7-af2c-a3b6518802fa.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V2-ORPO/35937965-2791-4f75-8954-5a2280381c91.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V2-abliterated/4ab806fe-738d-4f5b-89e4-004134d2f7fe.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V2/a937e27e-b757-4de7-b679-01ac29d8bb22.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V20/1d906aab-33a6-4ffe-8a63-694482d83d09.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V21/9e101298-6482-4ae8-83e4-b948ba8fa550.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V22/3818710d-80a9-4e7d-90e3-f06afffb71ac.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V23/a18ec0c4-6f3f-4904-b69c-e40770df169e.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V24/529c2bd4-6b8e-4e3c-8737-c0b794444d13.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V25/9e994362-a1d1-48f7-9db1-dd9d532b9f35.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V26/cf35b7db-f675-4362-8916-36b0582b64f4.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V27/79ee7e34-36cd-4024-8978-86c1b059ae5f.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V28/9ec4fb99-ed4d-416e-9342-0c036aadd35d.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V29/8788e4fa-04c5-4f7c-bb4e-523287901f71.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V2L1/18097bf4-5149-40e9-9850-558c3f143ed8.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V2L2/b5942721-5c30-4c49-a6e1-fb5419539652.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V3/76d27de3-0309-4e4b-8d0d-0e402bde0a31.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V30/5c0553ff-4910-45a9-aa8d-3a76af098403.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V4/fd97d1d9-a1b5-429d-b73d-1ea92ae1d61c.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V6/f77aa103-5a09-409c-ad72-7992b6049f94.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V7/0afdaa1d-c1e7-4283-a2b3-f459c09df4a9.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V8/044ed79b-0c54-4a7a-94ba-a3f999adeb0d.json
 delete mode 100644 data/hfopenllm_v2/T145/ZEUS-8B-V9/ac6b884d-62ea-4ff5-8eee-cfce08869030.json
 delete mode 100644 data/hfopenllm_v2/T145/qwen-2.5-3B-merge-test/8ffa696e-adef-4808-ba0e-bb04921a433d.json
 delete mode 100644 data/hfopenllm_v2/THUDM/glm-4-9b-chat-1m-hf/8a2cfa62-5f13-447e-8d0f-2503e4962ac5.json
 delete mode 100644 data/hfopenllm_v2/THUDM/glm-4-9b-chat-1m/4f24fc46-3686-41fa-bf25-a0e39b252cc9.json
 delete mode 100644 data/hfopenllm_v2/THUDM/glm-4-9b-chat-hf/b1375cb4-b0d5-4cb4-ad43-394ebd1a481f.json
 delete mode 100644 data/hfopenllm_v2/THUDM/glm-4-9b-chat/4ce062da-acfc-4684-95c2-679cbe5a697b.json
 delete mode 100644 data/hfopenllm_v2/THUDM/glm-4-9b/3d785765-befa-4e53-8672-769f7bb87dcd.json
 delete mode 100644 data/hfopenllm_v2/TIGER-Lab/AceCodeRM-7B/ab0d3a24-19db-4d00-892e-bcb7c0f2f30f.json
 delete mode 100644 data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-7B-Ins-Rule/31f0b186-1805-42ff-86cf-d8455a66d538.json
 delete mode 100644 data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Base-Rule/ed6b3e7e-d294-420d-b9b9-460a52cd0239.json
 delete mode 100644 data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Ins-Rule/91dec0c0-9854-4790-a0a5-e17d19636f17.json
 delete mode 100644 data/hfopenllm_v2/TIGER-Lab/MAmmoTH2-7B-Plus/599616fb-26c1-47e3-a98b-9ad922a95c08.json
 delete mode 100644 data/hfopenllm_v2/TIGER-Lab/Qwen2.5-Math-7B-CFT/aeee4365-c34d-46b9-8c98-29976010bb62.json
 delete mode 100644 data/hfopenllm_v2/TTTXXX01/Mistral-7B-Base-SimPO2-5e-7/1ec68708-94c9-4561-bb99-7f211d7a9950.json
 delete mode 100644 data/hfopenllm_v2/Tarek07/Progenitor-V1.1-LLaMa-70B/0b53e7b4-0e91-40a2-911b-cd0d415e9fad.json
 delete mode 100644 data/hfopenllm_v2/Tarek07/Thalassic-Alpha-LLaMa-70B/91bcd646-fe3d-458b-a426-a6a8863d69a0.json
 delete mode 100644 data/hfopenllm_v2/TeeZee/DoubleBagel-57B-v1.0/2e0458cc-e092-4770-bd80-00dff169d754.json
 delete mode 100644 data/hfopenllm_v2/Telugu-LLM-Labs/Indic-gemma-2b-finetuned-sft-Navarasa-2.0/d56ef415-0edf-4fde-8277-ae44b4bb4ed2.json
 delete mode 100644 data/hfopenllm_v2/Telugu-LLM-Labs/Indic-gemma-7b-finetuned-sft-Navarasa-2.0/a0a1beb8-ee9a-4e88-b939-6e0104ed76a7.json
 delete mode 100644 data/hfopenllm_v2/TencentARC/LLaMA-Pro-8B-Instruct/f9b7c3ee-ea8b-42f0-a55a-6171d4e3d0ea.json
 delete mode 100644 data/hfopenllm_v2/TencentARC/LLaMA-Pro-8B/2c8c6c6a-ce95-4d11-a33a-d547859fee11.json
 delete mode 100644 data/hfopenllm_v2/TencentARC/MetaMath-Mistral-Pro/47858744-3378-4ed4-9101-8acbc3a53cda.json
 delete mode 100644 data/hfopenllm_v2/TencentARC/Mistral_Pro_8B_v0.1/2aaeaaa7-89ed-4666-b0a5-8c1320ec4ec5.json
 delete mode 100644 data/hfopenllm_v2/TheDrummer/Cydonia-22B-v1.2/23ae6a72-5a1f-4961-8662-feb4d8ad8a26.json
 delete mode 100644 data/hfopenllm_v2/TheDrummer/Gemmasutra-9B-v1/312ec315-6175-4f99-8741-97d97eb26b47.json
 delete mode 100644 data/hfopenllm_v2/TheDrummer/Gemmasutra-Mini-2B-v1/7869bbe3-fd17-4e6d-9546-94d3df5e83ef.json
 delete mode 100644 data/hfopenllm_v2/TheDrummer/Llama-3SOME-8B-v2/68c9fb85-f90e-442f-aa96-458dabe30b39.json
 delete mode 100644 data/hfopenllm_v2/TheDrummer/Ministrations-8B-v1/6891d1dd-0e1a-42e8-9206-64a4c71854f9.json
 delete mode 100644 data/hfopenllm_v2/TheDrummer/Rocinante-12B-v1/c62eb6b3-2a3d-45bd-acdf-bad717e51766.json
 delete mode 100644 data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v1/55d4a6ae-44e5-4a1b-9509-299fbc6c3a36.json
 delete mode 100644 data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v2/227e3e19-29d6-414f-b538-9f6f89d47677.json
 delete mode 100644 data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v3/e922ac2c-e8d0-48f2-99fc-da70c925136c.json
 delete mode 100644 data/hfopenllm_v2/TheDrunkenSnail/Daughter-of-Rhodia-12B/59f93c1c-3712-4ee2-a3d2-999e5acc2ee5.json
 delete mode 100644 data/hfopenllm_v2/TheDrunkenSnail/Mother-of-Rhodia-12B/a98dcf1e-6abb-402b-9e0c-da7c23b74bde.json
 delete mode 100644 data/hfopenllm_v2/TheDrunkenSnail/Son-of-Rhodia/a889f561-0d8a-4345-9131-0a897ec215ac.json
 delete mode 100644 data/hfopenllm_v2/TheHierophant/Underground-Cognitive-V0.3-test/6402facc-6258-43a4-a0fd-78e21765c504.json
 delete mode 100644 data/hfopenllm_v2/TheTsar1209/nemo-carpmuscle-v0.1/29fbd2e0-e08a-48f4-905e-d2aa54886915.json
 delete mode 100644 data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-r-v0.3/313e0379-d3ea-4f5a-8e06-4b0a94317487.json
 delete mode 100644 data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.1/f326fbd0-5f92-4324-a587-1f08cf7da208.json
 delete mode 100644 data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.2/d61310e9-5267-4a87-8e24-ae25172cd64e.json
 delete mode 100644 data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.3/60953e5e-523d-43c0-ad00-f746308030b1.json
 delete mode 100644 data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.4.1/5afd8861-d7cb-45cd-af1b-6db966cb56e0.json
 delete mode 100644 data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.4/c3972df1-4414-4c71-b473-fb9459cf085b.json
 delete mode 100644 data/hfopenllm_v2/Tijmen2/cosmosage-v3/b89d54b7-2329-4608-b9f6-07017e63f1cd.json
 delete mode 100644 data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.1/50389350-af23-41ba-af46-5ffe338ff9d2.json
 delete mode 100644 data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.5/b8f8f045-2306-43ad-8fa0-6a8bdb494db6.json
 delete mode 100644 data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.6/7cd59011-75d7-4497-956c-322d5d609c5f.json
 delete mode 100644 data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v1.0/1313d865-9c5b-45d2-ad64-629c65f07f2c.json
 delete mode 100644 data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T/0efc2583-bf21-4b60-96cc-716928768eb1.json
 delete mode 100644 data/hfopenllm_v2/TinyLlama/TinyLlama_v1.1/be0a2737-19a0-4401-998a-a03663467133.json
 delete mode 100644 data/hfopenllm_v2/ToastyPigeon/Sto-vo-kor-12B/71720e07-2de0-4402-bdfd-102150c61765.json
 delete mode 100644 data/hfopenllm_v2/Trappu/Magnum-Picaro-0.7-v2-12b/38c84c69-5cdb-4f24-820d-4b39c5b118ff.json
 delete mode 100644 data/hfopenllm_v2/Trappu/Nemo-Picaro-12B/de9d274d-f213-4037-9711-3e9d3dbbcc96.json
 delete mode 100644 data/hfopenllm_v2/Tremontaine/L3-12B-Lunaris-v1/92381da4-b9d1-43c4-a5c9-59f375017e11.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Annunaki-12b/44ab6a50-027d-47df-a518-5aa944eb2a61.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/BigTalker-Lite-8B/2a1947d7-74e0-43d0-931d-b2862348e90a.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Chatty-Harry_V2.0/3677b71c-387d-4182-b15d-c3525bc7bc36.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Chatty-Harry_V3.0/6b125a8e-5b53-48ca-8875-926249879f39.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Chronos-Prism_V1.0/af851d4b-69d4-49a9-a160-a180146c3963.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/DS-Distilled-Hermes-Llama-3.1/7aa6ce37-c0e4-48ce-b9db-f158ac47d366.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/DS-Distilled-Hermes-Llama-3.1_TIES/1bce093e-27c0-41ad-aad6-b656f6773ed5.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-10B-Harmony/5c6cffab-ef72-4e12-808c-c26ee8ec6999.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-14B-Harmony_V0.1/e288a874-f750-4a90-be07-616094c220cf.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-7B-RP/0607da8d-3f4e-468a-91a6-b975261a87c0.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/DS-R1-Llama-8B-Harmony/be2cc2fd-c8e7-4421-b8c8-d3b937272d0d.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/DSR1-Distill-Llama-Lit-8B/15ffe64e-72fd-4e65-8632-babf137a386d.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/DSR1-Distill-Qwen-7B-RP/ce1c0d4f-f5a3-49e7-ab77-65ff51bbd0ca.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Dark-Chivalry_V1.0/b5afab38-13ba-4abd-9d04-a433c41061c5.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Distilled-DarkPlanet-Allades-8B/a862c2a5-f66b-4d09-ac57-6cbe565f9f35.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Distilled-DarkPlanet-Allades-8B_TIES/d8254f6c-8110-44d3-800e-101fc731d779.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Distilled-Whiskey-8b/ccbcd5a7-2b98-4d90-ace1-3ad5971a5f18.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Dolphin3-Llama3.2-Smart/c208b19b-4ecf-4fad-b931-54f65d4b711b.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Gemmadevi-Stock-10B/debaf4a0-c734-47ea-bea0-2ddc65dc397d.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Hermes-Llama-3.2-CoT-Summary/0eeb5962-ccc0-407b-92e6-7cf17c00941f.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Hermes-Llama-3.2-CoT/4b60e863-482c-4f91-8cd1-6c993d3c5988.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Hermes3-L3.1-DirtyHarry-8B/f5f0bc72-427d-4703-aab1-1bb1bea73895.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Herodotos-14B/aae7f543-7b5b-435f-a506-e3ab901a8c5a.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Herodotos-14B_V0.1/6e6ff4c3-3cfd-4790-80c4-544d9cbe47e2.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/L3.1-8B-Dusky-Ink/3ee76278-89d4-44fb-a449-717534b00161.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/L3.1-8B-Dusky-Ink_v0.r1/fa2854d3-9e2f-4f79-ac8c-e1cb5a638745.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/LThreePointOne-8B-HermesBlackroot/9ddaa721-bf3a-416a-9be8-291188793cc9.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/LThreePointOne-8B-HermesInk/d659077d-7261-4c69-862c-d61be21662a2.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Llama3.1-Allades-Lit-8b/e87ba227-c55e-4666-949d-b45913f8336b.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Llama3.1-cc-Lit-8b/077f683a-af6f-4a71-b599-b9b269546b7c.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Minerva-1.5b/54808b08-d10d-4a06-ab60-8d99039311b8.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Minerva-1.5b_V0.2/138e6fdb-7092-4ee6-be82-7bb86c1fc759.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Minerva-10b/1b27423f-62cc-4189-a293-5af84ef1f2c8.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Minerva-14b-V0.1/f5468512-d2c7-4486-9d31-bef61225af52.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Minerva-14b/0e0ec1a9-76aa-4d7e-9c0e-946d6b000a6a.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Minerva-7b/07b87b98-0d61-4479-937f-7447565b4631.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Minerva-8b/85b11b91-d686-49e9-8db0-971dd7cafb75.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Mistral-Redemption-Arc/21bac032-a092-4afa-8d29-ebdefb3a0650.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Mistral-Small-24b-Harmony/29e3a687-429f-4f33-ae5f-48db85127364.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Pans_Gutenbergum_V0.1/d98493a6-f237-4565-8508-9e4cc3188d2d.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Pans_Gutenbergum_V0.2/2def6fbd-7488-4e9f-a822-2405d4f7a315.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Pantheon_ChatWaifu_V0.2/819143d4-9538-48b9-b7af-128bc15c518a.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Phi-4-AbliteratedRP/c29d47af-a9de-4edb-acac-6763c0d44ca3.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Phi4-RP-o1-Ablit/22bf3fb7-9235-4a57-b8fd-c85b12047b0e.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Phi4-RP-o1/2bea7014-460d-470b-918f-468b58d70fd6.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Porpoise-R1-Llama3.2-3b/3927a5dd-002b-441a-b769-ba68547cd5f3.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Q2.5-14B-Instruct-1M-Harmony/476fc734-dedd-4192-aa59-eb2f9dabf16b.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Q2.5-AthensCOT/817e2fbe-0866-489f-b987-391228a68c53.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Q2.5-CodeR1-3B/f25f5eb1-ff22-4be3-a639-a9d25207078f.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Q2.5-EVACOT-7b/f71d1c31-184b-46be-a288-bdc92f0ebe09.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Q2.5-EvaHumane-RP/0d9547b3-7bef-4815-9c44-7d714fe81bbb.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Q2.5-Humane-RP/22dbc5a2-0ff6-4566-9bfd-e5ce314be597.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Q2.5-Instruct-1M_Harmony/afedb249-f1a5-42d6-b6c0-54b2cc303f64.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Q2.5-R1-3B/61b1bf5e-6aa4-4e90-af2c-dcf5fc9903f2.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Q2.5-R1-7B/c0adc04c-1e02-4891-a5a1-1fab0ddf18ca.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Robo-Gutenberg_V1.0/cc57e6f0-ab55-4ab9-983c-63d74632d016.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Rocinante-Prism_V2.0/0d3c5fdb-c4a5-4436-b9d4-f0f42cb4db96.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Rocinante-Prism_V2.1/a6ec2934-e9fd-481d-8f00-932603bc6e0a.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/RomboHermes3-R1-Llama3.2-3b/e2553c93-60df-4126-9e64-ecd4a5003389.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Rombos-Novasky-7B_V1c/e7c2fb42-e82a-4dac-9cc3-a9f41ab54e0f.json
 delete mode 100644 data/hfopenllm_v2/Triangle104/Set-70b/a807ee8c-509e-4b6d-a414-df24444d8a0a.json
 delete mode 100644 data/hfopenllm_v2/Tsunami-th/Tsunami-0.5-7B-Instruct/2199024b-7944-4950-8335-32a536efad02.json
 delete mode 100644 data/hfopenllm_v2/Tsunami-th/Tsunami-0.5x-7B-Instruct/97919c86-6161-4548-95b9-d44263a29f8a.json
 delete mode 100644 data/hfopenllm_v2/Tsunami-th/Tsunami-1.0-14B-Instruct/c40c1a46-2e30-4cf1-bcf3-a316a793fbcd.json
 delete mode 100644 data/hfopenllm_v2/Tsunami-th/Tsunami-1.0-7B-Instruct/c1294268-b5f5-4d64-b91a-147f58a21a47.json
 delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter1/2b029e6d-a0b8-4b6c-b62d-144b8dc4f739.json
 delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter2/b926ca6c-60c9-4353-9671-0453b46d0222.json
 delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter3/44db30b4-2010-4f96-a39e-9ccc8568374f.json
 delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter1/2210d673-d417-46be-aeca-de48cd846e01.json
 delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter2/892d27cc-dfb3-40c7-ae0f-a7cd06784808.json
 delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3/49b3f293-721d-4d44-9748-88d1ce275050.json
 delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3/70fb41fe-46af-49e3-8270-5882e12f710f.json
 delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter1/13e2489f-9d96-4f68-8e22-c937604c2145.json
 delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter2/0c386ea0-4706-4a6f-994c-b6ee21dbce92.json
 delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter3/a8d5a193-6c87-4b5b-8ea3-b3ab78e73104.json
 delete mode 100644 data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO/4018f4bd-492a-4814-9a7a-1f0c376f2d2e.json
 delete mode 100644 data/hfopenllm_v2/UKzExecution/LlamaExecutor-8B-3.0.5/568072cb-118d-41af-bfe8-fa14cb4c7348.json
 delete mode 100644 data/hfopenllm_v2/Unbabel/TowerInstruct-Mistral-7B-v0.2/a6d08766-8c36-41bf-8bbc-acdfdc3f8e23.json
 delete mode 100644 data/hfopenllm_v2/Undi95/MG-FinalMix-72B/2504fed5-c8a1-4ffc-8ce5-9559aa8c4325.json
 delete mode 100644 data/hfopenllm_v2/Undi95/Phi4-abliterated/359dde31-d9dc-4c22-b829-77df652dcc73.json
 delete mode 100644 data/hfopenllm_v2/V3N0M/Jenna-Tiny-2.0/34a79823-b993-402a-89a7-538e126ee02a.json
 delete mode 100644 data/hfopenllm_v2/VAGOsolutions/Llama-3-SauerkrautLM-70b-Instruct/f392c5c3-9bee-4111-9a22-6a1b706fd2ad.json
 delete mode 100644 data/hfopenllm_v2/VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct/73bbdd22-4e5f-496b-b39f-290d8e0d2aa4.json
 delete mode 100644 data/hfopenllm_v2/VAGOsolutions/Llama-3.1-SauerkrautLM-70b-Instruct/72a66eae-9c94-40e3-b3c9-211303e5cba8.json
 delete mode 100644 data/hfopenllm_v2/VAGOsolutions/Llama-3.1-SauerkrautLM-8b-Instruct/ef7390b5-599b-4354-805b-9486e4ce34fa.json
 delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-1.5b/57f964c3-0504-4b60-9539-ce0e369816ea.json
 delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-7b-HerO/4e6c0336-5d94-4417-a194-92a4d6f38481.json
 delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-7b-LaserChat/fe38dea8-92f4-4fb2-afdf-c5932d7c9e27.json
 delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Gemma-2b/5ced7497-5a05-40d2-80cb-cae63ca62022.json
 delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Gemma-7b/52a66aaa-193a-48ca-b693-4dcab811eaa3.json
 delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct/e0e4bcef-cb73-436b-9353-b18ade293e8b.json
 delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Nemo-12b-Instruct/1ae45791-7e47-4083-bd72-4530fa26893c.json
 delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Phi-3-medium/b2731f04-a9bd-4e36-a545-85be5b66f5a7.json
 delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-SOLAR-Instruct/ed6de552-d04b-4d51-8456-610e2cb41d85.json
 delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-gemma-2-2b-it/3e08a589-d2b3-487b-900e-85725522a2e4.json
 delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-gemma-2-9b-it/b2717503-d081-40ee-b1ed-fcadaf239049.json
 delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-v2-14b-DPO/9915eb01-5c45-42b6-82a3-ad782411642f.json
 delete mode 100644 data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-v2-14b-SFT/190eb7ca-46db-4e1d-8b71-9bb20af74ede.json
 delete mode 100644 data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B-r-v-0.1/86b9077d-9ec3-411d-84c5-326ba97742c1.json
 delete mode 100644 data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B/18bfa50c-20be-4027-8ee7-f6cd1411c882.json
 delete mode 100644 data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B/eb1a099a-48c7-412b-b62f-143537c41f06.json
 delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3-70B-Fireplace/e530a4b7-c2f6-4bad-bab5-2895e950ed63.json
 delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3-70B-ShiningValiant2/52ad7152-feea-46a6-b2d8-20e1a70514ce.json
 delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.1-70B-ShiningValiant2/a61162a6-ef3e-46f4-8aa2-241547fadea2.json
 delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Cobalt/9f208aef-8544-47c8-bb1f-a3841aff208b.json
 delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Cobalt/da237ab6-df39-460f-9efc-e1649e1ac202.json
 delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Enigma/c81b3193-9d01-4590-8b72-da97aa3c9dc4.json
 delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Esper2/1a9ffe50-69ae-48bc-b636-89431391eb37.json
 delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Fireplace2/b0c67359-1da0-4f55-aa1c-f54f88038bd7.json
 delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Fireplace2/c700798b-583a-41be-94dd-382669bb495f.json
 delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-ShiningValiant2/3c0b9735-2ef1-4f27-b94a-f246eb57b73c.json
 delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-ShiningValiant2/e8c9501b-c985-4b78-a902-a1a030c72e60.json
 delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-Enigma/df978fce-3373-4073-8c44-d6a83df1d9d1.json
 delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-Esper2/e46ee8d9-81af-4259-8fef-3d3113fb6168.json
 delete mode 100644 data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-ShiningValiant2/aa6ab404-89ef-4336-b811-7c8064e26107.json
 delete mode 100644 data/hfopenllm_v2/Vikhrmodels/Vikhr-Llama3.1-8B-Instruct-R-21-09-24/a14e6c79-4a78-4c02-a7ca-35e783f32be1.json
 delete mode 100644 data/hfopenllm_v2/Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24/ba1fb85b-bbc0-46ac-95d7-e61b91f65c2b.json
 delete mode 100644 data/hfopenllm_v2/Weyaxi/Bagel-Hermes-2x34B/f6312fc7-c7a8-45dc-a57c-91f56b4ca28a.json
 delete mode 100644 data/hfopenllm_v2/Weyaxi/Bagel-Hermes-34B-Slerp/335f5c32-f3f0-4a16-8c9d-8f07b2aae54a.json
 delete mode 100644 data/hfopenllm_v2/Weyaxi/Einstein-v4-7B/b7c7a907-7ecc-4d5b-bc6f-8b8d82954b21.json
 delete mode 100644 data/hfopenllm_v2/Weyaxi/Einstein-v6.1-Llama3-8B/112f01a2-f0fb-4257-86bf-61c9a184eb92.json
 delete mode 100644 data/hfopenllm_v2/Weyaxi/Einstein-v6.1-developed-by-Weyaxi-Llama3-8B/2d9410d6-7162-4811-bf7d-9de2c2b48fd2.json
 delete mode 100644 data/hfopenllm_v2/Weyaxi/Einstein-v7-Qwen2-7B/16ff8fa3-4676-473c-99ad-908ddb59d8ed.json
 delete mode 100644 data/hfopenllm_v2/Weyaxi/Einstein-v8-Llama3.2-1B/9b153ac9-f95b-419b-b7f9-beccd769ddad.json
 delete mode 100644 data/hfopenllm_v2/Weyaxi/SauerkrautLM-UNA-SOLAR-Instruct/8a5df3c2-eb71-4e12-b013-fb43685f2916.json
 delete mode 100644 data/hfopenllm_v2/WizardLMTeam/WizardLM-13B-V1.0/35fa3213-5c08-4b19-ae76-237fdd25444e.json
 delete mode 100644 data/hfopenllm_v2/WizardLMTeam/WizardLM-13B-V1.2/242ce55f-1471-435e-bcd7-d28b5fc87fc4.json
 delete mode 100644 data/hfopenllm_v2/WizardLMTeam/WizardLM-70B-V1.0/95f509f2-5e67-404a-968d-f7488d684e32.json
 delete mode 100644 data/hfopenllm_v2/Wladastic/Mini-Think-Base-1B/bcbcdfe9-0663-417c-9a29-60906e63db8f.json
 delete mode 100644 data/hfopenllm_v2/Xclbr7/Arcanum-12b/d95a7493-2f99-4c10-8067-711c7388af7d.json
 delete mode 100644 data/hfopenllm_v2/Xclbr7/Hyena-12b/789848a0-6d8a-4583-93c3-a72df74d0071.json
 delete mode 100644 data/hfopenllm_v2/Xclbr7/caliburn-12b/14af87df-0fc5-46e1-9d0b-c25c8b6a7ce7.json
 delete mode 100644 data/hfopenllm_v2/Xclbr7/caliburn-v2-12b/379f559f-9bfa-444f-b477-562c25b4c299.json
 delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Llama3.2-1B-THREADRIPPER-v0.2/effb6a3d-c98f-4c3a-be77-902c61cda21b.json
 delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Llama3.2-1B-THREADRIPPER/6c1c1405-afa4-412d-ba1f-49dc1cac4509.json
 delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Phi-4-Megatron-Empathetic/6f4ed7c2-c775-4fd2-8600-4cea523f53e4.json
 delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Phi-4-mini-UNOFFICAL/5fd5206b-186a-43b9-a4f4-07e75aa0293a.json
 delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-7B-MS-Destroyer/b707ecbf-0658-4226-803d-53456d16d54b.json
 delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview-v0.2/dca1ee57-5e86-4532-a2f3-ac6a619ca576.json
 delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview/1233476a-7839-4a22-a7ca-1d0f237d8888.json
 delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Medium-Censored/5c4bdeca-5ef8-4002-8f82-67d49b5ff722.json
 delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Small-AnniversaryEdition/18f5fd6c-2b79-4d48-b7e9-18845db16271.json
 delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Small/a9039374-fa5a-4b8b-800f-5f4651cf812d.json
 delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Ultra-1.5B-25.02-Exp/3f9704b4-bf25-40da-b6dc-b927c3569f40.json
 delete mode 100644 data/hfopenllm_v2/Xiaojian9992024/Reflection-L3.2-JametMiniMix-3B/a8f858d8-a792-409f-b79d-948a19e2aa87.json
 delete mode 100644 data/hfopenllm_v2/Xkev/Llama-3.2V-11B-cot/5c34a168-b8cf-436b-a3b7-a2d1feadffb9.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-1M-YOYO-V3/77092cfe-9820-45e8-94c5-31d27f1daa7c.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0505/cab8fed8-de68-4fa5-b4fc-d9483fc56571.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0510-v2/a8103350-b208-4856-8e7b-8ea8918ba0d1.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0805/e849c03c-c569-4059-8fc5-6a98cf391342.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1005-v2/f1d8bffa-61fc-47d5-85cf-48cebcb31af5.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1005/97bdb352-2e9d-4cc5-8b70-55348ef3a217.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010-v2/78053a33-24c8-4e9f-8791-f127f21eec1c.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010/03082966-87ba-4560-a784-5d8677003500.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010/97f26b20-db66-4a30-ba2a-c18a31081271.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-SCE/85f9ccda-8c47-4fa1-9d47-e9da4730b077.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4-p1/2a57d6f4-643b-4b30-8d67-03032d454887.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4-p2/d333f360-c1c3-4916-8480-4a1fc490875a.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4/37a41261-a7b0-44b2-916f-770cdfa0ad39.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-latest-V2/c46cd6cc-b56d-44c5-a03c-b49381ba3462.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-latest/612b6226-c25d-42e0-bcd7-be7faa844530.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-it-restore/2fc7a4d6-88e0-4f11-9110-dc53942870a4.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-7B-it-restore/34665752-58d8-48ee-81a6-f1a068c23026.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/Qwen2.5-Coder-14B-YOYO-1010/cc0767b5-4aaa-4418-8f68-72a721323e9c.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V2/ea507a41-1654-4515-94cc-ce2e38800c61.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V3/c44e773f-4cca-4780-bdd4-f486e65c18e0.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V4/f8a46bda-d53b-484e-8832-7939f7d0762d.json
 delete mode 100644 data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B/c3968a2d-4a9a-4f62-8bea-a3b4b6dcd378.json
 delete mode 100644 data/hfopenllm_v2/Yash21/TinyYi-7B-Test/da18242c-d6bb-4a0a-a2f9-2e42099f4e8a.json
 delete mode 100644 data/hfopenllm_v2/Youlln/1PARAMMYL-8B-ModelStock/ac078124-85d9-4715-bf7c-1428b1063732.json
 delete mode 100644 data/hfopenllm_v2/Youlln/2PRYMMAL-Yi1.5-6B-SLERP/9c1dcd75-8491-4890-ac6f-000868099a3e.json
 delete mode 100644 data/hfopenllm_v2/Youlln/3PRYMMAL-PHI3-3B-SLERP/7850fc57-49c7-4124-b7c6-e1e7bb2bc726.json
 delete mode 100644 data/hfopenllm_v2/Youlln/4PRYMMAL-GEMMA2-9B-SLERP/8f38374e-f373-4639-9278-24441ebd0325.json
 delete mode 100644 data/hfopenllm_v2/Youlln/ECE-MIRAGE-1-12B/c007938e-3427-4896-8493-1500abdfbd2b.json
 delete mode 100644 data/hfopenllm_v2/Youlln/ECE-MIRAGE-1-15B/df81dc0d-6c72-49e9-862b-02e9b6642cb6.json
 delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V3-MUSR/46c96d8e-568c-48f8-a74b-9dd4b4195037.json
 delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V3/1f4f7181-8a81-49f4-9e81-925d5d69a37c.json
 delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V4-MUSR/3ea343b6-93f6-4c61-a164-3db95d13cbdf.json
 delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-SLERP-V2/a9ea8bb5-05fc-4da3-8e00-f53ab8ea6af5.json
 delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-SLERP-V3/0ea74ce5-43c9-43eb-92bc-3d928062d9e0.json
 delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-1B-SLERP-V1/6896faa7-7204-4091-8f4e-9cc0b53d673a.json
 delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-1B-SLERP-V2/88064453-fd8c-4bd9-adf1-39f43972bec1.json
 delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-7B-SLERP-V4/a18ade45-acba-4059-b969-445e529a82e2.json
 delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL0.5-FT/6c0e4132-71e7-44af-95fc-83b0a6be2a82.json
 delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL0.5B-Youri/5d9ab422-4f4f-460d-bd39-51266b43d7e5.json
 delete mode 100644 data/hfopenllm_v2/Youlln/ECE-PRYMMAL1B-FT-V1/cda03c45-0782-40cc-a17d-67d808657b83.json
 delete mode 100644 data/hfopenllm_v2/Youlln/ECE-Qwen0.5B-FT-V2/50f5451b-41c4-4ba5-8bee-ee8a2deb7e79.json
 delete mode 100644 data/hfopenllm_v2/Youlln/ECE.EIFFEIL.ia-0.5B-SLERP/cf758994-6e94-434d-bf68-74cca188b5e8.json
 delete mode 100644 data/hfopenllm_v2/YoungPanda/qwenqwen/611f9549-0788-44e9-8125-18df06cd80d6.json
 delete mode 100644 data/hfopenllm_v2/Yuma42/KangalKhan-RawRuby-7B/59cf23ba-027d-4bac-a0e1-526376396b4d.json
 delete mode 100644 data/hfopenllm_v2/Yuma42/Llama3.1-IgneousIguana-8B/1f02bbd3-ddaf-4db6-b7f8-31bad8ffac66.json
 delete mode 100644 data/hfopenllm_v2/Yuma42/Llama3.1-SuperHawk-8B/1e737e28-d926-43e8-9e4c-e39fa91d7977.json
 delete mode 100644 data/hfopenllm_v2/Z1-Coder/Z1-Coder-7B/43ef8eee-5d8a-47e7-ac71-1a898421370a.json
 delete mode 100644 data/hfopenllm_v2/ZHLiu627/zephyr-7b-gemma-dpo-avg/d8d03c71-942f-4aff-8a5e-5c265c639b44.json
 delete mode 100644 data/hfopenllm_v2/ZHLiu627/zephyr-7b-gemma-rpo-avg/96262938-1146-4993-92a1-a2ddb2519f8a.json
 delete mode 100644 data/hfopenllm_v2/ZeroXClem/L3-Aspire-Heart-Matrix-8B/292d7cfb-3e3c-47d8-8cca-33507f9ff081.json
 delete mode 100644 data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-AthenaSky-MegaMix/3f29c10f-57ef-435b-85df-2cae30ae72fa.json
 delete mode 100644 data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-RainbowLight-EtherealMix/d7f022fe-86cb-4e4e-a672-62c2dc8cffd3.json
 delete mode 100644 data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SpecialTitanFusion/baa35c90-c494-4dff-af28-cb549e40bed8.json
 delete mode 100644 data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SuperNova-EtherealHermes/2fdc3186-6791-4550-ac4f-a1a5a5a1d514.json
 delete mode 100644 data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SuperTulu-LexiNova/f687df8b-42b5-4d94-b741-1b516d9221b2.json
 delete mode 100644 data/hfopenllm_v2/ZeroXClem/Qwen-2.5-Aether-SlerpFusion-7B/c3a8a952-6869-4eee-a59f-4ae33ac72986.json
 delete mode 100644 data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-CelestialHarmony-1M/a7a74117-71e4-49b2-bd65-add82c9165d8.json
 delete mode 100644 data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-HomerAnvita-NerdMix/04ee694c-0c89-4f25-b10f-315a24743ba2.json
 delete mode 100644 data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-HomerCreative-Mix/47fd4acb-acc3-4f12-8af5-c425d3754c38.json
 delete mode 100644 data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-Qandora-CySec/e19577f5-d1ba-45ad-8500-d18ae2b14440.json
 delete mode 100644 data/hfopenllm_v2/ZeusLabs/L3-Aethora-15B-V2/e86443cd-453b-4ca0-8e7e-054764fe4bb9.json
 delete mode 100644 data/hfopenllm_v2/ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3/24cd9977-f3fb-4619-aea1-59e1a36b2a5e.json
 delete mode 100644 data/hfopenllm_v2/aaditya/Llama3-OpenBioLLM-70B/1401f0d9-6f4c-41d2-819f-eb9487c5c1e6.json
 delete mode 100644 data/hfopenllm_v2/abacusai/Dracarys-72B-Instruct/4b1f2aab-ef92-4231-9bdd-96918b26914c.json
 delete mode 100644 data/hfopenllm_v2/abacusai/Liberated-Qwen1.5-14B/4956e127-14a1-405e-a0e0-76fe94ea727b.json
 delete mode 100644 data/hfopenllm_v2/abacusai/Llama-3-Smaug-8B/90fb6e40-88f7-4ce2-ae99-308d87e69718.json
 delete mode 100644 data/hfopenllm_v2/abacusai/Smaug-34B-v0.1/cdad0f08-1c60-4493-bed0-9733894b367a.json
 delete mode 100644 data/hfopenllm_v2/abacusai/Smaug-72B-v0.1/8e83b4f7-736f-4e03-8256-2a1fc421b04f.json
 delete mode 100644 data/hfopenllm_v2/abacusai/Smaug-Llama-3-70B-Instruct-32K/f0d6639d-8485-4bcd-b069-046a747dfbfa.json
 delete mode 100644 data/hfopenllm_v2/abacusai/Smaug-Mixtral-v0.1/d1fe36ba-04f8-4110-8c39-81d393c4cbfc.json
 delete mode 100644 data/hfopenllm_v2/abacusai/Smaug-Qwen2-72B-Instruct/5a8ab5fb-ec1e-490c-b643-e3b9d49f5d34.json
 delete mode 100644 data/hfopenllm_v2/abacusai/bigstral-12b-32k/de944f89-d2d4-4b01-b4b5-e7cbd1d8d1ae.json
 delete mode 100644 data/hfopenllm_v2/abacusai/bigyi-15b/db96601a-2f7f-438f-915b-55fee0e0d1d1.json
 delete mode 100644 data/hfopenllm_v2/abhishek/autotrain-0tmgq-5tpbg/27912f7d-7033-4b7c-b93a-af1673ce4a9b.json
 delete mode 100644 data/hfopenllm_v2/abhishek/autotrain-0tmgq-5tpbg/da58a484-4a45-4a70-a651-031ada8023d5.json
 delete mode 100644 data/hfopenllm_v2/abhishek/autotrain-llama3-70b-orpo-v1/e8bd221d-8a89-4e3c-8815-0bff27574053.json
 delete mode 100644 data/hfopenllm_v2/abhishek/autotrain-llama3-70b-orpo-v2/ffc21c2a-59fb-4ad8-88a4-930879b6eba0.json
 delete mode 100644 data/hfopenllm_v2/abhishek/autotrain-llama3-orpo-v2/1e506afa-0d08-45d6-9242-b06104aa67e8.json
 delete mode 100644 data/hfopenllm_v2/abhishek/autotrain-vr4a1-e5mms/7d66bb93-cb2f-4be6-b133-1f0325be58e1.json
 delete mode 100644 data/hfopenllm_v2/abideen/MedPhi-4-14B-v1/936f3c5f-7817-4118-96c8-e4061d4560fb.json
 delete mode 100644 data/hfopenllm_v2/adamo1139/Yi-34B-200K-AEZAKMI-v2/7d36ceed-2a1b-4b20-88ae-0a609cc161e9.json
 delete mode 100644 data/hfopenllm_v2/adriszmar/QAIMath-Qwen2.5-7B-TIES/77cace56-503f-4531-a4eb-0178a68cc283.json
 delete mode 100644 data/hfopenllm_v2/adriszmar/QAIMath-Qwen2.5-7B-TIES/9e49b710-2413-42f3-8943-bc9dbf68cb3c.json
 delete mode 100644 data/hfopenllm_v2/aevalone/distill_qw_test/9a5b3564-97df-4661-a171-37322386ac4d.json
 delete mode 100644 data/hfopenllm_v2/agentlans/Gemma2-9B-AdvancedFuse/0fc0450d-cdf1-44b5-a809-202d1dd6b5e3.json
 delete mode 100644 data/hfopenllm_v2/agentlans/Llama-3.2-1B-Instruct-CrashCourse12K/7f06c78c-f95e-4e50-aa57-da0579adcdae.json
 delete mode 100644 data/hfopenllm_v2/agentlans/Llama3.1-8B-drill/06e55e47-9995-4fa2-877a-c728e9f9f1a1.json
 delete mode 100644 data/hfopenllm_v2/agentlans/Llama3.1-Daredevilish-Instruct/39af1e0a-d1e3-4372-bc18-d07f3dff09f0.json
 delete mode 100644 data/hfopenllm_v2/agentlans/Llama3.1-Daredevilish/f32d59d6-8ab9-4b7d-ad9d-f62ce6d559bd.json
 delete mode 100644 data/hfopenllm_v2/agentlans/Llama3.1-LexiHermes-SuperStorm/7ddc3aef-c6c5-4d04-8473-3b3bba219d7f.json
 delete mode 100644 data/hfopenllm_v2/agentlans/Llama3.1-SuperDeepFuse-CrashCourse12K/ce80ac07-22d2-4883-ac6c-40b080e00b81.json
 delete mode 100644 data/hfopenllm_v2/agentlans/Llama3.1-SuperDeepFuse/cbece170-f872-485f-a6c2-5db17ced73bc.json
 delete mode 100644 data/hfopenllm_v2/agentlans/Qwen2.5-0.5B-Instruct-CrashCourse-dropout/c1fd751b-c6c3-4350-9618-f4b4840e1b69.json
 delete mode 100644 data/hfopenllm_v2/ahmeda335/13_outOf_32_pruned_layers_llama3.1-8b/bfd28b91-3a72-4417-b52b-804d2cbae12f.json
 delete mode 100644 data/hfopenllm_v2/ai21labs/Jamba-v0.1/32c26cbc-3697-47a6-bd12-18187df9dda9.json
 delete mode 100644 data/hfopenllm_v2/ai4bharat/Airavata/02280b9f-bc01-4e44-9d09-1e4ae8c0438b.json
 delete mode 100644 data/hfopenllm_v2/aixonlab/Aether-12b/a57d2d49-5ccf-48f5-8035-b1d480c80f40.json
 delete mode 100644 data/hfopenllm_v2/aixonlab/Grey-12b/6b5a3c69-f8dd-4952-96fc-b6e4dec1ed9d.json
 delete mode 100644 data/hfopenllm_v2/aixonlab/Zara-14b-v1.2/fe0665dd-b976-4d90-b16b-6c2acfef15ff.json
 delete mode 100644 data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.01-First/8c6bdc44-fd29-45e7-b161-2c8e07ef2935.json
 delete mode 100644 data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.01-Last/e7c70ff9-59ad-4d09-8af0-ef9cf16d1dfa.json
 delete mode 100644 data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.1-First/26c4c993-ae49-42a0-be0a-f157be9f7d58.json
 delete mode 100644 data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.1-Last/19adf124-c120-4e97-80cf-49c40a66eb81.json
 delete mode 100644 data/hfopenllm_v2/akhadangi/Llama3.2.1B.BaseFiT/66bc5d38-8d25-4934-bce8-41ce4ea0e385.json
 delete mode 100644 data/hfopenllm_v2/akjindal53244/Llama-3.1-Storm-8B/541eafe5-807e-44b0-b652-a0752210fc71.json
 delete mode 100644 data/hfopenllm_v2/akjindal53244/Llama-3.1-Storm-8B/845a2484-9f17-4c0e-b06b-6250992298bc.json
 delete mode 100644 data/hfopenllm_v2/alcholjung/llama3_medical_tuned/e62b6b26-5f3c-42c9-9541-bb8b23caee66.json
 delete mode 100644 data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B-DPO/ec773b66-24fd-4b6f-ac9c-ebcd355e4be7.json
 delete mode 100644 data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B-SFT/a70b8356-94ce-4f0d-b44a-2215076eed5e.json
 delete mode 100644 data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B/b182807d-587e-4702-bf30-dab11983b8db.json
 delete mode 100644 data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B/c1f0944a-c44c-42e9-90ba-a847509cbd66.json
 delete mode 100644 data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-DPO/64bb8530-7071-402e-ba9b-1d15ecbe275c.json
 delete mode 100644 data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-RM/4f1fc265-f8b7-47e6-a9e6-cfa61b89ad4a.json
 delete mode 100644 data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-SFT/1420df5c-690e-4b01-b99c-c21c793689ae.json
 delete mode 100644 data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B/aa9d0b0e-cb3f-452e-bc85-f7cf172d2b8b.json
 delete mode 100644 data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B/dfabd777-8620-40e3-b19c-a9227f57b638.json
 delete mode 100644 data/hfopenllm_v2/allenai/OLMo-1.7-7B-hf/08fe3877-ab04-426a-9e27-72ec4ff8ffc3.json
 delete mode 100644 data/hfopenllm_v2/allenai/OLMo-1B-hf/4b264bb0-bd7e-4b15-9591-50b5a521f100.json
 delete mode 100644 data/hfopenllm_v2/allenai/OLMo-2-1124-7B-Instruct/a8cfe336-0c3e-401c-a1e9-d951e64918ec.json
 delete mode 100644 data/hfopenllm_v2/allenai/OLMo-7B-Instruct-hf/5e66c653-41b1-46de-b677-ffd8426ba5ec.json
 delete mode 100644 data/hfopenllm_v2/allenai/OLMo-7B-hf/9f0f0914-1f7a-468e-8a2e-7ae122fd064d.json
 delete mode 100644 data/hfopenllm_v2/allenai/OLMoE-1B-7B-0125-Instruct/cc64a143-4f1e-42ee-ade1-fafc4b316336.json
 delete mode 100644 data/hfopenllm_v2/allenai/OLMoE-1B-7B-0924-Instruct/cf322e64-2682-4a9a-a48f-c4ec47b852f2.json
 delete mode 100644 data/hfopenllm_v2/allenai/OLMoE-1B-7B-0924/30b32261-b24a-49e3-ba57-172dc1d03ba0.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Chocolatine-24B/0681c01d-23f3-4b8b-9516-a5cc41761fc4.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Gemma2Slerp1-2.6B/7693ed8a-f76d-482b-92c1-f11810e522ca.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Gemma2Slerp1-27B/f8dc0128-c606-490a-b965-59d5377dd778.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Gemma2Slerp2-2.6B/844547f7-658f-41dd-ab4c-dc0569030e59.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Gemma2Slerp2-27B/75c291b5-6d60-4bde-8621-f865196a6ecc.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Gemma2Slerp3-27B/36d54b12-594f-47fe-9637-a9b740416c5c.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Gemma2Slerp4-27B/57733383-9573-463d-a467-068d2685014c.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/GemmaSlerp-9B/eda1ac9a-98e1-496f-bdeb-1e256b52c14a.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/GemmaSlerp2-9B/00b8bfda-c6b1-4e1f-b68c-bff7335e2dff.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/GemmaSlerp4-10B/0a3b9ad6-b853-471d-a292-413b30273034.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/GemmaSlerp5-10B/d61c3ace-e353-4c0b-9472-c9a1928809cc.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/GemmaStock1-27B/2293a19a-b650-436d-9448-1b641e63d407.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/HomerSlerp1-7B/c15b977c-c781-4b17-ac9f-25c77602c875.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/HomerSlerp2-7B/42c191be-c0ae-4170-8b6f-565053ae7d9c.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/HomerSlerp3-7B/f5cb910d-6e5b-404a-a751-d5cb90668150.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/HomerSlerp4-7B/de806e4c-dbf8-48cc-a0d8-033a61dfc777.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/LimyQstar-7B-slerp/59150b73-b05a-451e-ba3f-696d04effe05.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Llama3.1-60B/84926b81-360a-480c-b240-f154ec7fe0ba.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Marco-01-slerp1-7B/8e6edb04-302b-4dfc-b38f-94b437c921a8.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Meme-7B-slerp/db92c564-1cf9-43db-9e25-1f450c7b1e7f.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Ministral-8B-slerp/e3796243-cbba-4ec2-ad7c-89547ad24342.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/MistralPhi3-11B/1479be90-df8f-4e1d-b9db-03e84000187a.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Mistralmash1-7B-s/d2e6c48c-1c18-45a6-ba1a-b335325c980c.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Mistralmash2-7B-s/f843e45a-f66b-4091-a964-75583c2d7fc5.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/MixTAO-19B-pass/cbc3cd41-e187-4c4f-b207-37bceab423a4.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/MixTaoTruthful-13B-slerp/0f124566-5e94-4233-9a3f-5ff9cfdf160c.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiCalm-7B-slerp/98fabba8-7d70-4a1f-b03c-37e1a9ac94e8.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMash-12B-slerp/91522dad-529b-477c-8372-793f631e14b7.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMash10-13B-slerp/cec22734-493c-4d11-ba86-6c7ae2005124.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMash11-13B-slerp/704a6e19-0d86-42a5-b8f5-05a5856e9c29.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMash2-12B-slerp/bc54349d-59e0-4ae4-94f9-3f5ae98261f4.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMash5-12B-slerp/d20d533a-758b-477c-b4eb-073adaed640e.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMash6-12B-slerp/f7c9ad0d-3fea-4bec-8ac3-46f01a3449fb.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMash7-12B-slerp/9db1f823-e068-4a39-a5cc-b9c588099427.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMash8-13B-slerp/23818b45-bf5f-48a2-982f-1e2a0d35aac8.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMash9-13B-slerp/de6eda66-b8f5-4b23-89e1-44bbac600953.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiMerge-7B-slerp/632974c2-57e2-41f9-8c00-671e07e7594b.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Multimash3-12B-slerp/e86dcf4f-6282-4aa6-b645-00f93a2e9077.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Multimerge-19B-pass/b20be5c9-9720-4076-b587-728549dd19af.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/MultiverseEx26-7B-slerp/5e193803-39d1-4f12-8726-ebbe5f71563c.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/NeuralWestSeverus-7B-slerp/61131a6c-f412-42bf-814b-7d711a840d44.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Neuralcoven-7B-slerp/535e72b1-17e0-40e3-9d66-d31f8ec70413.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Neuralmultiverse-7B-slerp/ea15479e-24a8-4924-a754-a8567c511e61.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Ph3della5-14B/5799f285-c61f-43a8-a6a6-053808cf4e8f.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Ph3merge-14B/36feef44-3d3b-4102-8606-ee6420bddcff.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Ph3merge2-14B/fd55f19a-2c22-4f29-82e0-15b02f25b9a9.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Ph3merge3-14B/18e5decd-c95e-43d2-9ba2-007ba32e216f.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Ph3task1-14B/85a4996e-8c44-4e4f-9478-19a8c5513617.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Ph3task2-14B/db6d57c8-df0b-407e-b937-67c55b513a5f.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Ph3task3-14B/89ac933d-0a7c-40e6-8fa7-35bb6205e44b.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Ph3unsloth-3B-slerp/c79e690f-3e09-4fac-9412-937a3b7ef352.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Phi3mash1-17B-pass/ce74b7e3-8505-4c79-a7de-12d1e6b47155.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Quen2-65B/3c562d8a-2df9-4d3f-9699-bfaee4a1ce2b.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwen2.5-42B-AGI/152b0cbe-e27b-4438-8326-e67f4e70e600.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task2/c733c91f-79a9-49e5-9398-3a424ee1940a.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task3/32d7b6c6-de5c-4864-a446-97dccce378c5.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task4/7b22d02b-5bfd-4243-9ad9-c858d0af55a6.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task7/99650529-55d9-42b0-b812-761a30277e5e.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task8/81abbc2a-791b-4a39-bb46-97edfa14b9c0.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwen2.5-slerp-14B/c658e535-7098-40fc-bea0-f5734d8f4ca9.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/QwenSlerp12-7B/9e0656e9-9b82-4f6d-b00a-c09cf9cbc105.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/QwenSlerp4-14B/07c36058-e0e8-48ea-85f3-0a2cb2fe3443.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/QwenSlerp5-14B/c41d8925-b56b-458e-b1a9-27dbbcaee149.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/QwenSlerp6-14B/9136feb4-5c3e-48b3-bc70-c7816b8b189b.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/QwenStock1-14B/c395ef02-9a50-4696-aad2-bcb32ba05f67.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/QwenStock2-14B/93f47969-556a-4fd4-b7bb-4d1c861a8d71.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/QwenStock3-14B/349ae559-6c1f-4b2f-954c-e83cba1e603a.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwenslerp2-14B/3e43c3f6-645b-4ab3-b684-b23eb67bc5d9.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwenslerp2-7B/500c8cd4-fe4e-44f3-86b7-b0efd387ab92.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwenslerp3-14B/340a3ebb-bc06-404f-84e7-aeccc016fd32.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Qwenslerp3-7B/a6426f88-d7cc-4e6a-a2b5-76e59a52a6de.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/ROGERphi-7B-slerp/bdd05c8f-b895-4c91-9a9f-a608a4259cbd.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/RogerMerge-7B-slerp/0e1e45d4-2747-480d-9b1f-2b200e250271.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Rombos-LLM-V2.5-Qwen-42b/00f3f9ca-ae7d-4e62-9e7e-6bd202dbed59.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Strangecoven-7B-slerp/c9e57ab2-c2a4-4935-b976-4bf24647b777.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Weirdslerp2-25B/c22436a2-ec60-4220-82b3-123618165eb2.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/WestlakeMaziyar-7B-slerp/1f990438-dd84-44d2-99f9-a10035ecd652.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/YamMaths-7B-slerp/f4564f5e-3595-466e-8201-0e2a4c50ff0d.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Yi-1.5-34B/040def3a-702d-4868-b429-39697ca36207.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Yi-blossom-40B/9e24fd65-56ec-4160-b299-b34d702a3231.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Yibuddy-35B/216bf9f8-9521-4311-a40b-8a847271265c.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Yillama-40B/45f8c4fb-3591-44df-a4f0-57093b9bae23.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Yislerp-34B/d17275ef-8a32-4fcb-94f4-fb24299ba50e.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Yislerp2-34B/61b79e7d-0f50-4cfe-825c-ed5b23d943f3.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/Yunconglong-13B-slerp/113c3507-b738-4b06-ada8-da93b19c6ae2.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/limyClown-7B-slerp/8835d5c1-8350-4d42-a753-82b94dffda3b.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/llama3-Jallabi-40B-s/dc3bbda7-5007-44c7-b1ba-af0c82d100ee.json
 delete mode 100644 data/hfopenllm_v2/allknowingroger/llama3AnFeng-40B/0d24ee06-a6b4-4be7-b3ef-c4f53b4fc414.json
 delete mode 100644 data/hfopenllm_v2/allura-org/L3.1-8b-RP-Ink/f2415b7a-2cd7-4a05-834b-7da992e1da1a.json
 delete mode 100644 data/hfopenllm_v2/allura-org/MN-12b-RP-Ink/01af237f-40d8-4841-a90d-13dce6db8634.json
 delete mode 100644 data/hfopenllm_v2/allura-org/MS-Meadowlark-22B/d69bb392-fd38-4f57-b567-24566896167b.json
 delete mode 100644 data/hfopenllm_v2/allura-org/Mistral-Small-24b-Sertraline-0304/63503943-1c1e-4dac-9c41-4933fbb44b70.json
 delete mode 100644 data/hfopenllm_v2/allura-org/Mistral-Small-Sisyphus-24b-2503/80c5d343-41e6-45d7-8921-62586a3cd270.json
 delete mode 100644 data/hfopenllm_v2/allura-org/MoE-Girl-1BA-7BT/2c27d7f6-60fd-49f3-8666-784f2a16031b.json
 delete mode 100644 data/hfopenllm_v2/allura-org/TQ2.5-14B-Aletheia-v1/cbcc1e64-8455-4382-8999-654d1757bbd6.json
 delete mode 100644 data/hfopenllm_v2/allura-org/TQ2.5-14B-Neon-v1/1bea4f6b-7a41-4907-baca-430c7ea179e9.json
 delete mode 100644 data/hfopenllm_v2/allura-org/Teleut-7b/298ce89b-966c-4f4e-9da5-3803a395188f.json
 delete mode 100644 data/hfopenllm_v2/aloobun/Meta-Llama-3-7B-28Layers/ea27a4d6-8c32-4b36-873d-1046ae6240e5.json
 delete mode 100644 data/hfopenllm_v2/aloobun/d-SmolLM2-360M/73d5905d-7825-43ba-8051-7e1f5639b857.json
 delete mode 100644 data/hfopenllm_v2/alpindale/WizardLM-2-8x22B/956b8589-a048-43be-9cfd-05658d3c57ca.json
 delete mode 100644 data/hfopenllm_v2/alpindale/magnum-72b-v1/36f597b4-8f53-4b40-9c0e-c9284743e456.json
 delete mode 100644 data/hfopenllm_v2/altomek/YiSM-34B-0rn/7b67e526-7588-4c62-9293-55e77851c4c7.json
 delete mode 100644 data/hfopenllm_v2/amazon/MegaBeam-Mistral-7B-300k/8bc96d6d-0cd7-49c4-8112-7d8fb1c45199.json
 delete mode 100644 data/hfopenllm_v2/amd/AMD-Llama-135m/6751a200-0bd9-498e-a991-ebe22375633d.json
 delete mode 100644 data/hfopenllm_v2/amd/AMD-Llama-135m/f41442e3-5aa7-4ca4-9e61-a5e13965a3e4.json
 delete mode 100644 data/hfopenllm_v2/anakin87/gemma-2b-orpo/b105b62a-ce77-4387-b679-1adf2782b2f4.json
 delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v1-72b/72180fd7-bf34-4758-b02f-7d11859700c7.json
 delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v2-12b/ac5aaa9c-79ab-4082-b8c5-084fba3e122a.json
 delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v2-72b/2d266d7f-8edd-40fd-adfc-597a7742167b.json
 delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v2.5-12b-kto/484ccbf2-87e2-423f-9de4-a4bd54291b54.json
 delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v3-27b-kto/4de79504-f9e8-4235-9aad-d38f0799e081.json
 delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v3-34b/b4bde9d8-f50c-448c-ada4-5bc05f302c04.json
 delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v3-9b-chatml/5da3240b-b5e3-4333-ba61-925343b56043.json
 delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v3-9b-customgemma2/d6727b7d-cdf3-48d5-8e30-484e86ad60b6.json
 delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v4-12b/15b86bbf-8d3b-474b-98f0-abb3972a7271.json
 delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v4-22b/c0b339f6-4a46-46eb-b2d0-945176afe676.json
 delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v4-27b/79367289-6245-4bf0-99e9-42bc3ff7649c.json
 delete mode 100644 data/hfopenllm_v2/anthracite-org/magnum-v4-9b/c3ec5505-1086-446a-9739-523810e93d13.json
 delete mode 100644 data/hfopenllm_v2/apple/DCLM-7B/c6c5e462-d373-4536-afc3-b740fb7e300f.json
 delete mode 100644 data/hfopenllm_v2/appvoid/arco-2-instruct/b7537abe-8177-4206-999f-5bb7e95c72c8.json
 delete mode 100644 data/hfopenllm_v2/appvoid/arco-2/eb2f6159-e37e-46db-9419-6a66cb7e539e.json
 delete mode 100644 data/hfopenllm_v2/arcee-ai/Arcee-Blitz/0b2d0a06-2907-4258-be33-1591e18ac6a2.json
 delete mode 100644 data/hfopenllm_v2/arcee-ai/Arcee-Maestro-7B-Preview/0284d867-45c4-4fe4-883c-8e3ea169d66c.json
 delete mode 100644 data/hfopenllm_v2/arcee-ai/Arcee-Nova/1a2da513-104e-4074-b3b7-601ab11bf6d8.json
 delete mode 100644 data/hfopenllm_v2/arcee-ai/Arcee-Spark/189db16b-5e78-439f-9f79-6eec979c3a79.json
 delete mode 100644 data/hfopenllm_v2/arcee-ai/Arcee-Spark/d751f1c5-5505-4c12-8d51-091538b49949.json
 delete mode 100644 data/hfopenllm_v2/arcee-ai/Llama-3.1-SuperNova-Lite/b6f9144f-57a0-4c18-9e52-ffccf2d8ca9c.json
 delete mode 100644 data/hfopenllm_v2/arcee-ai/Llama-Spark/67dc7fb2-1455-4f60-9dcb-59a8197741d7.json
 delete mode 100644 data/hfopenllm_v2/arcee-ai/SuperNova-Medius/7f4ab590-29fa-473a-b617-00135dd1d6ee.json
 delete mode 100644 data/hfopenllm_v2/arcee-ai/Virtuoso-Lite/d67db62e-e21d-43c8-8b4c-bfa353e47636.json
 delete mode 100644 data/hfopenllm_v2/arcee-ai/Virtuoso-Small-v2/85abff46-8ae5-4a75-9522-721793224363.json
 delete mode 100644 data/hfopenllm_v2/arcee-ai/Virtuoso-Small/1736bbd8-4457-4d55-8c0b-0ae6e001ee62.json
 delete mode 100644 data/hfopenllm_v2/arcee-ai/raspberry-3B/4777e427-8d17-4e06-8cbf-0883c95bbfd8.json
 delete mode 100644 data/hfopenllm_v2/argilla-warehouse/Llama-3.1-8B-MagPie-Ultra/4df0b890-d4c5-408e-8994-88f7383e9235.json
 delete mode 100644 data/hfopenllm_v2/argilla/notus-7b-v1/76a5a59d-f5fd-4fb0-849e-7db7772b555a.json
 delete mode 100644 data/hfopenllm_v2/argilla/notux-8x7b-v1/6c8399d0-01ce-45cb-a20f-a49e4e760a1e.json
 delete mode 100644 data/hfopenllm_v2/arisin/orca-platypus-13B-slerp/92c2c5ee-dfa2-4db3-8401-887d02cc21dd.json
 delete mode 100644 data/hfopenllm_v2/arshiaafshani/Arsh-V1/b40ef568-f277-4d5c-87cd-53feaa71598b.json
 delete mode 100644 data/hfopenllm_v2/asharsha30/LLAMA_Harsha_8_B_ORDP_10k/893d5149-c535-41c7-8a1a-26bb6b33e407.json
 delete mode 100644 data/hfopenllm_v2/ashercn97/a1-v0.0.1/0b649ed5-5af4-4910-b853-2408e3b58f1f.json
 delete mode 100644 data/hfopenllm_v2/ashercn97/a1-v002/5c8edeba-5c65-4168-b67e-02143acbcafb.json
 delete mode 100644 data/hfopenllm_v2/assskelad/smollm2-360M-sft_SmallThoughts/67e657ef-d602-4f58-b898-874a22f4a009.json
 delete mode 100644 data/hfopenllm_v2/athirdpath/Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit/53d2bf07-689a-4e69-a534-b288313c8481.json
 delete mode 100644 data/hfopenllm_v2/automerger/YamshadowExperiment28-7B/34d6a184-d4d5-4609-8305-c0e2ee1c585b.json
 delete mode 100644 data/hfopenllm_v2/avemio/GRAG-NEMO-12B-ORPO-HESSIAN-AI/39b627ab-3e64-42f7-a88d-abe5764fcf4d.json
 delete mode 100644 data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-1-over-2/d8467b15-8a03-4cde-9fc5-5c08bdabb6c6.json
 delete mode 100644 data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-1-over-4/85bc5976-0d40-4416-bbf8-9b1dbf372343.json
 delete mode 100644 data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-3-over-8/8c7e8e64-672e-4c7e-a808-a49f1792d3a8.json
 delete mode 100644 data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-5-over-16/de8651eb-16d1-46ee-a1df-b8c72caaf205.json
 delete mode 100644 data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-7-over-16/6a744db8-814f-4e8e-b6e5-0d096267dfa5.json
 delete mode 100644 data/hfopenllm_v2/aws-prototyping/MegaBeam-Mistral-7B-512k/028b7c37-770e-4356-a7c6-0cc74650d5fd.json
 delete mode 100644 data/hfopenllm_v2/axolotl-ai-co/romulus-mistral-nemo-12b-simpo/3b399c64-922a-48ba-9a25-862102749647.json
 delete mode 100644 data/hfopenllm_v2/baconnier/Napoleon_24B_V0.0/d5e46a11-3e81-457d-9d26-9fd17f96f076.json
 delete mode 100644 data/hfopenllm_v2/baconnier/Napoleon_24B_V0.2/b3abfbc1-911a-43b7-a338-efb25f746f9d.json
 delete mode 100644 data/hfopenllm_v2/baebee/7B-Cetacea/6b471ee0-9444-45ff-92cf-da624aa59bf6.json
 delete mode 100644 data/hfopenllm_v2/baebee/mergekit-model_stock-nzjnheg/b56bd924-0a63-4ca2-8f2f-97b581e47a36.json
 delete mode 100644 data/hfopenllm_v2/baebee/mergekit-ties-fnjenli/bfe9098d-7207-4f8c-9a3f-549a29303b5f.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.1v/7856172d-ec3e-4e71-befe-54952478e330.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.2v/a68aada5-61bd-4a4c-a8e1-b9a2ace349df.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.3v/9d19c44f-4912-4c95-ab3f-2dddb055d932.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.4v/6cef3550-27d7-4073-b4bb-0f19a2c5f553.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.5v/08ab8f6a-9aaf-4ab4-ada3-eb4a75f46995.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.6v/622f9379-6a30-43ba-a7a8-fbd08c484fa5.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_III_IV_V/24f728e6-de5e-44cc-8b6d-51e0065c1475.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_III_ex_V/c3b2bf18-d355-40fc-a862-376c1b988305.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_Neo/79474be5-2587-4087-a2cc-1337e3b696dd.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B/22ff2700-70c0-459e-96a2-0ce1710947bc.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/Mistral-Nemo-VICIOUS_MESH-12B-2407/7d3a47a3-83d3-4f51-ab72-6a2fa5b5ef80.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/NameLess-12B-prob/69dc0f8e-16d7-4907-9741-484eafa62b8c.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-0.1v/e516abc1-9c3c-4921-a385-e2533d45fed3.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-0.X.ver/8baa5832-cc07-4a31-a815-0e8151426ea6.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-ALPHA/509fbca4-f405-4c27-85a9-1eea59025070.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-BETA/6f45ed56-6bec-4439-9adb-e79fcd74667c.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-DELTA/512ff924-c1d3-4d75-a468-2bcdcda25cf6.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-DIGAMMA/86b561ae-c4d3-4293-a884-bcab26df026d.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-EPSILON/516d1972-9731-4234-a4b3-b96423ebba5c.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-GAMMA/274f6e02-c81f-4f2e-9747-e5de5cee1933.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-NEMO/61638b55-296b-40fd-a39f-cc2276d9f94a.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-OMEGA/11c1b6fe-4815-415b-a4a8-d14073df6ee1.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-UNION/88e2cb24-288e-4f37-8753-f0daa825051c.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B/8a1a6c44-17fd-402e-a22e-e795a1f612e3.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B_Razor/1121af0b-61fe-424a-bc66-3164bcb1d833.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/mergekit-model_stock-zdaysvi/35300d67-7ee1-4874-b351-87f46267cec9.json
 delete mode 100644 data/hfopenllm_v2/bamec66557/mergekit-ties-sinbkow/6180b7b3-4b21-42aa-a62d-084a91568b43.json
 delete mode 100644 data/hfopenllm_v2/belztjti/dffghgjh/7414d344-0e67-424a-9e16-00de0487ce02.json
 delete mode 100644 data/hfopenllm_v2/belztjti/dtfgv/f5fcd407-080c-4cb7-a299-7a7f919c734d.json
 delete mode 100644 data/hfopenllm_v2/benhaotang/phi4-qwq-sky-t1/efe03731-6021-4dcf-b7fe-24cbf2d60fac.json
 delete mode 100644 data/hfopenllm_v2/beomi/gemma-mling-7b/6ffed624-cc22-4b62-a447-3c02b0e43ded.json
 delete mode 100644 data/hfopenllm_v2/beowolx/CodeNinja-1.0-OpenChat-7B/ed867fa8-be8a-49b0-8c94-38085808b58b.json
 delete mode 100644 data/hfopenllm_v2/berkeley-nest/Starling-LM-7B-alpha/c8b9a56b-0933-4085-8d5f-a1d8294699db.json
 delete mode 100644 data/hfopenllm_v2/bfuzzy1/Gunny/9b178661-ed9a-427d-b93c-b905b8089ad8.json
 delete mode 100644 data/hfopenllm_v2/bfuzzy1/acheron-c/69588e07-7559-49c2-9423-19fd143e42f7.json
 delete mode 100644 data/hfopenllm_v2/bfuzzy1/acheron-d/317589da-d673-4f90-93e9-59983f2ef54b.json
 delete mode 100644 data/hfopenllm_v2/bfuzzy1/acheron-m/efab322e-ea15-4fe7-9bfc-15246003e59c.json
 delete mode 100644 data/hfopenllm_v2/bfuzzy1/acheron-m1a-llama/b1eac68e-b292-414b-9594-c921f8e10818.json
 delete mode 100644 data/hfopenllm_v2/bfuzzy1/acheron/b7d08c65-8219-4067-9504-99e438a86038.json
 delete mode 100644 data/hfopenllm_v2/bfuzzy1/llambses-1/e9c5b479-0dce-4de3-84d6-90c7515337f1.json
 delete mode 100644 data/hfopenllm_v2/bhuvneshsaini/merged_model/3c766465-29db-4b3d-b42f-a3222b38a096.json
 delete mode 100644 data/hfopenllm_v2/bigcode/starcoder2-15b/e6c85677-61ed-475b-85a5-48b91ec76bcf.json
 delete mode 100644 data/hfopenllm_v2/bigcode/starcoder2-3b/7b68fa5e-dbbf-4542-8767-6874aabf8f40.json
 delete mode 100644 data/hfopenllm_v2/bigcode/starcoder2-7b/c103b7f4-a432-42d6-86ef-cb369e0c16ff.json
 delete mode 100644 data/hfopenllm_v2/bigscience/bloom-1b1/643dda41-37d0-4c1e-b856-58b774612886.json
 delete mode 100644 data/hfopenllm_v2/bigscience/bloom-1b7/ba2f284b-d7c6-4748-a8dc-4f80caa30c6c.json
 delete mode 100644 data/hfopenllm_v2/bigscience/bloom-3b/16e30aa0-736a-4ef8-8ba6-78285b84546f.json
 delete mode 100644 data/hfopenllm_v2/bigscience/bloom-560m/73eb729d-adfd-4dee-9bde-04a31f5528f6.json
 delete mode 100644 data/hfopenllm_v2/bigscience/bloom-7b1/0daad2ae-92d0-4522-a067-20332f72c96f.json
 delete mode 100644 data/hfopenllm_v2/bluuwhale/L3-SthenoMaid-8B-V1/a3e3849f-a289-4132-b4a8-f67d67ad46a1.json
 delete mode 100644 data/hfopenllm_v2/bond005/meno-tiny-0.1/59a9ed26-a67a-4e76-8858-520400c90766.json
 delete mode 100644 data/hfopenllm_v2/bosonai/Higgs-Llama-3-70B/6c5c61b4-8037-4b28-8616-1aefa7963eb8.json
 delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-1.5B-Blunt/e9f9b836-fbdf-4996-9b35-2c8145a7f01b.json
 delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-1.5B-Reflective/5b3dae43-5d5c-4d19-bd47-5c0f68ecbb81.json
 delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-ABUB-ST/d5b31b1f-ace0-457f-bf8a-9041398b8344.json
 delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective/b34702cf-ffb8-4e75-9c9b-f5c52623d4c8.json
 delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt/c701f1fd-166d-416b-8f78-edf17f2fecd4.json
 delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective/4217b403-e924-4f67-9b0e-ad1d4ed293a1.json
 delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored/03816e41-5fb8-4815-ab9c-4108ab19a3bc.json
 delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt/a763b10e-350a-4342-ade3-b782437ca3e2.json
 delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Reflective/9e806fd2-edbf-40e2-a008-834cee537bb6.json
 delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B/fbcf861c-62db-4079-bba6-becd4e231216.json
 delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-Blunt/22b591c0-3386-4bd5-860c-20c0c6001986.json
 delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored/dfb9a9c4-114e-4188-9940-4d6df7e4815f.json
 delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-Reflective/38fd5f4d-0f3c-4dc2-b250-a9ee7090aac2.json
 delete mode 100644 data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B/e53cbc94-fc9f-4d53-ae28-26bc8c2caef8.json
 delete mode 100644 data/hfopenllm_v2/braindao/Qwen2.5-14B-Instruct/2165e69a-c50c-419a-932e-909f53b73b71.json
 delete mode 100644 data/hfopenllm_v2/braindao/Qwen2.5-14B/46430a07-15c8-4727-9102-2f471d4f1d3c.json
 delete mode 100644 data/hfopenllm_v2/braindao/iq-code-evmind-0.5b/3c7f540a-c850-4e20-ad93-60e021d17133.json
 delete mode 100644 data/hfopenllm_v2/brgx53/3Bgeneral-ECE-PRYMMAL-Martial/c3ab4f38-6f7b-4589-ae4f-21ace05b8c44.json
 delete mode 100644 data/hfopenllm_v2/brgx53/3Bgeneralv2-ECE-PRYMMAL-Martial/2708c0d6-03e7-4a17-b6b9-e16f3ddcf5bb.json
 delete mode 100644 data/hfopenllm_v2/brgx53/3Blareneg-ECE-PRYMMAL-Martial/6427a5ef-8508-430d-970d-054fc485e754.json
 delete mode 100644 data/hfopenllm_v2/brgx53/3Blarenegv2-ECE-PRYMMAL-Martial/08984ad9-1e9b-4916-b214-af26dadfcc0b.json
 delete mode 100644 data/hfopenllm_v2/brgx53/Barracuda-PRYMMAL-ECE-TW3/1dbb5d03-fdfa-4059-9d50-d037ada6b1ac.json
 delete mode 100644 data/hfopenllm_v2/brgx53/LaConfiance-PRYMMAL-ECE-TW3/6bf42faa-c3e9-4069-bf93-ffd626062f0f.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Best-Mix-Llama-3.1-8B/9feccbdc-18eb-4077-b50b-986db0047fc8.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Blabbertron-1.0/a074c33f-782a-409c-987b-7dd62c65ccc7.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Blabbertron-1.1/2f2c0dea-dcd4-4e54-9f40-9fda4b91bd40.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/CyberCore-Qwen-2.1-7B/84481fee-3727-427b-912a-30e2744df28a.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/DeepQwen-3B-LCoT-SCE/aaa801dc-1a47-4009-9ad4-7129a8d4e651.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/DeepSeek-R1-Distill-Qwen-7B-RRP-Ex/3ac92cbf-c85b-4e00-9ef9-4322f961591a.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/DeepThinker-7B-Sce-v1/162b511b-4684-4595-9261-a33f3a4117f9.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/DeepThinker-7B-Sce-v2/20d5d59a-028d-4e34-9414-d9edaf2e59b8.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/FuseCyberMix-Qwen-2.5-7B-Instruct/a21b53fb-783b-440b-9f3d-d8ada3bd18ea.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/FuseQwQen-7B/0d2ab1e8-a2d7-45cf-b123-67bcab2d9dff.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/FwF-Qwen-7B-0.1/6b4a37c8-c7e6-4156-9d6d-8cba51b74d82.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/FwF-Qwen-7B-0.2/78582fec-2f69-4b37-8497-12ceb097b44b.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Gemma-2-2B-Smart/949bf65e-c2ae-4701-82f0-39d0c62a0e87.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Gemma2-9B-TitanFusion/8812151c-4301-4131-a414-d64d025e476e.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/HyperLlama-3.1-8B/2db1542f-a8da-4fb8-91a5-6dd1a942b55e.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.1-8B-TitanFusion-Mix/9feeffb2-3763-4e43-933e-89100b76f7fa.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.1-8B-TitanFusion-v3/721102b5-ed5e-4631-8600-a6adfff0c784.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-All-Mix/18c185f7-5ca4-46ff-81c2-6c538f096409.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Bespoke-Thought/7ab5911c-e229-43e5-a798-095287d0a597.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Booval/f800c4e5-e918-45bb-8a12-3ca2a64c6b23.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Deep-Test/5fcf41bc-30dc-46a7-9cf2-4ce2c7a5850c.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Deep-Test/d4b20ef4-734e-40a7-818e-f77e170d7437.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Della/e0996c96-c9e5-4d39-8e6d-1455ef1f9544.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Long-Think/3ad2b31e-ce2a-4cb4-9b85-79cdebd5d364.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Mix-Skill/9aff874c-1953-4b97-9bff-9e6120b0bfa7.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ProdigyPlus/45ae7f45-8c36-46c6-989d-bc672cdf8eff.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ProdigyPlusPlus/7d36e44e-a329-4b96-a891-365ad900f718.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-RP-DeepThink/a8c26325-1eec-43a6-a8ad-3bcb2e378924.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-RRStock/bde1a879-6852-42ce-9217-f427af85a46a.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ToxicKod/dd7a0377-f4d6-4390-b9f2-bf50b05ec0f7.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Llama-3.2-3b-RP-Toxic-Fuse/12cbf241-d6d4-4d25-ad3d-13a42d7adc74.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Maestro-S1k-7B-Sce/1f66fd7c-40ee-4249-8963-5c7bb93a3eaf.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-3.5-mini-TitanFusion-0.1/7076406b-7e0a-49c7-8150-2e6a243aa23b.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v2/96c3fd80-a601-4629-a1ab-bf7f366a909a.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v3/1302c9a5-d35c-400c-b9f3-d990243e5d59.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v4/c7f48bbf-6583-4ddd-ae4d-671c43218dae.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock/5f07e092-2eb0-44c2-b2ce-5f1b31a9ea99.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-RP-v0/15701682-97ce-46cf-8010-a6bdeaf8c7aa.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-RR-Shoup/c6eecf0b-fa16-484a-8eeb-d196203b3c3e.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-RStock-v0.1/4337b1c1-cc00-4a15-8148-e8d0739561b9.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-ReasoningRP/1151ee14-8fe9-4f97-808d-8103b353c2ec.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-Sce-exp-v0.1/a2c18179-aca3-422c-b9f5-8345109cea13.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-Stock-Ex/07495d34-1505-45a9-bb48-887af0da8a0c.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-Stock-RP/567baf6d-99f9-46a5-8c40-c6899986f1ff.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-4-Trim-Exp1/a337df3a-28ff-46c9-adae-4bc029937101.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Phi-Seek-4-Sce-V1/b201a849-44e9-4598-918b-ffa27c894ee9.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qandora-2.5-7B-Creative/dd87ebf3-3088-43b1-851c-a97d12a68ea8.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/QandoraExp-7B-Persona/1b3ef805-8b0c-44bf-b048-773a0dd94d0d.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/QandoraExp-7B-v2/220cb478-58c0-4028-b51a-ec5fe1050746.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/QandoraExp-7B/17cb8ab1-e7ba-4daf-95d4-2cdbd2777434.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/QwQen-3B-LCoT-R1/2b55023b-b8bc-42a2-aca8-dcaf39890232.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/QwQen-3B-LCoT/31736569-5992-4b1d-9d66-27a6c1620506.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Sky-T1/630b37b5-351c-403c-ac76-ccb68ffc5d53.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v1/69cdef01-30dc-4f75-97fa-9daeebcec72f.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v4/9aa1acb0-c791-4dea-aa1e-c912cea69466.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v5/0c1d66f3-8fd7-47f2-8538-a1aa8985aebf.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Exp-Sce/2872dcd9-421b-4346-812c-b27bb32c6e86.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-R1-Stock/2f3e2fc0-f1e0-43cb-8a8c-6aadcc538646.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Stock-Deep-Bespoke/d0a76497-84b0-45b9-b748-04ffe9bc13a3.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen-2.5-7b-S1k/185b6560-6790-417f-aeba-f7405fee808a.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-1.5B-Model-Stock/30a8074e-df03-4866-9b8d-a5a7eece3c71.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v2/ac8874ae-d6d6-45d3-aabc-06a3852f68d0.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v3.1/bc98b048-18d4-438e-80c4-0cd851798da5.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v3.2/c88c011f-0a24-4e78-a104-035d25af2430.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v4.1/f9e3c31c-02c0-4f5e-ad4f-3be0801a0f41.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock/5484405a-2ec8-4515-af75-76a5dd348d3d.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Mix/7dc117b9-c2a2-44c1-8471-f3bc8a116e3e.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Thinker-V2/e2d314dd-b5b3-49b5-8e64-1e3464f4b963.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Thinker/7ecb453b-1ba7-44ec-abfd-1f8be4c817fd.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-CyberRombos/d0a70e95-fc72-41c6-ac42-09b8f379b566.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Fuse-Exp/e2ef8ea6-b464-445e-81df-ef0779c1d0d4.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Instruct-Fusion/f3d7cca2-141c-4b84-abc4-396ad2d59e3c.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Instruct-Merge-Stock-v0.1/e3f48d7a-c8a3-4e75-99d6-7f2946696b12.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-MixStock-Sce-V0.3/3feb9449-49a2-427f-a317-c21e6d1ca66c.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-MixStock-V0.1/6359e37e-0405-436b-903c-8f0e740dd6c7.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-R1-Bespoke-Stock/f5daed76-f6e5-4a7d-84d7-80537a046b83.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-R1-Bespoke-Task/03af2b1d-989f-4afc-ab13-8793093b9c50.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-1M-Thinker/5db7ec54-7feb-4c11-b2e0-042226ba1f94.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-1M/f1f5615d-8a78-43c9-b5c6-edc180252381.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-ID/9c89bf8f-4b8a-4c01-8685-fafc687c673e.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Sky-R1-Mini/58b69c0f-826d-414f-915e-dd0b78d9298c.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/QwenMosaic-7B/101ea548-2ffe-4f47-b3b5-5fbe9a3854b4.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Smol-Llama-3.2-3B/259c4798-ff03-4f58-8fb4-59150710212b.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/SmolLM2-1.7-Persona/f731caa1-f777-494a-8490-da0c815f0708.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/SmolLM2-1.7B-roleplay-lora/d4d25d38-b21a-490e-9ca9-556504ec00ea.json
 delete mode 100644 data/hfopenllm_v2/bunnycore/Tulu-3.1-8B-SuperNova/75bb85a3-40bb-4630-95a0-50e40b008412.json
 delete mode 100644 data/hfopenllm_v2/byroneverson/Mistral-Small-Instruct-2409-abliterated/bb44f3ef-eefa-48ef-a257-2eb345c89a00.json
 delete mode 100644 data/hfopenllm_v2/byroneverson/Yi-1.5-9B-Chat-16K-abliterated/2dcf1771-3dbe-43ad-974c-54e2e2860bcc.json
 delete mode 100644 data/hfopenllm_v2/byroneverson/Yi-1.5-9B-Chat-abliterated/caa0c8df-5488-4bf9-a5b8-0fff831e6732.json
 delete mode 100644 data/hfopenllm_v2/c10x/Q-Pluse/c6f8e581-e849-4e28-b3a6-1838ee522770.json
 delete mode 100644 data/hfopenllm_v2/c10x/longthinker/f0c361a1-a3ac-4415-ab5d-069bdf27e7a3.json
 delete mode 100644 data/hfopenllm_v2/carsenk/flippa-v6/44129be7-f73d-4580-8375-e8ef324e73a8.json
 delete mode 100644 data/hfopenllm_v2/carsenk/phi3.5_mini_exp_825_uncensored/2925ecde-a9a5-4369-b391-d23a8605d35c.json
 delete mode 100644 data/hfopenllm_v2/cat-searcher/gemma-2-9b-it-sppo-iter-1-evol-1/8409e464-fd16-4b41-b533-2f6cae4fe894.json
 delete mode 100644 data/hfopenllm_v2/cat-searcher/gemma-2-9b-it-sppo-iter-1/86f6c6eb-8b08-4e6c-a1bc-0d941a00f10b.json
 delete mode 100644 data/hfopenllm_v2/cckm/tinymistral_950m/aa2e6df7-a0b0-42f7-8057-e2763fc34834.json
 delete mode 100644 data/hfopenllm_v2/cgato/TheSalt-L3-8b-v0.3.2/2bf9a06e-f3bf-4b55-804b-e553a722e0de.json
 delete mode 100644 data/hfopenllm_v2/chargoddard/prometheus-2-llama-3-8b/b380a675-39ea-4950-ad0a-d9771f09ddde.json
 delete mode 100644 data/hfopenllm_v2/chujiezheng/Llama-3-Instruct-8B-SimPO-ExPO/482358eb-7d3b-4de0-b5d9-451308f104e2.json
 delete mode 100644 data/hfopenllm_v2/chujiezheng/Mistral7B-PairRM-SPPO-ExPO/ef04a83d-7b89-43ec-ba33-30e1006422dc.json
 delete mode 100644 data/hfopenllm_v2/cjvt/GaMS-1B/7b64cf2e-c7c6-4b48-8e51-ea2aa0914145.json
 delete mode 100644 data/hfopenllm_v2/cloudyu/Llama-3-70Bx2-MOE/52c8e3f4-1063-4d9c-80d9-fdd0a72fc98e.json
 delete mode 100644 data/hfopenllm_v2/cloudyu/Llama-3.2-3Bx4/1f4a827d-31cd-42e6-871d-7c0cad010f58.json
 delete mode 100644 data/hfopenllm_v2/cloudyu/Mixtral_11Bx2_MoE_19B/56d6d99c-fba1-42e7-aad4-631370b44da3.json
 delete mode 100644 data/hfopenllm_v2/cloudyu/Mixtral_34Bx2_MoE_60B/006a0ac7-d6c3-42c1-b0cc-6a0bfe74f884.json
 delete mode 100644 data/hfopenllm_v2/cloudyu/Mixtral_7Bx2_MoE/33a82686-6202-4a4d-ba34-bd4537105e5f.json
 delete mode 100644 data/hfopenllm_v2/cloudyu/S1-Llama-3.2-3Bx4-MoE/38d45554-44bd-4b40-b7c9-c0b7ba44b862.json
 delete mode 100644 data/hfopenllm_v2/cloudyu/Yi-34Bx2-MoE-60B-DPO/37d7e3ab-db9c-4ad7-81d1-933c030a6250.json
 delete mode 100644 data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-ipo/9cc49b3c-4e51-4f67-92ea-4ac8a3cbed43.json
 delete mode 100644 data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid/b6bd8515-4c95-40ce-b2d5-af8873d261ab.json
 delete mode 100644 data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-etpc/d102e75d-3e20-482b-a243-bae3ec44e2bb.json
 delete mode 100644 data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Llama3.1-8B/68920da1-af71-4ccd-88b9-554e3c72c4dc.json
 delete mode 100644 data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Llama3.2-1B/c0eb144f-c726-4a80-bce9-384fb7a641a7.json
 delete mode 100644 data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Qwen2.5-0.5B/0b26f82d-36f6-4fd0-a0fd-05e4a1368a6e.json
 delete mode 100644 data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-R1-Mistral-24B/8fe4360a-0924-4386-b4cd-89069f7ff55f.json
 delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9-llama3-8b/eeeb082b-7112-4a08-a87a-b2c9ae37efff.json
 delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-llama-3-70b/b8f933e9-867f-4934-9648-371d1e632116.json
 delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-yi-1.5-34b/8d225023-4b7e-48cd-ae67-6d00b541f17d.json
 delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-yi-1.5-9b/ee3b45e7-a5d6-4fa8-8abd-f6a77d5a6d5b.json
 delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated/177ef040-da5c-4a65-adac-efdc555bd110.json
 delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated/e9dc8337-eb35-4eb9-bca7-30ec1cd44092.json
 delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium/f4549a39-0b28-4e06-998a-774f5f02cfba.json
 delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-qwen2-72b/a79af78a-adab-406f-995a-adb3893e1510.json
 delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-qwen2-7b/4e8e457a-85eb-4afb-a9fe-8f8ce6eaf4d7.json
 delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-Yi-1.5-34B-32k/eeb3a10a-d584-414a-90de-e018c47615c2.json
 delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-mistral-7B-32k/e83dadb0-5092-48b8-b408-e6bb1ac8a0ba.json
 delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-mistral-nemo-12b/cebc7767-fbc9-45a2-808b-51e1a4f0f35c.json
 delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.4-gemma2-2b/b64b6416-b18b-47cc-a516-c613cd670b37.json
 delete mode 100644 data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.4-llama3.1-8b/64e96d56-72a9-413f-8903-45821b98f71e.json
 delete mode 100644 data/hfopenllm_v2/collaiborateorg/Collaiborator-MEDLLM-Llama-3-8B-v2/a3f44cfd-d1fc-4a3c-aa5b-a0f37fc4a192.json
 delete mode 100644 data/hfopenllm_v2/cpayne1303/cp2024-instruct/79314f48-d92b-4992-b3c6-d31278c0867a.json
 delete mode 100644 data/hfopenllm_v2/cpayne1303/cp2024/5a007612-c8e7-4f6b-baa9-a21af7e908c6.json
 delete mode 100644 data/hfopenllm_v2/cpayne1303/llama-43m-beta/fdefdd3e-2d83-4430-bd95-e16a1935dff1.json
 delete mode 100644 data/hfopenllm_v2/cpayne1303/llama-43m-beta/ffdd45bf-3409-4b92-909a-25a32ba27f82.json
 delete mode 100644 data/hfopenllm_v2/cpayne1303/smallcp2024/a78ab8ac-2c2e-405a-95ee-0d1d27cf533b.json
 delete mode 100644 data/hfopenllm_v2/crestf411/MN-Slush/d9d49bf7-f6f0-4c25-9182-d815454940e3.json
 delete mode 100644 data/hfopenllm_v2/cstr/llama3.1-8b-spaetzle-v90/deb48e93-0378-482f-8a5d-7ec350497e0b.json
 delete mode 100644 data/hfopenllm_v2/cyberagent/calm3-22b-chat/302a9a47-8603-42d9-85fb-64c60e7c6f44.json
 delete mode 100644 data/hfopenllm_v2/darkc0de/BuddyGlassNeverSleeps/28d52801-3998-421f-a37a-2b7b677d0eaa.json
 delete mode 100644 data/hfopenllm_v2/darkc0de/BuddyGlassUncensored2025.2/32b4e23b-9430-45a8-bfa2-eea2e89792c4.json
 delete mode 100644 data/hfopenllm_v2/darkc0de/BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp/0336e168-e313-44cb-a030-42e6d20e92df.json
 delete mode 100644 data/hfopenllm_v2/databricks/dbrx-base/11bd8b5b-2ea4-4ec5-8fe6-654aedb40fc9.json
 delete mode 100644 data/hfopenllm_v2/databricks/dbrx-instruct/6d97749c-3bfa-4c32-b581-a5e2b73303f3.json
 delete mode 100644 data/hfopenllm_v2/databricks/dolly-v1-6b/ec58907d-b67c-467e-a3dd-b9f9c10138f0.json
 delete mode 100644 data/hfopenllm_v2/databricks/dolly-v2-12b/a7f09a3d-025c-48fa-9358-863b9ae382b1.json
 delete mode 100644 data/hfopenllm_v2/databricks/dolly-v2-3b/bf2be2d5-58de-4550-b733-a5910bded48d.json
 delete mode 100644 data/hfopenllm_v2/databricks/dolly-v2-7b/52b32c1f-6189-4850-b3f4-de442eb2ccb5.json
 delete mode 100644 data/hfopenllm_v2/davidkim205/Rhea-72b-v0.5/87b44160-c3dd-452d-8c15-c4f758f8db7b.json
 delete mode 100644 data/hfopenllm_v2/davidkim205/nox-solar-10.7b-v4/3e6814d3-54ea-493f-a9fc-85ae9eed1b05.json
 delete mode 100644 data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/35b7ff42-3825-4240-97bf-f8af7e8c23ff.json
 delete mode 100644 data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/c108173e-1582-4c99-9291-46986d7ba1cf.json
 delete mode 100644 data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/6feb08b0-1c67-4fe2-a001-0b3b84529687.json
 delete mode 100644 data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/d4ab3df2-109a-4eec-9742-dc3bb79d5a58.json
 delete mode 100644 data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/53ec995e-bcfd-4a72-bd9a-45d14da3f219.json
 delete mode 100644 data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/299a0397-89c7-4329-9599-9fc29a52db87.json
 delete mode 100644 data/hfopenllm_v2/deepseek-ai/deepseek-llm-67b-chat/41adbc32-6cdf-49ba-980c-6eb6f722b40b.json
 delete mode 100644 data/hfopenllm_v2/deepseek-ai/deepseek-llm-7b-base/4236ece5-f2b2-44e7-9503-9731bff20155.json
 delete mode 100644 data/hfopenllm_v2/deepseek-ai/deepseek-llm-7b-chat/b33d672c-4a96-4093-bc13-25c42303b918.json
 delete mode 100644 data/hfopenllm_v2/deepseek-ai/deepseek-moe-16b-base/2b4f42fc-8b25-481c-98f7-911c52fdd242.json
 delete mode 100644 data/hfopenllm_v2/deepseek-ai/deepseek-moe-16b-chat/634b7a64-2bd3-48b8-b2f4-a93189801850.json
 delete mode 100644 data/hfopenllm_v2/dfurman/CalmeRys-78B-Orpo-v0.1/72a4bcc3-9dfc-4268-be4e-cda5837a3da2.json
 delete mode 100644 data/hfopenllm_v2/dfurman/Llama-3-70B-Orpo-v0.1/78fa85f6-baff-4d95-ad3a-a0663f51b0a0.json
 delete mode 100644 data/hfopenllm_v2/dfurman/Llama-3-8B-Orpo-v0.1/359231a5-6eb9-4f73-a6f1-d7fd7f35c7ed.json
 delete mode 100644 data/hfopenllm_v2/dfurman/Llama-3-8B-Orpo-v0.1/79b81e37-f75e-4b18-b145-73c42625ced5.json
 delete mode 100644 data/hfopenllm_v2/dfurman/Qwen2-72B-Orpo-v0.1/2d99af7a-f67c-4e74-9ba2-f1401dfdf9fb.json
 delete mode 100644 data/hfopenllm_v2/dicta-il/dictalm2.0-instruct/315fa815-fab0-47c9-8185-00bc597c0176.json
 delete mode 100644 data/hfopenllm_v2/dicta-il/dictalm2.0/0c1686db-b396-4ecf-86f1-e4e092491acd.json
 delete mode 100644 data/hfopenllm_v2/distilbert/distilgpt2/57455fbc-b5a9-4a3b-9a30-7da0593fd778.json
 delete mode 100644 data/hfopenllm_v2/divyanshukunwar/SASTRI_1_9B/a8f9d0e6-5a1a-4d09-ac78-47fd586384df.json
 delete mode 100644 data/hfopenllm_v2/djuna-test-lab/TEST-L3.2-ReWish-3B-ties-w-base/9d0d4eee-0b87-485c-843f-e32d08aa601b.json
 delete mode 100644 data/hfopenllm_v2/djuna-test-lab/TEST-L3.2-ReWish-3B/e47c83ff-9a16-488b-8ccf-4a2fad2b14fc.json
 delete mode 100644 data/hfopenllm_v2/djuna/G2-BigGSHT-27B-2/8c7e25df-884d-4940-8185-4c1b82fac8c5.json
 delete mode 100644 data/hfopenllm_v2/djuna/G2-GSHT/83611d50-01d0-4642-a104-daf77f1a0fe8.json
 delete mode 100644 data/hfopenllm_v2/djuna/Gemma-2-gemmama-9b/5cbdafba-6071-4da1-8b19-3de612e9ff18.json
 delete mode 100644 data/hfopenllm_v2/djuna/L3.1-ForStHS/1c934cba-c94a-4aad-9645-84658e0b5588.json
 delete mode 100644 data/hfopenllm_v2/djuna/L3.1-Promissum_Mane-8B-Della-1.5-calc/7aad3f6b-89d9-4c9e-9339-cf4111fc37c6.json
 delete mode 100644 data/hfopenllm_v2/djuna/L3.1-Promissum_Mane-8B-Della-calc/38d4a8ca-4273-4e6a-8a39-3b5ff20ec461.json
 delete mode 100644 data/hfopenllm_v2/djuna/L3.1-Purosani-2-8B/3d65fbc2-bf91-479c-a687-e9ef702794fb.json
 delete mode 100644 data/hfopenllm_v2/djuna/L3.1-Suze-Vume-calc/650cdbbb-e066-4581-8d61-77aa6a4c402c.json
 delete mode 100644 data/hfopenllm_v2/djuna/MN-Chinofun-12B-2/05d566c5-1810-483c-8ce0-84635b9457dc.json
 delete mode 100644 data/hfopenllm_v2/djuna/MN-Chinofun-12B-3/37e3456a-92ff-4122-a697-ffbdc1c79555.json
 delete mode 100644 data/hfopenllm_v2/djuna/MN-Chinofun-12B-4/70c908d4-f1bf-4553-9bf7-95eb593b4853.json
 delete mode 100644 data/hfopenllm_v2/djuna/MN-Chinofun/2ccc9c20-5414-4286-abcd-ad2b20f8652d.json
 delete mode 100644 data/hfopenllm_v2/djuna/Q2.5-Partron-7B/50f4560a-e172-42b9-b552-437aff158a38.json
 delete mode 100644 data/hfopenllm_v2/djuna/Q2.5-Veltha-14B-0.5/c6a3abac-8a34-4725-915b-c27c3d0bc484.json
 delete mode 100644 data/hfopenllm_v2/djuna/Q2.5-Veltha-14B/a8ed68ea-6463-4ff9-9dcd-034080272dec.json
 delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-Llama-3-8B-Instruct/5799ce8b-c00d-49f6-96dc-f7dd057a268c.json
 delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-Llama-3-Huge-Instruct/0d261023-3e35-4160-98ca-241bbaee927e.json
 delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-Llama-3-Large-Instruct/f0454d3b-18b4-488a-94dd-fb24729996c7.json
 delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-Llama-3.1-8B-Instruct/6bafa7a7-3a2a-4141-9564-a762d1cdb1d0.json
 delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-Llama3.1-Large/37f20f86-40ba-4f63-b29d-efff6cb0e09b.json
 delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-Medium/bf0e7ce4-09e9-4879-993a-eb50b2a421d7.json
 delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-Phi-3-medium-4k-instruct/bcbc29f7-ea03-4dbe-a83e-d4940b2c6bea.json
 delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-XLarge-base/cbea8d66-0370-4998-8e3a-06fef0a60f0c.json
 delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-XLarge/ca48b670-b82e-46cc-beb9-2fd0f11d3585.json
 delete mode 100644 data/hfopenllm_v2/dnhkng/RYS-XLarge2/d37f99f7-f9c3-48b6-84d3-7da5d77f5030.json
 delete mode 100644 data/hfopenllm_v2/dreamgen/WizardLM-2-7B/503c8a24-4ced-4dca-b9df-5733ce89c2ca.json
 delete mode 100644 data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v1/5c5283a0-819f-4112-bb90-5277423d9c00.json
 delete mode 100644 data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v2/b636bc82-1625-49b1-beec-cadaf4e1b1a9.json
 delete mode 100644 data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v3/00f481c1-0ef0-40bd-bd95-81dc9443a62c.json
 delete mode 100644 data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v4/7ea22fef-2d79-49ae-bf72-9153a4e239c5.json
 delete mode 100644 data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v5/64f441df-1781-4d01-b73b-2156413ad403.json
 delete mode 100644 data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v6/4e3676eb-8607-416e-986a-7098bc192820.json
 delete mode 100644 data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v7/2101369c-5042-48f3-a8f2-f9f56e7b6ae7.json
 delete mode 100644 data/hfopenllm_v2/duyhv1411/Llama-3.2-1B-en-vi/c4b86264-3725-4742-91f0-3e01f8d965a4.json
 delete mode 100644 data/hfopenllm_v2/duyhv1411/Llama-3.2-3B-en-vi/0308147c-dabb-46bb-8add-d332fcd5a800.json
 delete mode 100644 data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id-inst/a9977a0d-e199-488a-a26e-6269806fdb2b.json
 delete mode 100644 data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id-instruct/56b89ec8-90c5-4e1e-a458-1bb8b5b92be8.json
 delete mode 100644 data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id/4185c376-91c6-435d-ae3b-47cd85151049.json
 delete mode 100644 data/hfopenllm_v2/dwikitheduck/gen-inst-1/26e45f5d-1e3d-425f-ba4d-b444dcda7f74.json
 delete mode 100644 data/hfopenllm_v2/dwikitheduck/gen-try1-notemp/09be48ce-61f8-4ba9-b082-b9c475fa714d.json
 delete mode 100644 data/hfopenllm_v2/dwikitheduck/gen-try1/27417bcb-fb2f-41d2-9dfa-9865a36f38d5.json
 delete mode 100644 data/hfopenllm_v2/dzakwan/dzakwan-MoE-4x7b-Beta/7b6fc3c2-a67d-450e-858c-fa87be122376.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/Falcon3-8B-Franken-Basestruct/76b86418-5450-48c6-ae56-58a19016d055.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/Falcon3-MoE-2x7B-Insruct/e06594e4-899a-4285-b130-f7b605e5a6b9.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/Gemma2-9B-it-psy10k-mental_health/9efdc773-a5c7-4709-88c8-96a67d84a742.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/Gemma2-9b-it-train6/1fcc2f96-afc9-403f-b82e-8e1804506582.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/HappyLlama1/bee1e134-9a43-441a-b977-522c510dd1ce.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/QwenQwen2.5-7B-IT-Dare/b70e1089-d136-4b2f-a253-f361bcf8cdcc.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/QwenQwen2.5-7B-IT/8b7e9c34-a982-4f4d-b5dc-66a12578601f.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/RQwen-v0.1/0ccc36d0-f546-46d1-91d3-15a40c7bf6c1.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/RQwen-v0.2/066abe97-2c6c-4f3b-9e5e-e144f130258a.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/SoRu-0009/a3af8f77-d915-4482-a2b6-c99744aada4b.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/coolqwen-3b-it/82cc8b37-e242-441e-ac74-1662bcc0a0e2.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/della-70b-test-v1/1527c8bc-c1ec-45f4-9663-4cffbb808f94.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/falcon3-ultraset/337b8ce8-d697-47f6-94ac-7a420dd7d91b.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/fd-lora-merged-16x32/3d6ed2bb-5be7-4838-abb7-49754f9c3bfe.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/fd-lora-merged-64x128/0a6c7056-1bce-479e-84b0-f4eeea0bd3cc.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/fp4-14b-it-v1/3e236ad8-3828-407f-9076-743b465b8d15.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/fp4-14b-v1-fix/9e90dcdf-ce2a-4a7c-8b89-6af8b7c2bcfe.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/fq2.5-7b-it-normalize_false/940d88e9-085b-4065-b8c8-92ebe685deb0.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/fq2.5-7b-it-normalize_true/7fdcd616-2c72-4c44-9646-9c32344bfa0b.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/frqwen2.5-from7b-duable4layers-it/9d358f55-810c-4ac1-adc7-83f95bd74c11.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/frqwen2.5-from7b-it/9ba3fe31-772a-4cf7-aa13-3680b6ad51ba.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/mllama-3.1-8b-instruct/651a32b1-77fb-4acf-89bf-2d45b684944d.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/mllama-3.1-8b-it/192c4037-753a-4790-80d0-33c4d277102d.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/moremerge-upscaled/679d66bf-244e-4080-9a42-0a0c6cfdc965.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/moremerge/73b0ca8a-fb16-43eb-a9af-a01219cf6196.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/phi-4-25b/7f00ecbc-fcc8-43ae-867b-cb160e63a80c.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/qwen2.5-test-32b-it/a8238bd4-3982-4e45-92e4-bab77e528e29.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/qwen2.5-with-lora-think-3b-it/f87f9f08-e989-4e99-a254-a3650e7ab1b6.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/rmoe-v1/f40496a9-fb14-4b2d-8070-84f55e6417f6.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/rufalcon3-3b-it/cc52f59d-5669-44b0-b1af-e6fd0836e284.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/ruphi-4b/67525a37-f658-40e8-89a1-de8bf6275a00.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/testq-32b/3cb34886-7a93-42b9-a8fa-fab5f4bd8624.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/tmoe-v2/0dd1f9fc-cf54-47ff-8ccd-148b45f3c921.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/tmoe/7a05616e-7335-419a-914d-00fb287fe663.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/trd-7b-it/070a21b5-4cd3-41b7-9653-0d2d2e4f273d.json
 delete mode 100644 data/hfopenllm_v2/ehristoforu/ud-14b/5afc044a-3138-443f-89cf-74f1272cc632.json
 delete mode 100644 data/hfopenllm_v2/elinas/Chronos-Gold-12B-1.0/a6c1d914-647c-46b7-b0e1-712b8d506780.json
 delete mode 100644 data/hfopenllm_v2/ell44ot/gemma-2b-def/43f35eac-0946-42f9-a128-eb8011c29588.json
 delete mode 100644 data/hfopenllm_v2/euclaise/ReMask-3B/04c22be7-2cf4-4774-b479-863199c7c3a4.json
 delete mode 100644 data/hfopenllm_v2/eworojoshua/vas-01/fc3d436b-ec61-4458-a3c6-1df41057ea70.json
 delete mode 100644 data/hfopenllm_v2/ewre324/Thinker-Llama-3.2-3B-Instruct-Reasoning/e3ed157f-f306-40fb-b3a1-d3434236759e.json
 delete mode 100644 data/hfopenllm_v2/ewre324/Thinker-Qwen2.5-0.5B-Instruct-Reasoning/8793b3e3-f409-499a-81f8-c250c8092841.json
 delete mode 100644 data/hfopenllm_v2/ewre324/Thinker-SmolLM2-135M-Instruct-Reasoning/33572f63-15ba-4fbc-b1cf-56b978384d02.json
 delete mode 100644 data/hfopenllm_v2/ewre324/ewre324-R1-SmolLM2-135M-Distill/44c636ba-8303-4d75-bcb5-46e3c07a991a.json
 delete mode 100644 data/hfopenllm_v2/experiment-llm/exp-3-q-r/0a002444-3e5a-4fc8-acc6-72210a4181a9.json
 delete mode 100644 data/hfopenllm_v2/facebook/opt-1.3b/bbf936a5-3594-4d0a-b5af-7a01740d0c81.json
 delete mode 100644 data/hfopenllm_v2/facebook/opt-30b/1164abea-4cc2-46a7-a44b-f024a2ce40b4.json
 delete mode 100644 data/hfopenllm_v2/failspy/Llama-3-8B-Instruct-MopeyMule/bfd88bec-fcc2-4580-a5c7-4792a0300a5b.json
 delete mode 100644 data/hfopenllm_v2/failspy/Llama-3-8B-Instruct-abliterated/7f49e582-a01f-481f-8345-1c384fc8b567.json
 delete mode 100644 data/hfopenllm_v2/failspy/Meta-Llama-3-70B-Instruct-abliterated-v3.5/10937ed1-56e2-4aad-b717-5125bc8ac72a.json
 delete mode 100644 data/hfopenllm_v2/failspy/Meta-Llama-3-8B-Instruct-abliterated-v3/f4622539-c0ac-4e9f-86d4-00e3c826d03b.json
 delete mode 100644 data/hfopenllm_v2/failspy/Phi-3-medium-4k-instruct-abliterated-v3/6b13b2b1-68cd-4aae-8f2b-2400f40760d7.json
 delete mode 100644 data/hfopenllm_v2/failspy/llama-3-70B-Instruct-abliterated/5b02726c-ba3f-482b-9f10-87b8d69ffeb4.json
 delete mode 100644 data/hfopenllm_v2/fblgit/TheBeagle-v2beta-32B-MGS/21d6f2dd-7bd6-42a9-b14e-c25777497890.json
 delete mode 100644 data/hfopenllm_v2/fblgit/TheBeagle-v2beta-32B-MGS/d0bc11cb-56ff-4c77-9446-e76e550e0919.json
 delete mode 100644 data/hfopenllm_v2/fblgit/UNA-SimpleSmaug-34b-v1beta/ff78dc97-e9cf-4215-a607-3e80892af82c.json
 delete mode 100644 data/hfopenllm_v2/fblgit/UNA-TheBeagle-7b-v1/0ff1c6ff-5404-4d61-b6c6-f6ef7ae9ca8b.json
 delete mode 100644 data/hfopenllm_v2/fblgit/UNA-ThePitbull-21.4B-v2/48837141-2556-4658-87e0-bb88cfcd562a.json
 delete mode 100644 data/hfopenllm_v2/fblgit/cybertron-v4-qw7B-MGS/f2d6da5d-3685-43de-8ceb-5b798f88e24c.json
 delete mode 100644 data/hfopenllm_v2/fblgit/cybertron-v4-qw7B-UNAMGS/9ec02ccd-329a-4d62-9f04-87de6fda5011.json
 delete mode 100644 data/hfopenllm_v2/fblgit/juanako-7b-UNA/781d0332-e332-4ff7-8585-9c2d8395a147.json
 delete mode 100644 data/hfopenllm_v2/fblgit/miniclaus-qw1.5B-UNAMGS-GRPO/d6dd460e-c352-4d31-8941-183c6eabd0a7.json
 delete mode 100644 data/hfopenllm_v2/fblgit/miniclaus-qw1.5B-UNAMGS/66bf6442-04ea-437b-88c4-e61afc6f7139.json
 delete mode 100644 data/hfopenllm_v2/fblgit/pancho-v1-qw25-3B-UNAMGS/0d1911f5-a2e7-4511-a8d8-098cbf9207df.json
 delete mode 100644 data/hfopenllm_v2/fblgit/una-cybertron-7b-v2-bf16/abc18648-ef96-4695-94d5-fa14be277431.json
 delete mode 100644 data/hfopenllm_v2/fhai50032/RolePlayLake-7B/ff1e7aaa-3f29-4192-a0e0-80fcd11ba055.json
 delete mode 100644 data/hfopenllm_v2/fhai50032/Unaligned-Thinker-PHI-4/cc8ef5bd-957f-4308-9539-00a696182056.json
 delete mode 100644 data/hfopenllm_v2/flammenai/Llama3.1-Flammades-70B/abc7652f-b88e-40ba-847c-c99dce9f2719.json
 delete mode 100644 data/hfopenllm_v2/flammenai/Mahou-1.2a-llama3-8B/56e36294-e616-45a1-8dc9-2c14cf3ee8d0.json
 delete mode 100644 data/hfopenllm_v2/flammenai/Mahou-1.2a-mistral-7B/4b81caad-92ed-4bd5-98bd-58582854b5d8.json
 delete mode 100644 data/hfopenllm_v2/flammenai/Mahou-1.5-llama3.1-70B/2cef0040-6d4c-4c38-be40-5477911f3063.json
 delete mode 100644 data/hfopenllm_v2/flammenai/Mahou-1.5-mistral-nemo-12B/4aeef94f-823e-4be5-b4f1-37463e052748.json
 delete mode 100644 data/hfopenllm_v2/flammenai/flammen15-gutenberg-DPO-v1-7B/3d367147-373f-4543-be19-55a6429558a2.json
 delete mode 100644 data/hfopenllm_v2/fluently-lm/FluentlyLM-Prinum/cb93091a-6c46-438a-b111-cbf7e2fac420.json
 delete mode 100644 data/hfopenllm_v2/fluently-lm/Llama-TI-8B-Instruct/ea6048f1-8be4-4ec8-a5d5-35ff1523d74a.json
 delete mode 100644 data/hfopenllm_v2/fluently-lm/Llama-TI-8B/f4dc1659-800f-49d2-a290-48e9d4b15581.json
 delete mode 100644 data/hfopenllm_v2/fluently-sets/FalconThink3-10B-IT/d4d8a784-5bd5-4437-8e0d-75dcb967ae33.json
 delete mode 100644 data/hfopenllm_v2/fluently-sets/reasoning-1-1k-demo/91017e73-f33a-49f5-ac87-f6e6a178d885.json
 delete mode 100644 data/hfopenllm_v2/formulae/mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp/b7a75bca-6afe-448a-8e5c-53ebd577c964.json
 delete mode 100644 data/hfopenllm_v2/formulae/mita-elite-v1.1-7b-2-25-2025/8cdced5c-23bc-4426-a0c9-b9bf82913683.json
 delete mode 100644 data/hfopenllm_v2/formulae/mita-elite-v1.1-gen2-7b-2-25-2025/368784c8-6fc2-4340-8277-a6a9a9800a99.json
 delete mode 100644 data/hfopenllm_v2/formulae/mita-elite-v1.2-7b-2-26-2025/f7ddf26b-4b4c-404b-b9d3-6ceaf78d39aa.json
 delete mode 100644 data/hfopenllm_v2/formulae/mita-gen3-7b-2-26-2025/f423b0d1-3536-4865-9615-f89b9d15b14c.json
 delete mode 100644 data/hfopenllm_v2/formulae/mita-gen3-v1.2-7b-2-26-2025/c7e8333d-1d79-4cfa-9833-fa42f9fcbb4b.json
 delete mode 100644 data/hfopenllm_v2/formulae/mita-math-v2.3-2-25-2025/b6149d15-3e0f-43d2-ae90-eca290a94edb.json
 delete mode 100644 data/hfopenllm_v2/formulae/mita-v1-7b/e21f5d83-6b71-488d-ad55-d23268fbd611.json
 delete mode 100644 data/hfopenllm_v2/formulae/mita-v1.1-7b-2-24-2025/68e1a42e-4318-4b5a-a45b-2607b7c2fe05.json
 delete mode 100644 data/hfopenllm_v2/formulae/mita-v1.2-7b-2-24-2025/12a03ffb-d66b-4d00-a43b-fd5be80e1b07.json
 delete mode 100644 data/hfopenllm_v2/frameai/Loxa-4B/adbad8dc-7d13-44cc-a5c6-e8da1de27c37.json
 delete mode 100644 data/hfopenllm_v2/freewheelin/free-evo-qwen72b-v0.8-re/7fb595e5-abbc-43ff-8135-c4bb4a2ea593.json
 delete mode 100644 data/hfopenllm_v2/freewheelin/free-solar-evo-v0.1/1bb09da7-1675-4e57-b46a-9791c888ce6f.json
 delete mode 100644 data/hfopenllm_v2/freewheelin/free-solar-evo-v0.11/3ed7dd5a-e431-480a-91a7-5ccd915057e4.json
 delete mode 100644 data/hfopenllm_v2/freewheelin/free-solar-evo-v0.13/9cab35b6-d6a7-475e-b715-e4493d07cd92.json
 delete mode 100644 data/hfopenllm_v2/fulim/FineLlama-3.1-8B/ef7149ae-8d50-4890-89ae-fb561a86d130.json
 delete mode 100644 data/hfopenllm_v2/gabrielmbmb/SmolLM-1.7B-Instruct-IFEval/3fa14e1f-82a5-4c04-9c76-2a3f6d56aa81.json
 delete mode 100644 data/hfopenllm_v2/gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA/4418c7d1-72da-4ed3-9d5c-9d8520f6641c.json
 delete mode 100644 data/hfopenllm_v2/gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES/8fe13380-a045-4d63-96f8-ec977540478c.json
 delete mode 100644 data/hfopenllm_v2/gbueno86/Brinebreath-Llama-3.1-70B/6da42427-c7de-4830-b368-ca7757ee1d51.json
 delete mode 100644 data/hfopenllm_v2/gbueno86/Meta-LLama-3-Cat-Smaug-LLama-70b/5faf24b3-38af-4f3f-8377-bba70d75f8df.json
 delete mode 100644 data/hfopenllm_v2/ghost-x/ghost-8b-beta-1608/9a26214c-2601-49be-b1b1-03796b704059.json
 delete mode 100644 data/hfopenllm_v2/glaiveai/Reflection-Llama-3.1-70B/fa71ed09-45d4-4a5b-bfb1-a61a359a8f0c.json
 delete mode 100644 data/hfopenllm_v2/gmonsoon/SahabatAI-Llama-11B-Test/25c5b304-46d3-4df3-9ac3-75ffa972849a.json
 delete mode 100644 data/hfopenllm_v2/gmonsoon/SahabatAI-MediChatIndo-8B-v1/88ed0272-39f8-4676-970a-525aee058991.json
 delete mode 100644 data/hfopenllm_v2/gmonsoon/SahabatAI-Rebase-8B-Test/d8eff5d0-061b-4b83-b96a-04f9ba47ea6c.json
 delete mode 100644 data/hfopenllm_v2/gmonsoon/StockSeaLLMs-7B-v1/dcb90e75-8709-4729-8c00-e756e6a9a49d.json
 delete mode 100644 data/hfopenllm_v2/gmonsoon/gemma2-9b-sahabatai-v1-instruct-BaseTIES/81dcf3ca-f5c2-40a1-8871-b0188d5e9ceb.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_full_2/0a0a4d32-c7a9-49c9-bba4-dae6b464a5b6.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_full_3B/82a3a8ef-7e5f-48d0-a48e-41ea2c5b6452.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_max_2600/e635e798-fa85-4430-bf1e-9d5ad7fe9f22.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_max_2600_3B/7ccaa29a-4f73-4794-83a2-b925d755d91e.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_me_max_5200/ba8de8f6-c118-4bc3-ae8d-851e964684ed.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_min_2600/4011975a-e2a0-466a-9b34-923e1b4f8733.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_ins_ans_max_5200/8a172205-39c6-4dd1-86b2-11b234b37e3c.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_ins_max_5200/495b2e8e-e2d8-4158-bc6e-7568604d44e9.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_ins_min_2600/e6a97d0d-9dc3-43a5-a69f-8132e19f9c77.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_ins_min_5200/4aecfd45-f47b-4f02-a0ed-288cbef46a6f.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_sampled_ifd_5200/a6f7bc45-c2b5-47d8-a062-60f20c3d7ea4.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_sampled_ifd_new_5200/c85c79d6-28e0-4deb-ad84-901b725aeca8.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.1_2600/73271472-d06f-405b-af9d-2da7c17e1eb0.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.3_2600/4e40bb43-c33d-4324-aa02-5bb7f88a5d1f.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.7_2600/9b36e4c0-0d13-4988-8145-b9254da2e76e.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_2500/6a464798-0111-4c71-b156-72a5aba1da63.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_2600_3B/78252135-f15b-427d-86de-c32cd3dbcd0f.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_5200/c3b7bd57-9bc3-4d83-aad9-7d6315748c0a.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/ifd_2500_qwen/bce17582-e807-4b91-b0e7-0a890bf5eb24.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/ifd_new_correct_all_sample_2500_qwen/f8371e81-f6d4-4441-bc6c-5d4a18da7d08.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/ifd_new_correct_sample_2500_qwen/78407b2e-1f44-46f0-bc21-76bdc68f8d9c.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/ifd_new_qwen_2500/bdb9e2d2-8d09-4994-a320-2f968bcb4898.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/qwen-2.5-1.5b-cherry/c57d15c8-9581-4bb5-89e4-2fea1e3c584e.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/qwen_2.5-1.5b-cherry_new/550d5665-7a8a-437e-b318-000690dd250f.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/qwen_full_data_alpaca/a1922f33-32f5-4f99-8df6-e2080808d292.json
 delete mode 100644 data/hfopenllm_v2/godlikehhd/qwen_ins_ans_2500/6ccc376b-24a4-42cc-8ea0-823ef14336db.json
 delete mode 100644 data/hfopenllm_v2/google/codegemma-1.1-2b/6547b6f3-63dd-4516-b294-62c4246c3dc7.json
 delete mode 100644 data/hfopenllm_v2/google/flan-t5-base/a58bf2d3-d209-41b8-a795-ba7a16e4a28f.json
 delete mode 100644 data/hfopenllm_v2/google/flan-t5-large/b15ad3b5-7ef2-439e-9acd-a85eab520d31.json
 delete mode 100644 data/hfopenllm_v2/google/flan-t5-small/64da2654-9fdb-4a08-ad16-cf8793a30ed8.json
 delete mode 100644 data/hfopenllm_v2/google/flan-t5-xl/37080215-ee30-4e59-a407-b14695ac2a38.json
 delete mode 100644 data/hfopenllm_v2/google/flan-t5-xl/b83a0ce7-bf13-4a98-81f3-04e5a44105f7.json
 delete mode 100644 data/hfopenllm_v2/google/flan-t5-xxl/bb7bea21-5bc6-460d-98ff-b3ed02d5b215.json
 delete mode 100644 data/hfopenllm_v2/google/flan-ul2/da9ddecc-43cf-4055-a19e-795b1ee98826.json
 delete mode 100644 data/hfopenllm_v2/google/gemma-1.1-2b-it/a93ccb3f-f2d9-415d-8397-0c7fb765fada.json
 delete mode 100644 data/hfopenllm_v2/google/gemma-1.1-7b-it/d0f86765-bdb4-4367-986b-28303bbe1844.json
 delete mode 100644 data/hfopenllm_v2/google/gemma-2-27b-it/693bb191-ae83-49dc-9df1-2f68b1b5fe4a.json
 delete mode 100644 data/hfopenllm_v2/google/gemma-2-27b/7b2c0b72-6421-4f33-8593-a4bbfd0c6d6b.json
 delete mode 100644 data/hfopenllm_v2/google/gemma-2-2b-it/c4ee822f-fc8b-4523-95b6-7c3f12a334b3.json
 delete mode 100644 data/hfopenllm_v2/google/gemma-2-2b-jpn-it/1810033a-185b-4c91-91d3-43b8f6c61443.json
 delete mode 100644 data/hfopenllm_v2/google/gemma-2-2b-jpn-it/beb721ae-a35c-4f6b-a80f-aac4835d5f8d.json
 delete mode 100644 data/hfopenllm_v2/google/gemma-2-2b/cf20e77a-340f-4d8d-b593-9645bdfc5877.json
 delete mode 100644 data/hfopenllm_v2/google/gemma-2-2b/eec73e49-ac2b-42ed-a115-76e45007cd5d.json
 delete mode 100644 data/hfopenllm_v2/google/gemma-2-9b-it/aa06d058-87f9-4fde-ad53-139b29a71448.json
 delete mode 100644 data/hfopenllm_v2/google/gemma-2-9b/3f1d571a-fc42-411b-88ab-4700d5861367.json
 delete mode 100644 data/hfopenllm_v2/google/gemma-2b-it/74a56080-aeb2-4cc6-a825-bbe4d9a5900a.json
 delete mode 100644 data/hfopenllm_v2/google/gemma-2b/2eb433ba-5c93-4355-99dd-edcb65721603.json
 delete mode 100644 data/hfopenllm_v2/google/gemma-7b-it/826fc3ab-6ff8-44fa-a745-a0b80bcb2db4.json
 delete mode 100644 data/hfopenllm_v2/google/gemma-7b/6da54964-e3b5-4567-8ce4-7e0f279af84f.json
 delete mode 100644 data/hfopenllm_v2/google/mt5-base/a7dde688-a0ae-4731-909f-0bef0c6eeba9.json
 delete mode 100644 data/hfopenllm_v2/google/mt5-small/eb2a8a60-2240-4b08-9dc3-be0215aa7bfc.json
 delete mode 100644 data/hfopenllm_v2/google/mt5-xl/9b05919f-d7c1-4e04-9dd8-9ae70e0005e6.json
 delete mode 100644 data/hfopenllm_v2/google/mt5-xxl/6cd98538-74b6-4ac6-a3ac-9a311cfe47f6.json
 delete mode 100644 data/hfopenllm_v2/google/recurrentgemma-2b-it/b0ca2dec-387f-4b27-9adb-772af1899832.json
 delete mode 100644 data/hfopenllm_v2/google/recurrentgemma-2b/53c4b397-b78e-4699-a01e-3535aa072225.json
 delete mode 100644 data/hfopenllm_v2/google/recurrentgemma-9b-it/f5b251f0-741c-4ad5-ab04-19c5202854ea.json
 delete mode 100644 data/hfopenllm_v2/google/recurrentgemma-9b/7b2ba13a-e01d-4442-9abe-d16df1a1668a.json
 delete mode 100644 data/hfopenllm_v2/google/switch-base-8/bf79f87c-3f14-49e8-acba-725e709d5f11.json
 delete mode 100644 data/hfopenllm_v2/google/umt5-base/3fbac7d4-cbbb-4b77-9db4-fd7e122cc90e.json
 delete mode 100644 data/hfopenllm_v2/goulue5/merging_LLM/6efd0dbd-b8c1-4c66-bdf7-19055c16ca22.json
 delete mode 100644 data/hfopenllm_v2/gradientai/Llama-3-8B-Instruct-Gradient-1048k/1388b8d4-c711-480c-8a06-a8b7bd8aa79c.json
 delete mode 100644 data/hfopenllm_v2/grimjim/DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B/03393ffd-1923-4767-ba14-d0e3e6751842.json
 delete mode 100644 data/hfopenllm_v2/grimjim/Gigantes-v1-gemma2-9b-it/b7d049dc-127d-4075-8067-22adac9a58c3.json
 delete mode 100644 data/hfopenllm_v2/grimjim/Gigantes-v2-gemma2-9b-it/89d79024-f4b8-4165-bd88-47f2b0010800.json
 delete mode 100644 data/hfopenllm_v2/grimjim/Gigantes-v3-gemma2-9b-it/d2c0fb0d-6c0c-464a-b09f-6382a57b6afb.json
 delete mode 100644 data/hfopenllm_v2/grimjim/HuatuoSkywork-o1-Llama-3.1-8B/a891b28a-2dcc-4b8e-ad20-1f23d663b44b.json
 delete mode 100644 data/hfopenllm_v2/grimjim/Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge/55e274bb-1e2c-4402-b7ae-09ff7b1f9738.json
 delete mode 100644 data/hfopenllm_v2/grimjim/Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge/fe7a6940-fc4c-4345-84be-609c8155be57.json
 delete mode 100644 data/hfopenllm_v2/grimjim/Llama-3.1-8B-Instruct-abliterated_via_adapter/77eb2b0f-e3e3-474c-bb02-dabde2998ef0.json
 delete mode 100644 data/hfopenllm_v2/grimjim/Llama-3.1-Bonsaikraft-8B-Instruct/94d744be-5d28-490a-ba9a-8440cb97dce9.json
 delete mode 100644 data/hfopenllm_v2/grimjim/Llama-Nephilim-Metamorphosis-v2-8B/2765061e-7506-4eb6-b63f-312f6290665a.json
 delete mode 100644 data/hfopenllm_v2/grimjim/Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B/167c937c-66c7-45a8-bbd9-97d98531bf7d.json
 delete mode 100644 data/hfopenllm_v2/grimjim/Magnolia-v1-Gemma2-8k-9B/9587c35c-1def-46e7-8642-7acb0340be5e.json
 delete mode 100644 data/hfopenllm_v2/grimjim/Magnolia-v2-12B/1c9594fe-03d6-4ec1-9da5-99960da0dcd4.json
 delete mode 100644 data/hfopenllm_v2/grimjim/Magnolia-v2-Gemma2-8k-9B/8ed2c4eb-bc72-4dde-a559-1afd1698d37d.json
 delete mode 100644 data/hfopenllm_v2/grimjim/Magnolia-v3-12B/a2f9536a-9266-4aee-be90-d04f4dcbe53c.json
 delete mode 100644 data/hfopenllm_v2/grimjim/Magnolia-v3-Gemma2-8k-9B/7f116aaa-3880-4e53-948a-4b06e0d26cff.json
 delete mode 100644 data/hfopenllm_v2/grimjim/Magnolia-v4-12B/7cbe4516-2be2-421b-95f4-c9500ad64ca5.json
 delete mode 100644 data/hfopenllm_v2/grimjim/Magnolia-v5a-12B/07df565a-bc30-4a9d-b472-7a85f35938be.json
 delete mode 100644 data/hfopenllm_v2/grimjim/Magot-v1-Gemma2-8k-9B/7545f7db-10bb-4d97-9b3f-4346f4f26bad.json
 delete mode 100644 data/hfopenllm_v2/grimjim/Magot-v2-Gemma2-8k-9B/47384f10-ac6a-4629-92db-86f01a441f7f.json
 delete mode 100644 data/hfopenllm_v2/grimjim/SauerHuatuoSkywork-o1-Llama-3.1-8B/3c9f022f-3e2b-48d6-acb9-07f066cfceb6.json
 delete mode 100644 data/hfopenllm_v2/grimjim/llama-3-Nephilim-v1-8B/1d851cfb-8624-4516-8204-85569c60dc67.json
 delete mode 100644 data/hfopenllm_v2/grimjim/llama-3-Nephilim-v2-8B/a7990990-7498-4b74-a0aa-9c266910698e.json
 delete mode 100644 data/hfopenllm_v2/grimjim/llama-3-Nephilim-v2.1-8B/0b41d37e-0728-4575-9662-c150e2e29bd0.json
 delete mode 100644 data/hfopenllm_v2/grimjim/llama-3-Nephilim-v3-8B/c565a7e9-bd1b-41a5-bff3-3a349553f4e8.json
 delete mode 100644 data/hfopenllm_v2/gupta-tanish/llama-7b-dpo-baseline/680a4507-755e-4014-877b-6032f0220270.json
 delete mode 100644 data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.1/5ace8dc6-e348-4267-bb4a-f71a335d074e.json
 delete mode 100644 data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.2/07549821-db51-4b77-980a-056131b5dd29.json
 delete mode 100644 data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.3/ff12a0a1-a913-441b-955c-bcbd50056acf.json
 delete mode 100644 data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.4/947cfc2b-b73c-40eb-9e57-be5278776711.json
 delete mode 100644 data/hfopenllm_v2/h2oai/h2o-danube-1.8b-chat/53639078-c50a-4147-bab0-16993f1790b6.json
 delete mode 100644 data/hfopenllm_v2/h2oai/h2o-danube3-4b-base/b2cf96e0-382e-4200-a4a4-d66e8a188878.json
 delete mode 100644 data/hfopenllm_v2/h2oai/h2o-danube3-4b-chat/d4ed3eb6-f569-4d4b-8da5-50eaaf824128.json
 delete mode 100644 data/hfopenllm_v2/h2oai/h2o-danube3-500m-chat/210f7063-e0d9-424d-94f4-3645e4e1b401.json
 delete mode 100644 data/hfopenllm_v2/h2oai/h2o-danube3.1-4b-chat/4ecd26d8-8416-4dba-8d53-96f4013cfef0.json
 delete mode 100644 data/hfopenllm_v2/haoranxu/ALMA-13B-R/15712b7d-e69f-4a4f-b13c-4e79ce859399.json
 delete mode 100644 data/hfopenllm_v2/haoranxu/Llama-3-Instruct-8B-CPO-SimPO/9148c375-7c08-4c1c-82ed-5f935b2a4f04.json
 delete mode 100644 data/hfopenllm_v2/haoranxu/Llama-3-Instruct-8B-SimPO/fb93274b-b7d8-483a-a95d-96340535febc.json
 delete mode 100644 data/hfopenllm_v2/hatemmahmoud/qwen2.5-1.5b-sft-raft-grpo-hra-doc/0818b755-ec49-457c-8635-73f01816f30b.json
 delete mode 100644 data/hfopenllm_v2/hon9kon9ize/CantoneseLLMChat-v0.5/77962326-0160-49bd-9ef1-59b403b2bfce.json
 delete mode 100644 data/hfopenllm_v2/hon9kon9ize/CantoneseLLMChat-v1.0-7B/272abbe5-8b61-442f-9860-d7411e7fec99.json
 delete mode 100644 data/hfopenllm_v2/hongbai12/li-0.4-pre/14d617a8-18c6-40a7-a4ba-19cf5fc5f4e3.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/Deepseek-qwen-modelstock-2B/ef7b5e6d-b5b7-4c7b-9781-6f90eb1ff5dd.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/Falcon3Slerp1-10B/1970e257-7c93-4342-9ff4-a96af21acc67.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/Falcon3Slerp2-10B/15d71696-4b21-41ff-a4c6-0aea92fb844a.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/Falcon3Slerp4-10B/ccb85394-5252-48d4-8980-8b3a6c67ab1a.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/FalconSlerp-3B/ea9837ff-f4c7-4bb0-b2af-7ae26371baf0.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/FalconSlerp1-7B/fe9012a7-d07f-48d4-b460-eca256078d8b.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/FalconSlerp2-7B/8e8d2071-8e7d-4dad-8536-4698b2d00316.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/FalconSlerp3-10B/dbcb41be-9ed6-4244-ada8-77f363c3487e.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/FalconSlerp3-7B/e48e2d7e-6c14-4bb1-bd12-74d93a145ca3.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/FalconSlerp4-7B/30c2d908-3eaf-408a-a2b5-301e0cd9e052.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/FalconSlerp6-7B/f7624d04-66d1-4c05-8c01-d015ecf8412c.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/Gemma2Crono-27B/511e4aad-1e5a-4515-9433-46989fc3945b.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/Gemma2SimPO-27B/863e71ec-03a4-47ed-8bc9-b064d5571162.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/Gemma2atlas-27B/6a6dfcb4-192b-44ff-a34f-76b31bbf5ad3.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/Gemma2magnum-27b/e0dbec0b-a154-448a-be23-ef9b764469ea.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/Llama-Hermes-slerp-8B/ecd91300-b0cf-48ce-9e5c-253a7991f90e.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/Llama-Hermes-slerp2-8B/e3df71f1-63e1-40f1-918d-07cb3ec939cf.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/LlamaStock-8B/52066a23-9847-490e-90e3-57eee3c63276.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/Mistral-modelstock-24B/91f15ba3-a062-4b01-8a61-6e51fdf5f8d4.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/Mistral-modelstock2-24B/323630ee-fbe0-49a7-aa11-816fde38ba2d.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/Phi4-Slerp4-14B/e5c8f97d-1873-4c9d-8bed-50dc592543db.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/Qwen2.5-HomerSlerp-7B/7ee2803c-b8f8-4156-8472-bab4baab8863.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenModelStock-1.8B/78573f63-3073-4be4-93a7-0ea00b1383fd.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenSlerp-14B/42da7295-d78d-49a4-9279-8406063240c4.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenSlerp-3B/b61c5735-53ca-4dda-a223-79921eee7f3e.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenSlerp-7B/310124ef-e33f-49de-83eb-e665a5143aaa.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenSlerp2-14B/c9b056df-8bbe-4959-ab44-85813157c95c.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenSlerp2-3B/7a60385f-48dd-4926-8b66-3d42a1631db3.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenSlerp3-14B/da365c7b-74d0-4a9f-a8fd-cf4049ec4de6.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenSparse-7B/e2930715-b616-49a4-83bc-53e92fc3580f.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenStock-0.5B/543f45e0-a158-4fdb-bbb1-8deb38f4515b.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenStock-1.7B/b96a20e0-d044-4a66-8909-437aeaef569c.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/QwenStock1-14B/408742ff-4b21-46dc-b4d6-4c78d652d228.json
 delete mode 100644 data/hfopenllm_v2/hotmailuser/RombosBeagle-v2beta-MGS-32B/496a9fbe-376c-4546-bd90-b42f583924ce.json
 delete mode 100644 data/hfopenllm_v2/huggyllama/llama-13b/f32c07b4-21a8-4cd2-91f8-f0f26d0b1b38.json
 delete mode 100644 data/hfopenllm_v2/huggyllama/llama-65b/cc36cc37-0f41-42aa-8051-54cc135820ef.json
 delete mode 100644 data/hfopenllm_v2/huggyllama/llama-7b/20d3dac4-9f8c-431c-b20f-364dd860e37f.json
 delete mode 100644 data/hfopenllm_v2/huihui-ai/DeepSeek-R1-Distill-Qwen-14B-abliterated-v2/89022ea8-2a5b-4eba-8d7a-320ba13d30a4.json
 delete mode 100644 data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-7030/97bfd152-79c6-4c96-8d3e-588275339e41.json
 delete mode 100644 data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-8020/93061947-2bcf-482e-ab22-38ef8ee33bcf.json
 delete mode 100644 data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-9010/8f65748b-1251-49f8-bfed-d1e4a937d5ba.json
 delete mode 100644 data/hfopenllm_v2/huihui-ai/Qwen2.5-14B-Instruct-abliterated-v2/4f278881-69d3-42b5-b72c-ff8627a6ef44.json
 delete mode 100644 data/hfopenllm_v2/huihui-ai/Qwen2.5-72B-Instruct-abliterated/d88e85c5-73df-46cc-9234-f0556592ad5a.json
 delete mode 100644 data/hfopenllm_v2/huihui-ai/Qwen2.5-7B-Instruct-abliterated-v2/44d2a20d-e867-4fa5-af3d-087f9c1b4067.json
 delete mode 100644 data/hfopenllm_v2/huihui-ai/Qwen2.5-7B-Instruct-abliterated/e83b3e7e-dc34-4b06-bcfe-95b3ba28aab4.json
 delete mode 100644 data/hfopenllm_v2/huu-ontocord/wide_3b_orpo_stage1.1-ss1-orpo3/44f2948c-4564-44cc-98d8-4f82a30e1f09.json
 delete mode 100644 data/hfopenllm_v2/iFaz/llama31_8B_en_emo_v4/846cf1ff-62c3-44e7-b6dd-0135ec77451a.json
 delete mode 100644 data/hfopenllm_v2/iFaz/llama32_1B_en_emo_v1/d2054469-b38b-4b1d-bd40-7324319f8eca.json
 delete mode 100644 data/hfopenllm_v2/iFaz/llama32_3B_en_emo_1000_stp/ce60608d-5b52-49d4-bbce-4b20e8272cef.json
 delete mode 100644 data/hfopenllm_v2/iFaz/llama32_3B_en_emo_2000_stp/f177bb70-fb7c-4b57-965d-acbcb4936bfa.json
 delete mode 100644 data/hfopenllm_v2/iFaz/llama32_3B_en_emo_300_stp/a5b2ab3d-1f12-4a5a-a110-2514185568b6.json
 delete mode 100644 data/hfopenllm_v2/iFaz/llama32_3B_en_emo_5000_stp/63b887a1-a0b9-46db-a563-b9bd67a0805a.json
 delete mode 100644 data/hfopenllm_v2/iFaz/llama32_3B_en_emo_v2/92d122f7-f29d-49e3-99da-bf20edf377a2.json
 delete mode 100644 data/hfopenllm_v2/iFaz/llama32_3B_en_emo_v3/a0b71344-f3a8-4ad0-87c5-6393148488b1.json
 delete mode 100644 data/hfopenllm_v2/iRyanBell/ARC1-II/821ff784-c48a-4623-9fb5-b77b7114b625.json
 delete mode 100644 data/hfopenllm_v2/iRyanBell/ARC1/ed251513-4807-4e31-bc8e-3ab0217ae4f3.json
 delete mode 100644 data/hfopenllm_v2/ibivibiv/colossus_120b/e7fa3baa-07b4-4f10-aa9c-8424d8fea303.json
 delete mode 100644 data/hfopenllm_v2/ibivibiv/multimaster-7b-v6/11dfd131-00bf-4561-a913-f1c0cb15bf9c.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.0-1b-a400m-base/3ba34f38-2340-407f-a7b5-82749f8a0ee6.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.0-1b-a400m-instruct/91b9649b-bdf6-4b15-a038-47edc2e79ef6.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.0-2b-base/24670e63-32e1-4c5d-82fe-0d0c45a4e165.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.0-2b-instruct/198d1441-1d13-468a-a998-c8cf9f1e7a57.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.0-3b-a800m-base/e9eb1499-835c-4a70-b531-4be5a9718c34.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.0-3b-a800m-instruct/b1fd95ad-767d-4c13-a936-00b08c74ca3d.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.0-8b-base/f87bd357-535e-4450-b01d-b41e1b7571e0.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.0-8b-instruct/300fd27e-4dce-441f-91da-f38bd14ffe5e.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.1-1b-a400m-base/1fd9a2e5-856f-4303-8ac1-611311f3e7b5.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.1-1b-a400m-instruct/4c34d5c6-af1b-4519-8d08-67bd837e9b97.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.1-2b-base/ddc27df7-1c4c-4563-92b2-5a39380423a8.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.1-2b-instruct/3e606ef8-9caa-43d4-81d6-8eae9936ab4c.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.1-3b-a800m-base/b9053559-3b90-4de0-981a-dbb49db38eb5.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.1-3b-a800m-instruct/cea89bc6-b1a1-4b67-a136-45e097563a5b.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.1-8b-base/5eb16113-7d0d-47a0-91d8-ec7dab35efdd.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.1-8b-instruct/45aa6545-d20a-4dfb-a8a6-01f2fd34c9f5.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.2-2b-instruct/c94079d1-d8b1-4198-8129-8c5a11c310ca.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-3.2-8b-instruct/cb45306a-096c-4ed5-a028-6d720b26afe9.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-7b-base/f301908e-474b-4ba2-a873-610ca1b6c2bd.json
 delete mode 100644 data/hfopenllm_v2/ibm-granite/granite-7b-instruct/06f5865d-a62a-48da-b33f-486fe29e3685.json
 delete mode 100644 data/hfopenllm_v2/ibm/PowerLM-3b/4f952c51-91dc-446e-bda1-43ed66e1ca3e.json
 delete mode 100644 data/hfopenllm_v2/ibm/merlinite-7b/dcba3a6f-8f4f-49f6-af74-541de16be435.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.15-02.10-RP/b5d39bcb-dab4-4880-9cb1-68dbd20a3ce5.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.16-02.10-RP/1e597e9b-4e75-4981-842b-dad6f1c15ed7.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.17-03.10-RP/18752dc4-76d1-40dc-9f43-62b8087b7a88.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.27-06.11-RP/fa30c36e-20f1-41ee-a59d-0044f2b76dfb.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.29-06.11-RP/5391ae8f-41b0-41cb-9365-b5cb7649c8b7.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.31-08.11-RP/a95ab4cf-456f-4b3d-9bab-2b755649758d.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.32-10.11-RP/9840baa9-2ddf-4dd9-b3b0-3ec3075089bc.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.34b-14.11-RP/26ff113c-95ca-4716-83f7-4792b46be246.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.34n-14.11-RP/285e1d08-15a0-4d8b-a844-e4cad923ea9b.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.37-18.11-RP/0462269d-94a3-4991-9af5-e55592f344e5.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.38-19.11-RP/c47c4cd6-90b6-42df-a3b9-4fc8f1b3c980.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.39-19.11-RP/0fecafe4-f8f0-4f97-ab2d-589a3856e1af.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.40-20.11-RP/4b5529b9-0800-4cd6-b720-a905ab5e6c9a.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.41-22.11-RP/84783e4d-5eed-474d-9463-a01a0890850e.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.50-16.01-RP/d9fe39c5-24a5-4240-bfc9-59860fcb3911.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.50.1-16.01-RP/2ddf850e-36dc-41b2-92da-e2b45d1544c6.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.51-16.01-RP/b10a9284-fa5e-4a4e-8240-edc98cea6d9c.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.51.1-16.01-RP/2c51bd1d-ebe8-4de9-9749-5f42f7ba3d5a.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.52-16.01-RP/425e6f1e-50dd-444f-b0da-5a0c47d5bf06.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.52.1-16.01-RP/7e1fcf4e-9f64-4112-934c-4808f07d32b2.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.53-16.01-RP/d3666566-09dc-4d53-9996-2301c6fb2721.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.54-17.01-RP/36e5efb9-e3f0-4903-a9f1-3d51453bfdc4.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.55-17.01-RP/a6dba337-81d2-40c6-89c2-aee6de82282e.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.57-17.01-RP/e44b8d9a-f270-45c8-b126-6a8911c35436.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.60-18.01-RP/44d5e1ac-45d5-42aa-b9fa-f18112cf6676.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.60.1-18.01-RP/4246401d-9049-4c83-83d4-e2d9efa4dded.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.61-18.01-RP/26c4785a-0caf-4b01-be5d-1e421bfeb698.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.62-18.01-RP/cc9b9a25-18f9-4cc3-a756-3975a3a3be7d.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.62.1-24.01-RP/b4edb7f5-a675-4627-af96-7ed0909da1e5.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.64-24.01-RP/461b6f40-6f19-48b1-857e-f0fb37f929f9.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.64.1-24.01-RP/e924270d-a655-4093-91b2-f73b7f12eefd.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.65-25.01-RP/af8905e0-e969-45bd-8e09-e7316fff0914.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.66-25.01-RP/e92a6d31-2277-4093-8fae-b3dfaa2d47dd.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.67-25.01-RP/47472cd9-36d3-4074-83d4-af53b9c23758.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.68-25.01-RP/b922f4e1-1fd9-4a32-94ce-4784430cef51.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.69-25.01-RP/5bb2e77f-7709-4eb8-bd08-3c8da4a56310.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.7-29.09-RP/35937213-bb16-4935-9d92-9fa8fd61aac3.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.70-25.01-RP/04122d1b-929d-439c-bb8d-f08508f7a00e.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.70.1-01.02-RP/03beb242-2628-4ea0-a2f3-c3ec43d379de.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.73-01.02-RP/46d55b7b-1972-4cb0-97ca-e04d306282a7.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.74-02.02-RP/32730d82-cfac-481f-9a22-9cbe40646218.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.76-02.02-RP/a290a75f-753b-489d-87a2-ce0637c09f41.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.77-02.02-RP/54032eb0-c4cd-4c76-be2e-f0c81bd26365.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.78-02.02-RP/73b59506-cc1d-413c-a28b-d25e0e6bf413.json
 delete mode 100644 data/hfopenllm_v2/icefog72/Ice0.80-03.02-RP/bea2dcd6-4772-4aac-bcbc-4802cfb33495.json
 delete mode 100644 data/hfopenllm_v2/icefog72/IceCocoaRP-7b/66275215-28e6-42bc-bc22-5d152682ce53.json
 delete mode 100644 data/hfopenllm_v2/icefog72/IceCoffeeRP-7b/9015365c-400b-4fa3-85f2-a1033b030cf7.json
 delete mode 100644 data/hfopenllm_v2/icefog72/IceDrinkByFrankensteinV3RP/55d52914-0904-4e6e-8b37-c22b06f5f2bf.json
 delete mode 100644 data/hfopenllm_v2/icefog72/IceDrinkNameGoesHereRP-7b-Model_Stock/3677260a-2fd5-41bf-9010-f1b31cedacbc.json
 delete mode 100644 data/hfopenllm_v2/icefog72/IceDrinkNameNotFoundRP-7b-Model_Stock/fc54f87a-2e4a-4f3f-b407-e268c4487d16.json
 delete mode 100644 data/hfopenllm_v2/icefog72/IceDrunkCherryRP-7b/8d893736-1707-4c0b-860d-16c62ec26d78.json
 delete mode 100644 data/hfopenllm_v2/icefog72/IceDrunkenCherryRP-7b/d3d2728f-74bf-4196-a909-43797d8b628a.json
 delete mode 100644 data/hfopenllm_v2/icefog72/IceEspressoRPv2-7b/ed241e67-8718-48be-a6e8-19e295a2b5cd.json
 delete mode 100644 data/hfopenllm_v2/icefog72/IceLemonTeaRP-32k-7b/05aafad3-e07a-453b-a70b-f18fbd4eb218.json
 delete mode 100644 data/hfopenllm_v2/icefog72/IceMartiniRP-7b/f79ac32e-ab83-40c3-9c18-35623f5ae1d4.json
 delete mode 100644 data/hfopenllm_v2/icefog72/IceNalyvkaRP-7b/cec76b15-1069-4d37-b8bc-74dde28101f6.json
 delete mode 100644 data/hfopenllm_v2/icefog72/IceSakeRP-7b/e4ac0d0c-65ea-4b43-bb4b-7371c6cd5d61.json
 delete mode 100644 data/hfopenllm_v2/icefog72/IceSakeV4RP-7b/f8d629bf-df0b-4c6a-8c18-17dda002b089.json
 delete mode 100644 data/hfopenllm_v2/icefog72/IceSakeV6RP-7b/6739d8e3-f4bd-4fd5-98f3-887f5ed3f9c0.json
 delete mode 100644 data/hfopenllm_v2/icefog72/IceSakeV8RP-7b/a51722f4-29f4-47a5-acba-4c8b5355551b.json
 delete mode 100644 data/hfopenllm_v2/icefog72/IceTea21EnergyDrinkRPV13-DPOv3.5/06d0a21f-f6e4-4ca9-a679-8c4502aaaad1.json
 delete mode 100644 data/hfopenllm_v2/icefog72/IceTea21EnergyDrinkRPV13-DPOv3/04a4dcc9-3784-4aea-9faf-9db49c2e4c43.json
 delete mode 100644 data/hfopenllm_v2/ifable/gemma-2-Ifable-9B/e4668365-d3dd-4996-9bb1-5b4e6f510264.json
 delete mode 100644 data/hfopenllm_v2/ilsp/Llama-Krikri-8B-Instruct/4d743678-e14d-4866-b1bf-0d660787847b.json
 delete mode 100644 data/hfopenllm_v2/inflatebot/MN-12B-Mag-Mell-R1/720b1476-876c-47d1-bf46-d037389b4b2f.json
 delete mode 100644 data/hfopenllm_v2/informatiker/Qwen2-7B-Instruct-abliterated/4e4f3b2d-5b17-486a-a2ab-c2e89194c765.json
 delete mode 100644 data/hfopenllm_v2/insightfactory/Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model/b738668e-3ac1-4a36-ad71-ad7d2a5256ae.json
 delete mode 100644 data/hfopenllm_v2/instruction-pretrain/InstructLM-500M/623f1b73-1505-4527-b41c-dcb2b711226d.json
 delete mode 100644 data/hfopenllm_v2/internlm/internlm2-1_8b/53f03454-9587-4208-bc01-21de62f59195.json
 delete mode 100644 data/hfopenllm_v2/internlm/internlm2-7b/fb38d8b4-6320-4b8d-bf3d-e3d22bb0ed83.json
 delete mode 100644 data/hfopenllm_v2/internlm/internlm2-chat-1_8b/b127a923-3bf2-4cad-9225-d738efe800e3.json
 delete mode 100644 data/hfopenllm_v2/internlm/internlm2_5-1_8b-chat/a94ae52a-7936-4750-83f5-4740f23adf15.json
 delete mode 100644 data/hfopenllm_v2/internlm/internlm2_5-20b-chat/95e689c6-cd19-4114-b3b5-1672ab849214.json
 delete mode 100644 data/hfopenllm_v2/internlm/internlm2_5-7b-chat/890a8414-bccf-4a66-8013-6c270d017965.json
 delete mode 100644 data/hfopenllm_v2/intervitens/mini-magnum-12b-v1.1/0f8ce410-cf3b-4f78-81b9-a0a1fe91b963.json
 delete mode 100644 data/hfopenllm_v2/inumulaisk/eval_model/121096cf-356b-4069-a0a3-8cf6aad52b81.json
 delete mode 100644 data/hfopenllm_v2/invalid-coder/Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp/fb0bcadf-32a0-4320-909f-2c38ba7d9372.json
 delete mode 100644 data/hfopenllm_v2/invisietch/EtherealRainbow-v0.2-8B/ab941c52-cf33-4b8e-87af-4a73930cf72a.json
 delete mode 100644 data/hfopenllm_v2/invisietch/EtherealRainbow-v0.3-8B/08c242fd-0258-4817-970a-668584ed9385.json
 delete mode 100644 data/hfopenllm_v2/invisietch/MiS-Firefly-v0.2-22B/2171af9a-be5e-4daf-8e67-a5239ccec7bd.json
 delete mode 100644 data/hfopenllm_v2/invisietch/Nimbus-Miqu-v0.1-70B/706f75a1-2f6b-47dd-809e-a830e739b574.json
 delete mode 100644 data/hfopenllm_v2/irahulpandey/mistralai-7B-slerp-v0.1/a9cd0399-4670-4f5c-8c64-c82dac97cd8c.json
 delete mode 100644 data/hfopenllm_v2/jaredjoss/pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model/67cfd12d-0551-406d-bd1d-8ced75c69478.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2-8B/0a31d2f0-196b-4508-861a-1ba7bd28ea23.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.1-8B/57576999-2749-441a-91d6-5a976e83a658.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.2-8B/e44792e6-0329-4784-832b-3043478e70a4.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.3-8B/8b3789d6-51be-472a-95d3-2ae7c34ad140.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-Aurora_faustus-8B/3f4765f2-551b-485f-9020-0cf17a36a887.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-8B/6375a845-5d86-4dcf-bfd2-e836daa4ca11.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Franken-Immersive-v39-8B/65a74446-6964-4f5f-8ea6-aeb1b09595ae.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Franken-v38-8B/dcba5998-3b84-4753-a4fa-2558ffe3e69b.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Fusion-8B/0af6b3c0-6638-4bd8-bdd9-349e2b9ca71c.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Fusion-8B/4e332594-d0b9-4913-9950-208abe4faab7.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-8B/5ad2ad73-47ed-465d-b4c0-b358e6b6435f.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-light-8B/c9f716ef-0aa6-445f-8fc9-b102f3a0ea2a.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v23-8B/a2e32a77-867c-4921-ada4-c7b169efbebe.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v24-8B/f76f759f-d05d-4eb6-a2b9-3b1dfbe840f0.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v25-8B/ece0bd6b-4eec-485c-942b-e23f3295c2f8.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v26-8B/ada110bb-0988-4c19-9798-74577dde5ce9.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v27-8B/ed4f994d-d196-40bd-8f8f-f6a7f07c3c90.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v28-8B/57395f9a-0534-453e-80fc-96e9dc5cd9c3.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v29-8B/f8f70702-9ab4-4e1a-a11d-090627d58f02.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v30-8B/3cab8bda-bdf6-4345-b89e-18d34a8f6361.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v31-8B/0955fc17-8878-401a-9ec3-149528ee51e1.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v32-8B/c63bf49a-e7d4-4853-8684-9cc03eaa7840.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v33-8B/65e6a3b6-4291-4591-bc0b-576930061c68.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v34-8B/1ddf9e02-4066-440e-a777-fcd3f96bc4b3.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-8B/f9f96bb2-edbc-4112-97aa-a7420dea32a1.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-light-8B/3a24b30f-7698-4ecb-ac26-3537a0b38616.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v19-8B/d4030df6-2be6-4f46-9c9b-ce3037b9a004.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v20-8B/ec234403-f43d-46a0-84a4-ab47673226b3.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v21-8B/805379f4-784f-4602-92e8-180df4da9fc3.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v22-8B/9f3920aa-9400-46f1-bcfa-969f69b3335c.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-8B/26cbf444-ab93-409a-b85d-e2bd267eae5e.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-alt-8B/7c2b17a8-1de2-4441-a281-fe3fd043f831.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-light-8B/94c5756c-cbde-46e2-90d2-207678373061.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-light-alt-8B/e0048124-89bf-4327-88a8-00aa51ee29af.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-ultra-light-8B/9d776307-43af-43bb-ab64-52fb7f331cfe.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v13-8B/d8d41981-a7c8-48e9-a63c-86520a0f23d5.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v14-8B/1355985c-fbcb-4eac-8435-417d6034f2f0.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v15-8B/44486b02-7bdd-4f59-8d4e-5c8deeb1fd60.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v16-8B/45ae3dc3-6dc0-4d10-99cb-a7f330110906.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v17-8B/6b54763a-6329-47fb-bf50-296604251b47.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v18-8B/96a26bf3-b4b2-465f-8ce6-a2ef943c001a.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-immersive-sof-v44-8B/655b047f-c3a8-4c9c-b864-81d318b2f506.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v10-8B/f62fed77-e166-422d-b5ce-c50b7bccbf4c.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v11-8B/7ffdabf3-0a8e-4316-b6bd-85b10a81db53.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v12-8B/2c93c987-b32d-4a02-8df4-949cc45b8eb2.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v2-8B/02e7c1d6-9db1-4de8-b13e-afd752b3669a.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v3-8B/580a3045-338a-47b2-8ed7-54c993d5aa90.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v4-8B/e71d3be5-ea9d-4426-aa58-5806b7541aa6.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v5-8B/1174683a-9488-4c6b-be6b-e5a96328a96f.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v6-8B/3789b37f-daf0-4c21-82b8-309cbf00312e.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v7-8B/8586cdc1-dd4e-4112-a59c-f6bc2766701b.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v8-8B/946a7b16-dfa6-42ad-97c1-955bf8a40dae.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v9-8B/d9a6cc31-57c4-4480-a019-25a34b31fcc8.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v9-TitanFusion-Mix-8B/279bd5fa-0ab1-411b-871b-bd9ff23853f6.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-8b/c26fae10-e65a-49ac-a2da-2dbf024fd10d.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-8B/6d37b2b4-630e-4471-b7a8-50f8a58902fe.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-Asymmetric-8B/de687865-4297-4130-bcfe-0c5116c9b0d1.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-Aurora_faustus-8B/ee1acad1-5dc4-4d8b-8aca-544af5dc2392.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/Kosmos-VENN-8B/52e3f1b1-5a1c-4cca-a36f-9f60284e1883.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/PRP-Kosmos-EVAA-8B/2d54c67e-fad5-4a61-b3ae-0393f16dc1ba.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/PRP-Kosmos-EVAA-light-8B/5120e433-f5c7-45fa-be56-566101556271.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/TSN-Kosmos-EVAA-8B/7f4b4668-c3a0-4575-957d-ba321d55f420.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/TSN-Kosmos-EVAA-v2-8B/9245b74d-4b9d-4158-a402-0c3742097eba.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bbb-1/29a5fcd3-9c22-424c-ab17-70cfe187aea1.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bbb-2/af71bfa0-1077-4c96-a4c1-0aa28dc789bf.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bbb-3/258ebe6d-191d-4804-b5e1-5cd6ce93ba88.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bbb-4/4765f197-82ed-44b3-9a7c-7cbabc6ecd8e.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bbb-5/a5d66f97-1f4b-43da-a83a-4a262e297fd9.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bbb-6/5d29cf73-65d6-4965-a504-4caf07108cc8.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bbb-7/15ec04ae-30d3-4ffb-9b0c-54ba63410e3d.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-1/2ed96c70-390b-44de-aa08-9883a2f33ff3.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-10/67c95889-8a67-40fd-99e2-62e767c16416.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-11/a518f39d-e073-493d-9a4f-9af53fc71abf.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-12/24f0d9bc-d743-4f46-b5a6-e855e39a1daf.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-13/3d27f6d9-05a0-44bd-a225-6e6a0bf4a35b.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-15/ad28e7b8-69e6-4fb9-bec4-62c67fae6d58.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-16/0da639d4-181c-4ee1-808c-3de8003c2471.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-17/480bd62c-bc67-4379-bce0-b28a5d6bdf4f.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-18/dd94c18e-b2c3-4135-aa2d-5eb0248315d0.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-19/a2ae2953-e341-49be-8469-32bd41d780d7.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-2/23bdd694-f250-46dd-9b8b-526fda47bc9e.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-20/d600a69d-1952-4e30-abe8-1769ab63ac29.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-21/afc031d4-852e-4ead-9098-6ce30112b459.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-22/cb33e29f-e5e1-4bf5-9e20-86d9c3486d2d.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-23/a4b93124-1151-4f69-8a5e-6b916e8cf11f.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-24/efe11d8f-65e6-4ba6-8148-fdd43c9346be.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-25/923da7be-2ec8-46b2-8187-fe08eb86d5a0.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-26/1652b9fe-640a-48f9-b7a5-20ae28fb5985.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-27/572463ed-f6b9-460d-9c38-0e0ee5327511.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-28/5f6bbbfd-16a8-4ea8-b9d9-b436a882700a.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-29/32322361-f18d-480d-9475-cd11a45bc4bc.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-3/f62d1aee-2d9e-466e-85e2-002fae5d2504.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-30/af389bf1-da63-49a9-9e49-32613d8d05b8.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-31/ea13ae62-d050-4cc4-9cbe-99eedfc206e2.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-32/1e697620-36a7-459c-b88c-405febb57c3a.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-33/532723e8-a9b7-4f72-a015-c2bd9363b5d8.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-34/be096a57-7d81-4999-919a-ed8a243012b2.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-35/cadeb016-e158-4a49-921c-efe0e4eb0cb2.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-36/c606d7b9-3ea3-49d4-9ecc-9610ed4b4eac.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-37/04a5eed3-7eea-4d9f-acc6-5a96ec987e2b.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-38/a1c60d74-dabe-423d-9e40-3dd8112d7d8e.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-39/29c7bc9b-6833-497b-a553-2941026efea5.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-4/09a60955-978e-4136-bdde-d5459e37ad2c.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-40/501744a2-070a-4378-9232-f7ccd9b2a67e.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-41/369efdc6-6529-477c-b5f0-d229c8102491.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-42/906645f3-2041-4380-8118-ac26b92297ba.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-43/57fe8deb-02dc-43a8-8a92-14bdaf61dd67.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-44/95f2fa22-3da9-4876-ace3-50763f2b2453.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-46/b2f9e38f-c2a1-4e5f-a7ce-4e33a05b503b.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-47/b3173a2a-8309-498d-961b-0167d5d5dea6.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-48/0d59dd75-c999-4a7e-919a-fd084202fc9c.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-49/639e91d9-ebbf-4ba2-bce3-6953e7c91e32.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-5/56a5fb9b-a4b7-4290-9ec9-6864b3efaa82.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-50/d03fb481-be0b-4dfb-bb4d-54067e058e99.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-51/d8fc3475-83e9-4790-a472-72b442087562.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-52/57efd335-4873-4e01-bfc3-0d704b3d482a.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-53/25fdcc8a-0e7d-4148-8508-2631ea6deb05.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-54/f5f63d06-7e51-4b91-8814-ecbda604fe6b.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-55/5326c33b-6b8a-472a-9058-a9e9fe83b599.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-56/28674053-e1b6-4f0a-a90e-5dd5082ec164.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-57/fd27bfa7-11b3-46d3-915c-373ddf5a9865.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-58/91f190ba-39c8-47af-8351-73d1f382dd99.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-59/b637b55c-dd05-4060-bf33-e63e9de7fac9.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-6/bcacef79-d7c0-46e7-9194-43541c2f01fc.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-60/77a358c7-59fa-4b22-a190-dfca86c5166b.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-61/ad4c8922-7079-4383-8f42-d3de6326a1e1.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-62/7f89eded-e5fc-4b3b-9afd-dcd71b7b44d5.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-63/07cb94ab-0aea-4ce2-89b0-4378cb892c7e.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-64/5fb04756-c7bb-4772-b209-0d9a300bbf7d.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-7/0c02d1b6-2d31-4c54-b881-588cbfb0c686.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-8/a32e4d22-8096-4537-a68a-98ff9171ac8c.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/bh-9/4e45b666-fa7e-4a38-8b6b-65846876c8d9.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/dp-6-8b/d9cb1d13-2af5-4385-aa78-5c053e00e6c6.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/dp-7-8b/6afaec07-ebb8-4f3f-af48-c679f38f4917.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/ek-6/bf8370c9-baed-4034-ac38-c6f796baca15.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/ek-7/d397c078-6fe3-44a8-859c-a0f7c551dc3a.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/f-1-8b/ed61cd6a-bbf0-45f2-9536-a7a262d5d6fb.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/f-2-8b/6be795f4-0784-44bf-8926-e3060ec37dcf.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/f-3-8b/d4d808f5-3b79-43b5-8076-d3f785083789.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/f-4-8b/370f5923-91d7-40d2-bd06-bf2b657b8ef2.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/f-5-8b/5334e5e4-d243-4c20-912c-d0ded74d6ea5.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/f-6-8b/7306f2cd-4fd2-4dd4-b06b-8c9aa558388b.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/f-7-8b/68cc19eb-423b-4d6d-a3bf-eac6f666bc4b.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/f-8-8b/59aa26a8-93b3-43fc-8c38-ef67cd8efd80.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/f-9-8b/220cd306-0613-4c8f-9848-4af812a1d37f.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/fct-14-8b/39a6a40c-3fa0-41ba-9d13-da9381263d4a.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/fct-9-8b/4d037b71-5d03-41a1-bf23-c0aea0cdcbbb.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/fr-1-8b/16baf620-7dcc-49f3-a787-b431e11ad4f6.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/fr-10-8b/4745add2-7bcb-4c05-8b12-6bd30856890b.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/fr-3-8b/f68b122d-4dec-4d5c-ac22-198da3d3e96b.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-8B/2e20f780-ceab-4d1d-a1ab-35f4f0ac44aa.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-v2-8B/f21bcd75-fc9f-4266-8976-3227b18b6b32.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-v3-8B/7c1a81ec-1cb7-4858-8f1f-23b3ee49b73f.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/knf-2-8b/1cbfd1ad-237d-4cd3-8b5d-3135c194fcc0.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/knfp-2-8b/ef5c1813-a74d-4b3d-9911-c27a46c1c84e.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/knfp-3-8b/df50857d-c90e-4ec8-a9b6-96a6d2f894b1.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/kstc-1-8b/774d54fb-a445-4ed9-b79a-9c1346537e98.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/kstc-11-8b/420b8be3-3560-48e8-8ab3-bb55338a9069.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/kstc-4-8b/c118b75c-597f-48a7-a4eb-675af72c9930.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/kstc-5-8b/e75534d3-b994-4e88-9274-7b62f61916cf.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/kstc-6-8b/770a1ff1-057f-49a7-9402-c6dd881ac03d.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/kstc-8-8b/6cc9790d-9b02-437e-8ac7-be4152f5b17d.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/kstc-9-8b/264f5b42-a3ac-4af1-8145-c5763b8e7fa6.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-10/549db368-437a-4982-ba5b-5c4d7bf203ae.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-11/0d098a19-7e8f-4a52-8466-729be91388d8.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-13/83335f65-25a4-4bec-a901-587567ed0e99.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-14/02fb24c3-927f-4c21-bd47-b883521162a3.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-17/2a6507c7-44c1-4416-9ff1-36abd6af3b73.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-2/327a146a-8cfd-4480-8342-46afde530677.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-20/0700fb7a-e722-432f-a64d-c040bba4deee.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-22/131d3a7e-43dd-4189-8466-6562703b3bdd.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-23/8f6d7008-b8de-4a76-94aa-bbecc93ef3f7.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-25/aadb0ce5-a1aa-4b0d-bec4-8bb0e8e54a1d.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-29/a73250f1-399a-4afa-bf83-4036dce78ef3.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-32/f68bf680-9626-4952-b95e-12a18fd60820.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-33/d6a78a5c-4a2e-4370-88f2-d8627a94f1ea.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-34/7b5eab2e-fba3-47d5-9839-02249c2568c5.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-35/2acee2c3-4322-4152-8151-c1d571475b7c.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-36/67ffb2de-0410-44a2-aad7-4a32e2c49c7d.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-37/2923aeb3-982f-400d-9588-707583c75a1d.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-6/b6a622da-5ce8-4ea5-a82a-f3a2a299ddf2.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/slu-mix-1/7b06ac17-bfc6-43d5-99e6-d2b7a31290fb.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/sof-1/fd481b93-55b2-4831-9be9-1b1b2886fda3.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/sof-10/f159748f-234e-4962-b582-cd5805448f33.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/sof-3/044d53dd-d134-4959-a70c-46f11cc0b300.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/sof-6/f05501fd-7c06-46d5-bc20-a9d0cc5c2e0f.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/test-10/5c44a2f2-23e3-4c9f-9b7c-9012ca8b15e9.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/test-11/80e5134b-0733-41cc-8b4f-ef32fbe57066.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/test-12/61123e41-7b2a-40da-9f7f-b830c27d7f12.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/test-13/b93c31d7-54c3-47b9-a267-3f8fdb796805.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/test-14/b3eaa4c5-7abc-4e2d-9c11-c70ecb8a843b.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/test-15/3b06f75e-3d22-4428-8d4f-2e704b96961e.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/test-16/dfda4aab-f8d4-49ee-b141-78539b69007c.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/test-17/690f3c19-c148-458d-b4c5-87761d72b851.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/test-18/b6a18246-776d-463f-80d5-140df74e9704.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/test-19/9831abdc-ad08-48c0-8384-86240e7350b5.json
 delete mode 100644 data/hfopenllm_v2/jaspionjader/test-20/96a572e5-4751-46ce-9202-deb223ef4dfe.json
 delete mode 100644 data/hfopenllm_v2/jayasuryajsk/Qwen2.5-3B-reasoner/f4320b1e-ea4f-4aea-8dab-cdb221ce53e5.json
 delete mode 100644 data/hfopenllm_v2/jeanmichela/o-distil-qwen/8376c0bf-f9c3-4529-b13c-c57106182d15.json
 delete mode 100644 data/hfopenllm_v2/jebcarter/psyonic-cetacean-20B/97a80145-e621-4603-8ff8-2cc4bd74190a.json
 delete mode 100644 data/hfopenllm_v2/jebish7/Llama-3-Nanda-10B-Chat/99a7881c-cca0-43d6-96f5-ce5292ed60a0.json
 delete mode 100644 data/hfopenllm_v2/jebish7/Llama-3.1-8B-Instruct/60ca8f7e-1c20-4adb-bb84-892bad3c0d63.json
 delete mode 100644 data/hfopenllm_v2/jebish7/Nemotron-4-Mini-Hindi-4B-Base/4a0f8dc7-9446-4dda-bf49-8cca4851746c.json
 delete mode 100644 data/hfopenllm_v2/jebish7/Nemotron-4-Mini-Hindi-4B-Instruct/6eb3a040-8234-4d31-8274-6987b0e4e3b4.json
 delete mode 100644 data/hfopenllm_v2/jebish7/Nemotron-Mini-4B-Instruct/16053077-38fd-4136-81a5-fea0d4cd927a.json
 delete mode 100644 data/hfopenllm_v2/jebish7/aya-expanse-8b/25abb99f-536e-4638-8611-a1db5dee931d.json
 delete mode 100644 data/hfopenllm_v2/jebish7/gemma-2-2b-it/aaf0e5bd-b033-455e-bb23-b12b6f7c4520.json
 delete mode 100644 data/hfopenllm_v2/jebish7/gemma-2-9b-it/b3a46478-c5f4-4c74-9bf0-d1ba616ae24c.json
 delete mode 100644 data/hfopenllm_v2/jebish7/qwen2.5-0.5B-IHA-Hin/169fb05f-5201-47b8-a06e-7d01e574c689.json
 delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen-7B-nerd-uncensored-v1.0/db076309-32e5-4d46-9786-ff14f8daf5d2.json
 delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-minperplexity-2/cde914dc-7d57-425f-9787-e4b8d36d61cf.json
 delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v0.9/5d793ce3-a7fd-4ee3-b32c-c9da63ec0566.json
 delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.0/8c645c9f-02f6-44a5-b295-d6364ed49464.json
 delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.1/97bb5519-e2d3-44d5-abf4-b5263c2b3245.json
 delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.2/bd3d78d3-3ff1-4a92-a316-e4e30787a331.json
 delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.3/d8951ed7-f4ef-49ce-891e-8d8509e9cf93.json
 delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.4/e1772d6c-fd26-43a7-82b3-7997d8a6809f.json
 delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.5/febaf893-6aaf-4c87-89fc-cc865ebf2859.json
 delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.7/0ad591f4-c846-4fd1-8536-a169e0a7e4ab.json
 delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.8/0a318ebd-7bbb-456b-a6e4-9b480a858b5e.json
 delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.0/e1cfdc32-3c5e-4f4b-a205-f416c96cf5e6.json
 delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.1/85426280-8138-46d0-a111-b59b0d7c86c8.json
 delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.2/32bbd26e-05e7-4a0f-a491-8f54cea9f3d3.json
 delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.3/86ed6833-ae85-4a8e-b840-b0c9540083ce.json
 delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.4/2f751ac3-5ca5-4d0d-9ad4-48155e51468a.json
 delete mode 100644 data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.5/9677e68d-afda-4917-825c-83318219ff59.json
 delete mode 100644 data/hfopenllm_v2/jeffmeloy/jeffmeloy_Qwen2.5-7B-minperplexity-1/23cd57c2-bf7f-440a-ab3e-edfdede5e8cd.json
 delete mode 100644 data/hfopenllm_v2/jeonsworld/CarbonVillain-en-10.7B-v4/bec23315-f98a-4211-81a0-c49f395e66c9.json
 delete mode 100644 data/hfopenllm_v2/jiangxinyang-shanda/Homer-LLama3-8B/1ac5faef-7fa0-4b58-a6ba-0c444a2023a8.json
 delete mode 100644 data/hfopenllm_v2/jieliu/Storm-7B/39327803-11e7-4b28-8750-81feb027e8f3.json
 delete mode 100644 data/hfopenllm_v2/jiviai/medX_v2/ce2b6874-0fc8-4364-a526-7b25b101e1e3.json
 delete mode 100644 data/hfopenllm_v2/jlzhou/Qwen2.5-3B-Infinity-Instruct-0625/9f9ebc90-31f9-45c1-b9c2-07b727b12f3d.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01/d189a2fc-71f5-4bc9-a0b1-7e744a19921f.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1/1eb697fe-9dd4-4a41-aa47-33456df39e2d.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01/5f10df7b-cd2c-44ca-b13a-2852483c71f8.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1/3abbb4b6-8050-44fd-b066-0f061ce2f4d7.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01/5f47e65d-293f-469e-a18f-5627ca1adf44.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1/b753c1aa-8a0c-4600-99ec-8eb51ab50da7.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01/15c21655-9af8-4bee-9884-b047683e9adf.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1/f642de95-218a-4db0-807f-1bb97618b4f6.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01/01443b06-9ad3-41f5-ae0d-bc84086e0a0d.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1/1ee8c377-2236-4225-942f-ef8ce5770741.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01/4ee9aa78-d9eb-4a1c-91c4-f29f093b95d3.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1/419c6631-805f-43ba-9db8-5296f8d221ec.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01/3fc1822f-4a43-4a3b-90d7-fc163491c90a.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1/76b4037b-c5d0-435f-966a-bd88b1665dad.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01/757b85e7-84c8-429f-aeb4-870852fa8959.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1/acab4982-1205-4362-803e-306b1e2371bf.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01/0e549b5d-c1d9-443d-9a80-8dd34dadd22e.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1/d3d4eccc-8792-40e5-91cf-22885f4cbaf5.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01/708aded5-6252-44e3-bf0d-08bf3e7f32e0.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1/ce6d31f2-f38e-4af3-85a3-d2f6c80f71f1.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_linear/5efcc291-ca9a-4ca9-b2ed-dab37dce5f5a.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.1/47320824-8064-40d4-a08c-810faafbba77.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.3/8baeef58-0ba6-4723-8f23-7a4c386f2cad.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.7/0387ca63-1e31-4eaa-ac7c-35d417548c54.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.9/733983fe-4b9c-47e6-963d-c57829b6f1af.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_linear/80c4859d-8016-4650-939f-100ba2e6d808.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.1/21724d3a-cc6c-43eb-9d69-46d8d91c97f8.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.3/d781945e-e9df-4136-90cd-632f0bed6246.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.5/8f146bb5-dd4d-49ce-ac60-76f66321feb8.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.7/89bfba6d-c622-445e-b0b9-512aadcea7cf.json
 delete mode 100644 data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.9/9c27f2e6-ebbe-4fac-bc51-74455d3a6512.json
 delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-4k-DPO/455ef1e0-bdf2-49bf-a53d-2c9e3d00d5f3.json
 delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-DPO-v1.2/e04a76a6-ac22-43b2-bbf9-196a08de2949.json
 delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-DPO-v1.3/2fcb74f0-add1-4d46-8a0f-8578a616dbed.json
 delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-DPO-v2.0b1/51530638-ef76-43ce-9396-8a0d07988712.json
 delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0.1/74d99e4d-0e6f-4804-aa52-0dc76d37fac3.json
 delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0.3/80e8b9f0-b507-4927-9d24-1c793e3783cc.json
 delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0/7b037520-a5e9-4b58-80f3-f0ecc5957c67.json
 delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0b2/10b88d05-62d2-4603-9d04-b0854e39ed40.json
 delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0b3/4b693f41-d811-4b64-892c-d840eee5ace4.json
 delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-Revised/90d86c8c-3aa6-42ba-a94f-75c961e65c41.json
 delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-v1.0/8318ae52-6ae3-45ce-82db-73f8cb5ad7c7.json
 delete mode 100644 data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-v1.2/b20a1d13-2f14-42e4-bdde-49f053cef325.json
 delete mode 100644 data/hfopenllm_v2/jpacifico/Distilucie-7B-Math-Instruct-DPO-v0.1/51521dfb-d4b5-45df-ac2a-54190aed0b9f.json
 delete mode 100644 data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-DPO-v1.1.3/997a1ceb-185a-4e6c-8383-eb5a6f976771.json
 delete mode 100644 data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-DPO-v1.1/22101998-c3d3-414f-9ed1-99330cdbe3b2.json
 delete mode 100644 data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.0/a2408953-a7eb-449c-b80c-3620915d44d0.json
 delete mode 100644 data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.1/d65e5b08-7d3c-4c0d-85fa-496db65a235c.json
 delete mode 100644 data/hfopenllm_v2/jpacifico/Lucie-Boosted-7B-Instruct/ce2c9614-46d2-481d-ac25-3cc71a93bd5e.json
 delete mode 100644 data/hfopenllm_v2/jsfs11/L3-8B-Stheno-slerp/e9ba998d-8147-4046-afae-9ee7d544e98d.json
 delete mode 100644 data/hfopenllm_v2/jsfs11/MixtureofMerges-MoE-4x7b-v4/c44f1012-1123-42c8-b110-5735dc756fd5.json
 delete mode 100644 data/hfopenllm_v2/jsfs11/MixtureofMerges-MoE-4x7b-v5/5088f6a6-2acf-4d10-8b78-0d5bd4126ab5.json
 delete mode 100644 data/hfopenllm_v2/kaist-ai/janus-7b/b4d96088-5cc0-4ebc-8b8b-8c7e9f90420b.json
 delete mode 100644 data/hfopenllm_v2/kaist-ai/janus-dpo-7b/529dba11-53af-4045-ae46-04e1b9838d4a.json
 delete mode 100644 data/hfopenllm_v2/kaist-ai/janus-rm-7b/391f6d6c-418f-44be-910a-fb90b5712649.json
 delete mode 100644 data/hfopenllm_v2/kaist-ai/mistral-orpo-capybara-7k/2ccccb4b-7260-4a1a-9426-117e359c7c5c.json
 delete mode 100644 data/hfopenllm_v2/kavonalds/BunderMaxx-0710/84afecec-453d-491c-9f5a-de31d8fba43e.json
 delete mode 100644 data/hfopenllm_v2/kavonalds/BunderMaxx-0710/dba3a3a4-cd23-44c9-823f-0bd88cf6465b.json
 delete mode 100644 data/hfopenllm_v2/kavonalds/BunderMaxx-1010/1179bcce-558e-40ad-8537-c74c59557975.json
 delete mode 100644 data/hfopenllm_v2/kavonalds/Lancer-1-1b-Instruct/fe0a5c17-6c8d-4f06-a58e-47648ef9ecec.json
 delete mode 100644 data/hfopenllm_v2/kayfour/T3Q-Qwen2.5-7B-it-KOR-Safe/81cf8cbd-33bc-44ab-930a-65242e1ae7b2.json
 delete mode 100644 data/hfopenllm_v2/keeeeenw/MicroLlama/173bb053-e817-4551-b169-c3f71163650a.json
 delete mode 100644 data/hfopenllm_v2/kekmodel/StopCarbon-10.7B-v5/b7e6a86f-340c-48ed-a828-2e80a13aa515.json
 delete mode 100644 data/hfopenllm_v2/kevin009/llamaRAGdrama/bd221eee-7aa8-4d6f-a6be-89ee5568e729.json
 delete mode 100644 data/hfopenllm_v2/khoantap/cheap-moe-merge/8727a325-a515-4456-ba34-65c30f84644a.json
 delete mode 100644 data/hfopenllm_v2/khoantap/llama-3-8b-stock-merge/3e4011fa-d480-4c16-9371-2025bc834358.json
 delete mode 100644 data/hfopenllm_v2/khoantap/llama-breadcrumbs-ties-merge/867499a7-589b-4564-b04d-a004b7c0abb4.json
 delete mode 100644 data/hfopenllm_v2/khoantap/llama-evolve-ties-best-merge/52f1fb51-fc7e-4cc2-918a-7c7226ae2ce5.json
 delete mode 100644 data/hfopenllm_v2/khoantap/llama-linear-0.5-0.5-1-merge/5f4a8fb6-b22d-4eb2-aaef-da05ca45fbeb.json
 delete mode 100644 data/hfopenllm_v2/khoantap/llama-linear-0.5-1-0.5-merge/3278855d-7bd1-4e7e-b27b-b1393006e7e7.json
 delete mode 100644 data/hfopenllm_v2/khoantap/llama-linear-1-0.5-0.5-merge/5193ab4d-1627-43b5-bfb7-89e08ea1f810.json
 delete mode 100644 data/hfopenllm_v2/khoantap/llama-slerp-merge/598faeda-48fb-43a8-aaa9-849d5dfcea79.json
 delete mode 100644 data/hfopenllm_v2/khoantap/moe-out-merge/d1afa2fb-1256-4dd3-b13b-802917bf481b.json
 delete mode 100644 data/hfopenllm_v2/khulaifi95/Llama-3.1-8B-Reason-Blend-888k/397c9bc3-0af5-453c-9b68-5360783dfbf7.json
 delete mode 100644 data/hfopenllm_v2/kms7530/chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1/9bb39652-c79a-42bf-b6d8-c4ed6174a4c7.json
 delete mode 100644 data/hfopenllm_v2/kms7530/chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath/7e793244-b746-4aa4-a401-dcf5884f61a4.json
 delete mode 100644 data/hfopenllm_v2/kms7530/chemeng_qwen-math-7b_24_1_100_1/26a8da03-debd-41e3-8ee1-2827d76b26ca.json
 delete mode 100644 data/hfopenllm_v2/kms7530/chemeng_qwen-math-7b_24_1_100_1_nonmath/e214c326-dd84-4915-bba1-faaafbb026b2.json
 delete mode 100644 data/hfopenllm_v2/kno10/ende-chat-0.0.5/98a5ea0a-6e45-48f8-8219-32099b9fa9d0.json
 delete mode 100644 data/hfopenllm_v2/kno10/ende-chat-0.0.7/40d7d17d-2d41-4d23-83c1-ab5f3320e36e.json
 delete mode 100644 data/hfopenllm_v2/kyutai/helium-1-preview-2b/d881a83a-9ba8-4919-8b89-45f5a7220621.json
 delete mode 100644 data/hfopenllm_v2/kz919/QwQ-0.5B-Distilled-SFT/d6c966a1-7927-424a-9886-b98688d27e6f.json
 delete mode 100644 data/hfopenllm_v2/ladydaina/ECE-FDF/c09fe163-a7f7-4b6b-b407-ee8d698b2ee8.json
 delete mode 100644 data/hfopenllm_v2/laislemke/LLaMA-2-vicuna-7b-slerp/b3979c7f-0596-4a24-b264-73a17ba19821.json
 delete mode 100644 data/hfopenllm_v2/lalainy/ECE-PRYMMAL-0.5B-FT-V5-MUSR/f6156893-92e7-4c4f-bff4-8b6d774ecbd8.json
 delete mode 100644 data/hfopenllm_v2/lalainy/ECE-PRYMMAL-0.5B-SLERP-V4/8b1c19e0-8b47-46ae-8bf3-f84c7d3a9c0e.json
 delete mode 100644 data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1/6221102e-4e8c-46dd-8c03-fa9e92b7e4ea.json
 delete mode 100644 data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-1B-SLERP-V3/329e5e91-10ba-4795-ae86-dda95e698b4f.json
 delete mode 100644 data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-1B-SLERP-V4/3fe89b13-135d-4790-871d-74e7a28ea2e9.json
 delete mode 100644 data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-6B-SLERP-V1/4b807741-f1b9-4964-9bc9-bb93f9b34217.json
 delete mode 100644 data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-6B-SLERP-V2/c52a8a4d-be91-4a0d-8cd5-8473a42f0978.json
 delete mode 100644 data/hfopenllm_v2/langgptai/Qwen-las-v0.1/f6e157c4-0ce9-41c9-b885-9222d894ff0c.json
 delete mode 100644 data/hfopenllm_v2/langgptai/qwen1.5-7b-chat-sa-v0.1/fe52a94a-5324-4b59-accc-dfd1f9d4aead.json
 delete mode 100644 data/hfopenllm_v2/lars1234/Mistral-Small-24B-Instruct-2501-writer/1241f5e3-54eb-429e-b109-a5e163e39eda.json
 delete mode 100644 data/hfopenllm_v2/leafspark/Llama-3.1-8B-MultiReflection-Instruct/8ccc7c8c-1d14-45bb-9a6b-f8f69e506139.json
 delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-9B/5531b59e-24c0-41af-ab6b-d6a5e38b0a98.json
 delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-Advanced-9B/63e82cb3-2f6f-4617-abb7-ae093bc27830.json
 delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-Remix-9B/0feb74e6-40d4-472d-9233-27faa2d3f802.json
 delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2-9B/e74dd005-c9b5-45c9-b7f5-455c3110e09b.json
 delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2a-9B/d094bf6f-9952-45c7-995e-d7eda07f4668.json
 delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2f-9B/0e5f3393-8a6a-4f2f-948a-a37ae4d8fdeb.json
 delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3-Advanced-9B/f91982ac-0cab-415a-8503-e090d195bd05.json
 delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3b-9B/fb1af66e-7828-495b-8277-5cff77c3070e.json
 delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3i-9B/ac84c157-4d11-43c1-8731-b1e5cfa91668.json
 delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3j-9B/bbc812dd-9a9c-4f99-b813-50361025eea3.json
 delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4-Advanced-9B/fc818799-49d5-4fca-b131-ebe8d5d831f1.json
 delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4a-Advanced-9B/33349989-8573-4d71-ae0f-99691fdaffc3.json
 delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4b-9B/91551de5-d8ac-4c0d-b9b4-3627db947f0e.json
 delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4c-9B/c2d2c1f4-aaab-45f1-b3f6-5b4ea56b696e.json
 delete mode 100644 data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4d-9B/36821a8b-af18-4631-b4b0-7e4b37bb194b.json
 delete mode 100644 data/hfopenllm_v2/lemon07r/Llama-3-RedMagic4-8B/e402d129-f4f1-4b95-b079-4f30936119aa.json
 delete mode 100644 data/hfopenllm_v2/lemon07r/llama-3-NeuralMahou-8b/814e1ea7-a639-4b05-9208-0bf537ea5479.json
 delete mode 100644 data/hfopenllm_v2/lesubra/ECE-EIFFEL-3B/35a50d36-31d0-454b-a13c-80ca26945f94.json
 delete mode 100644 data/hfopenllm_v2/lesubra/ECE-EIFFEL-3Bv2/87347017-4ff1-4bd3-a1d7-8f3999061209.json
 delete mode 100644 data/hfopenllm_v2/lesubra/ECE-EIFFEL-3Bv3/976184ed-c4ed-4898-83c7-521a8a8309ac.json
 delete mode 100644 data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP-V1/fa52f072-7725-4a4e-b728-042e5897a1bd.json
 delete mode 100644 data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP-V2/6374dcee-301c-4f28-9316-82ed8e693089.json
 delete mode 100644 data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP_2-V1/b7c95cb4-f32f-466e-a28c-32afd9ec5578.json
 delete mode 100644 data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP_2-V2/bddd742b-f7c9-44aa-ad2f-83f51a4625be.json
 delete mode 100644 data/hfopenllm_v2/lesubra/merge-test/099af0ee-c06b-4435-8f97-27681f3eddff.json
 delete mode 100644 data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-full/fa826f3a-8688-4518-8d44-68189abb47ba.json
 delete mode 100644 data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-half/10d29dc0-3486-40df-9933-1ce8f0fabaa2.json
 delete mode 100644 data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top25/741ff375-3392-461e-a9b0-e0dab4e6e9f8.json
 delete mode 100644 data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top75/c3d709de-118d-40c2-ab89-040efedd7fdb.json
 delete mode 100644 data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual/9be3dd27-93fa-49e9-a628-5a77a8a3bb9a.json
 delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_145_/be850d1b-bf75-4c34-830f-8881792ac842.json
 delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_200_Gemma/6b644b97-4fc3-4826-9ea9-68be1dc8e947.json
 delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_212_QwenLawLo/861d41f1-6d33-4e07-96ea-2c39a36c4b63.json
 delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_212_Qwencore/7501b038-4847-45bc-8b92-6800d7a58c1e.json
 delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_230_Xiaqwen/db48206d-700b-45f3-b597-8752110113b5.json
 delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_375_QwenDyancabs/b52b76e4-9dec-4336-88b1-d98b95b95d2a.json
 delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_456_QwenKoen/ba9ec2ea-2bce-4999-9e48-e1d0795b31d0.json
 delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_7B_KoenQwenDyan/724221ce-d7b2-43cb-8e16-72ac529a7b60.json
 delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_7B_Qwen2.5koen/552f3814-d071-4d00-a895-b739dffdcb2d.json
 delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_7B_QwenDyanKoenLo/d3819133-bae8-493d-9a86-aee67da5d115.json
 delete mode 100644 data/hfopenllm_v2/lkoenig/BBAI_7B_QwenDyancabsLAW/5c3a022f-7221-4b4f-ab67-d5b69c558434.json
 delete mode 100644 data/hfopenllm_v2/llmat/Mistral-v0.3-7B-ORPO/c161b868-746f-4d88-9f41-eb8283a7b87a.json
 delete mode 100644 data/hfopenllm_v2/llmat/Mistral-v0.3-7B-ORPO/f79a76fc-09ff-48c8-b0e7-5f18e0750e6d.json
 delete mode 100644 data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-1B-SLERP-V5/39f4d1ab-fd42-4746-b949-9666ce32f9d1.json
 delete mode 100644 data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-1B-SLERP-V6/8348f316-9109-4229-9fee-edc02431befa.json
 delete mode 100644 data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V1/6b2346c6-5fbf-4195-b3bb-66bbd446ca53.json
 delete mode 100644 data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V2/8645ffc1-6487-4205-b8b0-e980e094ac6c.json
 delete mode 100644 data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V3/2c6d1e57-7673-4a86-808e-6ff6a7146a11.json
 delete mode 100644 data/hfopenllm_v2/lmsys/vicuna-13b-v1.3/64ab8b1a-62be-4561-8f0c-e42f1fe37178.json
 delete mode 100644 data/hfopenllm_v2/lmsys/vicuna-7b-v1.3/3eb22885-eb7c-4c85-b79f-cd47ffacd551.json
 delete mode 100644 data/hfopenllm_v2/lmsys/vicuna-7b-v1.5/8956d608-c627-469b-943d-bfad6c7382af.json
 delete mode 100644 data/hfopenllm_v2/lodrick-the-lafted/llama-3.1-8b-instruct-ortho-v7/9ff060c8-d4fa-4880-a0cd-9581f5c2f574.json
 delete mode 100644 data/hfopenllm_v2/lordjia/Llama-3-Cantonese-8B-Instruct/e3d6b3d7-a231-40c1-bac9-0b7fcb478bca.json
 delete mode 100644 data/hfopenllm_v2/lordjia/Qwen2-Cantonese-7B-Instruct/20acb302-3a74-4425-af4c-a1d719b90a88.json
 delete mode 100644 data/hfopenllm_v2/lt-asset/nova-1.3b/a8613588-687d-4291-ae5a-57688501cffd.json
 delete mode 100644 data/hfopenllm_v2/lunahr/thea-3b-50r-u1/83dd67cb-5508-4aa5-9435-d5585b7f3d52.json
 delete mode 100644 data/hfopenllm_v2/lunahr/thea-v2-3b-50r/26d981bb-f2e5-4195-8d6f-594bb0b26f4a.json
 delete mode 100644 data/hfopenllm_v2/m42-health/Llama3-Med42-70B/df06c977-b54c-4668-837f-eb583ef24d29.json
 delete mode 100644 data/hfopenllm_v2/macadeliccc/Samantha-Qwen-2-7B/31a8ac03-f58b-46e3-9f17-53311b1fd506.json
 delete mode 100644 data/hfopenllm_v2/macadeliccc/magistrate-3.2-3b-base/3e4a7141-7a82-421a-a107-bbac3cbafc9b.json
 delete mode 100644 data/hfopenllm_v2/macadeliccc/magistrate-3.2-3b-it/9a3069f2-81ed-484a-b6e6-a45a259e9a43.json
 delete mode 100644 data/hfopenllm_v2/magnifi/Phi3_intent_v56_3_w_unknown_5_lr_0.002/c0a3d0c3-c541-4606-a925-4100b062284f.json
 delete mode 100644 data/hfopenllm_v2/maldv/Awqward2.5-32B-Instruct/20685a4b-686f-4cd4-b49d-3067a005256d.json
 delete mode 100644 data/hfopenllm_v2/maldv/Lytta2.5-32B-Instruct/85a91293-cd51-4f79-8b98-2f4bc67d78c1.json
 delete mode 100644 data/hfopenllm_v2/maldv/Qwentile2.5-32B-Instruct/d2e3a6c2-4e67-4150-b9a8-fec979fb1658.json
 delete mode 100644 data/hfopenllm_v2/maldv/badger-kappa-llama-3-8b/c4d686f2-2af1-4271-9556-09380f07ba5f.json
 delete mode 100644 data/hfopenllm_v2/maldv/badger-lambda-llama-3-8b/93167303-b38e-43f0-a552-72c26ccb4339.json
 delete mode 100644 data/hfopenllm_v2/maldv/badger-mu-llama-3-8b/b52a176f-f369-4791-a7e3-88a72709c868.json
 delete mode 100644 data/hfopenllm_v2/maldv/badger-writer-llama-3-8b/b6310012-17f1-4ee0-abd0-0079a9299350.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Cheng-1/f581e832-0f77-496e-bcd3-6cfec51ef594.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Cheng-2-v1.1/47b47c89-b13b-4099-98b2-854feae05f63.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Cheng-2/8d51ae58-7b20-4fa4-b234-2abb9cdeaad4.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.1/4d4d5679-8ec6-49b8-a5d7-2a76497b44b7.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.3/0bdb6574-69e2-4858-b7aa-a90a5fadf741.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST/fa1a92bb-ad25-4be2-a35f-7fdebbeeeba8.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-Preview/d62ea0a1-cc9d-41b7-8d60-479b8e2262b5.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-RP-v1.4-1M/912446e3-efdf-4ed0-80bd-261c6c87a3d0.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.1/5e86dc31-ae3e-4ef7-858e-41e29b3a8031.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.2/80680e5e-ab83-4a59-aeec-9d4166509c47.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.3/c5bc9c92-8469-4174-aafd-67bb61aaccf2.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.4/1d67b792-178b-4baa-a108-2362f658bd4e.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Qwen2.5-7B-Preview/eb0c87b0-4795-4029-82c1-57ce37ba8259.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Yell-Qwen2.5-7B-Preview-v1.1/dc9b2300-7ab0-4e92-9d23-15fe9ca52994.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/Yell-Qwen2.5-7B-Preview/e005624d-c822-4be1-9477-873642aae228.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/absolute-o1-7b/e9756d91-b9e2-4dd0-bf08-c6154c7d1f2e.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-2-28-2025/704598c3-c5d6-4ce0-bab3-0fa98118e16a.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-v1.1/fafc9463-d725-4827-8bc1-5cd9e83814b6.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-v1.2-normalize-false/109820e0-ee00-449c-9ae5-58a7bf1da5f8.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b/37f29d5b-d803-4195-9ce0-75e45e32c160.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/cursor-o1-7b/43546f48-8c46-4481-b1e5-f4b1ad2535be.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/cursorr-o1.2-7b/ec81e0ff-9cb4-4d43-9f78-1d5f4edc9103.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/etr1o-explicit-v1.1/9290c86f-40b0-4520-b8aa-3460de62c396.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/etr1o-explicit-v1.2/a4bf576e-9556-4956-8dcb-4d8906d45db0.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/etr1o-v1.1/320a5c00-3307-4bc3-9f47-9befb88e461c.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/etr1o-v1.2/844d1556-6bc6-467e-a145-f92646770727.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/fan-o1-7b/78923f4b-c2e7-4472-8398-10a0a8453ec5.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/olmner-7b/17abe1bf-2e97-409e-88e3-4f661861a195.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/olmner-della-7b/756978e5-1dfe-433e-ba88-339004a50ea7.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/olmner-o1-7b/a889ae3a-5d86-4454-bfb9-332c4b61b836.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/olmner-sbr-7b/2c5e1086-03b7-4cdd-801e-03fb26183076.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/post-cursa-o1/d9578847-b732-4c75-b246-9cdf03674fe0.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.2/4c6f83fe-7896-4cf3-9434-b5f8d499f5ba.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.3/619037af-d528-4579-b7e3-58628468d8fb.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.4/5113b737-8d9f-4321-9a67-91f1aabb40a1.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.6/641ac372-2e5a-4b44-b22e-a17600a6a868.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1/7cbb0b08-871d-48fc-bf3e-86267f5ef19d.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/r1o-et/c82e887c-c8ab-4221-aa0b-e8b7a86e7c46.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/sbr-o1-7b/50c65a83-9d08-4155-ad2c-5a2f8ffc8743.json
 delete mode 100644 data/hfopenllm_v2/marcuscedricridia/stray-r1o-et/99d97aef-bb6b-471b-8ed7-f6f92f75842c.json
 delete mode 100644 data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3/b98504a0-f1d6-4872-b748-2ca8199c5328.json
 delete mode 100644 data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis/5a159667-7460-4a97-884e-6a96df59873b.json
 delete mode 100644 data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis/16a2eceb-073d-4dc3-87a7-a15c641c5ebb.json
 delete mode 100644 data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis/e8e2d04b-21db-43dc-8b8f-7fa3bba87abc.json
 delete mode 100644 data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/acbb93b3-f8fc-479d-9610-392efd7d4ecc.json
 delete mode 100644 data/hfopenllm_v2/mattshumer/Reflection-Llama-3.1-70B/6d0589bd-1f05-44ee-afa5-3657b960d7c9.json
 delete mode 100644 data/hfopenllm_v2/mattshumer/ref_70_e3/134663d8-05a8-4336-90e2-68e7cba5f1df.json
 delete mode 100644 data/hfopenllm_v2/maywell/Qwen2-7B-Multilingual-RP/3bfced28-b06e-46ab-a6aa-171b0c424337.json
 delete mode 100644 data/hfopenllm_v2/meditsolutions/Llama-3.1-MedIT-SUN-8B/b6a83b82-6b05-4437-a076-e2a3982f6169.json
 delete mode 100644 data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-1B-Instruct/f621201b-f571-4487-9f1e-b767675c659d.json
 delete mode 100644 data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-1B-chat/710fdb79-fba4-42da-8e26-45b4caf75207.json
 delete mode 100644 data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-26000/35fa7a5e-8866-4ce3-9899-8737e908f34f.json
 delete mode 100644 data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-34800/2b24b69b-15dc-4666-83f3-c77db545bdbd.json
 delete mode 100644 data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-v1.0.0/0d00d849-2147-4fc1-9e5f-d42a95be6ca5.json
 delete mode 100644 data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.5B-chat/f45135b0-3c26-44b5-9922-a6c0817a172d.json
 delete mode 100644 data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-HDIC-1B-Instruct/67eb0d6c-9086-4c80-8506-c3e1489f2673.json
 delete mode 100644 data/hfopenllm_v2/meditsolutions/MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune/79d3dc85-08f6-475c-ac2c-1ff32f5a089f.json
 delete mode 100644 data/hfopenllm_v2/meditsolutions/MSH-v1-Bielik-v2.3-Instruct-MedIT-merge/4e9b3fa2-d3d2-4e4c-a1fa-c812f481f64a.json
 delete mode 100644 data/hfopenllm_v2/meditsolutions/MedIT-Mesh-3B-Instruct/6e62a8a0-0bdf-4b6c-93de-593423dadd3a.json
 delete mode 100644 data/hfopenllm_v2/meditsolutions/SmolLM2-MedIT-Upscale-2B/871131c1-295d-40a0-a396-09d24b880064.json
 delete mode 100644 data/hfopenllm_v2/meetkai/functionary-small-v3.1/44eefbb2-22d4-4dff-889d-a87fc40b2eea.json
 delete mode 100644 data/hfopenllm_v2/meraGPT/mera-mix-4x7B/cd1de470-a174-4c08-9efe-a06d493dc4b2.json
 delete mode 100644 data/hfopenllm_v2/mergekit-community/JAJUKA-WEWILLNEVERFORGETYOU-3B/fdb55a14-0697-4775-8358-fed202498b4f.json
 delete mode 100644 data/hfopenllm_v2/mergekit-community/SuperQwen-2.5-1.5B/c069a224-638a-4cad-a9ad-e4f8579e8c15.json
 delete mode 100644 data/hfopenllm_v2/mergekit-community/VirtuosoSmall-InstructModelStock/10e5c103-f25f-45bb-bfe6-a22876cffe87.json
 delete mode 100644 data/hfopenllm_v2/mergekit-community/diabolic6045_ELN-AOC-CAIN/a9ecca9a-c5d4-45b2-a403-e74a98a46322.json
 delete mode 100644 data/hfopenllm_v2/mergekit-community/mergekit-dare_ties-ajgjgea/630d8a60-03b7-4550-82f4-e879b2e01c6c.json
 delete mode 100644 data/hfopenllm_v2/mergekit-community/mergekit-della-zgowfmf/206b5a96-ae07-41fd-822f-436d49c57dcb.json
 delete mode 100644 data/hfopenllm_v2/mergekit-community/mergekit-model_stock-azgztvm/702d2120-5301-4e03-bb0f-1f8ab19e522a.json
 delete mode 100644 data/hfopenllm_v2/mergekit-community/mergekit-slerp-fmrazcr/61e39700-c237-49fc-baef-3fa573b3b0c6.json
 delete mode 100644 data/hfopenllm_v2/mergekit-community/mergekit-ties-rraxdhv/8892ab84-750d-494f-9f87-ad28e73cf364.json
 delete mode 100644 data/hfopenllm_v2/mergekit-community/mergekit-ties-ykqemwr/538a2eb7-34e4-4e78-a382-60a13710096e.json
 delete mode 100644 data/hfopenllm_v2/mergekit-community/sexeh_time_testing/a041629e-8ed8-4a6c-95ee-98e759501e19.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-2-13b-chat-hf/09f05984-5815-4b3d-bc73-83ea1e5ecc27.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-2-13b-hf/6535524e-f8cf-4f2f-9d89-9ba70aedac91.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-2-70b-chat-hf/08ea4f9d-0e3c-4a8b-85e6-075290d30ba4.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-2-70b-hf/631f0a1f-a6f5-46f6-9aa0-31ac9764c086.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-2-7b-chat-hf/b771f6db-7516-4423-9010-3467db0e26e3.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-2-7b-hf/cf580dfb-2924-4c4b-9352-394275b959bd.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-3.1-70B-Instruct/ba549fe6-7718-4abf-a610-7e0f48611483.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-3.1-70B/b92440b1-78a9-4288-a432-f057f2b04a2f.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-3.1-8B-Instruct/838f3932-edf2-4f72-9238-981d1aadc771.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-3.1-8B/61e933b2-5cd1-4f08-8a9e-5b06ef54b6d5.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-3.2-1B-Instruct/0b307c78-94c7-418f-bc47-5106b81c30de.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-3.2-1B/18783694-3e7b-4d06-9378-5a3fa4a7a0a2.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-3.2-3B-Instruct/dab922e5-1b46-4a90-b75c-1b26cd6cc6d3.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-3.2-3B/8cfa1f00-3b26-4d75-9b0a-0dea65e2e352.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Llama-3.3-70B-Instruct/f74d26e6-9dfb-4e81-8522-8309b27760cf.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Meta-Llama-3-70B-Instruct/2022bcf3-a057-4b0a-aa33-6cf074ffc714.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Meta-Llama-3-70B/a6e79d12-42f6-47ad-95fa-ba03fa4d3a06.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B-Instruct/24d850fe-1817-4041-8767-085f4bd2bac3.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B-Instruct/610a3be1-1032-4079-ba37-d6c2c5f9fd55.json
 delete mode 100644 data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B/857bb10e-1b43-4714-a758-0cef5816ba02.json
 delete mode 100644 data/hfopenllm_v2/mhl1/Qwen2.5-0.5B-cinstruct-stage1/cdabdd54-6101-471c-9bd8-446953be986b.json
 delete mode 100644 data/hfopenllm_v2/microsoft/DialoGPT-medium/8029cb75-8d3b-411d-b0eb-74539b8ecb2f.json
 delete mode 100644 data/hfopenllm_v2/microsoft/Orca-2-13b/65d10996-2c5b-4e11-9a07-319c2446a237.json
 delete mode 100644 data/hfopenllm_v2/microsoft/Orca-2-7b/ef21d739-b122-4ab8-a8ff-a7cfecad5c8e.json
 delete mode 100644 data/hfopenllm_v2/microsoft/Phi-3-medium-128k-instruct/45f3b963-497b-4d89-ac66-9ff0ba8dadf8.json
 delete mode 100644 data/hfopenllm_v2/microsoft/Phi-3-medium-4k-instruct/4173435b-d907-4ac5-a8bd-dfa2759f3fb6.json
 delete mode 100644 data/hfopenllm_v2/microsoft/Phi-3-mini-128k-instruct/b4a79f30-3a04-4f78-861e-1571316a0642.json
 delete mode 100644 data/hfopenllm_v2/microsoft/Phi-3-mini-4k-instruct/53426038-df38-45ba-b621-34231c9cad7f.json
 delete mode 100644 data/hfopenllm_v2/microsoft/Phi-3-mini-4k-instruct/fa758fe5-21ec-45cc-941f-5cb5ca0612b1.json
 delete mode 100644 data/hfopenllm_v2/microsoft/Phi-3-small-128k-instruct/d2a92a62-3bd0-4cb2-897b-742ea0d5203f.json
 delete mode 100644 data/hfopenllm_v2/microsoft/Phi-3-small-8k-instruct/8b752519-63d4-4638-b56e-1c45c7f4694e.json
 delete mode 100644 data/hfopenllm_v2/microsoft/Phi-3.5-MoE-instruct/8da71b7c-7b73-453f-998b-84e70b54e471.json
 delete mode 100644 data/hfopenllm_v2/microsoft/Phi-3.5-mini-instruct/2b7b1216-3ea7-48f1-89f6-e5d84fef2b32.json
 delete mode 100644 data/hfopenllm_v2/microsoft/Phi-4-mini-instruct/37e19712-3197-42da-a8f2-ae1f36c2b06c.json
 delete mode 100644 data/hfopenllm_v2/microsoft/phi-1/c6ae6691-64ec-443d-8d76-af614c8cc7f9.json
 delete mode 100644 data/hfopenllm_v2/microsoft/phi-1_5/80567722-8c6b-41b9-8103-3bdaedfdb8ee.json
 delete mode 100644 data/hfopenllm_v2/microsoft/phi-2/20192dc4-ea3a-4413-8457-18a592fa0c64.json
 delete mode 100644 data/hfopenllm_v2/microsoft/phi-4/8c878c05-86f7-4d61-81d7-9bb286516581.json
 delete mode 100644 data/hfopenllm_v2/microsoft/phi-4/fa753be0-4a98-4ec3-9cc9-3bf7b380ad17.json
 delete mode 100644 data/hfopenllm_v2/migtissera/Llama-3-70B-Synthia-v3.5/0516b46b-a957-413f-aadc-58f4339dc60a.json
 delete mode 100644 data/hfopenllm_v2/migtissera/Llama-3-8B-Synthia-v3.5/97200dd7-7ed0-4a7b-ace9-31c173f017f1.json
 delete mode 100644 data/hfopenllm_v2/migtissera/Tess-3-7B-SFT/758f8332-ffa8-4059-ac6f-400f9367bb23.json
 delete mode 100644 data/hfopenllm_v2/migtissera/Tess-3-Mistral-Nemo-12B/b1103662-055c-471e-ace8-dd75f607491d.json
 delete mode 100644 data/hfopenllm_v2/migtissera/Tess-v2.5-Phi-3-medium-128k-14B/27b0d675-498f-4351-b92f-7c0d1a3c83bd.json
 delete mode 100644 data/hfopenllm_v2/migtissera/Tess-v2.5.2-Qwen2-72B/3f1f88d4-2908-4f28-b8d3-4f9ded18ba0e.json
 delete mode 100644 data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B-v0.2/3883b0d3-e442-42d3-adc6-ed959c902dd3.json
 delete mode 100644 data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B-v0.2/da172cdb-1388-42f5-97b1-ae8e15291631.json
 delete mode 100644 data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B/7c94dbfa-4b3a-43fd-9f2c-b3d63d8ef700.json
 delete mode 100644 data/hfopenllm_v2/mindw96/DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3/7cdd1de0-767d-4527-a024-c67166bb8b20.json
 delete mode 100644 data/hfopenllm_v2/minghaowu/Qwen1.5-1.8B-OpenHermes-2.5/d4702278-54c4-42e8-a901-dfe5c7f2004a.json
 delete mode 100644 data/hfopenllm_v2/ministral/Ministral-3b-instruct/149f8ee5-4376-4fcc-8f87-7412a3083570.json
 delete mode 100644 data/hfopenllm_v2/mistral-community/Mistral-7B-v0.2/de82b746-c5d7-450a-bc2b-1b2859d91d6b.json
 delete mode 100644 data/hfopenllm_v2/mistral-community/Mixtral-8x22B-v0.1/d2a916a6-288a-4761-a3fd-ca674edb67c1.json
 delete mode 100644 data/hfopenllm_v2/mistral-community/mixtral-8x22B-v0.3/cda497f9-c7f9-48d6-944b-0167476e5e5c.json
 delete mode 100644 data/hfopenllm_v2/mistralai/Codestral-22B-v0.1/b56c6c01-a226-4090-9332-330535d79e24.json
 delete mode 100644 data/hfopenllm_v2/mistralai/Ministral-8B-Instruct-2410/0ddc8e10-9cc5-48eb-b5b0-a2c2f071862b.json
 delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.1/2917c469-7e22-497e-8d62-9b9972266658.json
 delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.2/2424d85c-e092-4e7c-bf4f-ae014d08a159.json
 delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.3/90278363-1d8f-47ca-a7dc-c51c6b511dc9.json
 delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-7B-v0.1/3c3197ee-675d-4bb7-874d-28104d2a3cae.json
 delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-7B-v0.3/eb5a8679-bfdd-40f2-9a32-55c04a65ae7e.json
 delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-Large-Instruct-2411/d770f88d-b110-4f27-85e9-e52217c11798.json
 delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-Nemo-Base-2407/364328ce-5de7-401f-ad84-0c76e3c1dc91.json
 delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-Nemo-Instruct-2407/f7dcfdbb-ff12-4692-9702-712de3d0b7ba.json
 delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-Small-24B-Base-2501/d641aa88-9981-4a25-90d5-fcc4564ede52.json
 delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-Small-Instruct-2409/8915e742-df2e-41bc-b83f-3e111edfd257.json
 delete mode 100644 data/hfopenllm_v2/mistralai/Mistral-Small-Instruct-2409/e29a5e35-8677-4e53-83fd-85e919b4366a.json
 delete mode 100644 data/hfopenllm_v2/mistralai/Mixtral-8x22B-Instruct-v0.1/e5c55d38-dc04-42b4-9aca-ae7be436ebe0.json
 delete mode 100644 data/hfopenllm_v2/mistralai/Mixtral-8x22B-v0.1/504baceb-6684-430d-a532-b7b5b0b061fe.json
 delete mode 100644 data/hfopenllm_v2/mistralai/Mixtral-8x7B-Instruct-v0.1/31fcd34a-af1e-4eab-bd9a-5ec17eb572d2.json
 delete mode 100644 data/hfopenllm_v2/mistralai/Mixtral-8x7B-v0.1/01ab0a3e-393a-497a-9b32-8af790b7581a.json
 delete mode 100644 data/hfopenllm_v2/mistralai/Mixtral-8x7B-v0.1/541967a6-b856-4dc9-958a-9335197fba99.json
 delete mode 100644 data/hfopenllm_v2/mixtao/MixTAO-7Bx2-MoE-v8.1/ee31c801-67cb-46a3-9e39-02e842c0473f.json
 delete mode 100644 data/hfopenllm_v2/mkurman/llama-3.2-MEDIT-3B-o1/65fabe8b-05af-461e-b804-fcff3492da34.json
 delete mode 100644 data/hfopenllm_v2/mkurman/phi-4-MedIT-11B-exp-1/7e1a7121-2c9f-4196-bbdd-48aea257f384.json
 delete mode 100644 data/hfopenllm_v2/mkurman/phi4-MedIT-10B-o1/dd32609c-316e-4511-8791-fcae33a1a506.json
 delete mode 100644 data/hfopenllm_v2/mkxu/llama-3-8b-instruct-fpo/d95d7058-49eb-47d7-b790-3a253291d22b.json
 delete mode 100644 data/hfopenllm_v2/mkxu/llama-3-8b-po1/37cbc3d6-1198-4e23-b86c-1fd979eacd9a.json
 delete mode 100644 data/hfopenllm_v2/mlabonne/AlphaMonarch-7B/76d0d338-e502-4638-adad-c4c4df00c26f.json
 delete mode 100644 data/hfopenllm_v2/mlabonne/Beyonder-4x7B-v3/f47375bd-547a-4d0b-8c96-bbe2bc1ac445.json
 delete mode 100644 data/hfopenllm_v2/mlabonne/BigQwen2.5-52B-Instruct/6b1ed68c-3099-4bd7-892b-cdc36c90ccfe.json
 delete mode 100644 data/hfopenllm_v2/mlabonne/BigQwen2.5-Echo-47B-Instruct/0e59c8ca-cde0-4482-ab03-3309bcb8737c.json
 delete mode 100644 data/hfopenllm_v2/mlabonne/ChimeraLlama-3-8B-v2/d7e900e2-0574-44cd-a68a-0dd2715cf48c.json
 delete mode 100644 data/hfopenllm_v2/mlabonne/ChimeraLlama-3-8B-v3/fd626c3f-566d-4193-9a85-e7c9a89e671c.json
 delete mode 100644 data/hfopenllm_v2/mlabonne/Daredevil-8B-abliterated/196b04ae-fd53-400f-9f08-19edd4959f6e.json
 delete mode 100644 data/hfopenllm_v2/mlabonne/Daredevil-8B/57177299-076a-4506-89a7-ce54af08df4f.json
 delete mode 100644 data/hfopenllm_v2/mlabonne/Hermes-3-Llama-3.1-70B-lorablated/d3bdf36f-7f89-4b5a-b6cb-847b49200b5b.json
 delete mode 100644 data/hfopenllm_v2/mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated/92619b9e-dacf-4d0a-9f8b-6e131af74fa4.json
 delete mode 100644 data/hfopenllm_v2/mlabonne/NeuralBeagle14-7B/cbb408ea-ced6-4f47-9066-d4ff6d604b1e.json
 delete mode 100644 data/hfopenllm_v2/mlabonne/NeuralDaredevil-8B-abliterated/6999bb02-29fd-4c59-886f-184362afa06e.json
 delete mode 100644 data/hfopenllm_v2/mlabonne/NeuralDaredevil-8B-abliterated/913d1d8e-0b02-4ce5-9b7c-403143a8c880.json
 delete mode 100644 data/hfopenllm_v2/mlabonne/OrpoLlama-3-8B/82c87bc0-29cf-4150-92f5-c80fb0028ea6.json
 delete mode 100644 data/hfopenllm_v2/mlabonne/phixtral-2x2_8/a18834ad-6143-4ce2-9842-471817a60a39.json
 delete mode 100644 data/hfopenllm_v2/mlx-community/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32/be900bcf-8ec9-484f-81db-0e83975c1ecd.json
 delete mode 100644 data/hfopenllm_v2/mlx-community/Mistral-Small-24B-Instruct-2501-bf16/d226ccf6-674b-44c6-8b11-d782b59a961a.json
 delete mode 100644 data/hfopenllm_v2/mmnga/Llama-3-70B-japanese-suzume-vector-v0.1/d8839a1a-8d07-4e0b-bd44-2668c84f750c.json
 delete mode 100644 data/hfopenllm_v2/mobiuslabsgmbh/DeepSeek-R1-ReDistill-Llama3-8B-v1.1/e90b04db-2eb3-483a-ab0e-ea8aef821d84.json
 delete mode 100644 data/hfopenllm_v2/mobiuslabsgmbh/DeepSeek-R1-ReDistill-Qwen-7B-v1.1/900921ae-fbb2-4488-ab19-18987c1d008d.json
 delete mode 100644 data/hfopenllm_v2/moeru-ai/L3.1-Moe-2x8B-v0.2/0da0a7cd-c075-4bc0-8e88-8acc7212e5c3.json
 delete mode 100644 data/hfopenllm_v2/moeru-ai/L3.1-Moe-4x8B-v0.1/b50a49cd-2909-4dbe-9c9f-c150abb99845.json
 delete mode 100644 data/hfopenllm_v2/moeru-ai/L3.1-Moe-4x8B-v0.2/13831d81-a9dd-43c7-bce1-240aad42fbc6.json
 delete mode 100644 data/hfopenllm_v2/monsterapi/Llama-3_1-8B-Instruct-orca-ORPO/56ea7cb3-3a1e-477a-bac8-26a0fde6297a.json
 delete mode 100644 data/hfopenllm_v2/monsterapi/gemma-2-2b-LoRA-MonsterInstruct/8ce19b33-4f2b-4b8d-80bd-1ed399a5e9dd.json
 delete mode 100644 data/hfopenllm_v2/mosaicml/mpt-7b/18ab167d-b72e-4fa9-94a8-09edc641c73f.json
 delete mode 100644 data/hfopenllm_v2/mosama/Qwen2.5-1.5B-Instruct-CoT-Reflection/7df237ea-29c0-4d0a-9092-c41df4c13aca.json
 delete mode 100644 data/hfopenllm_v2/mrdayl/OpenCogito/e5dc8caa-2d86-4ff0-af8d-22d85c8faeb0.json
 delete mode 100644 data/hfopenllm_v2/mrdayl/OpenCognito-r1/01591bb6-9daf-40fb-b802-0a007f4cc388.json
 delete mode 100644 data/hfopenllm_v2/mrdayl/OpenCognito-r2/f6c32abf-bbae-4827-9ce2-29ce20c9463e.json
 delete mode 100644 data/hfopenllm_v2/mrdayl/OpenCognito/74a6605d-3557-4458-bef5-cc9420434e68.json
 delete mode 100644 data/hfopenllm_v2/mrdayl/OpenThink/dbe6e126-d35c-4634-a544-adf374ed5d00.json
 delete mode 100644 data/hfopenllm_v2/mrm8488/phi-4-14B-grpo-gsm8k-3e/d68681c1-01e4-4af0-9a81-e0aaed0ae865.json
 delete mode 100644 data/hfopenllm_v2/mrm8488/phi-4-14B-grpo-limo/de9620b8-7112-436f-8941-fae2c5e7f9e0.json
 delete mode 100644 data/hfopenllm_v2/mukaj/Llama-3.1-Hawkish-8B/cafee7ac-deb6-4c4b-af8f-81548648cb14.json
 delete mode 100644 data/hfopenllm_v2/natong19/Mistral-Nemo-Instruct-2407-abliterated/3e3cb617-6f19-4731-b31a-b1f4d88237d5.json
 delete mode 100644 data/hfopenllm_v2/natong19/Qwen2-7B-Instruct-abliterated/3c2c2c14-d065-4d6c-8c98-44ba8f2ca461.json
 delete mode 100644 data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish-Instruct/8909f916-401b-4457-ab8f-2691696049c6.json
 delete mode 100644 data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish-Instruct/ae191508-7dad-4cac-ad4a-af95d7a15b5d.json
 delete mode 100644 data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish/507f5047-fac3-415f-b9fa-aae4311fa837.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/BigKartoffel-mistral-nemo-20B/0ee8716c-74f0-41b4-94a2-efc715150293.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/DoppelKartoffel-Mistral-Nemo-23B/fcf491f4-cf57-4c95-9de1-4702ab5d54c7.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/DoublePotato-Mistral-Nemo-13B/4fd20259-c7c7-4da5-9013-ae2feb2175b1.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-1.5B/a7c8c345-cade-48fd-93c0-0f344044d2b5.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-14B/7a8e3986-7688-4a26-a74c-a9bb47cd3e8d.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-7B-1k-r16/7a2ffb4d-1135-42a1-b28b-3b4e4d014979.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-7B-1k-r64-2e-5/25468720-93d7-4f10-a534-30c4976657e8.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/EVA-abliterated-TIES-Qwen2.5-1.5B/5ba1d617-9d9a-4c3b-b9cc-3224ace129b3.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/EVA-abliterated-TIES-Qwen2.5-14B/27b2b46f-1323-4ddd-9f65-d8fcd9cd6508.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Flammades-Mistral-Nemo-12B/65917125-bb7c-4d64-ba5f-b5e4f67ec332.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Gemma2-Gutenberg-Doppel-9B/30bf22d8-b93a-4775-8073-30e14e15e35d.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Gutensuppe-mistral-nemo-12B/ff510365-a13d-4e44-9709-59a56e864991.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Hermes2-Gutenberg2-Mistral-7B/6d1eebc4-228b-43f3-b31c-3d5b1591ae2d.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Kartoffel-Deepfry-12B/f1e8cdbb-14b7-4959-a053-fb1b37629aff.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Llama-3.1-Nemotron-lorablated-70B/4145d1a0-8d6a-4d64-8a45-a89cf343ac46.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Llama3.1-Gutenberg-Doppel-70B/d6966190-e254-4902-8472-cac59bfbdbe0.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Lyra-Gutenberg-mistral-nemo-12B/5fdb5437-f413-451d-9800-42036cda7686.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Lyra4-Gutenberg-12B/347577a4-2768-4472-ba48-9b174ad89724.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Lyra4-Gutenberg2-12B/33af440e-837d-4454-9340-af0d3ee74f77.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Mahou-1.5-mistral-nemo-12B-lorablated/1a1f4709-8d05-4905-8105-0c3606d5ef5b.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Mistral-Gutenberg-Doppel-7B-FFT/28421948-089b-4487-bb71-a06e5ce74402.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B-v2/3fa0c783-9226-4fc8-b3a0-6e960684f43d.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B/743b7fe2-f998-408c-98b1-af02d9c1ee2a.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Moderne-12B-FFT-experimental/0039c88b-a881-4ce0-9a0a-a10f1a8cbc70.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B-v2/87c7fbd9-7648-4d0d-ac9e-8ba85860e335.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B-v7/6ca3ab87-c05e-46b5-879d-4fc8bf75417b.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B/525f1b9f-88a2-459d-bb4a-7c01a0107968.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Mistral-Small-Drummer-22B/503f79be-7f05-4464-ac9f-0f284f1e7965.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Mistral-Small-Gutenberg-Doppel-22B/86ec7d95-6f6d-4ca6-97d5-7a910f42a06d.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Nemo-Loony-12B-experimental/d472ba79-6592-4f8a-a99c-ec3f71468d3e.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Nemoties-ChatML-12B/6ddc052c-6bda-4d8e-ad97-20d881c8cfb7.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Qwen2.5-Gutenberg-Doppel-14B/76d1aed8-80fe-4b4f-bd81-ea0d6bf085c4.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/SmolNemo-12B-FFT-experimental/d2845d6e-65dd-4448-901d-d554b3e741f3.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/Stella-mistral-nemo-12B-v2/f7dd203f-24d8-4875-878a-12ed99e20cd3.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/gemma2-gutenberg-27B/287ae246-bee5-4fae-b78f-203491aa8df2.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/gemma2-gutenberg-9B/9ee493f7-e031-4593-beae-65be17678e00.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/llama-3-gutenberg-8B/86b10c6f-41c6-4d0a-ae59-f90e204e466c.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/llama3.1-cc-8B/043e3533-7d5c-4d45-bcd8-0dbcc8ca4819.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/llama3.1-kartoffeldes-70B/1b3269fb-4b16-42b6-80c0-3d54bc2b4fed.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-bophades-12B/ee625c29-62c4-49da-9790-e7e67233157d.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-bophades3-12B/02b16bf2-62bb-401e-9726-2135d8d610be.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-cc-12B/db10c6f9-2962-46cc-aa4e-4c99c4b494d1.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-gutades-12B/aa37bda0-2e0a-4361-a5b4-468154d8ac72.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v2/d9a6565c-5a0b-4893-b6e0-1fc52ec55bf5.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v3/becf9805-83a9-4137-a938-81a61a10e4f0.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v4/6e848120-bc31-4628-af05-30707a6dcc41.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B/864af855-71b0-4b11-ae3f-56294a7d0db9.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg2-12B-test/285bd390-1dd9-4db2-af45-68dea557da3c.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-kartoffel-12B/459e2375-1a15-4129-bee0-dc8852d531e2.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-narwhal-12B/7b4c7d92-f581-4057-bec9-e3a8c6a5386e.json
 delete mode 100644 data/hfopenllm_v2/nbeerbower/mistral-nemo-wissenschaft-12B/7ceab841-f9a3-455b-9314-243d8fc3cd11.json
 delete mode 100644 data/hfopenllm_v2/nbrahme/IndusQ/c1e2fb45-22d8-4eb4-8971-ce89c3048b9e.json
 delete mode 100644 data/hfopenllm_v2/necva/IE-cont-Llama3.1-8B/68cb2ca1-1648-41a2-92b7-969bccdca4ee.json
 delete mode 100644 data/hfopenllm_v2/necva/replica-IEPile/5f285d61-5e4b-4c5c-8960-c10313d76ae3.json
 delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.1-bf16-falcon3-7b-instruct/3af19898-8590-4aec-b324-46c7fbf596d3.json
 delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.1-falcon3-10b-instruct/e8472266-6d03-439f-bd6b-e3ac5ef2cf09.json
 delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.1-qwen2.5-7b-instruct/3f578b45-48f9-4022-991c-32a71706aba3.json
 delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.1-virtuoso-small/ef8c22a7-3898-422e-88e2-1a8c14ab5bf2.json
 delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.2-falcon3-10b-instruct/81630ea2-d496-4872-92b7-e476badaf50d.json
 delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.2-falcon3-7b-instruct/9436d04a-9c81-47ad-a7b8-496e14058627.json
 delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.3-falcon3-7b-instruct/f1e6e54e-cb97-4980-8957-2190ee5c4c34.json
 delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.4-falcon3-7b-instruct/30914dd3-c857-4aaf-b6b9-d1c7e4917e89.json
 delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.5-falcon3-7b-instruct/1c389a32-68b3-47c0-a6b8-2c2291293002.json
 delete mode 100644 data/hfopenllm_v2/neopolita/jessi-v0.6-falcon3-7b-instruct/e759a217-6571-446d-9bf9-d1512793f307.json
 delete mode 100644 data/hfopenllm_v2/neopolita/loki-v0.1-virtuoso/753f3b21-7365-4117-b2a0-a91f03ec3d39.json
 delete mode 100644 data/hfopenllm_v2/netcat420/DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b/297ef102-67c1-4e9c-b418-fed026bb1f8a.json
 delete mode 100644 data/hfopenllm_v2/netcat420/DeepSeek-R1-MFANN-TIES-unretrained-7b/9fbf73d7-7d67-4d6c-a5b9-efc627cd1b2b.json
 delete mode 100644 data/hfopenllm_v2/netcat420/Llama3.1-MFANN-8b/b1446577-f13f-434a-a0b4-916091395d4a.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V2/fc8946aa-8b04-482c-8c05-d026d2af07be.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V3/fabe3784-948c-4618-9cf0-c76a3ddd3820.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-V4/736dcf09-6a19-4e88-a790-7a7ee74d8717.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-V5/75b4c750-1570-4825-a04a-965c06861fd4.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-Slerp-TIES/b7f8b678-2aea-4d41-ba21-2083fc472574.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-Slerp-V3.2/a8010630-58de-448c-af08-70b8ffec431b.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-SFT/4a0c2ce5-a4b4-4d35-b65d-bbc6e36a649b.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-abliterated-phi2-merge-unretrained/1132251a-59c7-402e-9957-f9288864508f.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-llama3.1-Abliterated-SLERP/e2fac049-8f9f-4b71-bcd3-5746b7d90150.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-SLERP-v3.1/d891a1e1-ad65-498f-9ee8-59523c1bfd19.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-SLERP-v3/9dd3103f-6c4f-4077-ac27-3a9b0f4a5882.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-v2/ca031f70-5785-46d1-8a58-b279d8340776.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V2/18457711-92b8-4c27-a89a-928fecdf5724.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V3.2/3398aeb8-08a8-4be9-a24c-efeabcaa2139.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V3.3/707bc006-4318-41bc-b91b-aa43ca7cba6f.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3b/7bfda919-13be-4b68-8655-99fe6a4605a2.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv0.15/f844e739-5f0d-4db4-ba66-bd33b1290571.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv0.18/0cde6639-6a89-4682-bb3e-a2a24a1bc8ab.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv0.19/87652005-4404-4c45-bd4f-5f63c44adf63.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv0.20/a7e0bc2d-784d-4719-ac08-d8fa0c29d178.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv0.21/e8ba93e6-6f90-4169-8403-381b7f9e26ab.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv0.22/ea86b542-3d06-4e71-b49d-17cdd362b465.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv0.23/15615d2c-46a1-47c7-a273-697e97bdf9f2.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv0.24/a2b8da3f-c99e-4dba-b4a2-23739281eaf2.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv1.1/76f3fa3a-1629-4cdd-b457-3a108784b427.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv1.2/c9e979e1-4433-4a38-8fd4-c14895e74f44.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv1.3/3f2effba-1ab8-476d-b228-ed9491e83adf.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANN3bv1.4/a5f0fb1b-27a7-495f-a010-3307afdb8949.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANNv0.19/22f2aa1d-fff1-430a-9c20-3b32859d9665.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANNv0.20/daff0e6f-d29f-4861-855f-902a0cd9a469.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANNv0.21/0f5cb926-b691-4d57-87f5-290235fd250a.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANNv0.22.1/d9e813da-2966-4901-99f9-c7627c64fc52.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANNv0.23/4cb98a5b-3eb7-4fa8-adfd-17add38d3332.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANNv0.24/f7494fd4-d248-46a6-a46d-f9d8db560aae.json
 delete mode 100644 data/hfopenllm_v2/netcat420/MFANNv0.25/4b8533d1-7770-435f-ba76-a5c658aabd8f.json
 delete mode 100644 data/hfopenllm_v2/netcat420/Qwen2.5-7B-nerd-uncensored-v0.9-MFANN/309c7906-0010-4f17-848f-185062d96a26.json
 delete mode 100644 data/hfopenllm_v2/netcat420/Qwen2.5-7b-MFANN-slerp/f18ab2ab-098b-4e46-8f8d-433b52cdb81b.json
 delete mode 100644 data/hfopenllm_v2/netcat420/Qwen2.5-7b-nerd-uncensored-MFANN-slerp/b4a70c71-dfac-4888-937e-d5220b491b0e.json
 delete mode 100644 data/hfopenllm_v2/netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained/b879a534-6b24-4873-a0e4-e18453540121.json
 delete mode 100644 data/hfopenllm_v2/netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN/c67ae8f2-596b-4dab-8c4f-768b2f0608b4.json
 delete mode 100644 data/hfopenllm_v2/netcat420/Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b/7766c638-b4dc-4b2d-8c14-becdb1b709ef.json
 delete mode 100644 data/hfopenllm_v2/netcat420/Qwen2.5-MFANN-7b/dd211bef-3940-4d78-8f7b-a67da81d605b.json
 delete mode 100644 data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-SLERP-V1.2/87e20b7a-85c8-4845-94b0-ace1e18814cb.json
 delete mode 100644 data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-SLERPv1.1/9ab01db6-3154-4c5b-b6a2-35479538d332.json
 delete mode 100644 data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-v1.1/9d35316a-011d-4e45-ae57-317b53de621f.json
 delete mode 100644 data/hfopenllm_v2/netease-youdao/Confucius-o1-14B/c9e7fec0-b244-4ca1-a117-a52fdd4671a5.json
 delete mode 100644 data/hfopenllm_v2/newsbang/Homer-7B-v0.1/0659cb01-0d52-42cb-9e3a-2d8cac01692e.json
 delete mode 100644 data/hfopenllm_v2/newsbang/Homer-7B-v0.2/98490bb1-70f0-4e7a-8fd6-698ec9fcbd5a.json
 delete mode 100644 data/hfopenllm_v2/newsbang/Homer-v0.3-Qwen2.5-7B/6e0f7e7e-8927-436e-95a7-5a7c626ca241.json
 delete mode 100644 data/hfopenllm_v2/newsbang/Homer-v0.4-Qwen2.5-7B/9c5b3f4d-6e0b-482b-b142-dd7b387cae22.json
 delete mode 100644 data/hfopenllm_v2/newsbang/Homer-v0.5-Qwen2.5-7B/04840708-a4cc-407c-8b2a-876b382920a1.json
 delete mode 100644 data/hfopenllm_v2/newsbang/Homer-v1.0-Qwen2.5-72B/83b0844c-70fe-4b63-8ed2-4147390518ee.json
 delete mode 100644 data/hfopenllm_v2/newsbang/Homer-v1.0-Qwen2.5-7B/9cf10c60-bee1-4f4f-9e03-c3c10287bded.json
 delete mode 100644 data/hfopenllm_v2/nguyentd/FinancialAdvice-Qwen2.5-7B/8e92dd9e-a68c-46ef-9b03-955c06a21437.json
 delete mode 100644 data/hfopenllm_v2/ngxson/MiniThinky-1B-Llama-3.2/dd1139d8-2b44-4516-b24a-1219826f5482.json
 delete mode 100644 data/hfopenllm_v2/ngxson/MiniThinky-v2-1B-Llama-3.2/e37e86f7-b67b-4f0a-b1bd-92f30842b303.json
 delete mode 100644 data/hfopenllm_v2/nhyha/N3N_Delirium-v1_1030_0227/bc3b55d5-35ca-48b5-832e-8544e145b1b1.json
 delete mode 100644 data/hfopenllm_v2/nhyha/N3N_Llama-3.1-8B-Instruct_1028_0216/5757cd3d-c64e-4743-8200-5e610e24bf95.json
 delete mode 100644 data/hfopenllm_v2/nhyha/N3N_gemma-2-9b-it_20241029_1532/ae8cd3ad-ce7b-41f4-8e4a-f11002af2e58.json
 delete mode 100644 data/hfopenllm_v2/nhyha/N3N_gemma-2-9b-it_20241110_2026/bee54048-ebb2-4051-a18f-aa85b0f2ce27.json
 delete mode 100644 data/hfopenllm_v2/nhyha/merge_Qwen2.5-7B-Instruct_20241023_0314/2f98c85b-5a2e-467e-9626-b1bdefe7bdd7.json
 delete mode 100644 data/hfopenllm_v2/nidum/Nidum-Limitless-Gemma-2B/2c530a3b-888e-4a61-b97b-ea875b30ec9c.json
 delete mode 100644 data/hfopenllm_v2/nisten/franqwenstein-35b/4c9fb322-735e-4644-8121-088d00f78c5f.json
 delete mode 100644 data/hfopenllm_v2/nisten/franqwenstein-35b/e7e7733f-682b-4e68-8f07-85f3ba7a7ae1.json
 delete mode 100644 data/hfopenllm_v2/nisten/tqwendo-36b/e9a4e1e2-bd55-4c3d-99eb-8fafd8f6ec44.json
 delete mode 100644 data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.0.8/42ed92b3-63bc-4fa1-bc16-c19bfb73368f.json
 delete mode 100644 data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.0.9/915ae579-786a-4eb2-a1bb-107a12c9c40d.json
 delete mode 100644 data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.1.0/3489ffea-a607-4f3d-a0c2-bd17147f244f.json
 delete mode 100644 data/hfopenllm_v2/nlpguy/Miisce-one/7b5ba8a8-16c3-4169-b97d-13dd5d4f8395.json
 delete mode 100644 data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v1/6411c44a-b2b3-4fe3-8ba4-9422a0a0b31e.json
 delete mode 100644 data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v2/fe344f84-7428-45af-940f-736275bc4d50.json
 delete mode 100644 data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v3/60956ea2-8b0b-4e4b-801a-d0689f9d46f4.json
 delete mode 100644 data/hfopenllm_v2/nlpguy/StableProse/1ad54bdc-419a-4dd9-9fbb-d7b7ee7038d1.json
 delete mode 100644 data/hfopenllm_v2/nlpguy/StarFusion-alpha1/2ab375f0-2477-48a5-a5d9-0b5d0d7d0a84.json
 delete mode 100644 data/hfopenllm_v2/noname0202/Llama-3.2-4x3B-Instruct/e0525a52-d38c-4b2f-b59b-048b4bf71cb2.json
 delete mode 100644 data/hfopenllm_v2/noname0202/gemma-2-2b-it-ties/01bc964f-552b-4cda-9ed0-cf720f0c8de4.json
 delete mode 100644 data/hfopenllm_v2/noname0202/gemma-2-9b-sft-jp-en-zh-v1/c9e95c55-978e-485b-8a77-ab2e668e3254.json
 delete mode 100644 data/hfopenllm_v2/noname0202/gemma-2-9b-sft-jp-en-zh-v2/c71c606b-ccb7-48e9-a6c8-b72205ec6c06.json
 delete mode 100644 data/hfopenllm_v2/noname0202/llama-math-1b-r16-0to512tokens-test/ae1801cb-d112-4d1a-895d-c6743779846a.json
 delete mode 100644 data/hfopenllm_v2/noname0202/llama-math-1b-r32-0to512tokens-test/008e3601-dfc4-4bc1-bf8b-f5cef43ae098.json
 delete mode 100644 data/hfopenllm_v2/noname0202/llama-math-1b-r32-test/379b315d-96fb-4edb-b2d6-3dc113a10c17.json
 delete mode 100644 data/hfopenllm_v2/noname0202/llama-math-1b-r8-512tokens-test/8cd36aa1-6f87-4d4d-a1bf-adc87e0a26c6.json
 delete mode 100644 data/hfopenllm_v2/notbdq/Qwen2.5-14B-Instruct-1M-GRPO-Reasoning/f76ce244-29f7-44f0-9850-7291f8e4cbf1.json
 delete mode 100644 data/hfopenllm_v2/nothingiisreal/L3.1-8B-Celeste-V1.5/506871f1-0c87-4e8c-a270-eed7b5da2599.json
 delete mode 100644 data/hfopenllm_v2/nothingiisreal/MN-12B-Starcannon-v2/c20264fd-b1f9-4e0f-9f6e-1d58f1c18cda.json
 delete mode 100644 data/hfopenllm_v2/nothingiisreal/MN-12B-Starcannon-v3/59f14dca-923a-41f1-b443-cc3551063f45.json
 delete mode 100644 data/hfopenllm_v2/nvidia/AceInstruct-1.5B/a1ba054f-b0a1-4827-b7ea-3988aa4cf1f1.json
 delete mode 100644 data/hfopenllm_v2/nvidia/AceInstruct-72B/51d8f53f-ad7e-4dae-9e2a-0895729ff790.json
 delete mode 100644 data/hfopenllm_v2/nvidia/AceInstruct-7B/421119ea-0da8-4b26-a335-f2e720618c44.json
 delete mode 100644 data/hfopenllm_v2/nvidia/AceMath-1.5B-Instruct/b0e6bfb2-a8d4-4b1d-859a-aa821f646e57.json
 delete mode 100644 data/hfopenllm_v2/nvidia/AceMath-72B-Instruct/7c4c2ccf-7d7b-4d24-802e-20c182290d07.json
 delete mode 100644 data/hfopenllm_v2/nvidia/AceMath-72B-RM/95212a55-f382-4869-9e11-cfa201ba865b.json
 delete mode 100644 data/hfopenllm_v2/nvidia/AceMath-7B-Instruct/a7da2118-063c-489f-bb31-40f1b7beeefe.json
 delete mode 100644 data/hfopenllm_v2/nvidia/AceMath-7B-RM/9a75ae18-8f9a-40a5-8a7b-0c38df34e9dd.json
 delete mode 100644 data/hfopenllm_v2/nvidia/Hymba-1.5B-Base/a85d4a1f-fbd9-4d21-9700-9e55e30c1391.json
 delete mode 100644 data/hfopenllm_v2/nvidia/Hymba-1.5B-Instruct/2fd1c45e-209c-43da-ae85-d60887513a96.json
 delete mode 100644 data/hfopenllm_v2/nvidia/Llama-3.1-Minitron-4B-Depth-Base/91e0e6aa-b933-4a02-a28d-8d69e698c60a.json
 delete mode 100644 data/hfopenllm_v2/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF/6f3f3d06-2937-4c55-9b95-a62ae5253571.json
 delete mode 100644 data/hfopenllm_v2/nvidia/Minitron-4B-Base/9b3ffdd3-ac18-4084-9e83-1bfc61db0ec2.json
 delete mode 100644 data/hfopenllm_v2/nvidia/Minitron-8B-Base/60077cbd-87af-4a00-a359-9235acb011ed.json
 delete mode 100644 data/hfopenllm_v2/nvidia/Mistral-NeMo-Minitron-8B-Base/577936a8-b450-4233-b633-064565b3d1a4.json
 delete mode 100644 data/hfopenllm_v2/nvidia/Mistral-NeMo-Minitron-8B-Instruct/470b9413-2cc8-4bf4-9e7c-0b8e99929568.json
 delete mode 100644 data/hfopenllm_v2/nvidia/Nemotron-Mini-4B-Instruct/3cbf9c73-0dc8-402e-bc94-c6d52b9f1af7.json
 delete mode 100644 data/hfopenllm_v2/nvidia/OpenMath2-Llama3.1-8B/3fccb1d0-5ae1-427a-adae-37004ecbacaa.json
 delete mode 100644 data/hfopenllm_v2/nxmwxm/Beast-Soul-new/6463183f-4043-4b96-b4d1-0bd41b4d6876.json
 delete mode 100644 data/hfopenllm_v2/occiglot/occiglot-7b-es-en-instruct/0b102423-1a06-4e5b-a287-710695658b63.json
 delete mode 100644 data/hfopenllm_v2/odyssey-labs/Astral-1-10B/b7e4ffd8-2a5a-4364-844a-a308dd7c899c.json
 delete mode 100644 data/hfopenllm_v2/olabs-ai/reflection_model/3fa2e3ef-a375-4ca5-9f85-7cb986313d53.json
 delete mode 100644 data/hfopenllm_v2/ontocord/Llama_3.2_1b-autoredteam_helpfulness-train/abd48d9d-0443-40be-a23a-68922771e14f.json
 delete mode 100644 data/hfopenllm_v2/ontocord/RedPajama-3B-v1-AutoRedteam-Harmless-only/436ff0a4-9907-4e56-a5f2-c97f1b13f81a.json
 delete mode 100644 data/hfopenllm_v2/ontocord/RedPajama-3B-v1-AutoRedteam/7a654100-b206-4011-828e-fb386df27d0c.json
 delete mode 100644 data/hfopenllm_v2/ontocord/RedPajama3b_v1-autoredteam_helpfulness-train/2f0e262c-a099-41f4-89f1-8b251708a960.json
 delete mode 100644 data/hfopenllm_v2/ontocord/merged_0.2_expert_0.8-stack_2x/7bf3e9ca-7d6f-4d43-b8fe-aceb8d60c7c6.json
 delete mode 100644 data/hfopenllm_v2/ontocord/merged_0.2_expert_0.8/8703dbdd-12ef-457b-8cda-f570c8f5c890.json
 delete mode 100644 data/hfopenllm_v2/ontocord/merged_0.5_expert_0.5/d77f3e8f-1eea-478e-babd-ba873d2d427c.json
 delete mode 100644 data/hfopenllm_v2/ontocord/ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful/783a4385-c802-4bb3-9a21-90629d16efc7.json
 delete mode 100644 data/hfopenllm_v2/ontocord/ontocord_wide_7b-stacked-stage1-instruct/bb4ff51e-ce3a-42f5-871e-3e5e8977bc42.json
 delete mode 100644 data/hfopenllm_v2/ontocord/ontocord_wide_7b-stacked-stage1/e80d25b5-3f4b-45a7-9472-09f98db03bf0.json
 delete mode 100644 data/hfopenllm_v2/ontocord/starcoder2-29b-ls/7fed0b1d-0d79-4784-8fd6-42f8611b1751.json
 delete mode 100644 data/hfopenllm_v2/ontocord/starcoder2_3b-AutoRedteam/be534cd3-8245-4370-ba6c-9687b431ee8d.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b-merge_test/e98967b7-3aff-4baa-92eb-eff86bf09797.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b-stage1_shuf_sample1_jsonl-pretrained/8736a22a-f980-4a01-953d-217f27050129.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge/75a2b5c9-7c73-4bb4-8e99-af4a3a27589d.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge/0e0ebdc7-a5bd-4314-9bd7-fc8a11541a4e.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue/f8579305-003b-4727-b904-bad4f363a616.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue/3103f36a-4a88-4a39-8261-0b597f8d6db4.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue/eda9de3b-ae53-4102-b203-eddadbc50464.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/b7de4fa8-d97d-400f-bc3f-ecb1963a03ed.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/fa6ecaf9-457e-4135-ad25-4790ebc27737.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue/ebaa99c4-ff66-421d-8ba7-dae2c5fa274c.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue/e388c707-8b35-49a4-94eb-f32e983fe33e.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue/f6273192-31cf-4ee1-af45-c2f62de05330.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_math.no_issue/105650e6-d9cf-4106-9d55-6f3c08f2f1cf.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue/a1d23749-40c0-4ccb-a104-bf0de63bc2bd.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical/4e4b4cf9-48d5-4ff6-92c0-1e9d7b874b6b.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_formatted_text/3c4713a3-3973-4a04-9c4a-a6782251734e.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_how-to/de70c700-a007-4e87-a3db-941ee285eb1f.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_math/a1324a7f-1911-4fa9-8d83-be891f752a61.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_news/9c4af0df-f538-4755-8cd0-eec6b2b26524.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_software/fde650a6-a5d1-4edc-bd64-8be806663263.json
 delete mode 100644 data/hfopenllm_v2/ontocord/wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked/96dd1a08-b166-4d8e-ac31-5e948adf931b.json
 delete mode 100644 data/hfopenllm_v2/oobabooga/CodeBooga-34B-v0.1/3b90b9db-a68e-4ee9-bd4d-a18cec357753.json
 delete mode 100644 data/hfopenllm_v2/oopere/Llama-FinSent-S/444a6ace-77d4-4d93-b80b-ff5c7e2f6888.json
 delete mode 100644 data/hfopenllm_v2/oopere/Llama-FinSent-S/7e11a778-fccf-4a91-81cf-c06f1a5c77c4.json
 delete mode 100644 data/hfopenllm_v2/oopere/pruned10-llama-3.2-3B/e5d126d7-e0bf-43dc-95c0-184ea1d586ea.json
 delete mode 100644 data/hfopenllm_v2/oopere/pruned20-llama-1b/d05b129c-6b9e-4e6b-80fc-af65db620c5d.json
 delete mode 100644 data/hfopenllm_v2/oopere/pruned20-llama-3.2-3b/d9792fac-29c1-45b2-b649-cdebb6830e2f.json
 delete mode 100644 data/hfopenllm_v2/oopere/pruned40-llama-1b/fcc2f06a-e6c8-4c28-bf22-4ee582392912.json
 delete mode 100644 data/hfopenllm_v2/oopere/pruned40-llama-3.2-1B/c6e13327-90b3-440d-9367-dbcec54dd6cc.json
 delete mode 100644 data/hfopenllm_v2/oopere/pruned40-llama-3.2-3b/30b02429-350c-4d86-aded-ba8597bec4d5.json
 delete mode 100644 data/hfopenllm_v2/oopere/pruned60-llama-1b/7d1ee802-106e-4313-ba1d-72d5a0676c88.json
 delete mode 100644 data/hfopenllm_v2/oopere/pruned60-llama-3.2-3b/1b3af020-f65e-44b8-a9a2-ad60fa686427.json
 delete mode 100644 data/hfopenllm_v2/open-atlas/Atlas-Flash-1.5B-Preview/6e40871d-bc23-4f1c-a005-f5b8eb096f84.json
 delete mode 100644 data/hfopenllm_v2/open-atlas/Atlas-Flash-7B-Preview/1ab33ed2-ea3b-4c6f-a2ac-2465ddd844f4.json
 delete mode 100644 data/hfopenllm_v2/open-neo/Kyro-n1-3B/ec601f5d-bf19-4407-ac41-6b9272d94735.json
 delete mode 100644 data/hfopenllm_v2/open-neo/Kyro-n1-7B/87e53761-e8b7-4032-ae7a-c3a91704d115.json
 delete mode 100644 data/hfopenllm_v2/open-thoughts/OpenThinker-7B/59492d86-4b85-4865-84e9-84ab4ace630c.json
 delete mode 100644 data/hfopenllm_v2/openai-community/gpt2-large/cc082df2-259c-44c1-abe4-ef349056a2a9.json
 delete mode 100644 data/hfopenllm_v2/openai-community/gpt2-medium/3f069053-b24e-4242-9302-d46b82e511aa.json
 delete mode 100644 data/hfopenllm_v2/openai-community/gpt2-xl/62cd9bcb-a74c-40b9-be84-a0077235ae3c.json
 delete mode 100644 data/hfopenllm_v2/openai-community/gpt2/b4cd25f1-87d5-4173-a4d3-928444f6cb37.json
 delete mode 100644 data/hfopenllm_v2/openai-community/gpt2/ddd4716e-d8ae-46a1-8fb4-c27e2da40e6e.json
 delete mode 100644 data/hfopenllm_v2/openbmb/MiniCPM-S-1B-sft-llama-format/1e5b62a3-018b-429a-b2b4-325545ee99dc.json
 delete mode 100644 data/hfopenllm_v2/openchat/openchat-3.5-0106/958d410e-ce43-44c0-8a56-685c0a618408.json
 delete mode 100644 data/hfopenllm_v2/openchat/openchat-3.5-1210/57c53f20-aa32-49fd-926a-f26c9d0759d4.json
 delete mode 100644 data/hfopenllm_v2/openchat/openchat-3.6-8b-20240522/76def522-6fe1-458f-bfbf-99b50ece3367.json
 delete mode 100644 data/hfopenllm_v2/openchat/openchat_3.5/c467bc88-6769-48ac-abd4-867ee38bbe57.json
 delete mode 100644 data/hfopenllm_v2/openchat/openchat_v3.2/801681eb-66f4-46e0-bb2b-7ba4b46679af.json
 delete mode 100644 data/hfopenllm_v2/openchat/openchat_v3.2_super/cdd0ea1c-b17a-4816-953c-1d7164c64114.json
 delete mode 100644 data/hfopenllm_v2/orai-nlp/Llama-eus-8B/b2060893-1f7d-4e7a-a458-3623147ac118.json
 delete mode 100644 data/hfopenllm_v2/oxyapi/oxy-1-small/cf8aac35-679a-4ebb-bca8-6e0f2d42e71b.json
 delete mode 100644 data/hfopenllm_v2/ozone-ai/0x-lite/34bfe887-5a3a-4626-997e-c35d3a0ec341.json
 delete mode 100644 data/hfopenllm_v2/ozone-research/Chirp-01/b81acc47-6fd5-4f89-8c70-f8f14b677e04.json
 delete mode 100644 data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V1/30b977a8-7882-49be-8621-9ee3fce270ec.json
 delete mode 100644 data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V2/3367fd79-713c-4691-80cd-4abb6b2818ef.json
 delete mode 100644 data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V5/add899b8-f3e6-4d87-8846-8254f4dfbd5f.json
 delete mode 100644 data/hfopenllm_v2/paloalma/Le_Triomphant-ECE-TW3/53829ec0-f233-4b61-a672-6a467823caaa.json
 delete mode 100644 data/hfopenllm_v2/paloalma/TW3-JRGL-v2/e2b41200-bff2-4835-a0ea-27ff56937570.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/Al_Dente_v1_8b/3d33f26d-72be-451e-bcf0-501e0bc2f1db.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/model_007_13b_v2/3b4c05fc-2ccf-46db-8d64-045508f6614b.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_3b/af83a91c-3b07-48c6-9726-5bd77347f810.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_7b/48759b07-9aea-42bd-8d73-9c4208d2789f.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_phi-4/68820679-55f4-494d-91a0-0db1bccb8983.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v2_7b/029774ac-a63d-4acc-a37c-4194e4afdecc.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v3_13b/146df856-e2c8-41eb-b860-ceb78c126e55.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v3_70b/74c6bea7-ad16-4f08-a2b7-9c894b9ce207.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v3_7b/b5e97b2d-d8a2-485a-8b0a-71590e4a376e.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b/e79d0a8c-caec-4dec-b119-3229ffa69a73.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b_dpo/2c760893-b52a-40a9-9420-fb193a62a5c3.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b_orpo/ef9b84e0-68b0-4caa-9980-96ea5e7f440b.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v6_8b/fb48aff8-3f6b-4934-9fb8-d72bf8614d6f.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v6_8b_dpo/9450acd9-16b6-49a2-9b73-cf1161b96df3.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v7_72b/0d50ec2d-5dd4-487e-80cb-9533246a9876.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v7_7b/f6e6827d-fbf8-49cd-bdad-e8c7ea87550a.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v8_1_70b/c5e48fd8-0eea-46a9-8790-1745923561d3.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_0_3B-Instruct/870c7739-8886-47df-8e20-09bfae03b9c5.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_1_1B-Instruct/d8eb5fd1-f1d4-481d-85af-88a11d7b6f6f.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_2_14B/6625b2e0-1f65-4dc5-9913-ceb0e82e6439.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_2_70b/24e7df20-e046-48f7-909e-502d0c70216a.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_4_70B/7920f562-9e7f-4a64-85f4-584b13af44de.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_1B-Instruct/c6620817-69fe-40e2-bb0a-1e9c739ab65d.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_1B-Instruct_preview/520e2d66-4143-493b-8533-64f86c6d676e.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_3B-Instruct/993bdfd2-3a88-4de3-9ed9-9b7b63c0f4f5.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_6_1B-Instruct/4e1be694-cc4d-4943-a8e4-74913cfb2ebe.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_6_3B-Instruct/42c174d1-6211-4438-bb9a-24f3cf386a6d.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_7_1B-Instruct/625bf39b-a118-4ec6-82d0-5405cf70ba53.json
 delete mode 100644 data/hfopenllm_v2/pankajmathur/orca_mini_v9_7_3B-Instruct/e09cb198-d259-42ea-a356-6efe61b1e12b.json
 delete mode 100644 data/hfopenllm_v2/paulml/ECE-ILAB-Q1/5838b130-c2e6-400c-80b7-6822efb5db2c.json
 delete mode 100644 data/hfopenllm_v2/pints-ai/1.5-Pints-16K-v0.1/52b51638-64cd-4b19-8fc7-c223d50bc549.json
 delete mode 100644 data/hfopenllm_v2/pints-ai/1.5-Pints-2K-v0.1/28b3178b-c963-4267-9649-3f7fc10fba3c.json
 delete mode 100644 data/hfopenllm_v2/piotr25691/thea-3b-25r/748298a2-5042-4636-ac7e-051c28916f3a.json
 delete mode 100644 data/hfopenllm_v2/piotr25691/thea-c-3b-25r/03bcd4e6-1620-424a-9200-c0cf4b73bbd2.json
 delete mode 100644 data/hfopenllm_v2/piotr25691/thea-rp-3b-25r/c7fba530-63cc-4ece-a171-4a2919aa8057.json
 delete mode 100644 data/hfopenllm_v2/postbot/gpt2-medium-emailgen/c25c1046-a8d5-4f4b-9a72-c4591cfb4023.json
 delete mode 100644 data/hfopenllm_v2/prince-canuma/Ministral-8B-Instruct-2410-HF/c3800a5c-310b-41cb-9b07-cfc1f1b13256.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Base/e8e2b99f-cf83-4776-9117-aa2b5d9c8068.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Instruct/2da19e45-117f-446b-b956-b35a20bb7411.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Instruct/9e982a33-19cb-4381-8560-884bc8946a2b.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-64k-Base/9130a862-cfd7-47ce-a92a-f60438739491.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-64k-Instruct/858d3717-fcb2-45d9-8eaa-1b00ae0ca918.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-CPO/5f1f137b-cb2f-4ee6-8bc9-5e0b94939f35.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-DPO/6feca911-7a6e-43a2-b59d-7cb48070fe8e.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-IPO/d3ad9813-273e-47de-be16-312cc67ac64f.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-KTO/317205ee-2cc6-4523-9662-be6508314b08.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-ORPO/3b5fe65a-50a1-4036-b81a-86117356cab9.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-RDPO/812ac262-97f4-485e-93de-f8d420b8658e.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-RRHF/39cd7eb0-781e-47b6-8eaa-c72e702f778f.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-SLiC-HF/9411a8a4-306e-43da-96d7-c93eb3aac398.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-SimPO/c93feb32-0526-44ac-b3ed-95f08c37cc9f.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT/1a3b0f7a-afb6-4002-9321-23a86f000c5c.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-CPO-v0.2/8d29363d-3096-4c54-a40e-acf4a7318a04.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-CPO/8cea452d-63b8-4e82-9511-64c94f8e140d.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-DPO-v0.2/5e5b5424-1d48-4a5e-8775-52c75609c338.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-DPO/73787033-ed1d-4d2e-b7b2-e886ef6f1036.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-KTO-v0.2/54c9403f-2525-45c0-a585-9ff598f95f6b.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-KTO/77d0d88d-7ca8-4f3e-8b79-295f53140635.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-ORPO-v0.2/727f27e3-2a3f-4572-8db5-87e498c4b6ca.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-ORPO/b6e0cc97-27cf-4082-a908-95d5c39014b8.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RDPO-v0.2/3b77ec51-fd47-4bc7-9e96-ed46202fef7c.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RDPO/b24cdd3f-3e44-4ebe-b2b4-209ee0bbfbd3.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RRHF-v0.2/e47a3cab-dfef-47f6-9377-9ee32489bab6.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RRHF/1e4481fe-458b-4c23-8a6c-55439fb8b4fd.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SLiC-HF-v0.2/6421e9dc-e7ca-4e1c-9f4f-1d1ac409c4d1.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SLiC-HF/55f43b53-6ed9-4c16-bf75-c968999a6f36.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SimPO-v0.2/6ce93e70-04b1-46b8-b3e3-7eb0df35e1c1.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SimPO/95096a89-2baf-4b14-bc6e-1f30e920c086.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-CPO/f1651632-2787-47cf-b471-89d1b89a6b01.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-DPO/e1fb2ac9-8f60-4dc1-9e0d-99fcb91a53a9.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-IPO/d3accbc1-d698-4357-ab08-0b98fb49b4ed.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-KTO/5388a25a-5780-4ae1-999f-172b558a7b52.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-RDPO/9e4143ff-d461-4fdb-8bc7-86f959f69e68.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-RRHF/5d843bd7-b34b-41d4-92ff-c25a709b4930.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-SLiC-HF/87975b2f-298b-4297-8f4d-e5bb1bf5d113.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-SimPO/41bb8174-f3d6-4862-b892-dbc9f6e2e696.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-CPO/683ad2cd-5e39-4088-b98b-94d89dda7b88.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-DPO/08ffd7ab-ccca-4258-be6d-cbc151cc43aa.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-IPO/4b6efad4-c697-4f0a-8d24-75dc49d8ec06.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-KTO/4986c30a-85b0-4263-9be4-d69c9b067e0c.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-ORPO/47b5a878-1a4a-425f-ae6f-ac286f681cca.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-RDPO/992a6862-46b9-415e-858f-2eff8709ca81.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-RRHF/c6391381-c973-4068-b72c-af08762d9e5c.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-SLiC-HF/0f6e18e6-1b0f-43f4-a9af-6632f6ce63cc.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-SimPO/56d9ee92-6774-4c9b-9861-c5f0a9945e7c.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Sheared-LLaMA-1.3B/d3e753cc-37fc-4d77-8b2d-da90a7843d60.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/Sheared-LLaMA-2.7B/eb08ef6f-6631-47c4-8f52-bf9454ad34b6.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/gemma-2-9b-it-DPO/2207b154-c5d4-4e5a-ade0-271e62d6345f.json
 delete mode 100644 data/hfopenllm_v2/princeton-nlp/gemma-2-9b-it-SimPO/f4161154-7777-4261-9275-a3002a1305d8.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Bellatrix-1.5B-xElite/8523812d-1db6-4a9d-b06b-ac904191789d.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Bellatrix-Tiny-1.5B-R1/6cd9ea81-618d-444e-a892-d4f9819daa67.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Bellatrix-Tiny-1B-v2/2217326d-377a-4503-8180-206c12c87436.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Blaze-14B-xElite/3bbb10fc-e3b9-4c6a-ac35-ee5de9ecd330.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/COCO-7B-Instruct-1M/01124f11-b739-422b-97f7-062074b8d0fb.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite-1M/7cc4c93b-7c43-4bed-84a3-fa1cd9130abb.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite-Stock/bf3aa551-f9c6-4203-b2d4-55cf9e6e2872.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite/2eae8905-5338-4a78-86e7-d354d06efa23.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite/9dcc4121-e046-49c7-969e-7255b0c32d3d.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite2-R1/dd7d4acd-549a-467b-b461-0eba5b019122.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite2/159969cc-32c5-4f6f-b586-8e6d44180b44.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite3/b80e559d-e519-4678-8abc-ee5591b81fac.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite4/90c137c9-939d-4e77-9fcc-9e33551a6121.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Merge/f25d6fef-d337-4cf7-ba05-ca6ff5eccd52.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Calcium-Opus-20B-v1/c6f92306-dcdc-4549-bfc2-feb62a3a6ef6.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Codepy-Deepthink-3B/96c64d23-d23d-486c-83a4-4c0ab4f09d60.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Coma-II-14B/243abf0b-0f88-4b4f-ab51-6c8aebaf19be.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Condor-Opus-14B-Exp/438fb728-d6ad-4c28-a43c-ff82d522cd50.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Cygnus-II-14B/94b45b8d-b754-4fb4-843d-b7ffeafc4f1b.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Deepthink-Llama-3-8B-Preview/5618fc82-d455-4261-8e34-1190d70fd3f3.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Deepthink-Reasoning-14B/395f6339-3fca-4f4d-befc-2d231008efdd.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Deepthink-Reasoning-7B/b22696ac-7074-44f2-b72f-c59ca0a41ce6.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Dinobot-Opus-14B-Exp/6856f8b6-a719-4f69-be71-4df582015f28.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Elita-0.1-Distilled-R1-abliterated/f2c0ea2b-76ae-4469-832e-84c0b79fa283.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Elita-1/5619e3cb-eb3e-4420-a156-6f7b2a5d372d.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Epimetheus-14B-Axo/9d5e329f-491a-4608-bcac-1ee63046b34a.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Equuleus-Opus-14B-Exp/80953f08-6530-4bab-a375-cc542081aabb.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Eridanus-Opus-14B-r999/0b8691a8-f394-4da3-a67b-faa1af9b42c9.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Evac-Opus-14B-Exp/fb541a2b-d9bd-4aa2-8b83-da62a3b77731.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/FastThink-0.5B-Tiny/c20d1c62-d3e0-4e30-b0d3-4c62a6585d23.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/GWQ-9B-Preview/8a10eeb6-7178-4c78-8940-68fad78e389b.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/GWQ-9B-Preview2/f0bb774c-a842-4261-b817-b169ce65a493.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/GWQ2b/59afe234-3a7f-49bb-873c-df6cf793e5e5.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Gaea-Opus-14B-Exp/4074081a-66a6-42e4-994f-72541f90888b.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Galactic-Qwen-14B-Exp1/6a618ec8-c029-49ec-9ea5-da52b5231280.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Galactic-Qwen-14B-Exp2/edc8f510-c961-4c1f-9757-e80c4247f275.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Gauss-Opus-14B-R999/aaa5d1e6-5aca-4471-87ea-7195610a6c1d.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Jolt-v0.1/89b45e8b-9979-4c7f-8aa6-c6ab7009cab0.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Lacerta-Opus-14B-Elite8/41000c74-8b29-4369-996f-cf3a2fd09f63.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Llama-3.1-5B-Instruct/a1765846-74e1-440a-8851-12a571444059.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Llama-3.1-8B-Open-SFT/9c6b594f-387a-42a3-9e40-3b26363e6071.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Llama-3.2-3B-Math-Oct/2b910401-457a-45dd-920a-559f4595897b.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Llama-3.2-6B-AlgoCode/90b7be49-53a0-4d7f-8995-cbc52fe3a70f.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Llama-8B-Distill-CoT/5e8854ba-7147-4fdd-a568-1ea58e79e7d8.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Llama-Deepsync-1B/df6e0cfb-d720-428a-a5ad-b1529faa07c0.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Llama-Deepsync-3B/a88a6e6f-2253-4b67-9527-55ab6153e40f.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Llama-Express.1-Math/00c66a37-b46b-47e8-a098-ce12433c1135.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/LwQ-10B-Instruct/6ad5483c-13dc-4e79-a719-66af383d195a.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/LwQ-Reasoner-10B/9fa6813a-7acb-4c08-9912-6dc0d356a7e2.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Magellanic-Opus-14B-Exp/3880e3bf-6ff0-4eef-a519-2649014254e1.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Magellanic-Qwen-25B-R999/e77efb9d-b1fc-4833-8e7f-8da683019018.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Megatron-Corpus-14B-Exp.v2/2bcc02df-8d27-412a-8b58-c331df98e4d4.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Megatron-Corpus-14B-Exp/622531d5-03f8-42cf-974e-94291aa1e515.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-2.0/b772f20f-afbd-496c-9f94-e5fd30d54466.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-2.1/169d5ad3-ae4a-42de-b951-f264d85bf623.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-Exp/e84c3b50-4ea9-4f41-be11-50c6aa3d4656.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-Stock/594780dc-d969-4a6b-b90b-1cc32f40c452.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Megatron-Opus-7B-Exp/4ff7c238-d69c-4b92-83d0-69cacdfa0fe6.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Messier-Opus-14B-Elite7/bb576dc9-eede-48d6-b438-732da91a4d29.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Omni-Reasoner-Merged/0fb2fe17-b55d-4802-ad48-bd4d711e1e0f.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Omni-Reasoner3-Merged/03d59002-dc98-467f-b2a9-605ef8d9b763.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Pegasus-Opus-14B-Exp/8a7034fd-7027-4a87-9cac-c95b745935d0.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Phi-4-Empathetic/717f745f-1eae-4277-8a31-dbed140ef3e8.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Phi-4-Math-IO/2dc78735-c0c3-4dd7-8e97-52c92785e623.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Phi-4-QwQ/e9ab98ff-5cf0-4437-9cf3-c77ecb546c84.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Phi-4-Super-1/6303d73e-4129-472a-a6fd-c64cb3de7204.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Phi-4-Super-o1/8a689e8f-19cc-45b7-80be-ce861a549af7.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Phi-4-Super/84881315-55a4-4f05-a115-cf82f850090d.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Phi-4-o1/970dc71c-42be-4d50-86ac-f7301ec969ca.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Phi4-Super/c02e1fcf-a837-4b8a-a42d-63837c56128d.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Porpoise-Opus-14B-Exp/37280340-5b9a-47d9-aa37-9299d9025518.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Primal-Opus-14B-Optimus-v1/46e7ad9b-b774-46b9-933c-913d1b307f7a.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Primal-Opus-14B-Optimus-v2/c154d3f5-39dc-43c0-85ea-2e43b08494b4.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-14B-Conversational/abd830e4-2b7f-4895-8262-75926edbafd9.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-3B-Instruct/2c945021-72e3-4e7a-9c6f-81efb27b2206.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-7B-Instruct/5f0ea694-7f73-45fa-b54f-49fc06d1a6d9.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/QwQ-LCoT1-Merged/6c73f6ae-8ffd-4948-8071-33eab07437a6.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/QwQ-LCoT2-7B-Instruct/fbf71df3-b9c3-4f9c-b538-e4ccf097e81c.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/QwQ-MathOct-7B/e3dcfd94-ca04-4cd3-ada5-e701a8b776da.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/QwQ-R1-Distill-1.5B-CoT/9278bcf2-bfab-437f-bd64-7496b24fb8cf.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/QwQ-R1-Distill-7B-CoT/633aa068-5613-41d8-a194-aebc9ce1586f.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Qwen-7B-Distill-Reasoner/d3c1a922-a453-4c7b-b33b-52934e7bf72b.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Qwen2.5-1.5B-DeepSeek-R1-Instruct/3a27b2a6-5eea-450b-91c7-1dc006229985.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Qwen2.5-14B-DeepSeek-R1-1M/395e37ae-005d-47c0-9cf5-919460e34350.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Qwen2.5-7B-DeepSeek-R1-1M/b03b7c7a-f263-4712-bcf4-2e32ca4bd237.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/SmolLM2-CoT-360M/452ab810-6921-4922-9446-f2a5c081dc61.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Elite5/1abba5a0-f1a3-4f39-a81c-f4cd641d33ac.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Elite6/b2eefd3a-795c-4dc0-a10e-924bece05ea5.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm1/008cc919-f156-4a2e-af4b-eed015ca91f6.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm2/9d56082f-5e46-4d7a-8f06-cb44fc983b3f.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm4/7ea26e73-a501-40bf-8f01-81ab8e850a91.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm5/e3343130-cf4f-4e5c-b2d3-5dda13d575b9.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Sqweeks-7B-Instruct/ba1965f8-b59f-4d71-920c-e3b401ca0534.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Tadpole-Opus-14B-Exp/6dc87410-a39e-41b1-8759-68c1556c8419.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Taurus-Opus-7B/c4ebe788-fb60-453b-914b-56bf87dd6374.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Triangulum-10B/45a44cc8-a550-4d2f-b0f4-37b4aac6a2b5.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Triangulum-5B/10593c13-3b30-4605-8063-c6a6526fc9d9.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Triangulum-v2-10B/12b8f4d7-2ae8-492c-8756-f7cb21a58c76.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Tucana-Opus-14B-r999/96d9b675-c299-4138-a381-fb4de36287e5.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Tulu-MathLingo-8B/17fffa9b-8ed4-44c7-87ea-7ee2c1f28e6a.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Viper-Coder-7B-Elite14/8999a5f3-f421-4663-835e-7626cebd2282.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Viper-Coder-Hybrid-v1.2/951e1a4f-ed6c-49ca-b648-6086989e333f.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Viper-Coder-Hybrid-v1.3/2acc0666-e0ff-4760-a74a-227a02775344.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Viper-Coder-HybridMini-v1.3/3196c71d-0e0a-4d29-8bca-c31ba3d99dfd.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Viper-Coder-v0.1/e858aa6c-c424-447e-b512-7dcf794f9f0f.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.1/8773eac5-205e-4264-981b-58f1a25f872a.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.6-r999/c26ae286-a9b8-499f-b886-4b75be0cf2da.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.7-Vsm6/d3a61998-2d41-4349-bd15-ce29143cc910.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Viper-OneCoder-UIGEN/56b66428-2751-4c62-b98c-6c60e58c45ca.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/Volans-Opus-14B-Exp/9b2ec4af-4a7c-4cf7-8b7d-79b6cc219880.json
 delete mode 100644 data/hfopenllm_v2/prithivMLmods/WebMind-7B-v0.1/5855a920-428f-4699-becc-73d4422f706f.json
 delete mode 100644 data/hfopenllm_v2/pszemraj/Llama-3-6.3b-v0.1/f1004f08-7f46-4eb1-8f60-66893fca7180.json
 delete mode 100644 data/hfopenllm_v2/pszemraj/Mistral-v0.3-6B/97db158a-3035-45d3-8d92-a08c9e605493.json
 delete mode 100644 data/hfopenllm_v2/qingy2019/LLaMa_3.2_3B_Catalysts/0d81b928-2a24-4eb4-93d5-224e3c505532.json
 delete mode 100644 data/hfopenllm_v2/qingy2019/OpenMath2-Llama3.1-8B/bf4cc7ee-cad4-42af-8638-6b371577ec68.json
 delete mode 100644 data/hfopenllm_v2/qingy2019/Oracle-14B/5b574dda-0d85-47aa-9ebc-7f8581d402ca.json
 delete mode 100644 data/hfopenllm_v2/qingy2019/Oracle-14B/6043830f-8a9d-4a03-9de5-4805724a9ae8.json
 delete mode 100644 data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct-Alpha/9d5fdb25-0d6a-4d5c-bcfb-0903504e620a.json
 delete mode 100644 data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct-Pro/217819b0-2c4b-4c26-823b-1ea14f893e01.json
 delete mode 100644 data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct/0f844855-fb46-4b53-82c2-f36e5721c385.json
 delete mode 100644 data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct/59aaa7ed-27d4-4765-b115-90570ad86c77.json
 delete mode 100644 data/hfopenllm_v2/qingy2019/Qwen2.5-Ultimate-14B-Instruct/4478c5ff-3b51-4be2-abce-3fb6a951b6e7.json
 delete mode 100644 data/hfopenllm_v2/qingy2024/Benchmaxx-Llama-3.2-1B-Instruct/9202146d-5889-49fd-9025-e03153ba9093.json
 delete mode 100644 data/hfopenllm_v2/qingy2024/Eyas-17B-Instruct/94257d3e-2b1e-47a1-bbd1-7fc696a574b3.json
 delete mode 100644 data/hfopenllm_v2/qingy2024/Falcon3-2x10B-MoE-Instruct/2245cf71-fb8d-44ca-b58d-06608312ee8c.json
 delete mode 100644 data/hfopenllm_v2/qingy2024/Fusion-14B-Instruct/9a823fde-7802-4876-b72c-d8f73cd17236.json
 delete mode 100644 data/hfopenllm_v2/qingy2024/Fusion2-14B-Instruct/ede99239-ef8f-49eb-a48b-0ec2553c99e5.json
 delete mode 100644 data/hfopenllm_v2/qingy2024/Fusion4-14B-Instruct/4a307570-994f-491c-87a7-ad90b7965b8b.json
 delete mode 100644 data/hfopenllm_v2/qingy2024/OwO-14B-Instruct/eb448d78-6417-4533-8458-99c1869a74ae.json
 delete mode 100644 data/hfopenllm_v2/qingy2024/QwEnlarge-16B-Instruct/e1b8e4ad-4327-46b9-b957-fbd02e57c87e.json
 delete mode 100644 data/hfopenllm_v2/qingy2024/QwQ-14B-Math-v0.2/aab6b224-b948-4fb1-84b7-0dbe5c46d527.json
 delete mode 100644 data/hfopenllm_v2/qingy2024/Qwarkstar-4B-Instruct-Preview/2e5cd1de-6109-4f76-b722-abbd4b207f4d.json
 delete mode 100644 data/hfopenllm_v2/qingy2024/Qwarkstar-4B/767d1296-4971-478f-8d78-1d63d162ae5b.json
 delete mode 100644 data/hfopenllm_v2/qingy2024/Qwen2.5-4B/eab74e3b-de61-4fa9-87c2-56e69b70349a.json
 delete mode 100644 data/hfopenllm_v2/qingy2024/Qwen2.5-Coder-Draft-1.5B-Instruct/3219d563-3bfb-4618-8cb3-e9b198d5b11f.json
 delete mode 100644 data/hfopenllm_v2/qingy2024/Qwen2.5-Math-14B-Instruct-Alpha/233fd27c-561e-4c9e-a917-cbc5b08c055a.json
 delete mode 100644 data/hfopenllm_v2/qingy2024/Qwen2.5-Math-14B-Instruct-Preview/a875e8f7-a4e6-4c17-abbc-b8d4b73b7501.json
 delete mode 100644 data/hfopenllm_v2/qingy2024/Qwen2.6-14B-Instruct/4b68ba49-6681-4add-9197-2cd711701e15.json
 delete mode 100644 data/hfopenllm_v2/qingy2024/Qwen2.6-Math-14B-Instruct/5679ca73-3d5f-4bc7-bea2-5e9e713db0cc.json
 delete mode 100644 data/hfopenllm_v2/qq8933/OpenLongCoT-Base-Gemma2-2B/a6c631f6-890c-4199-abee-18b012bc48df.json
 delete mode 100644 data/hfopenllm_v2/raphgg/test-2.5-72B/1edc3610-40fc-467d-8410-26d4b6adebce.json
 delete mode 100644 data/hfopenllm_v2/rasyosef/Mistral-NeMo-Minitron-8B-Chat/42c773ba-8fb4-4b3c-8ac7-0688519bb55c.json
 delete mode 100644 data/hfopenllm_v2/rasyosef/Phi-1_5-Instruct-v0.1/1a371df5-447f-4fd8-8fe8-dbf9a1dc079a.json
 delete mode 100644 data/hfopenllm_v2/rasyosef/phi-2-instruct-apo/821a21a0-6fd7-438a-933d-5e31b2dd2adc.json
 delete mode 100644 data/hfopenllm_v2/rasyosef/phi-2-instruct-v0.1/781a4cc6-a69d-4106-81aa-06e114f7c897.json
 delete mode 100644 data/hfopenllm_v2/realtreetune/rho-1b-sft-MATH/e49c98b4-46f4-406e-9eeb-7072bf72b9a3.json
 delete mode 100644 data/hfopenllm_v2/recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp/3b7524a8-d17b-4788-93f2-11076df464a7.json
 delete mode 100644 data/hfopenllm_v2/recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp/6188a57f-4bc3-42a5-ad18-c59774e40407.json
 delete mode 100644 data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.1/28689805-7c4c-438e-8431-f4a6aceb5e94.json
 delete mode 100644 data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.2/7c156689-9668-4ded-bacc-c88a03ad1526.json
 delete mode 100644 data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.2/7e43f187-1959-4dfe-802f-094ba88f3b0d.json
 delete mode 100644 data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.3/a6170173-ef17-4cfa-a76e-8e51cb8cb970.json
 delete mode 100644 data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.3/e998d52b-dd94-4ef2-9cfc-5034ded0105a.json
 delete mode 100644 data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.4/a3ac60bd-8fb3-47d9-b378-1f0c4d74fed2.json
 delete mode 100644 data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.5/0f69217c-74ed-4398-8d1b-53d1a43be890.json
 delete mode 100644 data/hfopenllm_v2/redrix/AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS/b973adcc-769c-4009-87c5-5f5af02a5d3a.json
 delete mode 100644 data/hfopenllm_v2/redrix/patricide-12B-Unslop-Mell/4b30f11e-a2b9-40e9-b080-9d7484a5d048.json
 delete mode 100644 data/hfopenllm_v2/refuelai/Llama-3-Refueled/befdae09-4caa-4996-a3ac-fe36310aaf01.json
 delete mode 100644 data/hfopenllm_v2/rhplus0831/maid-yuzu-v7/8cd7fc1b-2873-4154-9de7-c0b8e5f4f5e9.json
 delete mode 100644 data/hfopenllm_v2/rhymes-ai/Aria/7f6e5858-f5d4-41cf-9bb7-c3c82a55c392.json
 delete mode 100644 data/hfopenllm_v2/rhysjones/phi-2-orange-v2/7b8bf84f-4101-41a1-b6ff-9cadbb5f84a3.json
 delete mode 100644 data/hfopenllm_v2/riaz/FineLlama-3.1-8B/1f3a733d-a6d3-453b-9763-61992cd514b0.json
 delete mode 100644 data/hfopenllm_v2/riaz/FineLlama-3.1-8B/d0eed3c1-2226-48c5-a314-e429f66c5053.json
 delete mode 100644 data/hfopenllm_v2/rmdhirr/Gluon-8B/957f02f1-45c7-4cce-b5aa-86bb5e485ad3.json
 delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-Coder-V2.5-Qwen-14b/55a01e8e-318a-4609-a862-bab4d62b3e7a.json
 delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-Coder-V2.5-Qwen-7b/cbdcd76f-be8f-42fe-89ed-d1d09d9d785f.json
 delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-0.5b/c7b6515e-6f96-468b-8bc0-15212c31e790.json
 delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-1.5b/f27f3a1d-c19a-42b2-8b49-64ecfe5d3405.json
 delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-14b/994aa481-627a-4bed-8719-9e874373cbc6.json
 delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-32b/9f5cd849-20b1-4e8d-9deb-f286dcfd9d6e.json
 delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-3b/c4dd34f2-7acc-4a94-a9aa-3c6aeeae8a8c.json
 delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-72b/e908b473-a015-4156-8e88-d67153479cb9.json
 delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-7b/173af77d-7a51-4d5a-8fd3-366aaa5d78a0.json
 delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5.1-Qwen-3b/0bb65f09-323d-485f-886e-5a35c8bcd342.json
 delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5.1-Qwen-3b/86b4c877-ef2d-4563-93a2-92d7e77eab5c.json
 delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.6-Nemotron-70b/be2ee3f6-37ee-4895-821a-3d3c7eb04eac.json
 delete mode 100644 data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.6-Qwen-14b/e574af17-dd3b-4c09-8689-ea598d44e562.json
 delete mode 100644 data/hfopenllm_v2/rombodawg/rombos_Replete-Coder-Instruct-8b-Merged/83958185-047a-4356-918d-2f45f273c08a.json
 delete mode 100644 data/hfopenllm_v2/rombodawg/rombos_Replete-Coder-Llama3-8B/d04c6e84-0b63-4de1-9278-aa37c9d2c8e3.json
 delete mode 100644 data/hfopenllm_v2/rootxhacker/Apollo-70B/a218e260-7f56-4676-af58-254bd84d0327.json
 delete mode 100644 data/hfopenllm_v2/rootxhacker/Apollo_v2-32B/f21fb2c8-4abe-40de-ab2c-9d23e95ee281.json
 delete mode 100644 data/hfopenllm_v2/rootxhacker/apollo-7B/da5774b2-8a6f-4f2d-8267-beb25490b06a.json
 delete mode 100644 data/hfopenllm_v2/rsh345/mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B/274705bd-8eb6-4863-8998-f5d67c4ac827.json
 delete mode 100644 data/hfopenllm_v2/rubenroy/Geneva-12B-GCv2-5m/5b95cc2f-3378-45e7-9f56-6bb7e1ce4826.json
 delete mode 100644 data/hfopenllm_v2/rubenroy/Gilgamesh-72B/6918d1a3-e547-46b7-9062-274057c1f513.json
 delete mode 100644 data/hfopenllm_v2/rubenroy/Zurich-14B-GCv2-5m/599deb3c-49f9-4c0b-af8d-78f9e166820b.json
 delete mode 100644 data/hfopenllm_v2/ruizhe1217/sft-s1-qwen-0.5b/b4ea3f14-3787-434b-8f26-20ff640c0146.json
 delete mode 100644 data/hfopenllm_v2/rwitz/go-bruins-v2/6952c527-ca23-494a-910c-1c027e4a5a29.json
 delete mode 100644 data/hfopenllm_v2/sabersaleh/Llama2-7B-CPO/3f12e79c-dd1b-428d-9094-10a047205e3e.json
 delete mode 100644 data/hfopenllm_v2/sabersaleh/Llama2-7B-DPO/d508da29-0288-4a0a-b727-fc5355515c5e.json
 delete mode 100644 data/hfopenllm_v2/sabersaleh/Llama2-7B-IPO/48cf5a8a-70c6-4c55-8959-32d773d6dbcf.json
 delete mode 100644 data/hfopenllm_v2/sabersaleh/Llama2-7B-KTO/4bb7d331-f305-4c08-a073-87ba7b2cbde2.json
 delete mode 100644 data/hfopenllm_v2/sabersaleh/Llama2-7B-SPO/94639454-c525-4e6f-af27-d92d45a9ac40.json
 delete mode 100644 data/hfopenllm_v2/sabersaleh/Llama2-7B-SimPO/9fa81bb7-7abc-4764-9465-d61217590da5.json
 delete mode 100644 data/hfopenllm_v2/sabersaleh/Llama3/9a683492-4057-4de4-a30a-aa66becffb13.json
 delete mode 100644 data/hfopenllm_v2/sabersalehk/Llama3-001-300/b917df45-62f2-4c3b-943a-ad6c98ef8bc1.json
 delete mode 100644 data/hfopenllm_v2/sabersalehk/Llama3-SimPO/ba658bc7-b89d-4fb7-a794-f48bd3715a49.json
 delete mode 100644 data/hfopenllm_v2/sabersalehk/Llama3_001_200/93f79cdc-ffd7-4299-9876-c0c7bed55ae5.json
 delete mode 100644 data/hfopenllm_v2/sabersalehk/Llama3_01_300/5a91b0bf-b043-41d2-960d-5f0e78abc400.json
 delete mode 100644 data/hfopenllm_v2/saishf/Fimbulvetr-Kuro-Lotus-10.7B/263f56e5-b578-475a-9bc4-b5ffc142f9e2.json
 delete mode 100644 data/hfopenllm_v2/saishf/Neural-SOVLish-Devil-8B-L3/9219ff66-73ba-45d8-99a0-23d23b3555ba.json
 delete mode 100644 data/hfopenllm_v2/saishshinde15/TethysAI_Base_Reasoning/b2328396-e9b2-464d-94e4-f03db19144ea.json
 delete mode 100644 data/hfopenllm_v2/saishshinde15/TethysAI_Vortex/3f895edf-8f54-48ff-a731-666144af0fda.json
 delete mode 100644 data/hfopenllm_v2/saishshinde15/TethysAI_Vortex_Reasoning/b48b8e16-a555-466b-8b1c-246137223311.json
 delete mode 100644 data/hfopenllm_v2/sakaltcommunity/novablast-preview/5fdcb98f-4c50-4cdb-bd99-dd32efc6d6f3.json
 delete mode 100644 data/hfopenllm_v2/sakaltcommunity/sakaltum-7b/d49c5e72-0dd0-4663-a310-9cd9bf1f5150.json
 delete mode 100644 data/hfopenllm_v2/sakhan10/quantized_open_llama_3b_v2/0176903f-e6ca-4f21-b98a-00bc443bf244.json
 delete mode 100644 data/hfopenllm_v2/saltlux/luxia-21.4b-alignment-v1.0/11f32afc-95c1-4531-ae45-5a0974d36b3a.json
 delete mode 100644 data/hfopenllm_v2/saltlux/luxia-21.4b-alignment-v1.2/70657dd7-63cf-40f4-92a0-1097fc1ce9ae.json
 delete mode 100644 data/hfopenllm_v2/sam-paech/Darkest-muse-v1/53cf325b-6f32-4791-8f95-8b982ea03b23.json
 delete mode 100644 data/hfopenllm_v2/sam-paech/Delirium-v1/8c50491b-6ed4-4f38-9d3f-d5168600cf4f.json
 delete mode 100644 data/hfopenllm_v2/sam-paech/Quill-v1/7adf79de-a51d-4b87-989a-c218ec6d99e3.json
 delete mode 100644 data/hfopenllm_v2/sarvamai/OpenHathi-7B-Hi-v0.1-Base/92358e5a-5e73-4747-9e92-e5ac003b97f7.json
 delete mode 100644 data/hfopenllm_v2/schnapss/testmerge-7b/f1636512-b98f-4fe4-adf3-abd556dd0ab9.json
 delete mode 100644 data/hfopenllm_v2/sci-m-wang/Mistral-7B-Instruct-sa-v0.1/9333afdd-4866-412b-b11b-dfb118a06db9.json
 delete mode 100644 data/hfopenllm_v2/sci-m-wang/Phi-3-mini-4k-instruct-sa-v0.1/840c0e19-6d75-47a2-b64b-f9c51cb1dcff.json
 delete mode 100644 data/hfopenllm_v2/sci-m-wang/deepseek-llm-7b-chat-sa-v0.1/071b49f2-8e23-47b1-9858-78d676d9905e.json
 delete mode 100644 data/hfopenllm_v2/securin/Securin-LLM-V2.5-Qwen-1.5B/d3821f53-87aa-470a-a403-c8e3cd100ae1.json
 delete mode 100644 data/hfopenllm_v2/senseable/WestLake-7B-v2/389dbaba-c9cd-4e6b-afb3-f2ee3951faa0.json
 delete mode 100644 data/hfopenllm_v2/sequelbox/Llama3.1-70B-PlumChat/5f78f39a-42cc-4cf6-bb27-e2160765bf24.json
 delete mode 100644 data/hfopenllm_v2/sequelbox/Llama3.1-8B-MOTH/b6e3d811-bf9d-474e-b82d-358a44e0dfc9.json
 delete mode 100644 data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumChat/bef1cbad-4f75-4dde-b467-6145f72a87f4.json
 delete mode 100644 data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumCode/654bebe0-b461-427e-a4cf-06386e9272d8.json
 delete mode 100644 data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumMath/37ef4e34-58f8-463a-950f-48b3a6833d54.json
 delete mode 100644 data/hfopenllm_v2/sequelbox/gemma-2-9B-MOTH/20687086-8aab-40f1-aec6-03917f4f9bf5.json
 delete mode 100644 data/hfopenllm_v2/sethuiyer/Llama-3.1-8B-Experimental-1206-Instruct/53a0a998-a0a6-4800-80bf-bfd83123f2f6.json
 delete mode 100644 data/hfopenllm_v2/sethuiyer/Llama-3.1-8B-Experimental-1208-Instruct/4ee8df1c-e8ff-4a56-816c-0c2258a226e7.json
 delete mode 100644 data/hfopenllm_v2/sethuiyer/LlamaZero-3.1-8B-Experimental-1208/42c8d84d-c8b8-42c6-8f49-4e971df173d7.json
 delete mode 100644 data/hfopenllm_v2/sethuiyer/Llamaverse-3.1-8B-Instruct/77b57dea-22e1-48a6-b8ae-9e474f08ad5f.json
 delete mode 100644 data/hfopenllm_v2/sethuiyer/Llamazing-3.1-8B-Instruct/a9ed5d04-57d2-4566-91df-b798be939fdb.json
 delete mode 100644 data/hfopenllm_v2/sethuiyer/Qwen2.5-7B-Anvita/bad4ec47-fe84-4518-b072-6955938f0c86.json
 delete mode 100644 data/hfopenllm_v2/shadowml/BeagSake-7B/497e585c-059a-4e18-9a8f-bdaa066f59ea.json
 delete mode 100644 data/hfopenllm_v2/shadowml/Mixolar-4x7b/e24b2a4e-83e4-4a79-bc41-03a54af00595.json
 delete mode 100644 data/hfopenllm_v2/shastraai/Shastra-LLAMA2-Math-Commonsense-SFT/15e39361-585b-4870-b91a-64dce4fb37ec.json
 delete mode 100644 data/hfopenllm_v2/shivam9980/NEPALI-LLM/96efd11b-e9f2-4bf1-90f9-561714137edf.json
 delete mode 100644 data/hfopenllm_v2/shivam9980/mistral-7b-news-cnn-merged/98e9936d-d376-4c72-80a6-0a28cf722ac4.json
 delete mode 100644 data/hfopenllm_v2/shivank21/mistral_dpo_self/7ada9c83-7851-4da2-b9d1-d744b174b777.json
 delete mode 100644 data/hfopenllm_v2/shuttleai/shuttle-3/a6ed72b7-14f1-464c-a7f5-590791982696.json
 delete mode 100644 data/hfopenllm_v2/shyamieee/Padma-v7.0/79e3f38d-ae2b-44a7-be0d-024adad6bcd6.json
 delete mode 100644 data/hfopenllm_v2/silma-ai/SILMA-9B-Instruct-v1.0/ef13bdea-cf73-4ead-b6d7-73a155fa9a79.json
 delete mode 100644 data/hfopenllm_v2/silma-ai/SILMA-Kashif-2B-Instruct-v1.0/2663884f-941c-4e16-8029-b38e3a543733.json
 delete mode 100644 data/hfopenllm_v2/siqi00/Mistral-7B-DFT/ca7af645-4796-4b31-ae7d-2cbebe5a369b.json
 delete mode 100644 data/hfopenllm_v2/siqi00/Mistral-7B-DFT2/f95e098c-d320-4db1-887d-8c3252bbaf77.json
 delete mode 100644 data/hfopenllm_v2/skumar9/Llama-medx_v2/2bbf6dc9-8dd5-4dee-908e-d4a8fc03bc84.json
 delete mode 100644 data/hfopenllm_v2/skymizer/Llama2-7b-sft-chat-custom-template-dpo/5f4edfdb-a62c-4410-83a3-1ceb15d2e7b0.json
 delete mode 100644 data/hfopenllm_v2/someon98/qwen-CoMa-0.5b/aadfae06-73b6-4306-b056-0a733b9bd8f4.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/ChocoTrio-14B-v1/cfecbfbc-46c3-4dd3-8bd9-afe4cd386973.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/IF-reasoning-experiment-40/97640dd1-d415-4b56-818c-cdcede3c52fd.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/IF-reasoning-experiment-80/b750c460-ef70-4abf-b77d-118a82039598.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/KytheraMix-7B-v0.2/f4c20519-9e33-4698-a17a-07e5fe7d2707.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.1-experimental/0f204733-55b4-4c06-bd12-dbc2e2593abd.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.3/0bb226ed-fe88-4678-9b50-f77883ceb708.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.4-Qwenvergence/fb297e45-9e14-4853-8384-75c187b28a9b.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6-002-model_stock/4f6eba27-2ab4-4b33-9568-814d15fbd6b9.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6-model_stock/c3bc3d69-a987-4dd0-b6a5-e0ecc50034fb.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6/5d02ba78-cf8b-44ee-a1b3-e51ecf437d89.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-Fusion/4a43fa67-2438-4c2a-b17b-9d2f221e5a86.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-rc1/2c044767-1169-48c6-9e37-e9d1e35f4cfe.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-rc4/bad67b35-d9ef-417a-955b-9c33e87cb927.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v1/60eaa315-f489-405d-a67d-7f1312e90cab.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2-hi/50de312a-293d-41a4-8bee-4feb0c148b90.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2-lo/56f24cac-394c-4439-8f2e-8270e7519bda.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2/8efa1423-0a39-4674-a94d-3d92448010d6.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v3/350b3491-cba8-46b4-a07f-3d1277270530.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen-14B-ProseStock-v4/0741ead7-24f3-49b0-9967-f726df84f78a.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen-2.5-14B-Virmarckeoso/1ea4d10e-e099-4967-8c43-e84acaeb40be.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v2/6c78d9f7-a61e-4f65-ac57-61597f735541.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-IF-Variant/e9bcfb1f-c688-4e7a-918a-e697adaf7aa5.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-Prose01/153cfe7f-c27a-40b8-b8d2-54351f26f583.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-model_stock/b58372cd-5d55-4f42-a5da-2970e55b44b0.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3/34a028ac-2002-480c-a1af-5b945ffe872e.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso/065ffc51-154c-4a93-a342-0dd476fda473.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Prose/ebc74f4f-157d-4ee4-8b99-9fb5b685afd5.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Reason/91004d26-7b8b-4c0a-bd8c-8880654dc93a.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1/5eb1aa92-a031-40d4-ad64-552075dae68a.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentessential-14B-v1/3ebc147d-58f2-4605-a011-a71c591fac0e.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v013/01795776-e909-46d3-8b6c-0989334e3d0e.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v1/00dffa94-31f9-4b5c-b032-03dd20fc2e8d.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v2/736249d0-cea9-46c6-9677-ecae4b410af4.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v3/ef602cfe-3453-4189-b583-292cf05421d1.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v5/559af2c1-deca-4c35-b83a-004c22ac958a.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v6-Prose/8d66d895-626a-477f-91b6-2195f35aacb3.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v6/004df803-70da-4e59-b3ad-f210c790f29e.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v7/bb2972ca-e673-4be5-bc7e-2689adeac3a9.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v8/eacf2411-a0ea-41fd-8363-e565fce0f26f.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v9/4eefe3cd-ff42-4d4c-89c6-c3e48d8c85e9.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-qv256/f19dab38-48ed-438e-8a62-86e4d111f6c8.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v0.6-004-model_stock/ff4b6d28-62e2-4671-8df9-690ce7f13f0b.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v10/9c05a7e4-f495-41d0-a7f0-1959e7434ba2.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v11/404e3d61-26d3-4f95-9847-064f0c7c6970.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v12-Prose-DS/0b4574f2-1b71-427f-9923-17db449be191.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v12-Prose/775b88cd-98e8-4d93-acca-e294f68f2da2.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v13-Prose-DS/89464568-47cb-4659-af37-8b061d3f0c8c.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v15-Prose-MS/9fad9d73-acbf-4ffc-886c-551c1fe1ed45.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v2-Prose/c1882335-0df5-4df2-bfa1-c16126c328fb.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Prose/291471ed-3b7c-4bd4-91bb-c27cd74ec460.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Reason/53565fe4-0368-477b-9916-ac9a4b8a9c7b.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Reason/f6cb5e9d-c4c9-44a2-9adf-7fa5639d84d9.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3/e51fee25-7648-49d9-a8da-b8dbc68a722b.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v6-Prose-model_stock/6acdc96b-cfde-439f-b6b3-a66257b3fcde.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v6-Prose/850da8de-ca13-4f15-bb9f-68b910355cfd.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v8/542fbb7a-d4eb-4cbf-b63a-4305cb108361.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v9/1dbb8206-6a86-4e2c-8ee0-d80fed014a69.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/lamarck-14b-prose-model_stock/6341de3c-8d4c-4af8-8f0d-c81e948bacd6.json
 delete mode 100644 data/hfopenllm_v2/sometimesanotion/lamarck-14b-reason-model_stock/e6cb6a87-6db8-4aee-bede-ce8a60dc8f4a.json
 delete mode 100644 data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415/5113439d-1394-46f2-a38e-34b54e94a9e6.json
 delete mode 100644 data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205/a03d88aa-7ccd-4f8a-9a1e-c9469d3ae559.json
 delete mode 100644 data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522/1cfb40a7-7373-417c-aa1c-f6ab63ecb3b8.json
 delete mode 100644 data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbc-213steps/446ac93f-d47c-4207-bf32-0cd94e88a931.json
 delete mode 100644 data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbo-180steps/7e4ba4f8-2768-4e7b-a11d-75ad22a47c45.json
 delete mode 100644 data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbr-180steps/ca77f821-4722-45b1-b731-7d774232acb4.json
 delete mode 100644 data/hfopenllm_v2/sophosympatheia/Midnight-Miqu-70B-v1.5/f32d2a11-edd3-4662-aed7-88c6820b2c2e.json
 delete mode 100644 data/hfopenllm_v2/speakleash/Bielik-11B-v2.0-Instruct/71c56883-dd14-4f16-b839-5ce607a4aadb.json
 delete mode 100644 data/hfopenllm_v2/speakleash/Bielik-11B-v2.1-Instruct/639004c2-81a5-410d-bd61-e3e263f55335.json
 delete mode 100644 data/hfopenllm_v2/speakleash/Bielik-11B-v2.2-Instruct/5f232a99-07c9-4df7-9d3b-837966ea6de5.json
 delete mode 100644 data/hfopenllm_v2/speakleash/Bielik-11B-v2.3-Instruct/482e34ee-8974-46c6-b3f4-4cc9872ef562.json
 delete mode 100644 data/hfopenllm_v2/speakleash/Bielik-11B-v2/13743252-3ba3-406d-8e95-5a4cd3ac3772.json
 delete mode 100644 data/hfopenllm_v2/spmurrayzzz/Mistral-Syndicate-7B/ff25cb66-ed6f-421a-a038-1feb24666645.json
 delete mode 100644 data/hfopenllm_v2/spow12/ChatWaifu_12B_v2.0/843f0d9a-04e8-4cea-bb18-94651a814d1f.json
 delete mode 100644 data/hfopenllm_v2/spow12/ChatWaifu_22B_v2.0_preview/fa3ccf4a-9b26-4a76-a974-3a776adec7c2.json
 delete mode 100644 data/hfopenllm_v2/spow12/ChatWaifu_v1.4/ef4ac8ab-4ff5-4fce-94b6-443b1ef7964f.json
 delete mode 100644 data/hfopenllm_v2/spow12/ChatWaifu_v2.0_22B/468bbea7-6dee-4a1a-84b3-e44b0f3ab95a.json
 delete mode 100644 data/hfopenllm_v2/spow12/ChatWaifu_v2.0_22B/bd8fdfa5-bda1-402b-9010-94bf78b0127b.json
 delete mode 100644 data/hfopenllm_v2/ssmits/Qwen2.5-95B-Instruct/a0b34b40-3e68-463f-a7fa-3c58c15aa16d.json
 delete mode 100644 data/hfopenllm_v2/stabilityai/StableBeluga2/dbf4fbac-cd99-426d-b725-600e60af00d2.json
 delete mode 100644 data/hfopenllm_v2/stabilityai/stablelm-2-12b-chat/f793c471-1638-476a-a050-455a32368e29.json
 delete mode 100644 data/hfopenllm_v2/stabilityai/stablelm-2-12b/1d9c1beb-f84b-4eb7-9c1e-ce5a70afabfb.json
 delete mode 100644 data/hfopenllm_v2/stabilityai/stablelm-2-1_6b-chat/99396d97-d875-4cd9-a8a1-a9aec5c43bfc.json
 delete mode 100644 data/hfopenllm_v2/stabilityai/stablelm-2-1_6b/82a44b46-156f-4232-92e4-6a08d7a4f197.json
 delete mode 100644 data/hfopenllm_v2/stabilityai/stablelm-2-zephyr-1_6b/3b40defd-5a2e-4d6e-838f-dbbbf12236fb.json
 delete mode 100644 data/hfopenllm_v2/stabilityai/stablelm-3b-4e1t/dde41cd5-e6d1-43a9-9593-1a5751bc5f44.json
 delete mode 100644 data/hfopenllm_v2/stabilityai/stablelm-zephyr-3b/1cffcbeb-ef81-4efe-b883-0a8540a799e7.json
 delete mode 100644 data/hfopenllm_v2/sthenno-com/miscii-14b-0130/033ef96e-3d2d-49a4-bbff-8bc815a1b40e.json
 delete mode 100644 data/hfopenllm_v2/sthenno-com/miscii-14b-0218/bfe654b8-cb79-4845-bf14-85012207ce90.json
 delete mode 100644 data/hfopenllm_v2/sthenno-com/miscii-14b-1028/5c4efc23-9591-447b-aecc-4c82797d7d01.json
 delete mode 100644 data/hfopenllm_v2/sthenno-com/miscii-14b-1225/a5fe3fab-95d9-41ac-a95f-66205e489dae.json
 delete mode 100644 data/hfopenllm_v2/sthenno/tempesthenno-0120/c0bf8ffb-444a-43a3-9514-76aa92c5f5b7.json
 delete mode 100644 data/hfopenllm_v2/sthenno/tempesthenno-fusion-0309/3d556d9f-036b-4368-bb4a-18ad6b444bdf.json
 delete mode 100644 data/hfopenllm_v2/sthenno/tempesthenno-kto-0205-ckpt80/92905e27-1033-4423-b87d-23236f9be964.json
 delete mode 100644 data/hfopenllm_v2/sthenno/tempesthenno-nuslerp-001/17326bb0-42c2-469a-ac19-6a4b75d9e6e2.json
 delete mode 100644 data/hfopenllm_v2/sthenno/tempesthenno-nuslerp-0124/11574f56-6c34-48e4-8fb5-c58d42f07330.json
 delete mode 100644 data/hfopenllm_v2/sthenno/tempesthenno-ppo-ckpt40/8f728c51-15f9-422d-bbdb-4d976961ab9d.json
 delete mode 100644 data/hfopenllm_v2/sthenno/tempesthenno-sft-0309-ckpt10/8d6e4b5e-ad17-4390-bc6b-ab6581a62442.json
 delete mode 100644 data/hfopenllm_v2/sthenno/tempesthenno-sft-0314-stage1-ckpt50/5e33bf05-6c67-4ecc-982d-7590e9953145.json
 delete mode 100644 data/hfopenllm_v2/sthenno/tempestissimo-14b-0309/f55ae879-bd95-409c-a8a3-9a57cd615a31.json
 delete mode 100644 data/hfopenllm_v2/streamerbtw1002/Nexuim-R1-7B-Instruct/b8426ac9-14f1-4e07-9c7e-b50cb2c7a1e3.json
 delete mode 100644 data/hfopenllm_v2/stupidity-ai/Llama-3-8B-Instruct-MultiMoose/51fd90b0-0d5a-4199-ba5b-ff29eeeab06b.json
 delete mode 100644 data/hfopenllm_v2/suayptalha/Clarus-7B-v0.1/c46e4fa1-afae-4b68-a13e-034b5cd2b779.json
 delete mode 100644 data/hfopenllm_v2/suayptalha/Clarus-7B-v0.2/42cc06ed-20fc-4e84-836f-3d7243ec336d.json
 delete mode 100644 data/hfopenllm_v2/suayptalha/Clarus-7B-v0.3/aaa53387-af33-4454-95f0-3af85f4778c0.json
 delete mode 100644 data/hfopenllm_v2/suayptalha/DeepSeek-R1-Distill-Llama-3B/465bca6d-b32a-4d34-9916-fc8b3166faa0.json
 delete mode 100644 data/hfopenllm_v2/suayptalha/Falcon3-Jessi-v0.4-7B-Slerp/bf138f3d-09d9-4dea-aa43-5efc804bc775.json
 delete mode 100644 data/hfopenllm_v2/suayptalha/HomerCreativeAnvita-Mix-Qw7B/cb4e944c-66f6-49f2-b1e0-d90454e34315.json
 delete mode 100644 data/hfopenllm_v2/suayptalha/Komodo-Llama-3.2-3B-v2-fp16/b2b6bc49-bda1-4a3e-a071-ec0a0bdc1313.json
 delete mode 100644 data/hfopenllm_v2/suayptalha/Lamarckvergence-14B/933f3d40-8726-418f-be2f-1f9686e9ab02.json
 delete mode 100644 data/hfopenllm_v2/suayptalha/Lix-14B-v0.1/af1bf15c-7c5f-46fa-ba3a-821b521e86f4.json
 delete mode 100644 data/hfopenllm_v2/suayptalha/Luminis-phi-4/43df4336-1eb8-4df7-8309-1199aafc07b1.json
 delete mode 100644 data/hfopenllm_v2/suayptalha/Maestro-10B/44ae222d-407c-4c8b-9b67-75440631f848.json
 delete mode 100644 data/hfopenllm_v2/suayptalha/Rombos-2.5-T.E-8.1/a87db0fe-3727-4ff1-875f-9edd3109f3a2.json
 delete mode 100644 data/hfopenllm_v2/sumink/Qmerft/0c73e33a-7f6f-4925-970b-db289069d5ca.json
 delete mode 100644 data/hfopenllm_v2/sumink/Qwenftmodel/02bc7f5c-dc2f-4d8c-adcb-a89a34ff5549.json
 delete mode 100644 data/hfopenllm_v2/sumink/Qwenmplus/590c031c-2aa6-48e6-9b3f-68b1a585dd39.json
 delete mode 100644 data/hfopenllm_v2/sumink/Qwensci/970c9fb8-c217-444b-a025-f4d9acdd679d.json
 delete mode 100644 data/hfopenllm_v2/sumink/bbhqwen/07a08dd7-822b-49ac-859b-d2fc75b9c88d.json
 delete mode 100644 data/hfopenllm_v2/sumink/bbhqwen2/0c0e9250-b75a-4549-9fb2-2b5c9ac2ef49.json
 delete mode 100644 data/hfopenllm_v2/sumink/bbhqwen3/2ae306b1-5409-4418-b5e4-50feff9dafe7.json
 delete mode 100644 data/hfopenllm_v2/sumink/bbhqwen4/44bf5d75-afb2-48fa-a0fa-96d283b0ae94.json
 delete mode 100644 data/hfopenllm_v2/sumink/bbhqwen5/e3860bb2-b2e4-4fdf-91cb-3343ad6440d7.json
 delete mode 100644 data/hfopenllm_v2/sumink/bbhqwen6/6369fceb-148f-4491-9488-420182a9838f.json
 delete mode 100644 data/hfopenllm_v2/sumink/flflmillama/045c814e-a30f-4b6b-b4f4-382dee4063b7.json
 delete mode 100644 data/hfopenllm_v2/sumink/ftgpt/59d2b375-5696-47d0-9c96-1a826c08bea0.json
 delete mode 100644 data/hfopenllm_v2/sumink/llamaft/ff601b4f-24a1-4376-8c5e-5bda2ea88f65.json
 delete mode 100644 data/hfopenllm_v2/sumink/llamamerge/8c043ba8-f7dd-4cc8-a3b1-7201042b8dc8.json
 delete mode 100644 data/hfopenllm_v2/sumink/llftfl7/ce27dff4-9ca7-47cb-bc18-b5dd167c72a2.json
 delete mode 100644 data/hfopenllm_v2/sumink/llmer/d69ecbfa-5036-48b8-8fed-f9162e2857f5.json
 delete mode 100644 data/hfopenllm_v2/sumink/qwft/b5924329-c182-482a-bee8-22fcb348281d.json
 delete mode 100644 data/hfopenllm_v2/sumink/qwmer/a6a6b6f2-ac28-4c4a-806e-8abe8c7f9190.json
 delete mode 100644 data/hfopenllm_v2/sumink/solarmer3/b904301c-d0c0-41a4-b92e-92b2d7c9c13a.json
 delete mode 100644 data/hfopenllm_v2/sumink/somer/b5de0218-91dc-487a-be90-70f8bcb64803.json
 delete mode 100644 data/hfopenllm_v2/sumink/somer2/3870f65b-3429-45c2-846f-6af30155a78b.json
 delete mode 100644 data/hfopenllm_v2/sumink/somerft/d6c33a51-be09-4cb5-9942-4348668d3e5e.json
 delete mode 100644 data/hfopenllm_v2/sunbaby/BrainCog-8B-0.1-Instruct/1ccd36ee-445a-4861-8835-d602973148fc.json
 delete mode 100644 data/hfopenllm_v2/swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA/4c7ef4ee-3a7e-4f15-8a4a-c5853b1c6a47.json
 delete mode 100644 data/hfopenllm_v2/synergetic/FrankenQwen2.5-14B/6a69202c-1c68-43e4-bd45-bbc2ff2db743.json
 delete mode 100644 data/hfopenllm_v2/talha2001/Beast-Soul-new/a053d6a3-05d4-4d0b-a9b8-7865cf7ac612.json
 delete mode 100644 data/hfopenllm_v2/tangledgroup/tangled-llama-pints-1.5b-v0.1-instruct/f76d3d30-4fce-48a9-a26b-7d714fff1d29.json
 delete mode 100644 data/hfopenllm_v2/tangledgroup/tangled-llama-pints-1.5b-v0.2-instruct/eb38a092-1b56-4348-8188-baa2243f7046.json
 delete mode 100644 data/hfopenllm_v2/tanliboy/lambda-gemma-2-9b-dpo/1c4cfb94-fc66-4fe2-9879-78683abe654f.json
 delete mode 100644 data/hfopenllm_v2/tanliboy/lambda-gemma-2-9b-dpo/2deef730-c37b-46ca-82b7-de38ae724fd4.json
 delete mode 100644 data/hfopenllm_v2/tanliboy/lambda-qwen2.5-14b-dpo-test/13a92beb-a8a4-4853-b2f5-1b09d3e2a64a.json
 delete mode 100644 data/hfopenllm_v2/tanliboy/lambda-qwen2.5-32b-dpo-test/36cf5b59-5369-4baf-80c1-3a47678eb5cb.json
 delete mode 100644 data/hfopenllm_v2/tannedbum/Ellaria-9B/fced3ef1-fb69-47fe-bf68-3efe72db3142.json
 delete mode 100644 data/hfopenllm_v2/tannedbum/L3-Nymeria-Maid-8B/7a83d75a-332e-476a-b0f7-986b2ec9cc5d.json
 delete mode 100644 data/hfopenllm_v2/tannedbum/L3-Nymeria-v2-8B/6f413d72-cd9f-435c-b13e-9cec14edeb5c.json
 delete mode 100644 data/hfopenllm_v2/tannedbum/L3-Rhaenys-8B/a7822bbf-bc23-437d-8e5b-32fb06d3a9ec.json
 delete mode 100644 data/hfopenllm_v2/teknium/CollectiveCognition-v1.1-Mistral-7B/0b19508c-4996-4fb7-b0e0-9fa952854fa3.json
 delete mode 100644 data/hfopenllm_v2/teknium/OpenHermes-13B/447c22c1-8929-420f-b59b-01ab32a22281.json
 delete mode 100644 data/hfopenllm_v2/teknium/OpenHermes-2-Mistral-7B/ab3dbe43-658e-4c8a-a399-b3d070d467ba.json
 delete mode 100644 data/hfopenllm_v2/teknium/OpenHermes-2.5-Mistral-7B/ee5c87a4-aa06-4728-a9bf-2fc35284b987.json
 delete mode 100644 data/hfopenllm_v2/teknium/OpenHermes-7B/6a1a58f6-e399-4ac3-a516-f02a37b6ff68.json
 delete mode 100644 data/hfopenllm_v2/tensopolis/falcon3-10b-tensopolis-v1/9e2bfd77-b73e-436f-ad50-ccfd379cd3f2.json
 delete mode 100644 data/hfopenllm_v2/tensopolis/falcon3-10b-tensopolis-v2/100cf60a-c43c-4b3a-a667-a45cffdd562a.json
 delete mode 100644 data/hfopenllm_v2/tensopolis/lamarckvergence-14b-tensopolis-v1/2088fca7-11d7-47de-808d-d47da0caad0f.json
 delete mode 100644 data/hfopenllm_v2/tensopolis/mistral-small-2501-tensopolis-v1/bf0b3560-9d38-406a-ad30-5fd157f0fe43.json
 delete mode 100644 data/hfopenllm_v2/tensopolis/mistral-small-r1-tensopolis/9ce12fbc-00f7-4cc8-bd9d-67ead83a0801.json
 delete mode 100644 data/hfopenllm_v2/tensopolis/phi-4-tensopolis-v1/14501de3-dac0-44af-8c17-7abcd9bbba8b.json
 delete mode 100644 data/hfopenllm_v2/tensopolis/qwen2.5-14b-tensopolis-v1/c9db8ce4-6f0d-4c13-8484-6fca9e9c3798.json
 delete mode 100644 data/hfopenllm_v2/tensopolis/qwen2.5-3b-or1-tensopolis/8c6c06be-bbc6-4307-ba5b-336dc2bb466f.json
 delete mode 100644 data/hfopenllm_v2/tensopolis/qwen2.5-7b-tensopolis-v1/1326ff61-d0b4-46eb-9bcf-f978166e622b.json
 delete mode 100644 data/hfopenllm_v2/tensopolis/qwen2.5-7b-tensopolis-v2/4c9e829f-7a99-4d61-8730-7457215a4fd6.json
 delete mode 100644 data/hfopenllm_v2/tensopolis/virtuoso-lite-tensopolis-v1/afc24d42-6d25-4036-8f22-fcf944b481b7.json
 delete mode 100644 data/hfopenllm_v2/tensopolis/virtuoso-lite-tensopolis-v2/6f6db681-991e-408b-8d4e-71fff9e1c974.json
 delete mode 100644 data/hfopenllm_v2/tensopolis/virtuoso-small-tensopolis-v1/f3fa76bf-f11c-4dee-9b9f-00f1ec793dac.json
 delete mode 100644 data/hfopenllm_v2/tensopolis/virtuoso-small-tensopolis-v2/77b457d9-4957-4f0d-a8d3-e005ae382239.json
 delete mode 100644 data/hfopenllm_v2/tensopolis/virtuoso-small-v2-tensopolis-v1/11474a7a-73a6-4a3f-8bcb-bef783e12a2b.json
 delete mode 100644 data/hfopenllm_v2/tensoropera/Fox-1-1.6B/23cc1e7f-0994-43a5-8403-5361a2976285.json
 delete mode 100644 data/hfopenllm_v2/tenyx/Llama3-TenyxChat-70B/88c257d3-d5c1-4e1f-bbc8-9fc6bd65e15e.json
 delete mode 100644 data/hfopenllm_v2/theo77186/Qwen2.5-Coder-7B-Instruct-20241106/ec4c2032-8fc0-448a-a7c4-ee9b35b642db.json
 delete mode 100644 data/hfopenllm_v2/theprint/Boptruth-Agatha-7B/3c7ac4de-1456-4afb-b7ac-07beb6cb4d39.json
 delete mode 100644 data/hfopenllm_v2/theprint/CleverBoi-7B-v2/a06ad94f-13ee-466c-b25f-87cd87012678.json
 delete mode 100644 data/hfopenllm_v2/theprint/CleverBoi-7B-v3/9e1ca6d0-d2b2-48c5-acc2-ad299ce02e1f.json
 delete mode 100644 data/hfopenllm_v2/theprint/CleverBoi-Llama-3.1-8B-Instruct/7dcd6e37-3685-4b08-b983-b2a711aeaf73.json
 delete mode 100644 data/hfopenllm_v2/theprint/CleverBoi-Llama-3.1-8B-v2/b1ae6801-0139-41d3-85dc-102ad5cc4c6a.json
 delete mode 100644 data/hfopenllm_v2/theprint/CleverBoi-Nemo-12B-v2/4cc037a2-d952-4566-a575-015f8e3a5925.json
 delete mode 100644 data/hfopenllm_v2/theprint/Code-Llama-Bagel-8B/a1eaadae-8601-4c18-ab0c-4f6d80d3307b.json
 delete mode 100644 data/hfopenllm_v2/theprint/Conversely-Mistral-7B/40e452df-8f0a-4473-a3d1-41f9c288c12f.json
 delete mode 100644 data/hfopenllm_v2/theprint/Llama-3.2-3B-VanRossum/216020ac-276b-436e-815b-d6968eb83770.json
 delete mode 100644 data/hfopenllm_v2/theprint/ReWiz-7B/1bb4aeac-a5e1-4fd7-9e70-64fdcfc600cd.json
 delete mode 100644 data/hfopenllm_v2/theprint/ReWiz-Llama-3.1-8B-v2/25739611-f690-41b4-87de-9f4ea8b3d815.json
 delete mode 100644 data/hfopenllm_v2/theprint/ReWiz-Llama-3.2-3B/b8c27fdd-5b35-41ab-8a35-b5a48f27cceb.json
 delete mode 100644 data/hfopenllm_v2/theprint/ReWiz-Nemo-12B-Instruct/fa237949-c3ac-482a-8a54-5a2019f24016.json
 delete mode 100644 data/hfopenllm_v2/theprint/ReWiz-Qwen-2.5-14B/b60dd828-a3e7-46a8-b4c2-322aeca42faf.json
 delete mode 100644 data/hfopenllm_v2/theprint/ReWiz-Worldbuilder-7B/5de9f914-333f-4181-a93f-79257a3daf54.json
 delete mode 100644 data/hfopenllm_v2/theprint/RuDolph-Hermes-7B/e2d23da4-226a-4a02-8390-e8edaea4b65b.json
 delete mode 100644 data/hfopenllm_v2/theprint/WorldBuilder-12B/c64c7470-dcf9-46f8-b789-cab7e902739d.json
 delete mode 100644 data/hfopenllm_v2/theprint/phi-3-mini-4k-python/f6d727a3-19dc-4173-a88f-2c47449896aa.json
 delete mode 100644 data/hfopenllm_v2/thinkcoder/llama3-8b-instruct-lora-8-sft/490d14c8-2cb0-4328-9f41-6074b28d6fdc.json
 delete mode 100644 data/hfopenllm_v2/thirdeyeai/elevate360m/9351b079-7ef5-42ec-bb83-f0d8ec7de479.json
 delete mode 100644 data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-1_5B/852d5adb-f422-4102-8114-082ab0b3c07d.json
 delete mode 100644 data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-7B-0917/c64e98cd-c022-4834-a3e0-3949416d1fb1.json
 delete mode 100644 data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-7B/f101bd15-ac61-49d4-beac-c89bc889b34b.json
 delete mode 100644 data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2.5-7B-0917/11caf1c1-e2a0-4abb-bb0e-d06853a06e4d.json
 delete mode 100644 data/hfopenllm_v2/tianyil1/MistralForCausalLM_Cal_DPO/f0b57a60-8402-4430-93f3-b846a94113f2.json
 delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-10B-Base/50aa8077-4493-47a9-9cec-014c56343ecf.json
 delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-10B-Instruct/5e70d00b-c822-4ad6-afe8-3756a7038c57.json
 delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-1B-Base/8162ba41-e630-470f-a297-72fb9f2110fd.json
 delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-1B-Instruct/60dd9d02-476f-459d-a41c-f89f82116dc3.json
 delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-3B-Base/73e89f21-5799-4835-a0e0-a6664c0483da.json
 delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-3B-Instruct/7f355ad4-9156-486d-8cf4-723117da3bb8.json
 delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-7B-Base/4ccc6026-b639-488d-867f-d98ea49cf1b6.json
 delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-7B-Instruct/3cf2e68e-4de0-436e-935e-86935e11f72f.json
 delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-Mamba-7B-Base/e9e4ae5d-0dd1-463c-9f15-47cb21efb409.json
 delete mode 100644 data/hfopenllm_v2/tiiuae/Falcon3-Mamba-7B-Instruct/c57eb23a-5998-4ab9-9a98-39b1338f5ba6.json
 delete mode 100644 data/hfopenllm_v2/tiiuae/falcon-11B/94fb625d-f58c-4f2e-8268-1dc4472c1cce.json
 delete mode 100644 data/hfopenllm_v2/tiiuae/falcon-40b-instruct/4481ddef-2bef-4284-b56d-21054f5a9a97.json
 delete mode 100644 data/hfopenllm_v2/tiiuae/falcon-40b/80048c4b-e97b-45c7-aa04-70ce69481a97.json
 delete mode 100644 data/hfopenllm_v2/tiiuae/falcon-7b-instruct/d21a2557-2348-4087-b2a6-6e1c0101bccc.json
 delete mode 100644 data/hfopenllm_v2/tiiuae/falcon-7b/76290d4b-5526-400b-8ca4-24d220f7c02d.json
 delete mode 100644 data/hfopenllm_v2/tiiuae/falcon-mamba-7b/3a146535-09b3-4246-8bd8-0e984e0905b1.json
 delete mode 100644 data/hfopenllm_v2/tinycompany/BiBo-v0.3/6683f95c-f97f-4117-b3c5-c1ed9587289e.json
 delete mode 100644 data/hfopenllm_v2/tinycompany/BiBo-v0.7/bbe74b2b-9e13-4c13-92c8-618078667248.json
 delete mode 100644 data/hfopenllm_v2/tinycompany/ShawtyIsBad-bgem3/61876ce3-acc4-4619-b0c2-78ac4dff48ea.json
 delete mode 100644 data/hfopenllm_v2/tinycompany/ShawtyIsBad-e5-large/b304baee-c9de-4982-801d-2b9e7f1a7334.json
 delete mode 100644 data/hfopenllm_v2/tinycompany/ShawtyIsBad-ib/6f27e746-1bdd-4cec-a955-c27f2f9900ef.json
 delete mode 100644 data/hfopenllm_v2/tinycompany/ShawtyIsBad-nomic-moe/30637c5d-1bc0-49dc-8afd-335a9a66f196.json
 delete mode 100644 data/hfopenllm_v2/tinycompany/ShawtyIsBad-nomic1.5/169e29b6-50d8-456d-aa20-3fe2f3b19a1e.json
 delete mode 100644 data/hfopenllm_v2/tinycompany/SigmaBoi-base/427d32f7-190b-4005-b02c-6a8ce089dbbf.json
 delete mode 100644 data/hfopenllm_v2/tinycompany/SigmaBoi-bge-m3/de7551a8-63b1-4de3-899f-9d98cb985005.json
 delete mode 100644 data/hfopenllm_v2/tinycompany/SigmaBoi-bgem3/eff6f456-906d-4320-8e6f-667fbbf0574a.json
 delete mode 100644 data/hfopenllm_v2/tinycompany/SigmaBoi-ib/6cbd9a3a-7e06-4eee-af9e-6db4ff35c36a.json
 delete mode 100644 data/hfopenllm_v2/tinycompany/SigmaBoi-nomic-moe/7e3d3803-c8d4-4025-8d12-c4c29c49c059.json
 delete mode 100644 data/hfopenllm_v2/tinycompany/SigmaBoi-nomic1.5-fp32/a43a6ca9-3543-44bc-8511-ee5c45552070.json
 delete mode 100644 data/hfopenllm_v2/tinycompany/SigmaBoi-nomic1.5/83f6fdec-9592-45a1-acdf-0ebbb400c8a4.json
 delete mode 100644 data/hfopenllm_v2/tinycompany/Tamed-Shawty/6e2d4174-303f-437b-9abb-26667b1dd04c.json
 delete mode 100644 data/hfopenllm_v2/tklohj/WindyFloLLM/955e93d0-bec1-483c-b3f0-258e13d5cb16.json
 delete mode 100644 data/hfopenllm_v2/togethercomputer/GPT-JT-6B-v1/3065ca79-c5e9-4875-9f81-4231e971d818.json
 delete mode 100644 data/hfopenllm_v2/togethercomputer/GPT-NeoXT-Chat-Base-20B/fc7e485f-a416-420b-b43c-e45e502c4a8f.json
 delete mode 100644 data/hfopenllm_v2/togethercomputer/LLaMA-2-7B-32K/53e882c6-6eb5-4202-a8d0-3a313556c9f4.json
 delete mode 100644 data/hfopenllm_v2/togethercomputer/Llama-2-7B-32K-Instruct/ba715669-c0ed-471f-80a6-b67453fb4930.json
 delete mode 100644 data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Base/316cab27-5cac-4d26-90ae-05d1fc3bd14a.json
 delete mode 100644 data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Chat/d2b0a35a-ea72-42f4-9f71-fffa1480bc22.json
 delete mode 100644 data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Instruct/bf3eabff-fbf7-421c-9e04-548accc7678c.json
 delete mode 100644 data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Base-3B-v1/b7eeedd8-33ef-46b3-a3fb-6ac87247bc4e.json
 delete mode 100644 data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Chat-3B-v1/b1c41abe-e7f6-4229-b776-8ed0b5f91bd4.json
 delete mode 100644 data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Instruct-3B-v1/5b769770-3b63-4863-a723-95212e2be40e.json
 delete mode 100644 data/hfopenllm_v2/tokyotech-llm/Llama-3-Swallow-8B-Instruct-v0.1/f2264b41-efa5-4278-91fd-2f454aa91c61.json
 delete mode 100644 data/hfopenllm_v2/tomasmcm/sky-t1-coder-32b-flash/5c3484b4-6faa-47fd-a1a2-881898450f79.json
 delete mode 100644 data/hfopenllm_v2/trthminh1112/autotrain-llama32-1b-finetune/326b95f8-9eae-4064-a261-077a957e233c.json
 delete mode 100644 data/hfopenllm_v2/tugstugi/Qwen2.5-7B-Instruct-QwQ-v0.1/c1c7336e-b8bf-4a69-a586-c1a224ba8a65.json
 delete mode 100644 data/hfopenllm_v2/universalml/NepaliGPT-2.0/89e55482-b762-4f5d-a021-211048719bdc.json
 delete mode 100644 data/hfopenllm_v2/unsloth/Llama-3.2-1B-Instruct-no-system-message/81018e12-63f8-4ad8-87c4-181a13202497.json
 delete mode 100644 data/hfopenllm_v2/unsloth/Llama-3.2-1B-Instruct/5b09e8cb-aaf1-48fd-a2f4-11a8d4bc9a4d.json
 delete mode 100644 data/hfopenllm_v2/unsloth/Phi-3-mini-4k-instruct/8b344f21-9038-4b15-aba8-308aa62e4b39.json
 delete mode 100644 data/hfopenllm_v2/unsloth/phi-4-bnb-4bit/68ca8f7c-88c2-4ede-bcb7-d4ae23429d8f.json
 delete mode 100644 data/hfopenllm_v2/unsloth/phi-4-unsloth-bnb-4bit/df557f25-5505-49dd-a0cb-88fff601c6e2.json
 delete mode 100644 data/hfopenllm_v2/unsloth/phi-4/a50bf387-bf34-490f-979a-b6217a85a1bd.json
 delete mode 100644 data/hfopenllm_v2/upstage/SOLAR-10.7B-Instruct-v1.0/89264aa0-3bed-41d3-b171-2a5434cc990f.json
 delete mode 100644 data/hfopenllm_v2/upstage/SOLAR-10.7B-v1.0/a3272caf-a292-4dc7-8932-636a4099ca6b.json
 delete mode 100644 data/hfopenllm_v2/upstage/solar-pro-preview-instruct/c4ade77e-628f-457d-bbe1-3e5a0cb19d04.json
 delete mode 100644 data/hfopenllm_v2/utkmst/chimera-beta-test2-lora-merged/b030646c-5f5c-43ab-bbc4-405f82992265.json
 delete mode 100644 data/hfopenllm_v2/uukuguy/speechless-code-mistral-7b-v1.0/399e516c-d8c8-4511-a746-76c81f72b36a.json
 delete mode 100644 data/hfopenllm_v2/uukuguy/speechless-codellama-34b-v2.0/bd8e4424-7903-43e7-8105-269de734582e.json
 delete mode 100644 data/hfopenllm_v2/uukuguy/speechless-coder-ds-6.7b/9126e939-3a87-4774-9606-084c5b56e933.json
 delete mode 100644 data/hfopenllm_v2/uukuguy/speechless-instruct-mistral-7b-v0.2/be2ef197-738e-422d-9a88-cafd124584b7.json
 delete mode 100644 data/hfopenllm_v2/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b/ee22e6c5-8529-4987-86d0-4abf3b525f90.json
 delete mode 100644 data/hfopenllm_v2/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b/50f0ddc2-fccd-447c-ab50-a086ccb4cd3a.json
 delete mode 100644 data/hfopenllm_v2/uukuguy/speechless-zephyr-code-functionary-7b/83294141-a70f-40da-b3f8-21b367098cce.json
 delete mode 100644 data/hfopenllm_v2/v000000/L3-8B-Stheno-v3.2-abliterated/303ae3d2-fdf5-404d-83ca-8e6071e13e6b.json
 delete mode 100644 data/hfopenllm_v2/v000000/L3.1-Niitorm-8B-DPO-t0.0001/1b13d76d-259f-41f2-baba-ce96ef0cb937.json
 delete mode 100644 data/hfopenllm_v2/v000000/L3.1-Storniitova-8B/b644a420-0a70-4b3d-9a5a-ff91911c857b.json
 delete mode 100644 data/hfopenllm_v2/v000000/Qwen2.5-14B-Gutenberg-1e-Delta/33aaa60f-eb69-4d36-917c-6862121a223e.json
 delete mode 100644 data/hfopenllm_v2/v000000/Qwen2.5-14B-Gutenberg-Instruct-Slerpeno/a1d2e571-6de0-4bd7-bdcf-8b3921b450f6.json
 delete mode 100644 data/hfopenllm_v2/v000000/Qwen2.5-Lumen-14B/ad93274e-3ca0-40cb-9f65-e6e6c66a8008.json
 delete mode 100644 data/hfopenllm_v2/vhab10/Llama-3.1-8B-Base-Instruct-SLERP/b8043d04-c3ab-4d6a-97eb-44b195a52710.json
 delete mode 100644 data/hfopenllm_v2/vhab10/Llama-3.2-Instruct-3B-TIES/c6bff6da-382f-4423-ba3a-d987839132e0.json
 delete mode 100644 data/hfopenllm_v2/vhab10/llama-3-8b-merged-linear/f3574ad1-a6d7-47fb-86e7-69c256452dea.json
 delete mode 100644 data/hfopenllm_v2/vicgalle/CarbonBeagle-11B-truthy/f2e47267-6c40-4d70-8420-295c95b318f3.json
 delete mode 100644 data/hfopenllm_v2/vicgalle/CarbonBeagle-11B/395f246e-34c6-40e6-bfeb-b047aa12cf90.json
 delete mode 100644 data/hfopenllm_v2/vicgalle/Configurable-Hermes-2-Pro-Llama-3-8B/3a91f8bb-c132-45b3-b8b4-d2ecc9f03f3a.json
 delete mode 100644 data/hfopenllm_v2/vicgalle/Configurable-Llama-3.1-8B-Instruct/97c92043-9bed-460a-8d7b-70ab3584c75b.json
 delete mode 100644 data/hfopenllm_v2/vicgalle/Configurable-Yi-1.5-9B-Chat/ab2ce171-bfcf-49ea-a341-2a52b2bd803a.json
 delete mode 100644 data/hfopenllm_v2/vicgalle/ConfigurableBeagle-11B/f9bbd9cc-dc6a-466f-b777-eaea4a15b874.json
 delete mode 100644 data/hfopenllm_v2/vicgalle/ConfigurableHermes-7B/cd0aefa3-b0c9-4683-872f-f9f9d285e6c3.json
 delete mode 100644 data/hfopenllm_v2/vicgalle/ConfigurableSOLAR-10.7B/c42db2ab-dbc4-48e4-9c16-7b8a5f8492c3.json
 delete mode 100644 data/hfopenllm_v2/vicgalle/Humanish-RP-Llama-3.1-8B/1b32c387-97a7-42ff-892c-d3bacebbf050.json
 delete mode 100644 data/hfopenllm_v2/vicgalle/Merge-Mistral-Prometheus-7B/cbea057c-b0f9-48ac-a075-eb28ebbaf358.json
 delete mode 100644 data/hfopenllm_v2/vicgalle/Merge-Mixtral-Prometheus-8x7B/0b1bb876-9dc7-47d5-855a-f028fb7f2df6.json
 delete mode 100644 data/hfopenllm_v2/vicgalle/Roleplay-Llama-3-8B/a86678ad-344c-430f-80c7-02d634b0cd5b.json
 delete mode 100644 data/hfopenllm_v2/viettelsecurity-ai/security-llama3.2-3b/827f3236-74fa-432b-8177-8785ac25ad76.json
 delete mode 100644 data/hfopenllm_v2/vihangd/smart-dan-sft-v0.1/7f694687-77e5-41d2-923b-f2d5f231729b.json
 delete mode 100644 data/hfopenllm_v2/voidful/smol-360m-ft/daa9d03e-63b0-4c08-ae72-e11041200ac7.json
 delete mode 100644 data/hfopenllm_v2/vonjack/MobileLLM-125M-HF/1539822f-acc4-4dae-9e61-133da97ebcbe.json
 delete mode 100644 data/hfopenllm_v2/vonjack/Phi-3-mini-4k-instruct-LLaMAfied/eec80fda-ce2f-4ef4-94d3-9e7b90f7f2e5.json
 delete mode 100644 data/hfopenllm_v2/vonjack/Phi-3.5-mini-instruct-hermes-fc-json/448cac5f-a7d3-41fb-9b49-666758037eb4.json
 delete mode 100644 data/hfopenllm_v2/vonjack/Qwen2.5-Coder-0.5B-Merged/5d7c5ac1-84c3-4fd1-ac51-4c00ed8c59c7.json
 delete mode 100644 data/hfopenllm_v2/vonjack/SmolLM2-1.7B-Merged/7e1741cc-f9ea-4940-9b6b-d7a515cfce31.json
 delete mode 100644 data/hfopenllm_v2/vonjack/SmolLM2-135M-Merged/ec4d21be-b1a6-47a9-84a4-1a25249c1768.json
 delete mode 100644 data/hfopenllm_v2/vonjack/SmolLM2-360M-Merged/c6b03539-04b3-4ef2-909d-8036a7ea2ae1.json
 delete mode 100644 data/hfopenllm_v2/w4r10ck/SOLAR-10.7B-Instruct-v1.0-uncensored/f156ac38-056e-4ef1-bdbe-e83c299a683b.json
 delete mode 100644 data/hfopenllm_v2/wanlige/li-14b-v0.4-slerp/11d3c8db-300c-4e02-b729-7adba6844ad2.json
 delete mode 100644 data/hfopenllm_v2/wanlige/li-14b-v0.4-slerp0.1/fc75a820-fc0b-4e50-9304-61f0e93795c0.json
 delete mode 100644 data/hfopenllm_v2/wanlige/li-14b-v0.4/bb66896f-799c-4e17-8b54-af5e795699fa.json
 delete mode 100644 data/hfopenllm_v2/wannaphong/KhanomTanLLM-Instruct/30a1a786-7478-401f-85ae-57037ada3d32.json
 delete mode 100644 data/hfopenllm_v2/waqasali1707/Beast-Soul-new/05430b16-07b6-41a1-ade9-6211cdf8ccf1.json
 delete mode 100644 data/hfopenllm_v2/wave-on-discord/qwent-7b/09bc4d5a-f104-4a36-999c-11e2532eef1e.json
 delete mode 100644 data/hfopenllm_v2/weathermanj/Menda-3B-500/a92cfff6-6caf-4bf1-913a-9d7dd2d8d449.json
 delete mode 100644 data/hfopenllm_v2/weathermanj/Menda-3b-750/8972e92c-ebbe-4dc4-8a8c-6f7a42ab5c11.json
 delete mode 100644 data/hfopenllm_v2/weathermanj/Menda-3b-Optim-100/e4f39815-9704-4d0a-8d9b-39359367adcc.json
 delete mode 100644 data/hfopenllm_v2/weathermanj/Menda-3b-Optim-200/f40df456-eb9a-46f8-8fb0-b6ad2748f3c2.json
 delete mode 100644 data/hfopenllm_v2/win10/ArliAI-RPMax-v1.3-merge-13.3B/398996d9-299b-4120-a757-e2fe14e779ee.json
 delete mode 100644 data/hfopenllm_v2/win10/Breeze-13B-32k-Instruct-v1_0/4398633e-77b0-4b61-ae85-29b0e5aad38b.json
 delete mode 100644 data/hfopenllm_v2/win10/EVA-Norns-Qwen2.5-v0.1/1bc60148-512f-4830-b541-f30535cf74bf.json
 delete mode 100644 data/hfopenllm_v2/win10/Llama-3.2-3B-Instruct-24-9-29/a9dfb20a-13e0-4419-a747-7c001b2e9435.json
 delete mode 100644 data/hfopenllm_v2/win10/Norns-Qwen2.5-12B/388e3559-a3b6-4738-9843-9bdd048bae09.json
 delete mode 100644 data/hfopenllm_v2/win10/Norns-Qwen2.5-7B/994a6930-42d5-463a-9e7c-0a3070144211.json
 delete mode 100644 data/hfopenllm_v2/win10/Qwen2.5-2B-Instruct/cce46320-9794-443a-831a-92e2a21515b0.json
 delete mode 100644 data/hfopenllm_v2/win10/llama3-13.45b-Instruct/988f4cc0-ebfb-43a9-8a7f-3dd1f1c1e342.json
 delete mode 100644 data/hfopenllm_v2/win10/miscii-14b-1M-0128/3c675148-5d09-4778-baad-9295ef8cfc79.json
 delete mode 100644 data/hfopenllm_v2/winglian/Llama-3-8b-64k-PoSE/620b80ba-81ab-4504-9f42-4965014f3cd1.json
 delete mode 100644 data/hfopenllm_v2/winglian/llama-3-8b-256k-PoSE/b6c68fc1-c2c1-4cdf-91ef-2007becd7ade.json
 delete mode 100644 data/hfopenllm_v2/wzhouad/gemma-2-9b-it-WPO-HB/19279c18-c2f7-4f75-a9c5-a121b2d4bcff.json
 delete mode 100644 data/hfopenllm_v2/x0000001/Deepseek-Lumen-R1-Qwen2.5-14B/7966789d-8ace-4b39-9093-96bbb8e641d8.json
 delete mode 100644 data/hfopenllm_v2/xMaulana/FinMatcha-3B-Instruct/5e1d849d-0342-4de9-a7d8-dd5cd5960fac.json
 delete mode 100644 data/hfopenllm_v2/xinchen9/Llama3.1_8B_Instruct_CoT/a17563e3-0369-4042-8006-2ec781653f63.json
 delete mode 100644 data/hfopenllm_v2/xinchen9/Llama3.1_CoT/68369110-e371-4112-ae0a-14f7fe9fc40f.json
 delete mode 100644 data/hfopenllm_v2/xinchen9/Llama3.1_CoT_V1/2a6925d3-992f-4c4f-a57b-3eb41062743b.json
 delete mode 100644 data/hfopenllm_v2/xinchen9/Mistral-7B-CoT/28290ea9-9ce5-4605-ac5b-aa2d606994d8.json
 delete mode 100644 data/hfopenllm_v2/xinchen9/llama3-b8-ft-dis/eb2ed6eb-4789-400d-aea5-841547a20cd7.json
 delete mode 100644 data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table/873218a0-7ddb-4287-88ce-8c8214e85c85.json
 delete mode 100644 data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table/e4c32b92-46b4-431a-83f2-11499f587534.json
 delete mode 100644 data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table/a05681a0-07e4-4206-ae89-dee4e9706467.json
 delete mode 100644 data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table/b078f823-d603-4030-81a2-a3ca1a1117f9.json
 delete mode 100644 data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001/26625158-6720-47c7-8c28-46ca7b4b947e.json
 delete mode 100644 data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002/5e3e8dec-f14b-4b7a-ace1-1e1728395e84.json
 delete mode 100644 data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001/35b4378e-52cd-4ae1-985b-c8e2c00dc61a.json
 delete mode 100644 data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002/4d99a55e-39c0-41c7-9ef0-494f739ceaec.json
 delete mode 100644 data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table/f3c7bacd-e231-45fd-b503-ee4d34caf4e8.json
 delete mode 100644 data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table/1bb87d8f-2d66-42b2-a744-1a7cbc2c17dc.json
 delete mode 100644 data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table/ae10fd26-e648-4fa0-ae24-dfaaf4ff510d.json
 delete mode 100644 data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table/0af58746-0492-4ba7-8a17-c0a5c43d0700.json
 delete mode 100644 data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001/88fff9f5-7aa7-463a-87e0-5fd2f5bacf09.json
 delete mode 100644 data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002/bc79527d-ae58-4b17-afd8-df931562dbf3.json
 delete mode 100644 data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001/3e7423d5-ad7e-48e2-bd25-a4946d443c24.json
 delete mode 100644 data/hfopenllm_v2/xukp20/llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table/7979fd6a-a886-41cc-987b-356b7c452bff.json
 delete mode 100644 data/hfopenllm_v2/xwen-team/Xwen-7B-Chat/2be6bc34-1e61-426f-b963-6e096b5418fb.json
 delete mode 100644 data/hfopenllm_v2/xxx777xxxASD/L3.1-ClaudeMaid-4x8B/c4f69339-be6b-4bb4-8faf-a1f40e73d4b0.json
 delete mode 100644 data/hfopenllm_v2/yam-peleg/Hebrew-Gemma-11B-Instruct/c845eb10-a028-4cc2-8f64-25d75480c0d5.json
 delete mode 100644 data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B-200K/377e7223-4876-49b6-8057-b1831d7f129b.json
 delete mode 100644 data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B-200K/4ddb9ed6-0599-482e-b12e-bcb01975cc85.json
 delete mode 100644 data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B/9d5af106-be69-4b62-99c1-fcfb6091d080.json
 delete mode 100644 data/hfopenllm_v2/yanng1242/Marcoro14-7B-slerp/2f2d7a55-2838-446d-9487-a6cfa0c03356.json
 delete mode 100644 data/hfopenllm_v2/yasserrmd/Coder-GRPO-3B/65d20d45-f63b-4b09-b66d-5f53297c0c20.json
 delete mode 100644 data/hfopenllm_v2/yasserrmd/Text2SQL-1.5B/4712953f-0777-4b97-8f13-f7309f19f0dc.json
 delete mode 100644 data/hfopenllm_v2/ycros/BagelMIsteryTour-v2-8x7B/84382308-04b5-439f-b486-b26d20da605a.json
 delete mode 100644 data/hfopenllm_v2/ycros/BagelMIsteryTour-v2-8x7B/e82be06f-14ed-45e8-a273-d28c50f5212b.json
 delete mode 100644 data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table/5815ba55-40fc-4f8e-ae0b-b329c42fd503.json
 delete mode 100644 data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table/e58eceb3-b501-4924-9d0d-98d7da3c16c5.json
 delete mode 100644 data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table/5a88455c-7699-4c49-8a12-76cda15d878c.json
 delete mode 100644 data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table/122b4c1e-6e6c-4db5-8991-b091361c3ecf.json
 delete mode 100644 data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001/6abeb0e4-32ee-4dbb-9902-b19cc96a2aa7.json
 delete mode 100644 data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002/679f214f-e03f-47a9-8a11-91adbf1c4880.json
 delete mode 100644 data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001/680e77b8-9c64-4c52-aa83-55236039cef1.json
 delete mode 100644 data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002/c24c471c-14b3-462e-8b81-6548b27e5ffc.json
 delete mode 100644 data/hfopenllm_v2/yifAI/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002/efa7fa62-2e8b-403c-b345-eef876b48dbd.json
 delete mode 100644 data/hfopenllm_v2/ylalain/ECE-PRYMMAL-YL-1B-SLERP-V8/40bae762-65bd-4b4c-b422-ffd0fd3790a9.json
 delete mode 100644 data/hfopenllm_v2/ymcki/Llama-3.1-8B-GRPO-Instruct/596957cc-719c-44c7-8284-06a9ba0d1a30.json
 delete mode 100644 data/hfopenllm_v2/ymcki/Llama-3.1-8B-SFT-GRPO-Instruct/706bbc09-f867-4327-bc4d-b5ede41ebd93.json
 delete mode 100644 data/hfopenllm_v2/ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18-merge/8962e9be-75bf-4f57-8ce2-b29523740851.json
 delete mode 100644 data/hfopenllm_v2/ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18/014f4838-22ff-4802-a887-4d2de01a9256.json
 delete mode 100644 data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-18-24/5c6eac9c-0ec6-4364-a86b-dcd894d69f0b.json
 delete mode 100644 data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca/09b81cf2-3b79-448c-ab8e-87e378c804bb.json
 delete mode 100644 data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO/28b9977a-db3d-4f38-b1f7-bd0cdcab5504.json
 delete mode 100644 data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17/845ea162-cfa1-47f4-8914-d81d9bf1bb7d.json
 delete mode 100644 data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-18-ORPO/706737c7-cd1a-4958-9ffc-2655f0b50178.json
 delete mode 100644 data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-18/5acd58cd-8dfb-4fb7-8832-6bc151e0b1a1.json
 delete mode 100644 data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-24/d374a68d-b985-47c2-b087-500bffa93c80.json
 delete mode 100644 data/hfopenllm_v2/yuchenxie/ArlowGPT-3B-Multilingual/23fbceb0-b646-4945-b17f-66dde24a0e43.json
 delete mode 100644 data/hfopenllm_v2/yuchenxie/ArlowGPT-8B/73d9e204-e829-4159-b340-6d9581c6f0e1.json
 delete mode 100644 data/hfopenllm_v2/yuvraj17/Llama3-8B-SuperNova-Spectrum-Hermes-DPO/a6979dda-fba6-4104-b153-3b0a89de8585.json
 delete mode 100644 data/hfopenllm_v2/yuvraj17/Llama3-8B-SuperNova-Spectrum-dare_ties/62e04968-0c5c-4aad-a434-d9d24bccbdb8.json
 delete mode 100644 data/hfopenllm_v2/yuvraj17/Llama3-8B-abliterated-Spectrum-slerp/bae4064e-b10f-4082-876d-e4168ca1a8cc.json
 delete mode 100644 data/hfopenllm_v2/zake7749/gemma-2-2b-it-chinese-kyara-dpo/0040b48c-0f54-4c9b-97ee-1ca833c68e36.json
 delete mode 100644 data/hfopenllm_v2/zake7749/gemma-2-9b-it-chinese-kyara/6050e969-bcde-4594-8e53-05fa74c7287d.json
 delete mode 100644 data/hfopenllm_v2/zelk12/Gemma-2-TM-9B/3aaee358-bf3e-4d91-91bf-bd42e0a7c61e.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT-Gen1-gemma-2-9B/ef5f4fb2-f409-49dc-b3f0-f3e19585cd8a.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT-Gen2-GI-gemma-2-9B/4048fa60-7427-4f7e-9939-e270aa5e8b51.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT-Gen2-gemma-2-9B/f5c9baea-f2cf-414a-937a-6a43f55a1c1d.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT-Gen3-gemma-2-9B/1da70796-d40b-4f2a-8ce3-b304f414a6d5.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT-Gen4-gemma-2-9B/de476f79-2539-4f9e-a1d2-901c6c4342d4.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT-Gen5-gemma-2-9B/80aee542-c894-46b6-a6ed-9f3400aefa9e.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT-Gen6-gemma-2-9B/5c9d4eaf-0985-4f9e-8007-08b4081bb19d.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT-Gen6fix-gemma-2-9B/4b019824-8454-4ce8-aa49-d122a2491f9c.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT-Gen7-gemma-2-9B/0dfcd13c-f057-4aec-82ad-b5cf2b266502.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT-Max-Merge_02012025163610-gemma-2-9B/927589bf-f6a0-4155-a24b-120231bbf029.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT-Merge-gemma-2-9B/1a2740cb-c541-434e-89a1-7a9fd2c4cabd.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT-Merge1-gemma-2-9B/0110d1c9-755e-4f09-888b-0c9c1a263639.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT-Merge2-MU-gemma-2-MTg2MT1g2-9B/cda65781-494c-45bd-8c32-7b1fe987f31c.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT-Merge2-gemma-2-9B/2fd7de02-f8d9-45c1-9bb5-db5134bd4862.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT-Merge3-gemma-2-9B/acf07f51-5acd-4375-bafa-7a1a244db3c6.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT-Merge4-gemma-2-9B/ff985193-ba26-45d3-97be-b7d3b17ab4d7.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT-Merge5-gemma-2-9B/21dbea2c-5cb1-431c-a496-af9b932b3440.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT-Merge6-gemma-2-9B/1143955c-c32c-4b41-8484-2c77e72f4946.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT-gemma-2-9B/94824ceb-08c3-415c-8003-b70a0d9af09d.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT1-Gen1-gemma-2-9B/bf2903cb-b954-4870-98c3-116a96aa49fb.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT1-Gen2-gemma-2-9B/b089c439-a38c-438d-bdad-1c68a1265d95.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT1-Gen3-gemma-2-9B/c988815b-50e5-47e4-a418-bbbcdf1eb4a0.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT1-Gen4-gemma-2-9B/fa11d66c-7ebc-4b81-83b7-d35a4ff23d3f.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT1-Gen5-IF-gemma-2-S2DMv1-9B/1c81787b-594e-4bb6-aee1-7f193a628b16.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT1-Gen5-gemma-2-9B/fd9ce37e-d43d-4ec2-94ec-0eb42e3cc685.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT1-Gen6-gemma-2-9B/0625f09a-3e02-410b-963b-49b83dfc5c8f.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT1-Gen7-gemma-2-9B/50c1399e-b409-4dff-b4d6-9be01dbb02c7.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT1-Max-Merge_02012025163610-gemma-2-9B/402bdb4a-b258-40a4-ac9f-de74026c02f3.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT1-gemma-2-9B/65dcf458-db0f-45cd-a8a4-e16108e51161.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT2-Gen1-gemma-2-9B/f1346b1a-0e66-4d80-bfad-ccbe0a8e2abf.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT2-Gen2-gemma-2-9B/11e7b55a-d872-474a-98a6-fc82ce5a863e.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT2-Gen3-gemma-2-9B/19688633-fa6c-412a-8dbc-c16fc49b3276.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT2-Gen4-gemma-2-9B/7d67eb9c-a4d8-4b86-8c24-928ebbe58de7.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT2-Gen5-gemma-2-9B/447f880c-643f-4041-8cdb-87697d798085.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT2-Gen6-gemma-2-9B/653d459e-f8b7-48bc-a9db-779e515532cf.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT2-Gen7-gemma-2-9B/4e56faf6-dbde-4059-b502-32c76bdbed2d.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT2-Max-Merge_02012025163610-gemma-2-9B/f161df97-3cc6-48d3-bfc5-d3f01108ecbb.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT2-gemma-2-9B/7d08412d-e987-497f-a6ec-ce0affe0f80f.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT3-Gen1-gemma-2-9B/f042f897-cfe8-4d8c-b75b-bbfca44505ea.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT3-Gen2-gemma-2-9B/f24ab334-c022-4e34-a930-3fed6ee18793.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT3-Gen3-gemma-2-9B/2bd3c620-780f-452d-92d7-d01a04539939.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT3-Gen4-gemma-2-9B/234042bd-237f-4cc5-8c5d-1eacd2e8bfaa.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT3-Gen5-gemma-2-9B/d8e0a32e-f307-4056-b450-47a12a0a7b15.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT3-Gen5-gemma-2-9B_v1/9dc3c4f5-8974-4496-8a6e-daa4fe3e3c2a.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT3-Gen6-gemma-2-9B/037787fb-9c61-4c56-a7fc-704c04b519f7.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT3-Max-Merge_02012025163610-gemma-2-9B/5df3dd8f-4921-4916-8163-8651b796e478.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT3-gemma-2-9B/50463593-3a53-4b3f-9621-d05670309b7e.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT4-Gen1-gemma-2-9B/d7fef356-36c7-488f-8f49-997682a2c01a.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT4-Gen2-gemma-2-9B/42e7abc6-eaa2-4971-90ee-e4d9dbb97ddb.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT4-Gen3-gemma-2-9B/b1cf06a6-d270-41ae-bb9b-443bdc5446f3.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT4-Gen4-gemma-2-9B/e40ea476-bcc5-4d3b-bf8e-e5048d9cbe42.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT4-Gen5-gemma-2-9B/731a5f85-a59e-40af-870c-00e519ca0e7e.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT4-Max-Merge_02012025163610-gemma-2-9B/38d93ae8-90ec-473c-8570-33d52c46770b.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT4-gemma-2-9B/9072fd28-040b-44df-bd58-6e3f59398189.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT5-Gen1-gemma-2-9B/14827e00-09c5-4ebd-93cb-8e026ac73d20.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT5-Gen2-gemma-2-9B/11e76d74-b8e0-408f-b429-566faa5d60a2.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT5-Gen3-gemma-2-9B/944c84d8-231d-47ef-85f4-23c0286a4a02.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT5-Gen4-gemma-2-9B/47c8da1d-8ce3-4d19-b8b8-6b5e68e2e8ab.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT5-Gen5-gemma-2-9B/ca54a8d4-153b-4169-b6ee-133461a9bedd.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT5-Max-Merge_02012025163610-gemma-2-9B/652359ec-14f2-4f94-a694-b7dc98819bfc.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MT5-gemma-2-9B/b34f3335-c7a3-431f-b2c8-6f0731a81378.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MTM-Merge-gemma-2-9B/077306f9-5d40-40dc-9df4-b5ca559af5c7.json
 delete mode 100644 data/hfopenllm_v2/zelk12/MTMaMe-Merge_02012025163610-gemma-2-9B/e0f0fe87-8ed3-4398-8683-65aa042d01d9.json
 delete mode 100644 data/hfopenllm_v2/zelk12/Rv0.4DMv1t0.25-gemma-2-9B/2d968d3e-a3df-4bdf-86a4-034087c0d7fc.json
 delete mode 100644 data/hfopenllm_v2/zelk12/Rv0.4DMv1t0.25Tt0.25-gemma-2-9B/db476911-87fb-433f-b164-4435718dab46.json
 delete mode 100644 data/hfopenllm_v2/zelk12/Rv0.4MT4g2-gemma-2-9B/75a967f6-a8ab-435f-999b-4889e8217dce.json
 delete mode 100644 data/hfopenllm_v2/zelk12/T31122024203920-gemma-2-9B/e072997b-2f79-4d25-b8dc-ebf15ac311e1.json
 delete mode 100644 data/hfopenllm_v2/zelk12/Test01012025155054/6d681a29-0d1a-4054-8250-5246993509f8.json
 delete mode 100644 data/hfopenllm_v2/zelk12/Test01012025155054t0.5_gemma-2/2a6af4ce-e45c-4721-a23c-03071a5e774f.json
 delete mode 100644 data/hfopenllm_v2/zelk12/gemma-2-S2MTM-9B/5ae5ddff-714d-4a20-b1d3-3eeb95fd858c.json
 delete mode 100644 data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25/60052d34-f6a7-4204-baea-532f5ba29880.json
 delete mode 100644 data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75/e1ddd882-f8a1-48d0-bb2a-878f43095895.json
 delete mode 100644 data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1/d2c3edec-38d8-48e3-9f6d-e26a63442af8.json
 delete mode 100644 data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.2/dcfafe94-dacb-4e7a-9365-8bb39ecb79ec.json
 delete mode 100644 data/hfopenllm_v2/zelk12/recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1/8ca0e602-bf6b-4d15-95c2-a0d47e78ded0.json
 delete mode 100644 data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ifable-9B-v0.1/fc262523-dcde-4b45-80ba-2922e66d42c4.json
 delete mode 100644 data/hfopenllm_v2/zelk12/recoilme-gemma-2-psy10k-mental_healt-9B-v0.1/f8d745da-9867-4348-bace-d8052c3b4025.json
 delete mode 100644 data/hfopenllm_v2/zetasepic/Qwen2.5-32B-Instruct-abliterated-v2/3d410f0f-6b24-4e86-a353-6142c51b1ecc.json
 delete mode 100644 data/hfopenllm_v2/zetasepic/Qwen2.5-72B-Instruct-abliterated/46329fc3-974f-4d04-be9e-ba85b3816efc.json
 delete mode 100644 data/hfopenllm_v2/zhengr/MixTAO-7Bx2-MoE-v8.1/b964d0a4-7c44-4ea2-894e-3e1ca30321e0.json
 delete mode 100644 data/livecodebenchpro/alibaba/qwen3-235b-a22b-thinking-2507/126326f3-6521-45d1-aa14-5c51335c1929.json
 delete mode 100644 data/livecodebenchpro/alibaba/qwen3-30b-a3b/b3f5937a-1489-417b-8162-6c62dea0703d.json
 delete mode 100644 data/livecodebenchpro/alibaba/qwen3-max/f06d6c4c-b2c4-4c48-9702-f0bf08af62c4.json
 delete mode 100644 data/livecodebenchpro/alibaba/qwen3-next-80b-a3b-thinking/809a1503-a161-4532-afd3-fdbd6551eb63.json
 delete mode 100644 data/livecodebenchpro/aliyun/qwen3-next-80b-a3b-thinking/808ca8e4-9b14-48ba-bb39-e3b6a5672c80.json
 delete mode 100644 data/livecodebenchpro/anthropic/claude-3-7-sonnet-20250219/be076445-eb88-49b0-a855-2e0cb1551bab.json
 delete mode 100644 data/livecodebenchpro/anthropic/claude-3.7-sonnet/69210faf-04a8-46d4-b92b-94f2ca521c09.json
 delete mode 100644 data/livecodebenchpro/anthropic/claude-sonnet-4-5-20250929/ed293aa1-f64e-429d-bddf-91a35a4203d1.json
 delete mode 100644 data/livecodebenchpro/ark/ep-20250603132404-cgpjm/2bddd388-5e9a-423e-8767-37d6f9f69032.json
 delete mode 100644 data/livecodebenchpro/bytedance/doubao-seed-1-6-thinking-250615/bfd991ca-13e9-4716-b389-11e0d2afe286.json
 delete mode 100644 data/livecodebenchpro/deepseek/chat-v3-0324/b29b7c8e-759e-45fe-a9d3-1054f19af617.json
 delete mode 100644 data/livecodebenchpro/deepseek/ep-20250214004308-p7n89/801d2dc6-17e7-47f1-a54f-87b94a59b508.json
 delete mode 100644 data/livecodebenchpro/deepseek/ep-20250228232227-z44x5/def0b2e3-cf5f-4dfd-8f1c-827f98d1626a.json
 delete mode 100644 data/livecodebenchpro/deepseek/ep-20250603132404-cgpjm/157dd68b-fcc2-416f-a2c0-c9781020e6af.json
 delete mode 100644 data/livecodebenchpro/google/gemini-2.5-flash/174f0e23-84f1-43d0-bcdf-11b83c37025a.json
 delete mode 100644 data/livecodebenchpro/google/gemini-2.5-pro/bef7254b-549f-4e6b-b5c8-31b84dc6acda.json
 delete mode 100644 data/livecodebenchpro/kuaishou/kwaipilot-40b-0604/aa236b03-b81f-431b-b049-7101cea165f2.json
 delete mode 100644 data/livecodebenchpro/meta/llama-4-maverick/abc37028-a362-4e02-8499-1bb7497e0293.json
 delete mode 100644 data/livecodebenchpro/openai/gpt-4.1/ba46ef91-d157-4984-b3df-ce33d8d97f8e.json
 delete mode 100644 data/livecodebenchpro/openai/gpt-4o-2024-11-20/e70acf51-30ef-4c20-b7cc-51704d114d70.json
 delete mode 100644 data/livecodebenchpro/openai/gpt-5-2025-08-07/0e57aa1f-48c6-42b7-9aee-43a29d21b83f.json
 delete mode 100644 data/livecodebenchpro/openai/gpt-5-2025-08-07/de66cc70-b456-4165-a827-5193dd77e84d.json
 delete mode 100644 data/livecodebenchpro/openai/gpt-5.2-2025-12-11/e9139c52-ada0-4d1c-ae82-7852aacdb6ea.json
 delete mode 100644 data/livecodebenchpro/openai/gpt-oss-120b/1dd8c827-72af-4c8f-9ead-989de7105590.json
 delete mode 100644 data/livecodebenchpro/openai/gpt-oss-20b/ead39f61-b408-42b2-808f-8421a3200c89.json
 delete mode 100644 data/livecodebenchpro/openai/o3-2025-04-16/f96bdb35-4d61-4fde-8d91-edf55f13dc03.json
 delete mode 100644 data/livecodebenchpro/openai/o4-mini-2025-04-16/5516f77c-932a-4eaa-ac31-dda9260ce82d.json
 delete mode 100644 data/livecodebenchpro/openai/o4-mini-2025-04-16/8992cef5-df7e-40a1-b099-331532c3deb0.json
 delete mode 100644 data/livecodebenchpro/z-ai/glm-4.5/a77c08d6-a782-440c-b545-c60b6169712d.json
 delete mode 100644 data/reward-bench/0-hero/Matter-0.1-7B-DPO-preview/623bae1f-19e9-47f9-bc7b-80a859218d07.json
 delete mode 100644 data/reward-bench/0-hero/Matter-0.1-7B-boost-DPO-preview/fbba98c5-5d56-4837-9044-d4e5ac610c2c.json
 delete mode 100644 data/reward-bench/Ahjeong/MMPO_Gemma_7b/dc6e1164-c9d7-4dd5-b8dc-fbc4e3f45011.json
 delete mode 100644 data/reward-bench/Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3/c62a913b-3101-4ce3-a5c5-a1ac844e55f8.json
 delete mode 100644 data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/3101726d-fd51-436d-8adf-cbdf0d534834.json
 delete mode 100644 data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/f878a52a-fa80-4113-ae7d-0cb11e3ef9fd.json
 delete mode 100644 data/reward-bench/Anthropic/claude-3-7-sonnet-20250219/904c6359-bd7b-4448-9f16-bc115d0629c4.json
 delete mode 100644 data/reward-bench/Anthropic/claude-3-haiku-20240307/49511052-6881-4151-9b46-686c75f73c22.json
 delete mode 100644 data/reward-bench/Anthropic/claude-3-haiku-20240307/b289e2e6-d57b-4a2b-aa61-e2974d193909.json
 delete mode 100644 data/reward-bench/Anthropic/claude-3-opus-20240229/aeeca919-71a1-42a0-a6d0-6779d77750e6.json
 delete mode 100644 data/reward-bench/Anthropic/claude-3-opus-20240229/db29538d-f40e-42d0-b3c0-e622f92112d2.json
 delete mode 100644 data/reward-bench/Anthropic/claude-3-sonnet-20240229/ab0cdc4f-47dd-4dcc-b506-982ce3924105.json
 delete mode 100644 data/reward-bench/Anthropic/claude-opus-4-20250514/44da63b6-d934-4330-bc20-33464bae61dd.json
 delete mode 100644 data/reward-bench/Anthropic/claude-sonnet-4-20250514/c930cbe0-f429-4b61-9abe-86dcb7266cf7.json
 delete mode 100644 data/reward-bench/AtlaAI/Selene-1-Mini-Llama-3.1-8B/c84b27b2-2dd9-48ee-9a53-ec27ae62ae7a.json
 delete mode 100644 data/reward-bench/AtlaAI/Selene-1/73ee9408-e669-4b8a-9419-76bd6051ce8d.json
 delete mode 100644 data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/0deed2f4-770e-4033-a65d-e1da19e00611.json
 delete mode 100644 data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/e727cb77-f229-4aaa-909f-99c7aa06676b.json
 delete mode 100644 data/reward-bench/CohereForAI/c4ai-command-r-plus/da9264cd-2fa3-4121-81de-eef994e15993.json
 delete mode 100644 data/reward-bench/ContextualAI/LMUnit-llama3.1-70b/79cc5cd4-bfed-466d-9fbe-2f27e8aab175.json
 delete mode 100644 data/reward-bench/ContextualAI/LMUnit-qwen2.5-72b/28c35831-679d-489a-b2c4-fd2c7f333fbc.json
 delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-dpo_llama13b/9db7907d-7b22-480c-86a5-f88ec2b302e7.json
 delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-dpo_llama30b/2faddf79-41e6-47e9-9c26-17bc987bc870.json
 delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-dpo_llama7b/20989a47-6556-4e3b-8909-d0a419cb159b.json
 delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-dpo_pythia1-4b/f3d0010f-efed-4f87-9582-b9c87b4de99a.json
 delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-dpo_pythia12-0b/a0ce3ed6-2a2c-46ad-be86-6f6701533e36.json
 delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-dpo_pythia2-8b/d54c4830-23c8-4c12-aea1-4f5b5245464f.json
 delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-dpo_pythia6-9b/b5853278-edd9-4bc8-bbeb-d6dab515b562.json
 delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-kto_llama13b/74188e30-1e49-47d8-af01-b80e430dafa0.json
 delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-kto_llama30b/93974286-0497-46a2-a2e8-404c1e89dba0.json
 delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-kto_llama7b/02c0020c-7d69-4701-a606-4bc79ad87afd.json
 delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-kto_pythia1-4b/5dcb7c54-64e7-4f76-8903-8f57b35cdb0c.json
 delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-kto_pythia12-0b/4887256e-0545-40dd-9756-ff850e003a29.json
 delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-kto_pythia2-8b/d2b70870-9cbc-4666-bbd4-097fcebe716e.json
 delete mode 100644 data/reward-bench/ContextualAI/archangel_sft-kto_pythia6-9b/f420f432-2291-40a9-8ebd-b91241970113.json
 delete mode 100644 data/reward-bench/Databricks-Mosaic-Research/PGRM/02e68d1b-86f3-4344-ad8d-45df878b744c.json
 delete mode 100644 data/reward-bench/HFXM/RAMO-Llama3.1-8B/f712ab4a-1127-44ba-b6b9-7a40290f3322.json
 delete mode 100644 data/reward-bench/HuggingFaceH4/starchat2-15b-v0.1/b4175f0f-f9f4-4418-b4aa-a31e7f1f93f4.json
 delete mode 100644 data/reward-bench/HuggingFaceH4/zephyr-7b-alpha/9879e9a7-ddbc-4338-abc7-e3bc394869e9.json
 delete mode 100644 data/reward-bench/HuggingFaceH4/zephyr-7b-beta/d7d8a5cb-e295-4ced-b528-d99d814ff008.json
 delete mode 100644 data/reward-bench/HuggingFaceH4/zephyr-7b-gemma-v0.1/bff86a1f-71c3-4f27-aeae-bba6d03635ef.json
 delete mode 100644 data/reward-bench/IDEA-CCNL/Ziya-LLaMA-7B-Reward/723281f8-54b7-4db6-8253-5a6dcf4f3d4a.json
 delete mode 100644 data/reward-bench/LxzGordon/URM-LLaMa-3-8B/0ce7dc54-f608-4985-9904-75cee09b6288.json
 delete mode 100644 data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/5bb0aaa4-2cc5-4622-8235-993bc4178f12.json
 delete mode 100644 data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/85ab22b8-0587-4e2b-857f-3d6d84d571a4.json
 delete mode 100644 data/reward-bench/NCSOFT/Llama-3-OffsetBias-8B/37aa6702-b2fa-43bf-b5a9-36740f627217.json
 delete mode 100644 data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/57f48d0c-e424-410d-b9ee-4707e2add036.json
 delete mode 100644 data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/8643b4dd-e18c-442c-adb5-84ef756534f8.json
 delete mode 100644 data/reward-bench/Nexusflow/Starling-RM-34B/2f3d2e46-1f9e-4b1c-9729-ab0a93cc245c.json
 delete mode 100644 data/reward-bench/Nexusflow/Starling-RM-34B/4aec78d3-a38c-48e0-b9e2-b6dc063bd37e.json
 delete mode 100644 data/reward-bench/NousResearch/Hermes-3-Llama-3.1-70B/f9b60945-8b14-4564-9d44-3eb6db675ab9.json
 delete mode 100644 data/reward-bench/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/56703c11-eccb-4f66-af13-60f972a5068f.json
 delete mode 100644 data/reward-bench/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/fbd8be7e-5670-4729-a77d-83472510b734.json
 delete mode 100644 data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/2e18ee77-9c46-4cf9-9521-303ad15e5be4.json
 delete mode 100644 data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/ec5b296e-03e8-4371-a8c1-eca0b0b9759d.json
 delete mode 100644 data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/07b61a55-a8e3-4a6f-9806-a4100f8d5297.json
 delete mode 100644 data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/3d534c25-5016-44de-9c47-24b7d7399b0f.json
 delete mode 100644 data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/4de91433-05b3-4f88-9d0f-66691c671f62.json
 delete mode 100644 data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/dc71f1ba-f4b8-4231-ac72-0acf9a22d73e.json
 delete mode 100644 data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/36c4adc9-c2fb-4bc3-81ba-88478d30332e.json
 delete mode 100644 data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/f0827b15-20d0-4986-b5a0-bb4bc9be768e.json
 delete mode 100644 data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/aeaa8b33-e327-4c65-9641-5dfc63feee3b.json
 delete mode 100644 data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/c97c79f3-fd92-49db-9131-5e45834a7eaf.json
 delete mode 100644 data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/687099cb-c1bf-49ec-a902-329c2b818369.json
 delete mode 100644 data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/8da4f5eb-6264-4503-b9bc-fcf843b638be.json
 delete mode 100644 data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/28a68b87-5412-4374-9e61-896b0fff7669.json
 delete mode 100644 data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/3209c869-03c5-4801-8e4b-4c8bcde3d58f.json
 delete mode 100644 data/reward-bench/PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022.../9d1e124c-e133-41d3-8ac7-5c8c5027aa02.json
 delete mode 100644 data/reward-bench/Qwen/Qwen1.5-0.5B-Chat/633d499b-58bd-4fca-9b56-0f005a5a21b8.json
 delete mode 100644 data/reward-bench/Qwen/Qwen1.5-1.8B-Chat/5c4f3caf-6af3-48c6-83e2-4710d31e6acf.json
 delete mode 100644 data/reward-bench/Qwen/Qwen1.5-14B-Chat/77d1edc1-fb54-4371-bf7c-baebbb351163.json
 delete mode 100644 data/reward-bench/Qwen/Qwen1.5-4B-Chat/e7eecdb0-bc17-4d9f-b3e8-9ee777d2f595.json
 delete mode 100644 data/reward-bench/Qwen/Qwen1.5-72B-Chat/3f3915b3-0d6e-451c-9185-fa4372b93f2b.json
 delete mode 100644 data/reward-bench/Qwen/Qwen1.5-7B-Chat/e534d37b-3009-4a7d-82d8-d7c85b95649e.json
 delete mode 100644 data/reward-bench/Qwen/Qwen1.5-MoE-A2.7B-Chat/bd8f0ed1-75fc-48c1-996e-655d205c027c.json
 delete mode 100644 data/reward-bench/Qwen/WorldPM-72B/e9effaf6-e48b-4b35-b035-430be81b316b.json
 delete mode 100644 data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-32B/d2132eea-eb88-41e5-b8e6-2e8e8a623ed1.json
 delete mode 100644 data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-7B/ffd05bc7-3724-40ba-85b9-c25ebe71fba2.json
 delete mode 100644 data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/43f0e93d-f0b8-46af-a549-e1ac315d96ea.json
 delete mode 100644 data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/9ccab7bd-d2ed-4ab3-ad81-656650c29a3b.json
 delete mode 100644 data/reward-bench/RLHFlow/LLaMA3-iterative-DPO-final/c10d4213-f1fa-41e6-92d9-0d5337c1362b.json
 delete mode 100644 data/reward-bench/RLHFlow/RewardModel-Mistral-7B-for-DPA-v1/63b08ba0-eeb9-48ae-a5d1-d7d3792aa1c0.json
 delete mode 100644 data/reward-bench/RLHFlow/pair-preference-model-LLaMA3-8B/d724076d-509f-4ad4-894c-976b0472de85.json
 delete mode 100644 data/reward-bench/Ray2333/GRM-Gemma-2B-rewardmodel-ft/54d34f25-1cd9-4995-8e56-c36981842fc8.json
 delete mode 100644 data/reward-bench/Ray2333/GRM-Gemma-2B-sftreg/63ae1c75-fd4d-4f40-afd0-b9f91d700014.json
 delete mode 100644 data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/1d5ebbce-8cfe-446b-82c0-a227d4e9247f.json
 delete mode 100644 data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/3f9c81ac-5c76-43b4-a27d-7eaa055139c4.json
 delete mode 100644 data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/680098fb-76cf-47b6-a0ea-a1a06ca46dca.json
 delete mode 100644 data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/6ec21338-9908-4ce4-a1f2-dac14c5e27ab.json
 delete mode 100644 data/reward-bench/Ray2333/GRM-llama3-8B-distill/592ad1e3-8a48-4c39-8013-81d7c731780f.json
 delete mode 100644 data/reward-bench/Ray2333/GRM-llama3-8B-distill/5b36f0af-7ff6-4564-9714-08fbf41d261f.json
 delete mode 100644 data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/04f120c6-b648-4c83-81d8-05118efb0904.json
 delete mode 100644 data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/c907e494-ab2e-4a28-a28d-aeb68eb818ed.json
 delete mode 100644 data/reward-bench/Ray2333/GRM-llama3.2-3B-rewardmodel-ft/d9eed240-ebbe-482f-8dae-c5251ed6d067.json
 delete mode 100644 data/reward-bench/Ray2333/Gemma-2B-rewardmodel-baseline/670865e1-f219-465b-9fbe-6da6f73ac9e6.json
 delete mode 100644 data/reward-bench/Ray2333/Gemma-2B-rewardmodel-ft/88953298-b63e-499f-a31e-f0f586c4772d.json
 delete mode 100644 data/reward-bench/Ray2333/reward-model-Mistral-7B-instruct-Unifie.../3acb690c-ffc0-4e67-8ae1-e79bcee4f824.json
 delete mode 100644 data/reward-bench/SF-Foundation/TextEval-Llama3.1-70B/6ad2cb6a-f9a3-424e-aed2-9493899872e3.json
 delete mode 100644 data/reward-bench/SF-Foundation/TextEval-OffsetBias-12B/1892bf75-916b-4d4f-96ab-fda36872ae5d.json
 delete mode 100644 data/reward-bench/Salesforce/SFR-LLaMa-3.1-70B-Judge-r/e06e1863-c28f-4c96-a672-b1073c80aa71.json
 delete mode 100644 data/reward-bench/Salesforce/SFR-LLaMa-3.1-8B-Judge-r/d923f7aa-a9d4-406a-b5d7-bdab508f04f7.json
 delete mode 100644 data/reward-bench/Salesforce/SFR-nemo-12B-Judge-r/5c5e40b1-e86a-4d30-b93c-f8f9e73cdca8.json
 delete mode 100644 data/reward-bench/Schrieffer/Llama-SARM-4B/59299d8c-e468-490f-8a52-eef49b0aaeea.json
 delete mode 100644 data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/3ce9612f-9b57-476e-9fa4-6e63f14568a7.json
 delete mode 100644 data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/9c605bf1-2533-43db-a610-e71c0aaecdb5.json
 delete mode 100644 data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-70B/c289f778-92b8-44df-a079-3bced33c8ab5.json
 delete mode 100644 data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-8B/329d4101-e740-490c-9fbc-1708f76a2f61.json
 delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/3e87f52e-b136-4cb3-8cbb-d8d8a8571051.json
 delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/62b9adca-db38-46c0-a68a-ed7a8e735035.json
 delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/4d2f43eb-e6f3-4686-a9d9-6b6c6b68b86c.json
 delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/830df3fd-d479-4af8-a92b-93d82e804fec.json
 delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/0e6d85b8-aa37-448c-adb2-0da2bd13e322.json
 delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/45f0bd9c-e939-4b83-a623-1db61f431500.json
 delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/0f710903-7dd8-44ea-914d-d43bbfe894f1.json
 delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/b9ddd960-f6f7-4962-8297-88ec7fbbbd1f.json
 delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.1-8B/25a4520b-c780-45fc-a00f-36db1776c6a8.json
 delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-1B/96d7e5c1-2f43-4f09-9702-0af090afa141.json
 delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-3B/5a47f8bd-401a-4b6b-91b0-9593b36e5996.json
 delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-0.6B/c27e98d4-f5ea-48f9-babc-3ccda2d21d2a.json
 delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-1.7B/060bf847-e7b5-4e30-934f-5306d01c499a.json
 delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-4B/e648e6c2-18bb-49d7-b08f-47ce41a67d4f.json
 delete mode 100644 data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-8B/537e92cb-25db-47f5-916a-6f666e14639a.json
 delete mode 100644 data/reward-bench/Skywork/Skywork-VL-Reward-7B/e59ca33f-c6ce-44d4-9cb4-2fd65608313b.json
 delete mode 100644 data/reward-bench/Skywork/Skywork-VL-Reward-7B/fc99848b-82c7-459e-8327-1867a332ff28.json
 delete mode 100644 data/reward-bench/SultanR/SmolTulu-1.7b-RM/357f4f03-9542-495f-b575-4274111bbe1f.json
 delete mode 100644 data/reward-bench/ZiyiYe/Con-J-Qwen2-7B/d78c42d6-fc0d-4719-bbb6-7a53dbb0d017.json
 delete mode 100644 data/reward-bench/ai2/llama-2-chat-7b-nectar-3.8m.json/c94ddbe5-2bc0-4a33-b06b-10671fb22b70.json
 delete mode 100644 data/reward-bench/ai2/llama-2-chat-nectar-180k.json/cc2ac405-1710-46fa-aeba-dd86797c666c.json
 delete mode 100644 data/reward-bench/ai2/llama-2-chat-ultrafeedback-60k.jsonl/49fcb3e2-2883-4c3d-b519-d511c6b10162.json
 delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../0ba5ce6c-f311-4b02-a67a-d49539119a8e.json
 delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../49029c9e-a831-4219-8e26-df20862ad3e1.json
 delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../6dedd117-eab0-4c31-b50b-4890099d9904.json
 delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../71c20c06-efb8-428e-9e9d-e4fedf11041a.json
 delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../862f3d57-8f5f-4372-b6fb-876fb35efba4.json
 delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../93ea2bfa-e058-42d5-afac-0d3fc50fce91.json
 delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c1331fa1-7793-4526-b24b-02261bb4437f.json
 delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c3cab72a-47b3-47ec-bb2d-986903ab8c26.json
 delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../cd0452a7-0370-4024-a51f-b3deff290db9.json
 delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json/6fd85045-d600-451f-8d27-da637add4081.json
 delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized.json/a15ca8c3-fd90-4ef9-80c5-40eeac60d785.json
 delete mode 100644 data/reward-bench/ai2/tulu-2-7b-rm-v0.json/5f43832f-14fa-49e1-a851-949163aec826.json
 delete mode 100644 data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/1f8869e7-e434-469e-906d-d34621582cba.json
 delete mode 100644 data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/8f9d05db-9bb0-4998-bc75-96dbfa695548.json
 delete mode 100644 data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/2681e475-da0a-48a9-ab68-e0bf59240f90.json
 delete mode 100644 data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/e2986d78-100d-417a-9f38-9a570a335d95.json
 delete mode 100644 data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1bc5cd51-5a3a-46ea-bc78-56f9b3081f69.json
 delete mode 100644 data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1d1127ee-7a0e-4915-b8bf-0b22f8ba338b.json
 delete mode 100644 data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/4bb55ff5-5adf-407f-a9d6-910c6c9d2770.json
 delete mode 100644 data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/daebee0b-3856-4270-94c6-c14bd84f5cf5.json
 delete mode 100644 data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1be99417-352e-4a94-8108-b43123553667.json
 delete mode 100644 data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/8d3fbc68-2ee7-4989-a40c-f4a45e579b5c.json
 delete mode 100644 data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/9533891f-c2f7-4e82-9f39-131768dbc28a.json
 delete mode 100644 data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/b8a47660-f0a5-4136-a743-979863c53e3a.json
 delete mode 100644 data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RM/2673bea2-42eb-42a5-9dc2-13d43341c9b2.json
 delete mode 100644 data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/6f5555c2-588a-48d1-811c-be53634bbdef.json
 delete mode 100644 data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/9c96fa7b-52e8-4aed-9fdd-f389091d5e6f.json
 delete mode 100644 data/reward-bench/allenai/OLMo-7B-Instruct/0519d9fb-f220-40ab-8257-f20ed98a8b47.json
 delete mode 100644 data/reward-bench/allenai/llama-3-tulu-2-70b-uf-mean-rm/ece70375-447f-41e8-aa03-8f4b26abea73.json
 delete mode 100644 data/reward-bench/allenai/llama-3-tulu-2-8b-uf-mean-rm/7bbaffdd-f822-48cf-a0f2-e66b16db678d.json
 delete mode 100644 data/reward-bench/allenai/llama-3-tulu-2-dpo-70b/27c5c441-64ce-41dd-8384-f84c8f6ccc14.json
 delete mode 100644 data/reward-bench/allenai/llama-3-tulu-2-dpo-8b/38a14e6a-2094-4e0b-be22-45181ede2a63.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739590997/cee37c2c-2766-47b7-9192-a141e5d22f2d.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739871066/d1d69392-8717-462d-9ce0-c7ddf5faf97d.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739925892/72071bb1-57c0-4727-8100-ba24d8da10f5.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943850/7626c158-edaf-48f3-9ac3-1188be0c6032.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943881/c37be7a8-dc10-4fea-962b-202986a4581e.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943972/223dc616-b20f-4065-91a7-3c35bfd11c94.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739957701/4236b0a9-9d1e-41f6-8364-a7e8ebf51635.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971507/c8030a87-0cdf-4918-b0d5-d1fb0e284656.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971529/e6ecc1eb-7ff1-46aa-bf03-37bad1b391b7.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739998765/64872b1a-1eae-4171-95ec-a80c782b69f0.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740005072/37484401-c7fe-469d-889a-e70f7cadbf82.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740129284/8cf36288-3add-4fcd-a012-0df9eae2a059.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741286813/f2c8f979-c331-4b9b-b0a7-5efa82c17d3b.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741287363/de409ce8-fb68-4113-8879-23712769cbde.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741292911/264f20d7-1574-448c-8917-eb3f20810819.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742338142/0ebaec42-9190-4326-95dd-5ecb48bf1a72.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519610/29515933-c60b-4686-b475-70ef53d75457.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519628/414174a9-7e44-4f7b-94ce-0757639f5af7.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455/48513083-f854-455e-8455-ddbd2698ec03.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511/0b373560-854f-4482-81d0-6c984e130144.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406/1a021cab-d569-4077-af5e-1643f45de03d.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136/e26e230d-59b3-4243-a6c4-3845ab74b89b.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398/aa0991d0-9c5e-4f94-bc12-3342ca389e99.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535/397abe47-d5e9-487d-b883-ec49db16c584.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054/82f52a35-41b5-4b9c-bb3e-4bf18eed0b92.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271/670382ab-a8a1-43f3-a572-b9a5aeae23ef.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181/a4b3c031-7c01-4f7a-8cfe-52b3260d6ecc.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221/7fcd3fce-2296-4b5c-8362-24b1c70ccb8f.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262/4f164e8b-55a1-498f-b586-cf78da7d0b57.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523/a84d3d61-6e05-4d4d-bc89-7f663e9667fb.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750/7aa98f71-8262-4c1f-a71c-1ef36f2ef04c.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427/93398c1f-3129-4be4-83b5-62a4a45c6b84.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446/62493784-f899-4736-bdce-2107ec99a752.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094/9b68ecaa-cf9d-414e-9cf1-c662c765bb5c.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636/76f3d0bd-2b71-4406-a0d4-b01b6c91c4ff.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325/2dc5ab6f-2427-42ae-9582-a0e6139f451a.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238/0db97be6-6562-47d8-bd1a-5b469250e54b.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906/228e4dc4-e517-4023-b690-7f0c321286b2.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529/9442b27c-c94d-41c0-a752-3bd82385272d.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305/561039ac-b156-40eb-bf53-21a275b858ca.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778/d801d700-7b4d-4a62-883b-3d85b05385ea.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459/b8f24058-4441-4d19-898e-80470cc7b685.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747/1f372e00-e7a8-43ef-8e14-ef1b08e5e957.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935/0200a1b3-71f1-4633-96a5-4ca9883a67a7.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360/55479901-aec7-4875-b792-ba73b54aa37a.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366/872597b2-4392-4f23-b5b2-41d418b6cf89.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352/5cb437b5-5993-418d-bd9f-81dea71d9edf.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634/c471cdf7-73f9-48c9-a970-baa66b609093.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988/794a71b4-8a43-4c69-a663-369eea6a84a3.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103/2ad22375-4ed8-4be6-a012-a6f6799581e2.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835/a8df0dc2-d16c-4e1a-b0b5-abe2a4a1d803.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221/ca0a010a-fe3a-4b87-8c80-4a8d3e2597fb.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826/5d1c166c-6a22-4afb-b1b1-f7db9ec38bd8.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363/10a432fa-dfef-4c9c-bdf7-ce0f81fd1895.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498/a550663c-2a04-4dfb-8663-b177a7181f3d.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__2__1743897475/72b6196e-0a2b-4ec9-80a3-a7eb14f7be09.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__3__1744311421/5e41f068-f009-4e32-bac1-9de5220a2ce2.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903/eca1331f-6503-481a-b77b-3d96791f54e8.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368/69def7de-a916-4d23-984b-e676e91e1d8c.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182/679c6e0b-9e0b-4224-b1e3-59df149739a0.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012/2335433d-37c6-47f0-ad3b-5e0a42e9488f.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765/fe84f8a3-5fe9-4385-b6d4-0436fb7e5197.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527/70d2697e-0df5-40ae-9268-b906c9cabd9d.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236/0a30fd70-2381-4a4b-89aa-dbd169c856f0.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530/b9c787f9-3bcd-4215-a157-7fcfa2df82cc.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417/bdd98f27-fbfd-4de7-bd4e-3b8c3e4e7cc0.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486/44b20109-d534-4aa9-867d-fa59935ef6d0.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745/d1196312-4153-4a38-aa46-2940d63d7924.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661/4b1e3070-04ef-47e7-b720-739320194e7b.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472/247f400e-dca8-4dab-bebf-092f778f02c9.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267/d043ad21-102b-49f0-9e8e-6daef7cc3a2e.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759/d45ec8b8-1ee6-49bb-9237-a7271ba9d13c.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905/05a4c6aa-9af2-44f0-8c55-8aeed2e75eaf.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363/a6ef712e-014e-470e-8d5b-f3b51f677aee.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505/35a039ba-06be-4ec2-9bde-a6a6db2eefec.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180/97cb96f8-ce4c-403f-bfbc-386d3c611c81.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187/3a1621e9-75ee-4b34-9c0d-ae15399b1dab.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509/237218ac-4c74-4647-82b1-700360ddfdbd.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498/2858d126-d2ef-4512-8fc8-c39faf24b908.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926/d118ddb1-aafc-4ddf-b5c7-f3ff921bbe0c.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661/379ec82f-a6a7-4976-a4a6-ab80cb9da293.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598/c4df42d1-a838-4717-a814-40559fcd7342.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923/f022d826-3252-4def-b37b-3ce44d78f4ce.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628/cecc321b-efbd-434e-8a31-a97bbb8bbb3b.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999/278c2132-3415-48f4-a839-ed09d71e9240.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777/92bbda1a-ecb1-493d-aa39-a29522c1a11e.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638/f43b2dff-9e73-4779-86e0-b2cc30ae8b40.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938/59a98f5d-d017-4b1a-a563-5abd113337e9.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885/a41597ed-fbab-41af-9625-c277ca988546.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773/e311eb59-f217-4bc2-b69b-dcea434797a8.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867/69b037c3-bae2-4889-b10d-e732c45851e9.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424/adeee000-0b62-4a0c-afaa-5e8c5f29ff6d.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395/4464d588-62b2-440b-8188-2450bd7a94c5.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491/bf358648-a41d-43ee-8c14-f8b8eef41871.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787/afd99f12-f739-40d3-aa11-ef3a45316931.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461/49b4a24b-ddf1-47f0-ba39-9366892a1213.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780/ea14a487-39c3-488b-b52b-998e57135487.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489/02f74b6a-7f63-484e-a7c1-0c53bd801b87.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713/e492c59d-4b03-4dce-983e-a8724de35a60.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911/53de0394-8516-4882-b2bc-c7e62e3d8ef0.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412/56d4c1c5-5238-45dc-8331-64a14b830779.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922/7003c9d4-c758-4373-a7a3-04822978bf35.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495/75a7dcb6-789c-49de-b209-4cf7d27465e4.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507/e91d3910-4f20-4e82-b1fb-8605f5d2b8ac.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507/f18bfd44-3097-4eb8-a09c-2372c3ecd738.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917/9ca974b9-c5fb-4fc4-ab3e-1246e31ecdb2.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961/fb1ab5e0-18db-4e5f-add3-2352d9a1f260.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830/60ba1f0d-7e85-49e4-8c73-330d74de6707.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024/29d1c194-8b87-466c-8701-e0fcf267665c.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914/31e8f616-7b64-4d1a-b395-20bf8bb4629c.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091/cc3f315d-3cea-47e4-83b4-b5045e778c5e.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829/5d20dbf8-bb14-46af-adcd-b7ba05f8352c.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050/06f2cb33-3937-4fde-84e2-6b5467f051c6.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916/f35c4efa-3767-4a0e-8769-06230cda2512.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576/6cb65d6a-6c46-4991-8154-f28b101954f6.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619/6e15a49b-7dc4-4d69-965e-cb962c084e4a.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583/9f5591f4-751d-48d3-a348-4bb59f6bb1a3.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604/b609c002-fa0a-46a8-b5a1-9213ee89606c.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738/b147fc7f-0e31-49ca-abfd-ba990a925097.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191/e4fbfe23-2b70-459e-821b-db0116d43d8c.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737/2ab7dc14-af3e-4fb2-8c0c-fe0e14100321.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138/aca2c665-79f2-4226-b806-307be277ed08.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455/d37a63df-6d38-4083-bf87-11064162efde.json
 delete mode 100644 data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964/16e550cc-e59d-4aaa-b221-8cf71e1b26d2.json
 delete mode 100644 data/reward-bench/allenai/tulu-2-dpo-13b/47058e2a-dc41-45f8-8c32-bc496a8d3bc5.json
 delete mode 100644 data/reward-bench/allenai/tulu-2-dpo-70b/7199c8b3-8346-4200-b07e-4362ad13a7db.json
 delete mode 100644 data/reward-bench/allenai/tulu-2-dpo-7b/de7e59d5-e2ce-4479-bbd9-ab9deb3beed3.json
 delete mode 100644 data/reward-bench/allenai/tulu-v2.5-13b-preference-mix-rm/17e011c3-1a53-40ae-b7b4-cb24c23df3de.json
 delete mode 100644 data/reward-bench/allenai/tulu-v2.5-13b-uf-rm/1125dd05-2f0d-48ca-825c-f5efa18564aa.json
 delete mode 100644 data/reward-bench/allenai/tulu-v2.5-70b-preference-mix-rm/88014e0d-e89b-4fed-9eb6-5276bd7658df.json
 delete mode 100644 data/reward-bench/allenai/tulu-v2.5-70b-uf-rm/7cc9bfc2-570d-456c-918f-68fd4b711f05.json
 delete mode 100644 data/reward-bench/berkeley-nest/Starling-RM-7B-alpha/77b0957f-8779-4dbe-a6ea-cff50c4ee73b.json
 delete mode 100644 data/reward-bench/facebook/Self-taught-Llama-3-70B/ba0ce7ce-a755-4337-bfec-0391680d3625.json
 delete mode 100644 data/reward-bench/facebook/Self-taught-evaluator-llama3.1-70B/4eb460eb-b3ad-4e0d-b131-5b59ef54015c.json
 delete mode 100644 data/reward-bench/general-preference/GPM-Gemma-2B/6868a1e5-ee86-4f89-8452-5e939ac19169.json
 delete mode 100644 data/reward-bench/general-preference/GPM-Llama-3.1-8B/4a151d43-5fac-4afe-9c23-ba0e86a60849.json
 delete mode 100644 data/reward-bench/google/flame-1.0-24B-july-2024/5f16d574-adef-4016-abcf-9e7936771ba7.json
 delete mode 100644 data/reward-bench/google/gemini-1.5-flash-001/f3e0300f-39ed-4cfd-bd03-218904836037.json
 delete mode 100644 data/reward-bench/google/gemini-1.5-flash-8b/42c82c00-b74e-4152-a222-15d481a13e0c.json
 delete mode 100644 data/reward-bench/google/gemini-1.5-pro-0514/68096be8-c49f-4a23-824e-1275248369f7.json
 delete mode 100644 data/reward-bench/google/gemini-1.5-pro-0924/c91270bd-3731-452a-b429-6cd4943d1194.json
 delete mode 100644 data/reward-bench/google/gemini-2.5-flash-preview-04-17/337c7a43-46a7-4acb-b7f1-936e1f2cf46f.json
 delete mode 100644 data/reward-bench/google/gemini-2.5-flash/3b00f881-8f73-4608-8cbb-846fe7d1cfea.json
 delete mode 100644 data/reward-bench/google/gemini-2.5-pro-preview-05-06/2821dfdc-291b-405e-bd81-cf536c802885.json
 delete mode 100644 data/reward-bench/google/gemini-2.5-pro/7d441240-7e85-4776-b51c-3c1bc84456ba.json
 delete mode 100644 data/reward-bench/google/gemma-2-27b-it/840d35d9-441e-4ba3-bbc3-1f4ff2627517.json
 delete mode 100644 data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/0127f3c5-9657-4eb6-a77a-5a6476a8fc79.json
 delete mode 100644 data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/b72e2988-75e4-4d26-9a47-daae4786b02f.json
 delete mode 100644 data/reward-bench/infly/INF-ORM-Llama3.1-70B/643cf5a3-8992-4126-87c9-814887314266.json
 delete mode 100644 data/reward-bench/infly/INF-ORM-Llama3.1-70B/f81f1f67-6506-481f-87ce-a17a6a7578f3.json
 delete mode 100644 data/reward-bench/internlm/internlm2-1_8b-reward/32b35218-a099-410e-8a65-a0d6e2f380a6.json
 delete mode 100644 data/reward-bench/internlm/internlm2-1_8b-reward/deec1e7c-0cb8-4e6f-b3ac-d37790b709f3.json
 delete mode 100644 data/reward-bench/internlm/internlm2-20b-reward/e42a9986-4dcc-4017-be97-8135646c7424.json
 delete mode 100644 data/reward-bench/internlm/internlm2-20b-reward/ffc92063-606a-4f31-bfdd-5683aa748ccc.json
 delete mode 100644 data/reward-bench/internlm/internlm2-7b-reward/23a5398c-0911-4a66-930d-abada12bf985.json
 delete mode 100644 data/reward-bench/internlm/internlm2-7b-reward/80b0bbcb-a57a-453c-8fff-502646520b1d.json
 delete mode 100644 data/reward-bench/jondurbin/bagel-dpo-34b-v0.5/e383c939-b952-4fdd-94e3-eb3716691860.json
 delete mode 100644 data/reward-bench/llm-blender/PairRM-hf/daf873f9-ab03-49df-96cb-a0f5a8613048.json
 delete mode 100644 data/reward-bench/mattshumer/Reflection-70B/f4cff132-3b2f-4e03-bb49-098b16d87cef.json
 delete mode 100644 data/reward-bench/meta-llama/Meta-Llama-3-70B-Instruct/f80685de-058c-4ab8-aa35-dc7321d1cea6.json
 delete mode 100644 data/reward-bench/meta-llama/Meta-Llama-3-8B-Instruct/c8e4349d-a084-4eb5-990f-403ba930a9ad.json
 delete mode 100644 data/reward-bench/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo/729ca9c0-0680-49f1-97b9-5581be17a352.json
 delete mode 100644 data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/fdd4add5-b44d-46f9-8c98-da3120df4161.json
 delete mode 100644 data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct/6b5ef643-30dd-4381-b66f-e9ecd6b0d06e.json
 delete mode 100644 data/reward-bench/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo/95271b8c-4135-48bf-bbad-ae94baa37640.json
 delete mode 100644 data/reward-bench/meta-metrics/MetaMetrics-RM-v1.0/f437e790-efe1-4dc5-8ccc-5b0bfd800069.json
 delete mode 100644 data/reward-bench/mightbe/Better-PairRM/7d0f761a-2650-4029-b1e9-13af2f0cc69d.json
 delete mode 100644 data/reward-bench/mistralai/Mixtral-8x7B-Instruct-v0.1/49fc601e-4ac6-4672-a53d-0e89f19959c1.json
 delete mode 100644 data/reward-bench/my_model/6195e81a-d5a5-40af-96f6-259252009ad7.json
 delete mode 100644 data/reward-bench/nicolinho/QRM-Gemma-2-27B/2dec0f50-d374-4af3-9d27-80fcf50dac2c.json
 delete mode 100644 data/reward-bench/nicolinho/QRM-Gemma-2-27B/96722888-0cc9-4dfd-b38d-91f4118c0be2.json
 delete mode 100644 data/reward-bench/nicolinho/QRM-Llama3-8B/683abc2a-fce0-4d3d-bdcc-5cac2c76a46a.json
 delete mode 100644 data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/121344ec-61ef-49c5-a74b-b86f605d513e.json
 delete mode 100644 data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/8594f86b-a7f2-4046-a3a7-830d7ac20690.json
 delete mode 100644 data/reward-bench/nicolinho/QRM-Llama3.1-8B/c0c5e5e1-801c-48fd-a994-a4a69c0b1213.json
 delete mode 100644 data/reward-bench/nvidia/Llama-3.1-Nemotron-70B-Reward/0411ac30-1536-4639-8350-fc11d53298e3.json
 delete mode 100644 data/reward-bench/nvidia/Llama3-70B-SteerLM-RM/92281e58-4160-4d76-9119-b38fb47ffd8f.json
 delete mode 100644 data/reward-bench/nvidia/Nemotron-4-340B-Reward/43687871-2e19-4d2b-9754-1cb6527496c1.json
 delete mode 100644 data/reward-bench/openai/gpt-3.5-turbo-0125/1debe1de-b394-4856-a946-9d14bd867bf6.json
 delete mode 100644 data/reward-bench/openai/gpt-4-0125-preview/80c589d2-c1eb-4dcf-8be8-042f4f66b7eb.json
 delete mode 100644 data/reward-bench/openai/gpt-4-turbo-2024-04-09/62478772-bb85-4d3f-a916-c3d17db3ee61.json
 delete mode 100644 data/reward-bench/openai/gpt-4.1-2025-04-14/a070bae2-c927-418b-91cc-161781c4f5b7.json
 delete mode 100644 data/reward-bench/openai/gpt-4.1-mini-2025-04-14/b884c919-a272-4f67-9a09-3d232f56d083.json
 delete mode 100644 data/reward-bench/openai/gpt-4.1-nano-2025-04-14/deac33dd-187b-4406-a76a-b33caf417380.json
 delete mode 100644 data/reward-bench/openai/gpt-4o-2024-05-13/185bd742-d7d4-4600-86bd-bcda75ed2ebc.json
 delete mode 100644 data/reward-bench/openai/gpt-4o-2024-08-06/901e4de6-3ef6-4c2a-873c-cdcc47201974.json
 delete mode 100644 data/reward-bench/openai/gpt-4o-2024-08-06/a051d5d6-18e6-483d-a000-4a52a06de676.json
 delete mode 100644 data/reward-bench/openai/gpt-4o-mini-2024-07-18/94d77182-8952-4a63-b02b-3d8bd8a8dead.json
 delete mode 100644 data/reward-bench/openai/gpt-4o-mini-2024-07-18/9a48d808-0280-4175-a28a-7e9ba8ac6deb.json
 delete mode 100644 data/reward-bench/openbmb/Eurus-7b-kto/f0d9f57d-d552-44ea-a91c-751854133316.json
 delete mode 100644 data/reward-bench/openbmb/Eurus-RM-7b/561cfba1-856d-4809-b5c7-41481735e1d6.json
 delete mode 100644 data/reward-bench/openbmb/Eurus-RM-7b/995d1caf-b735-44dd-adff-875e3203aa46.json
 delete mode 100644 data/reward-bench/openbmb/MiniCPM-2B-dpo-fp32/81767043-23c2-4229-b3b5-1c24e470d52a.json
 delete mode 100644 data/reward-bench/openbmb/UltraRM-13b/4f6344bc-af30-46f9-b6f8-41ff925d064e.json
 delete mode 100644 data/reward-bench/openbmb/UltraRM-13b/abac8640-40be-4eb5-9035-2bf6fd436a7a.json
 delete mode 100644 data/reward-bench/opencompass/CompassJudger-1-1.5B-Instruct/6fd972ab-c45f-4ccd-a5cf-4aac5e703342.json
 delete mode 100644 data/reward-bench/opencompass/CompassJudger-1-14B-Instruct/8eb1bcf2-a6bd-467c-bc37-090fdb7a9460.json
 delete mode 100644 data/reward-bench/opencompass/CompassJudger-1-32B-Instruct/5ad53725-ed5a-41f3-8ff6-7404f3f981db.json
 delete mode 100644 data/reward-bench/opencompass/CompassJudger-1-7B-Instruct/ae2d05b4-5e80-4b00-af67-b94609b073eb.json
 delete mode 100644 data/reward-bench/prometheus-eval/prometheus-7b-v2.0/592f2811-c197-423e-89d4-e25ee5a324fb.json
 delete mode 100644 data/reward-bench/prometheus-eval/prometheus-8x7b-v2.0/17795e7b-e912-440f-a80e-63233d3b6d8c.json
 delete mode 100644 data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/375cf55f-64f6-42f6-a947-1487feffb196.json
 delete mode 100644 data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/94d2eddd-f7db-4360-ac58-0af39ce66935.json
 delete mode 100644 data/reward-bench/stabilityai/stable-code-instruct-3b/996ca604-e01c-4a95-9286-60b6dc04f67d.json
 delete mode 100644 data/reward-bench/stabilityai/stablelm-2-12b-chat/b6f0089f-d04b-4bcd-be84-ce3bc0d6c2b9.json
 delete mode 100644 data/reward-bench/stabilityai/stablelm-2-zephyr-1_6b/83e15cba-4fec-48f2-9be4-78decbd96f66.json
 delete mode 100644 data/reward-bench/stabilityai/stablelm-zephyr-3b/493617c0-37eb-4c83-b175-2507a3647b5e.json
 delete mode 100644 data/reward-bench/stanfordnlp/SteamSHP-flan-t5-large/97f494ce-3c9c-4a19-a237-d458be611a0a.json
 delete mode 100644 data/reward-bench/stanfordnlp/SteamSHP-flan-t5-xl/f8bf1e92-3cc3-4c7e-9770-485a3074e85f.json
 delete mode 100644 data/reward-bench/unknown/Cohere March 2024/5bf73fba-520f-4a2f-9296-8240847eb8ec.json
 delete mode 100644 data/reward-bench/unknown/Cohere May 2024/3dd2c89f-64f5-4bbc-a621-791a9f0538b2.json
 delete mode 100644 data/reward-bench/unknown/gemini-1.5-flash-8b/ef987556-7277-48d8-ac07-532586773a3a.json
 delete mode 100644 data/reward-bench/upstage/SOLAR-10.7B-Instruct-v1.0/add7eddb-7a8b-4c78-9864-c4316a97ce5e.json
 delete mode 100644 data/reward-bench/wenbopan/Faro-Yi-9B-DPO/caf02954-1eed-44eb-b5f4-df47c90828d7.json
 delete mode 100644 data/reward-bench/weqweasdas/RM-Gemma-2B/00798930-daa2-4e79-82c6-2cccf1c3a0cb.json
 delete mode 100644 data/reward-bench/weqweasdas/RM-Gemma-2B/71658cf8-0189-49dc-847f-b9a9b5faee4a.json
 delete mode 100644 data/reward-bench/weqweasdas/RM-Gemma-7B-4096/3d506b91-5b0d-47e3-a3a0-bc09808bf5b5.json
 delete mode 100644 data/reward-bench/weqweasdas/RM-Gemma-7B/04c71231-2025-4e1a-b7ed-56b245868089.json
 delete mode 100644 data/reward-bench/weqweasdas/RM-Gemma-7B/08b2edd0-f8e9-47cd-b19d-53fdc7209917.json
 delete mode 100644 data/reward-bench/weqweasdas/RM-Mistral-7B/79a43841-4032-4a20-8b5a-83b4b446d107.json
 delete mode 100644 data/reward-bench/weqweasdas/RM-Mistral-7B/a2c16ab8-1098-490a-8d0a-392d835427e0.json
 delete mode 100644 data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/0aa12860-7ebe-49c2-a5af-1926d23e34f8.json
 delete mode 100644 data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/796d3ec1-9c26-4ead-87cb-4eb866209120.json

diff --git a/data/global-mmlu-lite/alibaba/qwen3-235b-a22b-instruct-2507/c8ab4e94-d8e8-417f-be18-fececf3c815c.json b/data/global-mmlu-lite/alibaba/qwen3-235b-a22b-instruct-2507/c8ab4e94-d8e8-417f-be18-fececf3c815c.json
deleted file mode 100644
index b3b764f48..000000000
--- a/data/global-mmlu-lite/alibaba/qwen3-235b-a22b-instruct-2507/c8ab4e94-d8e8-417f-be18-fececf3c815c.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/alibaba_qwen3-235b-a22b-instruct-2507/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen3-235b-a22b-instruct-2507",
-    "id": "alibaba/qwen3-235b-a22b-instruct-2507",
-    "developer": "alibaba",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "Qwen 3 235B A22B Instruct 2506"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8798
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8522
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9075
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0318,
-            "upper": 0.0318,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0307,
-            "upper": 0.0307,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8875,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.031,
-            "upper": 0.031,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.885,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0313,
-            "upper": 0.0313,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0318,
-            "upper": 0.0318,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8775,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0321,
-            "upper": 0.0321,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0318,
-            "upper": 0.0318,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0318,
-            "upper": 0.0318,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0318,
-            "upper": 0.0318,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.875,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0324,
-            "upper": 0.0324,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8875,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.031,
-            "upper": 0.031,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.875,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0324,
-            "upper": 0.0324,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.033,
-            "upper": 0.033,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8725,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0327,
-            "upper": 0.0327,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8775,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0321,
-            "upper": 0.0321,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0318,
-            "upper": 0.0318,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/anthropic/claude-3-5-haiku-20241022/402c8833-1827-46fc-a497-46b40a6794ff.json b/data/global-mmlu-lite/anthropic/claude-3-5-haiku-20241022/402c8833-1827-46fc-a497-46b40a6794ff.json
deleted file mode 100644
index 5bff70d19..000000000
--- a/data/global-mmlu-lite/anthropic/claude-3-5-haiku-20241022/402c8833-1827-46fc-a497-46b40a6794ff.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/anthropic_claude-3-5-haiku-20241022/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "claude-3-5-haiku-20241022",
-    "id": "anthropic/claude-3-5-haiku-20241022",
-    "developer": "anthropic",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "Claude 3.5 Haiku"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6114
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5834
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6394
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.695,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0451,
-            "upper": 0.0451,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.485,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.049,
-            "upper": 0.049,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.675,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0459,
-            "upper": 0.0459,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.565,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0486,
-            "upper": 0.0486,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.61,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0478,
-            "upper": 0.0478,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6575,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0465,
-            "upper": 0.0465,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5475,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0488,
-            "upper": 0.0488,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.48,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.049,
-            "upper": 0.049,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.655,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0466,
-            "upper": 0.0466,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6575,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0465,
-            "upper": 0.0465,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5225,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0489,
-            "upper": 0.0489,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.485,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.049,
-            "upper": 0.049,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0453,
-            "upper": 0.0453,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6675,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0462,
-            "upper": 0.0462,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0453,
-            "upper": 0.0453,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0449,
-            "upper": 0.0449,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/anthropic/claude-3-7-sonnet-20250219/acd2082a-ce0c-418f-9383-f3c9f11735a2.json b/data/global-mmlu-lite/anthropic/claude-3-7-sonnet-20250219/acd2082a-ce0c-418f-9383-f3c9f11735a2.json
deleted file mode 100644
index ec9276c60..000000000
--- a/data/global-mmlu-lite/anthropic/claude-3-7-sonnet-20250219/acd2082a-ce0c-418f-9383-f3c9f11735a2.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/anthropic_claude-3-7-sonnet-20250219/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "claude-3-7-sonnet-20250219",
-    "id": "anthropic/claude-3-7-sonnet-20250219",
-    "developer": "anthropic",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "Claude 3.7 Sonnet"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8078
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7794
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8362
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7925,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0397,
-            "upper": 0.0397,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7625,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0417,
-            "upper": 0.0417,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.825,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0372,
-            "upper": 0.0372,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8125,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0382,
-            "upper": 0.0382,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7675,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0414,
-            "upper": 0.0414,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.805,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0388,
-            "upper": 0.0388,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8175,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0379,
-            "upper": 0.0379,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8225,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0374,
-            "upper": 0.0374,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8425,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0357,
-            "upper": 0.0357,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0368,
-            "upper": 0.0368,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0412,
-            "upper": 0.0412,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8075,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0386,
-            "upper": 0.0386,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8125,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0382,
-            "upper": 0.0382,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0384,
-            "upper": 0.0384,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.835,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0364,
-            "upper": 0.0364,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8125,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0382,
-            "upper": 0.0382,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/anthropic/claude-opus-4-1-20250805/c65ed336-b283-46c2-8284-c4695cad588d.json b/data/global-mmlu-lite/anthropic/claude-opus-4-1-20250805/c65ed336-b283-46c2-8284-c4695cad588d.json
deleted file mode 100644
index 06dce92ac..000000000
--- a/data/global-mmlu-lite/anthropic/claude-opus-4-1-20250805/c65ed336-b283-46c2-8284-c4695cad588d.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/anthropic_claude-opus-4-1-20250805/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "claude-opus-4-1-20250805",
-    "id": "anthropic/claude-opus-4-1-20250805",
-    "developer": "anthropic",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "Claude Opus 4.1"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.943
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9331
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9528
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.945,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0223,
-            "upper": 0.0223,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9475,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0219,
-            "upper": 0.0219,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9425,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0228,
-            "upper": 0.0228,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0233,
-            "upper": 0.0233,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.945,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0223,
-            "upper": 0.0223,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9475,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0219,
-            "upper": 0.0219,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9425,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0228,
-            "upper": 0.0228,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0233,
-            "upper": 0.0233,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0233,
-            "upper": 0.0233,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.95,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0214,
-            "upper": 0.0214,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.945,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0223,
-            "upper": 0.0223,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.945,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0223,
-            "upper": 0.0223,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.025,
-            "upper": 0.025,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9375,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0237,
-            "upper": 0.0237,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.945,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0223,
-            "upper": 0.0223,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.945,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0223,
-            "upper": 0.0223,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/anthropic/claude-sonnet-4-20250514/5ebb009d-b548-4f2b-b075-feb76ca295d2.json b/data/global-mmlu-lite/anthropic/claude-sonnet-4-20250514/5ebb009d-b548-4f2b-b075-feb76ca295d2.json
deleted file mode 100644
index 0251345d9..000000000
--- a/data/global-mmlu-lite/anthropic/claude-sonnet-4-20250514/5ebb009d-b548-4f2b-b075-feb76ca295d2.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/anthropic_claude-sonnet-4-20250514/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "claude-sonnet-4-20250514",
-    "id": "anthropic/claude-sonnet-4-20250514",
-    "developer": "anthropic",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "Claude Sonnet 4"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9058
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8913
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9203
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9125,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0277,
-            "upper": 0.0277,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.905,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0287,
-            "upper": 0.0287,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9075,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0284,
-            "upper": 0.0284,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9125,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0277,
-            "upper": 0.0277,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.028,
-            "upper": 0.028,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0294,
-            "upper": 0.0294,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9025,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0291,
-            "upper": 0.0291,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9075,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0284,
-            "upper": 0.0284,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0294,
-            "upper": 0.0294,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9125,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0277,
-            "upper": 0.0277,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.028,
-            "upper": 0.028,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9075,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0284,
-            "upper": 0.0284,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8975,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0297,
-            "upper": 0.0297,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8975,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0297,
-            "upper": 0.0297,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9175,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.027,
-            "upper": 0.027,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8925,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0304,
-            "upper": 0.0304,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/cohere/command-a-03-2025/c7df2916-bde4-4987-9139-fcfd18a14ac1.json b/data/global-mmlu-lite/cohere/command-a-03-2025/c7df2916-bde4-4987-9139-fcfd18a14ac1.json
deleted file mode 100644
index 8e9ed8546..000000000
--- a/data/global-mmlu-lite/cohere/command-a-03-2025/c7df2916-bde4-4987-9139-fcfd18a14ac1.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/cohere_command-a-03-2025/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "command-a-03-2025",
-    "id": "cohere/command-a-03-2025",
-    "developer": "cohere",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "Command A "
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8385
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7993
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8778
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8425,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0357,
-            "upper": 0.0357,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.855,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0345,
-            "upper": 0.0345,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8225,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0374,
-            "upper": 0.0374,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8425,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0357,
-            "upper": 0.0357,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8375,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0362,
-            "upper": 0.0362,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8421,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0358,
-            "upper": 0.0358,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8546,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0346,
-            "upper": 0.0346,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8375,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0362,
-            "upper": 0.0362,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.845,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0355,
-            "upper": 0.0355,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.035,
-            "upper": 0.035,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0359,
-            "upper": 0.0359,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8525,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0348,
-            "upper": 0.0348,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8275,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.037,
-            "upper": 0.037,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.815,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0381,
-            "upper": 0.0381,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.835,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0364,
-            "upper": 0.0364,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8175,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0379,
-            "upper": 0.0379,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/deepseek/deepseek-r1-0528/56ec8ab0-d76d-4c03-953b-a2a4a43af5f4.json b/data/global-mmlu-lite/deepseek/deepseek-r1-0528/56ec8ab0-d76d-4c03-953b-a2a4a43af5f4.json
deleted file mode 100644
index b6e9a89cf..000000000
--- a/data/global-mmlu-lite/deepseek/deepseek-r1-0528/56ec8ab0-d76d-4c03-953b-a2a4a43af5f4.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/deepseek_deepseek-r1-0528/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "deepseek-r1-0528",
-    "id": "deepseek/deepseek-r1-0528",
-    "developer": "deepseek",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "DeepSeek-R1"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6744
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6672
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6816
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6825,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0456,
-            "upper": 0.0456,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.715,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0442,
-            "upper": 0.0442,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.655,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0466,
-            "upper": 0.0466,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6375,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0471,
-            "upper": 0.0471,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6925,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0452,
-            "upper": 0.0452,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6475,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0468,
-            "upper": 0.0468,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.655,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0466,
-            "upper": 0.0466,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6775,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0458,
-            "upper": 0.0458,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7725,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0411,
-            "upper": 0.0411,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6575,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0465,
-            "upper": 0.0465,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.635,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0472,
-            "upper": 0.0472,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7175,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0441,
-            "upper": 0.0441,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6775,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0458,
-            "upper": 0.0458,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0412,
-            "upper": 0.0412,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5075,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.049,
-            "upper": 0.049,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0453,
-            "upper": 0.0453,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/deepseek/deepseek-v3.1/ad3211a9-4390-4247-b64d-600191a88a75.json b/data/global-mmlu-lite/deepseek/deepseek-v3.1/ad3211a9-4390-4247-b64d-600191a88a75.json
deleted file mode 100644
index 7e8deab0e..000000000
--- a/data/global-mmlu-lite/deepseek/deepseek-v3.1/ad3211a9-4390-4247-b64d-600191a88a75.json
+++ /dev/null
@@ -1,512 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/deepseek_deepseek-v3.1/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "deepseek-v3.1",
-    "id": "deepseek/deepseek-v3.1",
-    "developer": "deepseek",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8044
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7793
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8295
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.805,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0388,
-            "upper": 0.0388,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.825,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0372,
-            "upper": 0.0372,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8157,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0382,
-            "upper": 0.0382,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7925,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0397,
-            "upper": 0.0397,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8175,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0379,
-            "upper": 0.0379,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7569,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0421,
-            "upper": 0.0421,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7764,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0409,
-            "upper": 0.0409,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8075,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0386,
-            "upper": 0.0386,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8312,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0374,
-            "upper": 0.0374,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8125,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0382,
-            "upper": 0.0382,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8246,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0373,
-            "upper": 0.0373,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8125,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0382,
-            "upper": 0.0382,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.801,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0393,
-            "upper": 0.0393,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7831,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0415,
-            "upper": 0.0415,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8161,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0381,
-            "upper": 0.0381,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7925,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0397,
-            "upper": 0.0397,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/google/gemini-2.5-flash-preview-05-20/1a34326a-f75e-434c-a027-9f8cf7fe8fb9.json b/data/global-mmlu-lite/google/gemini-2.5-flash-preview-05-20/1a34326a-f75e-434c-a027-9f8cf7fe8fb9.json
deleted file mode 100644
index 7a051d563..000000000
--- a/data/global-mmlu-lite/google/gemini-2.5-flash-preview-05-20/1a34326a-f75e-434c-a027-9f8cf7fe8fb9.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/google_gemini-2.5-flash-preview-05-20/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemini-2.5-flash-preview-05-20",
-    "id": "google/gemini-2.5-flash-preview-05-20",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "Gemini 2.5 Flash Preview"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9092
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8925
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9259
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.905,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0287,
-            "upper": 0.0287,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9225,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0262,
-            "upper": 0.0262,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.028,
-            "upper": 0.028,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.905,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0287,
-            "upper": 0.0287,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.925,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0258,
-            "upper": 0.0258,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9125,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0277,
-            "upper": 0.0277,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9075,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0284,
-            "upper": 0.0284,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0307,
-            "upper": 0.0307,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9125,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0277,
-            "upper": 0.0277,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9075,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0284,
-            "upper": 0.0284,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.915,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0273,
-            "upper": 0.0273,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.915,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0273,
-            "upper": 0.0273,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.905,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0287,
-            "upper": 0.0287,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8825,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0316,
-            "upper": 0.0316,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.025,
-            "upper": 0.025,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9025,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0291,
-            "upper": 0.0291,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/google/gemini-2.5-flash/129c8b21-f97e-4284-9574-33d5932332f7.json b/data/global-mmlu-lite/google/gemini-2.5-flash/129c8b21-f97e-4284-9574-33d5932332f7.json
deleted file mode 100644
index ffe8e8eb2..000000000
--- a/data/global-mmlu-lite/google/gemini-2.5-flash/129c8b21-f97e-4284-9574-33d5932332f7.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/google_gemini-2.5-flash/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemini-2.5-flash",
-    "id": "google/gemini-2.5-flash",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "Gemini 2.5 Flash"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9145
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9291
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9125,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0277,
-            "upper": 0.0277,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9325,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0246,
-            "upper": 0.0246,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.028,
-            "upper": 0.028,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9025,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0291,
-            "upper": 0.0291,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.028,
-            "upper": 0.028,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.925,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0258,
-            "upper": 0.0258,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9075,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0284,
-            "upper": 0.0284,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9225,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0262,
-            "upper": 0.0262,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9125,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0277,
-            "upper": 0.0277,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.915,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0273,
-            "upper": 0.0273,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9125,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0277,
-            "upper": 0.0277,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9175,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.027,
-            "upper": 0.027,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.915,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0273,
-            "upper": 0.0273,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9075,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0284,
-            "upper": 0.0284,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.915,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0273,
-            "upper": 0.0273,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.915,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0273,
-            "upper": 0.0273,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/google/gemini-2.5-pro/3644fd67-0f46-4de3-b542-edf219d0e0cd.json b/data/global-mmlu-lite/google/gemini-2.5-pro/3644fd67-0f46-4de3-b542-edf219d0e0cd.json
deleted file mode 100644
index 6a19f6916..000000000
--- a/data/global-mmlu-lite/google/gemini-2.5-pro/3644fd67-0f46-4de3-b542-edf219d0e0cd.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/google_gemini-2.5-pro/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemini-2.5-pro",
-    "id": "google/gemini-2.5-pro",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "Gemini 2.5 Pro"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9323
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9241
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9406
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9475,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0219,
-            "upper": 0.0219,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9275,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0254,
-            "upper": 0.0254,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9275,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0254,
-            "upper": 0.0254,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.025,
-            "upper": 0.025,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9425,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0228,
-            "upper": 0.0228,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9275,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0254,
-            "upper": 0.0254,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.925,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0258,
-            "upper": 0.0258,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.935,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0242,
-            "upper": 0.0242,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9375,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0237,
-            "upper": 0.0237,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9275,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0254,
-            "upper": 0.0254,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.025,
-            "upper": 0.025,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0233,
-            "upper": 0.0233,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9375,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0237,
-            "upper": 0.0237,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.925,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0258,
-            "upper": 0.0258,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9275,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0254,
-            "upper": 0.0254,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.025,
-            "upper": 0.025,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/google/gemini-3-pro-preview/c0692e14-6484-4d02-8dac-55ce4373fb15.json b/data/global-mmlu-lite/google/gemini-3-pro-preview/c0692e14-6484-4d02-8dac-55ce4373fb15.json
deleted file mode 100644
index 8538679be..000000000
--- a/data/global-mmlu-lite/google/gemini-3-pro-preview/c0692e14-6484-4d02-8dac-55ce4373fb15.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/google_gemini-3-pro-preview/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemini-3-pro-preview",
-    "id": "google/gemini-3-pro-preview",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "Gemini 3 Pro Preview"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9453
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9397
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9509
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9475,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0219,
-            "upper": 0.0219,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9425,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0228,
-            "upper": 0.0228,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9425,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0228,
-            "upper": 0.0228,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0233,
-            "upper": 0.0233,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9575,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0198,
-            "upper": 0.0198,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9425,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0228,
-            "upper": 0.0228,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.955,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0203,
-            "upper": 0.0203,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.955,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0203,
-            "upper": 0.0203,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0233,
-            "upper": 0.0233,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0233,
-            "upper": 0.0233,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9425,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0228,
-            "upper": 0.0228,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9475,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0219,
-            "upper": 0.0219,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0233,
-            "upper": 0.0233,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9425,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0228,
-            "upper": 0.0228,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9475,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0219,
-            "upper": 0.0219,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9425,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0228,
-            "upper": 0.0228,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/google/gemma-3-27b-it/ab4940d1-118c-479a-bd37-1ea2da6f02a3.json b/data/global-mmlu-lite/google/gemma-3-27b-it/ab4940d1-118c-479a-bd37-1ea2da6f02a3.json
deleted file mode 100644
index 211f9d6b8..000000000
--- a/data/global-mmlu-lite/google/gemma-3-27b-it/ab4940d1-118c-479a-bd37-1ea2da6f02a3.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/google_gemma-3-27b-it/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-3-27b-it",
-    "id": "google/gemma-3-27b-it",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "Gemma 3 27B"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.763
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7528
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7733
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0406,
-            "upper": 0.0406,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7337,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0434,
-            "upper": 0.0434,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.75,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0426,
-            "upper": 0.0426,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.775,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0409,
-            "upper": 0.0409,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7481,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0429,
-            "upper": 0.0429,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7335,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0437,
-            "upper": 0.0437,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7563,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0422,
-            "upper": 0.0422,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.75,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0424,
-            "upper": 0.0424,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7925,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0397,
-            "upper": 0.0397,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.798,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0395,
-            "upper": 0.0395,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7481,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0427,
-            "upper": 0.0427,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7494,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0425,
-            "upper": 0.0425,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.785,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0403,
-            "upper": 0.0403,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7444,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0428,
-            "upper": 0.0428,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7925,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0397,
-            "upper": 0.0397,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7719,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0412,
-            "upper": 0.0412,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/google/gemma-3-4b-it/85552093-435f-4d85-897d-4e74c3655533.json b/data/global-mmlu-lite/google/gemma-3-4b-it/85552093-435f-4d85-897d-4e74c3655533.json
deleted file mode 100644
index f5d7db0a6..000000000
--- a/data/global-mmlu-lite/google/gemma-3-4b-it/85552093-435f-4d85-897d-4e74c3655533.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/google_gemma-3-4b-it/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-3-4b-it",
-    "id": "google/gemma-3-4b-it",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "Gemma 3 4B"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6511
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6116
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6906
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6525,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0467,
-            "upper": 0.0467,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.67,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0461,
-            "upper": 0.0461,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.68,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0457,
-            "upper": 0.0457,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6525,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0467,
-            "upper": 0.0467,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6575,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0465,
-            "upper": 0.0465,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6475,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0468,
-            "upper": 0.0468,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6775,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0458,
-            "upper": 0.0458,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6675,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0462,
-            "upper": 0.0462,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6325,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0472,
-            "upper": 0.0472,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.66,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0464,
-            "upper": 0.0464,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.68,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0457,
-            "upper": 0.0457,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6725,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.046,
-            "upper": 0.046,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6075,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0479,
-            "upper": 0.0479,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5825,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0483,
-            "upper": 0.0483,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6475,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0468,
-            "upper": 0.0468,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.63,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0473,
-            "upper": 0.0473,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/mistralai/mistral-medium-3/4ddc0062-6577-4ab9-85f1-791fd2822776.json b/data/global-mmlu-lite/mistralai/mistral-medium-3/4ddc0062-6577-4ab9-85f1-791fd2822776.json
deleted file mode 100644
index 242b4f1b9..000000000
--- a/data/global-mmlu-lite/mistralai/mistral-medium-3/4ddc0062-6577-4ab9-85f1-791fd2822776.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/mistralai_mistral-medium-3/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-medium-3",
-    "id": "mistralai/mistral-medium-3",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "Mistral Medium 3"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5511
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5391
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5631
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.455,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0488,
-            "upper": 0.0488,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0476,
-            "upper": 0.0476,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5175,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.049,
-            "upper": 0.049,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4775,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0489,
-            "upper": 0.0489,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.41,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0482,
-            "upper": 0.0482,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.555,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0487,
-            "upper": 0.0487,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.515,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.049,
-            "upper": 0.049,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.535,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0489,
-            "upper": 0.0489,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.58,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0484,
-            "upper": 0.0484,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.595,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0481,
-            "upper": 0.0481,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5175,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.049,
-            "upper": 0.049,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5375,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0489,
-            "upper": 0.0489,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7075,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0446,
-            "upper": 0.0446,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7675,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0414,
-            "upper": 0.0414,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.535,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0489,
-            "upper": 0.0489,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7325,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0434,
-            "upper": 0.0434,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/mistralai/mistral-small-2503/50fc4840-933b-43ec-847e-1834b30f9f14.json b/data/global-mmlu-lite/mistralai/mistral-small-2503/50fc4840-933b-43ec-847e-1834b30f9f14.json
deleted file mode 100644
index afd35d897..000000000
--- a/data/global-mmlu-lite/mistralai/mistral-small-2503/50fc4840-933b-43ec-847e-1834b30f9f14.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/mistralai_mistral-small-2503/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-small-2503",
-    "id": "mistralai/mistral-small-2503",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "Mistral Small 3.1"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7852
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7537
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8166
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7875,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0401,
-            "upper": 0.0401,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0392,
-            "upper": 0.0392,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7725,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0411,
-            "upper": 0.0411,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7975,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0394,
-            "upper": 0.0394,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0392,
-            "upper": 0.0392,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.795,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0396,
-            "upper": 0.0396,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.785,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0403,
-            "upper": 0.0403,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.805,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0388,
-            "upper": 0.0388,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0412,
-            "upper": 0.0412,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0399,
-            "upper": 0.0399,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7925,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0397,
-            "upper": 0.0397,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7825,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0404,
-            "upper": 0.0404,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.775,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0409,
-            "upper": 0.0409,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.735,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0432,
-            "upper": 0.0432,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7925,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0397,
-            "upper": 0.0397,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7825,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0404,
-            "upper": 0.0404,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/openai/gpt-4.1-2025-04-14/6cdc5384-2be5-47e0-a9b2-9cd6719c1760.json b/data/global-mmlu-lite/openai/gpt-4.1-2025-04-14/6cdc5384-2be5-47e0-a9b2-9cd6719c1760.json
deleted file mode 100644
index 4ace59a99..000000000
--- a/data/global-mmlu-lite/openai/gpt-4.1-2025-04-14/6cdc5384-2be5-47e0-a9b2-9cd6719c1760.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/openai_gpt-4.1-2025-04-14/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt-4.1-2025-04-14",
-    "id": "openai/gpt-4.1-2025-04-14",
-    "developer": "openai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "GPT-4.1"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8755
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8541
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8969
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0318,
-            "upper": 0.0318,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8825,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0316,
-            "upper": 0.0316,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8625,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0337,
-            "upper": 0.0337,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.875,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0324,
-            "upper": 0.0324,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8875,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.031,
-            "upper": 0.031,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8775,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0321,
-            "upper": 0.0321,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.885,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0313,
-            "upper": 0.0313,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0318,
-            "upper": 0.0318,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8725,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0327,
-            "upper": 0.0327,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.033,
-            "upper": 0.033,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.875,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0324,
-            "upper": 0.0324,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.885,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0313,
-            "upper": 0.0313,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8725,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0327,
-            "upper": 0.0327,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.875,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0324,
-            "upper": 0.0324,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.033,
-            "upper": 0.033,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8575,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0343,
-            "upper": 0.0343,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/openai/gpt-5-2025-08-07/a668c931-34e4-4702-a84c-97d8c6f59ef4.json b/data/global-mmlu-lite/openai/gpt-5-2025-08-07/a668c931-34e4-4702-a84c-97d8c6f59ef4.json
deleted file mode 100644
index 7b0435821..000000000
--- a/data/global-mmlu-lite/openai/gpt-5-2025-08-07/a668c931-34e4-4702-a84c-97d8c6f59ef4.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/openai_gpt-5-2025-08-07/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt-5-2025-08-07",
-    "id": "openai/gpt-5-2025-08-07",
-    "developer": "openai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "GPT-5"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8895
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8913
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8878
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8925,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0304,
-            "upper": 0.0304,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8725,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0327,
-            "upper": 0.0327,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0294,
-            "upper": 0.0294,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.028,
-            "upper": 0.028,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9075,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0284,
-            "upper": 0.0284,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0335,
-            "upper": 0.0335,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.795,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0396,
-            "upper": 0.0396,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9075,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0284,
-            "upper": 0.0284,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8875,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.031,
-            "upper": 0.031,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.915,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0273,
-            "upper": 0.0273,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8875,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.031,
-            "upper": 0.031,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.905,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0287,
-            "upper": 0.0287,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0335,
-            "upper": 0.0335,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9125,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0277,
-            "upper": 0.0277,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.895,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.03,
-            "upper": 0.03,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.915,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0273,
-            "upper": 0.0273,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/openai/o3-mini-2025-01-31/3a7e2aa6-4e57-446f-a127-4a7e022fe3e1.json b/data/global-mmlu-lite/openai/o3-mini-2025-01-31/3a7e2aa6-4e57-446f-a127-4a7e022fe3e1.json
deleted file mode 100644
index 0d22ba810..000000000
--- a/data/global-mmlu-lite/openai/o3-mini-2025-01-31/3a7e2aa6-4e57-446f-a127-4a7e022fe3e1.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/openai_o3-mini-2025-01-31/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "o3-mini-2025-01-31",
-    "id": "openai/o3-mini-2025-01-31",
-    "developer": "openai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "o3 mini"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.765
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.795
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7725,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0411,
-            "upper": 0.0411,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8025,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.039,
-            "upper": 0.039,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0412,
-            "upper": 0.0412,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7525,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0423,
-            "upper": 0.0423,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.74,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.043,
-            "upper": 0.043,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7525,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0423,
-            "upper": 0.0423,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7425,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0429,
-            "upper": 0.0429,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0392,
-            "upper": 0.0392,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0384,
-            "upper": 0.0384,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8075,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0386,
-            "upper": 0.0386,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7975,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0394,
-            "upper": 0.0394,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.775,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0409,
-            "upper": 0.0409,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.765,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0416,
-            "upper": 0.0416,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7725,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0411,
-            "upper": 0.0411,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8125,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0382,
-            "upper": 0.0382,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8075,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0386,
-            "upper": 0.0386,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/unknown/aya-expanse-32b/938a35f1-195d-49c8-9a16-90fab96692bd.json b/data/global-mmlu-lite/unknown/aya-expanse-32b/938a35f1-195d-49c8-9a16-90fab96692bd.json
deleted file mode 100644
index 4e5593fdf..000000000
--- a/data/global-mmlu-lite/unknown/aya-expanse-32b/938a35f1-195d-49c8-9a16-90fab96692bd.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/unknown_aya-expanse-32b/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "aya-expanse-32b",
-    "id": "unknown/aya-expanse-32b",
-    "developer": "unknown",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "Aya Expanse 32B"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7353
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6891
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7815
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7425,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0429,
-            "upper": 0.0429,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7544,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0422,
-            "upper": 0.0422,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7343,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0433,
-            "upper": 0.0433,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7425,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0429,
-            "upper": 0.0429,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7325,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0434,
-            "upper": 0.0434,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7375,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0431,
-            "upper": 0.0431,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7594,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0419,
-            "upper": 0.0419,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7305,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0436,
-            "upper": 0.0436,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7419,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0429,
-            "upper": 0.0429,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7525,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0423,
-            "upper": 0.0423,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7544,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0422,
-            "upper": 0.0422,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7362,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0433,
-            "upper": 0.0433,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7071,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0448,
-            "upper": 0.0448,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6942,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0452,
-            "upper": 0.0452,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.743,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0432,
-            "upper": 0.0432,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7025,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0448,
-            "upper": 0.0448,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/unknown/granite-4.0-h-small/ce756801-f75e-4250-9721-1d627a37f055.json b/data/global-mmlu-lite/unknown/granite-4.0-h-small/ce756801-f75e-4250-9721-1d627a37f055.json
deleted file mode 100644
index fd8643d63..000000000
--- a/data/global-mmlu-lite/unknown/granite-4.0-h-small/ce756801-f75e-4250-9721-1d627a37f055.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/unknown_granite-4.0-h-small/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-4.0-h-small",
-    "id": "unknown/granite-4.0-h-small",
-    "developer": "unknown",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "Granite 4.0 Small"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7503
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7182
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7826
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7613,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0419,
-            "upper": 0.0419,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0412,
-            "upper": 0.0412,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7613,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0419,
-            "upper": 0.0419,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.755,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0421,
-            "upper": 0.0421,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7594,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0419,
-            "upper": 0.0419,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7575,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.042,
-            "upper": 0.042,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7614,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0421,
-            "upper": 0.0421,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7525,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0423,
-            "upper": 0.0423,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7406,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0431,
-            "upper": 0.0431,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7525,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0423,
-            "upper": 0.0423,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.757,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0423,
-            "upper": 0.0423,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7638,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0417,
-            "upper": 0.0417,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7318,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0435,
-            "upper": 0.0435,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6921,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0456,
-            "upper": 0.0456,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7475,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0426,
-            "upper": 0.0426,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7419,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0429,
-            "upper": 0.0429,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/unknown/o4-mini-2025-04-16/b83b41d4-6c95-4c7d-a290-65d89bf776c2.json b/data/global-mmlu-lite/unknown/o4-mini-2025-04-16/b83b41d4-6c95-4c7d-a290-65d89bf776c2.json
deleted file mode 100644
index 95a579825..000000000
--- a/data/global-mmlu-lite/unknown/o4-mini-2025-04-16/b83b41d4-6c95-4c7d-a290-65d89bf776c2.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/unknown_o4-mini-2025-04-16/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "o4-mini-2025-04-16",
-    "id": "unknown/o4-mini-2025-04-16",
-    "developer": "unknown",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "o4 mini"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8705
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8503
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8906
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0335,
-            "upper": 0.0335,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8675,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0332,
-            "upper": 0.0332,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8875,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.031,
-            "upper": 0.031,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8775,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0321,
-            "upper": 0.0321,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.033,
-            "upper": 0.033,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.033,
-            "upper": 0.033,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8675,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0332,
-            "upper": 0.0332,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.855,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0345,
-            "upper": 0.0345,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.885,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0313,
-            "upper": 0.0313,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0318,
-            "upper": 0.0318,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0318,
-            "upper": 0.0318,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.855,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0345,
-            "upper": 0.0345,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8525,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0348,
-            "upper": 0.0348,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8525,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0348,
-            "upper": 0.0348,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0307,
-            "upper": 0.0307,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8725,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0327,
-            "upper": 0.0327,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/xai/grok-3-mini/31c3fe1b-be4b-42ef-8ec0-9da323b2ebb6.json b/data/global-mmlu-lite/xai/grok-3-mini/31c3fe1b-be4b-42ef-8ec0-9da323b2ebb6.json
deleted file mode 100644
index f816ebb33..000000000
--- a/data/global-mmlu-lite/xai/grok-3-mini/31c3fe1b-be4b-42ef-8ec0-9da323b2ebb6.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/xai_grok-3-mini/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "grok-3-mini",
-    "id": "xai/grok-3-mini",
-    "developer": "xai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "Grok 3 Mini"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.673
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6717
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6743
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.755,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0421,
-            "upper": 0.0421,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5075,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.049,
-            "upper": 0.049,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7355,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0434,
-            "upper": 0.0434,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6591,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0465,
-            "upper": 0.0465,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.485,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.049,
-            "upper": 0.049,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.56,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0486,
-            "upper": 0.0486,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.725,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0438,
-            "upper": 0.0438,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.696,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0452,
-            "upper": 0.0452,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6575,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0465,
-            "upper": 0.0465,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7325,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0434,
-            "upper": 0.0434,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6275,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0474,
-            "upper": 0.0474,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.61,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0478,
-            "upper": 0.0478,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7625,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0417,
-            "upper": 0.0417,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8296,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0369,
-            "upper": 0.0369,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5564,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0487,
-            "upper": 0.0487,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8693,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0331,
-            "upper": 0.0331,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/global-mmlu-lite/xai/grok-4-0709/a8e0fc0e-b3a4-4a0b-938f-aa11f1c64358.json b/data/global-mmlu-lite/xai/grok-4-0709/a8e0fc0e-b3a4-4a0b-938f-aa11f1c64358.json
deleted file mode 100644
index 4e37c60a0..000000000
--- a/data/global-mmlu-lite/xai/grok-4-0709/a8e0fc0e-b3a4-4a0b-938f-aa11f1c64358.json
+++ /dev/null
@@ -1,515 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "global-mmlu-lite/xai_grok-4-0709/1770822797.839372",
-  "retrieved_timestamp": "1770822797.839372",
-  "source_metadata": {
-    "source_name": "Global MMLU Lite Leaderboard",
-    "source_type": "documentation",
-    "source_organization_name": "kaggle",
-    "source_organization_url": "www.kaggle.com",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "grok-4-0709",
-    "id": "xai/grok-4-0709",
-    "developer": "xai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "display_name": "Grok 4"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Global MMLU Lite",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Global MMLU Lite",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8881
-      }
-    },
-    {
-      "evaluation_name": "Culturally Sensitive",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Sensitive",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8862
-      }
-    },
-    {
-      "evaluation_name": "Culturally Agnostic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Culturally Agnostic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89
-      }
-    },
-    {
-      "evaluation_name": "Arabic",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Arabic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.885,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0313,
-            "upper": 0.0313,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "English",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - English",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.905,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0287,
-            "upper": 0.0287,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Bengali",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Bengali",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8925,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0304,
-            "upper": 0.0304,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "German",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - German",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8725,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0327,
-            "upper": 0.0327,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "French",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - French",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.875,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0324,
-            "upper": 0.0324,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Hindi",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Hindi",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8675,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0332,
-            "upper": 0.0332,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Indonesian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Indonesian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0307,
-            "upper": 0.0307,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Italian",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Italian",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9025,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0291,
-            "upper": 0.0291,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Japanese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Japanese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.033,
-            "upper": 0.033,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Korean",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Korean",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.895,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.03,
-            "upper": 0.03,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Portuguese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Portuguese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8725,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0327,
-            "upper": 0.0327,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Spanish",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Spanish",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9075,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0284,
-            "upper": 0.0284,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Swahili",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Swahili",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.028,
-            "upper": 0.028,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Yoruba",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Yoruba",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.905,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0287,
-            "upper": 0.0287,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Chinese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Chinese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8525,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0348,
-            "upper": 0.0348,
-            "method": "unknown"
-          }
-        }
-      }
-    },
-    {
-      "evaluation_name": "Burmese",
-      "source_data": {
-        "dataset_name": "global-mmlu-lite",
-        "source_type": "url",
-        "url": [
-          "https://www.kaggle.com/datasets/cohere-labs/global-mmlu-lite"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Global MMLU Lite - Burmese",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9075,
-        "uncertainty": {
-          "confidence_interval": {
-            "lower": -0.0284,
-            "upper": 0.0284,
-            "method": "unknown"
-          }
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json b/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json
deleted file mode 100644
index 8176fa91a..000000000
--- a/data/helm_capabilities/allenai/olmo-2-0325-32b-instruct/8948bfb0-cc9d-40f7-a02d-d5c9611436d8.json
+++ /dev/null
@@ -1,352 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/allenai_olmo-2-0325-32b-instruct/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OLMo 2 32B Instruct March 2025",
-    "id": "allenai/olmo-2-0325-32b-instruct",
-    "developer": "allenai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.475,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 191.7591204277284
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.414,
-        "details": {
-          "description": "min=0.414, mean=0.414, max=0.414, sum=0.414 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=106.958, mean=106.958, max=106.958, sum=106.958 (1)",
-            "tab": "Efficiency",
-            "score": 106.95772108364105
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=228.506, mean=228.506, max=228.506, sum=228.506 (1)",
-            "tab": "General information",
-            "score": 228.506
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=338.34, mean=338.34, max=338.34, sum=338.34 (1)",
-            "tab": "General information",
-            "score": 338.34
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false",
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.287,
-        "details": {
-          "description": "min=0.287, mean=0.287, max=0.287, sum=0.287 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=161.247, mean=161.247, max=161.247, sum=161.247 (1)",
-            "tab": "Efficiency",
-            "score": 161.24673478646127
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)",
-            "tab": "General information",
-            "score": 0.002242152466367713
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=247.26, mean=247.26, max=247.26, sum=247.26 (1)",
-            "tab": "General information",
-            "score": 247.26008968609867
-          },
-          "GPQA - # output tokens": {
-            "description": "min=526.352, mean=526.352, max=526.352, sum=526.352 (1)",
-            "tab": "General information",
-            "score": 526.3520179372198
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false",
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78,
-        "details": {
-          "description": "min=0.78, mean=0.78, max=0.78, sum=0.78 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=78.302, mean=78.302, max=78.302, sum=78.302 (1)",
-            "tab": "Efficiency",
-            "score": 78.30223875301382
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.054, mean=46.054, max=46.054, sum=46.054 (1)",
-            "tab": "General information",
-            "score": 46.05360443622921
-          },
-          "IFEval - # output tokens": {
-            "description": "min=260.017, mean=260.017, max=260.017, sum=260.017 (1)",
-            "tab": "General information",
-            "score": 260.0166358595194
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.734,
-        "details": {
-          "description": "min=0.734, mean=0.734, max=0.734, sum=0.734 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=333.659, mean=333.659, max=333.659, sum=333.659 (1)",
-            "tab": "Efficiency",
-            "score": 333.659037665844
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=764.742, mean=764.742, max=764.742, sum=764.742 (1)",
-            "tab": "General information",
-            "score": 764.742
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2",
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.161,
-        "details": {
-          "description": "min=0.161, mean=0.161, max=0.161, sum=0.161 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=278.63, mean=278.63, max=278.63, sum=278.63 (1)",
-            "tab": "Efficiency",
-            "score": 278.6298698496819
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "General information",
-            "score": 0.001
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=108.843, mean=108.843, max=108.843, sum=108.843 (1)",
-            "tab": "General information",
-            "score": 108.843
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=573.483, mean=573.483, max=573.483, sum=573.483 (1)",
-            "tab": "General information",
-            "score": 573.483
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "num_output_tokens": "2048"
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json b/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json
deleted file mode 100644
index 4d2b264af..000000000
--- a/data/helm_capabilities/allenai/olmo-2-1124-13b-instruct/7d2d1dba-1b31-47b2-8308-f2668cf36c99.json
+++ /dev/null
@@ -1,352 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-13b-instruct/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OLMo 2 13B Instruct November 2024",
-    "id": "allenai/olmo-2-1124-13b-instruct",
-    "developer": "allenai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 103.93921828652563
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.31,
-        "details": {
-          "description": "min=0.31, mean=0.31, max=0.31, sum=0.31 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=48.22, mean=48.22, max=48.22, sum=48.22 (1)",
-            "tab": "Efficiency",
-            "score": 48.21963578557968
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=228.506, mean=228.506, max=228.506, sum=228.506 (1)",
-            "tab": "General information",
-            "score": 228.506
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=200.755, mean=200.755, max=200.755, sum=200.755 (1)",
-            "tab": "General information",
-            "score": 200.755
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false",
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.316,
-        "details": {
-          "description": "min=0.316, mean=0.316, max=0.316, sum=0.316 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=44.368, mean=44.368, max=44.368, sum=44.368 (1)",
-            "tab": "Efficiency",
-            "score": 44.36780591235567
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)",
-            "tab": "General information",
-            "score": 0.002242152466367713
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=247.26, mean=247.26, max=247.26, sum=247.26 (1)",
-            "tab": "General information",
-            "score": 247.26008968609867
-          },
-          "GPQA - # output tokens": {
-            "description": "min=185.419, mean=185.419, max=185.419, sum=185.419 (1)",
-            "tab": "General information",
-            "score": 185.41928251121075
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false",
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.73,
-        "details": {
-          "description": "min=0.73, mean=0.73, max=0.73, sum=0.73 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=71.901, mean=71.901, max=71.901, sum=71.901 (1)",
-            "tab": "Efficiency",
-            "score": 71.90055892868536
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.054, mean=46.054, max=46.054, sum=46.054 (1)",
-            "tab": "General information",
-            "score": 46.05360443622921
-          },
-          "IFEval - # output tokens": {
-            "description": "min=311.527, mean=311.527, max=311.527, sum=311.527 (1)",
-            "tab": "General information",
-            "score": 311.5268022181146
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.689,
-        "details": {
-          "description": "min=0.689, mean=0.689, max=0.689, sum=0.689 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=194.337, mean=194.337, max=194.337, sum=194.337 (1)",
-            "tab": "Efficiency",
-            "score": 194.33703967285157
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=771.135, mean=771.135, max=771.135, sum=771.135 (1)",
-            "tab": "General information",
-            "score": 771.135
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2",
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.156,
-        "details": {
-          "description": "min=0.156, mean=0.156, max=0.156, sum=0.156 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=160.871, mean=160.871, max=160.871, sum=160.871 (1)",
-            "tab": "Efficiency",
-            "score": 160.87105113315582
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "General information",
-            "score": 0.001
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=108.843, mean=108.843, max=108.843, sum=108.843 (1)",
-            "tab": "General information",
-            "score": 108.843
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=681.572, mean=681.572, max=681.572, sum=681.572 (1)",
-            "tab": "General information",
-            "score": 681.572
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "num_output_tokens": "2048"
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json b/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json
deleted file mode 100644
index 39fbc0d1c..000000000
--- a/data/helm_capabilities/allenai/olmo-2-1124-7b-instruct/3a056f7b-1bdf-4543-9e67-1101ace67179.json
+++ /dev/null
@@ -1,352 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/allenai_olmo-2-1124-7b-instruct/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OLMo 2 7B Instruct November 2024",
-    "id": "allenai/olmo-2-1124-7b-instruct",
-    "developer": "allenai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.405,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 164.44917339954657
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.292,
-        "details": {
-          "description": "min=0.292, mean=0.292, max=0.292, sum=0.292 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=65.565, mean=65.565, max=65.565, sum=65.565 (1)",
-            "tab": "Efficiency",
-            "score": 65.56540368175507
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=228.506, mean=228.506, max=228.506, sum=228.506 (1)",
-            "tab": "General information",
-            "score": 228.506
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=265.659, mean=265.659, max=265.659, sum=265.659 (1)",
-            "tab": "General information",
-            "score": 265.659
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false",
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.296,
-        "details": {
-          "description": "min=0.296, mean=0.296, max=0.296, sum=0.296 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=184.733, mean=184.733, max=184.733, sum=184.733 (1)",
-            "tab": "Efficiency",
-            "score": 184.73346061877606
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)",
-            "tab": "General information",
-            "score": 0.002242152466367713
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=247.26, mean=247.26, max=247.26, sum=247.26 (1)",
-            "tab": "General information",
-            "score": 247.26008968609867
-          },
-          "GPQA - # output tokens": {
-            "description": "min=381.121, mean=381.121, max=381.121, sum=381.121 (1)",
-            "tab": "General information",
-            "score": 381.1210762331838
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false",
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.693,
-        "details": {
-          "description": "min=0.693, mean=0.693, max=0.693, sum=0.693 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=102.503, mean=102.503, max=102.503, sum=102.503 (1)",
-            "tab": "Efficiency",
-            "score": 102.50307150909508
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.054, mean=46.054, max=46.054, sum=46.054 (1)",
-            "tab": "General information",
-            "score": 46.05360443622921
-          },
-          "IFEval - # output tokens": {
-            "description": "min=306.706, mean=306.706, max=306.706, sum=306.706 (1)",
-            "tab": "General information",
-            "score": 306.70609981515713
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.628,
-        "details": {
-          "description": "min=0.628, mean=0.628, max=0.628, sum=0.628 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=236.772, mean=236.772, max=236.772, sum=236.772 (1)",
-            "tab": "Efficiency",
-            "score": 236.77177815794946
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=768.348, mean=768.348, max=768.348, sum=768.348 (1)",
-            "tab": "General information",
-            "score": 768.348
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2",
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.116,
-        "details": {
-          "description": "min=0.116, mean=0.116, max=0.116, sum=0.116 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=232.672, mean=232.672, max=232.672, sum=232.672 (1)",
-            "tab": "Efficiency",
-            "score": 232.6721530301571
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "General information",
-            "score": 0.001
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=108.843, mean=108.843, max=108.843, sum=108.843 (1)",
-            "tab": "General information",
-            "score": 108.843
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=799.769, mean=799.769, max=799.769, sum=799.769 (1)",
-            "tab": "General information",
-            "score": 799.769
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "num_output_tokens": "2048"
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json b/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json
deleted file mode 100644
index 99d31c069..000000000
--- a/data/helm_capabilities/allenai/olmoe-1b-7b-0125-instruct/275cf2e5-5ccd-40be-be55-938c82ef6688.json
+++ /dev/null
@@ -1,352 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/allenai_olmoe-1b-7b-0125-instruct/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OLMoE 1B-7B Instruct January 2025",
-    "id": "allenai/olmoe-1b-7b-0125-instruct",
-    "developer": "allenai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.332,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 449.11527986486544
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.169,
-        "details": {
-          "description": "min=0.169, mean=0.169, max=0.169, sum=0.169 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=226.84, mean=226.84, max=226.84, sum=226.84 (1)",
-            "tab": "Efficiency",
-            "score": 226.84002213978766
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=231.403, mean=231.403, max=231.403, sum=231.403 (1)",
-            "tab": "General information",
-            "score": 231.403
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=237.89, mean=237.89, max=237.89, sum=237.89 (1)",
-            "tab": "General information",
-            "score": 237.89
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false",
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.22,
-        "details": {
-          "description": "min=0.22, mean=0.22, max=0.22, sum=0.22 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=263.918, mean=263.918, max=263.918, sum=263.918 (1)",
-            "tab": "Efficiency",
-            "score": 263.9177615305768
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)",
-            "tab": "General information",
-            "score": 0.002242152466367713
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=249.803, mean=249.803, max=249.803, sum=249.803 (1)",
-            "tab": "General information",
-            "score": 249.80269058295963
-          },
-          "GPQA - # output tokens": {
-            "description": "min=302.475, mean=302.475, max=302.475, sum=302.475 (1)",
-            "tab": "General information",
-            "score": 302.47533632286996
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false",
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.628,
-        "details": {
-          "description": "min=0.628, mean=0.628, max=0.628, sum=0.628 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=437.953, mean=437.953, max=437.953, sum=437.953 (1)",
-            "tab": "Efficiency",
-            "score": 437.95291065332407
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.782, mean=47.782, max=47.782, sum=47.782 (1)",
-            "tab": "General information",
-            "score": 47.781885397412196
-          },
-          "IFEval - # output tokens": {
-            "description": "min=432.808, mean=432.808, max=432.808, sum=432.808 (1)",
-            "tab": "General information",
-            "score": 432.80776340110907
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.551,
-        "details": {
-          "description": "min=0.551, mean=0.551, max=0.551, sum=0.551 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=915.237, mean=915.237, max=915.237, sum=915.237 (1)",
-            "tab": "Efficiency",
-            "score": 915.2368009176254
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=972.482, mean=972.482, max=972.482, sum=972.482 (1)",
-            "tab": "General information",
-            "score": 972.482
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2",
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.093,
-        "details": {
-          "description": "min=0.093, mean=0.093, max=0.093, sum=0.093 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=401.629, mean=401.629, max=401.629, sum=401.629 (1)",
-            "tab": "Efficiency",
-            "score": 401.62890408301354
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "General information",
-            "score": 0.001
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=110.864, mean=110.864, max=110.864, sum=110.864 (1)",
-            "tab": "General information",
-            "score": 110.864
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=442.229, mean=442.229, max=442.229, sum=442.229 (1)",
-            "tab": "General information",
-            "score": 442.229
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "num_output_tokens": "2048"
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json b/data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json
deleted file mode 100644
index c786f36c7..000000000
--- a/data/helm_capabilities/amazon/nova-lite-v1_0/43e7be99-4872-4eb1-b30b-75c44b298ab4.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/amazon_nova-lite-v1:0/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Amazon Nova Lite",
-    "id": "amazon/nova-lite-v1:0",
-    "developer": "amazon",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.551,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 2.6046740288354906
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6,
-        "details": {
-          "description": "min=0.6, mean=0.6, max=0.6, sum=0.6 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=1.375, mean=1.375, max=1.375, sum=1.375 (1)",
-            "tab": "Efficiency",
-            "score": 1.3748559999999983
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=235.232, mean=235.232, max=235.232, sum=235.232 (1)",
-            "tab": "General information",
-            "score": 235.232
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=343.771, mean=343.771, max=343.771, sum=343.771 (1)",
-            "tab": "General information",
-            "score": 343.771
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.397,
-        "details": {
-          "description": "min=0.397, mean=0.397, max=0.397, sum=0.397 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=2.04, mean=2.04, max=2.04, sum=2.04 (1)",
-            "tab": "Efficiency",
-            "score": 2.0404999999999998
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=264.121, mean=264.121, max=264.121, sum=264.121 (1)",
-            "tab": "General information",
-            "score": 264.1210762331838
-          },
-          "GPQA - # output tokens": {
-            "description": "min=512.256, mean=512.256, max=512.256, sum=512.256 (1)",
-            "tab": "General information",
-            "score": 512.2556053811659
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.776,
-        "details": {
-          "description": "min=0.776, mean=0.776, max=0.776, sum=0.776 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=3.156, mean=3.156, max=3.156, sum=3.156 (1)",
-            "tab": "Efficiency",
-            "score": 3.1562421441774484
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.58, mean=47.58, max=47.58, sum=47.58 (1)",
-            "tab": "General information",
-            "score": 47.58040665434381
-          },
-          "IFEval - # output tokens": {
-            "description": "min=412.706, mean=412.706, max=412.706, sum=412.706 (1)",
-            "tab": "General information",
-            "score": 412.70609981515713
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.75,
-        "details": {
-          "description": "min=0.75, mean=0.75, max=0.75, sum=0.75 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=4.034, mean=4.034, max=4.034, sum=4.034 (1)",
-            "tab": "Efficiency",
-            "score": 4.0338700000000065
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=938.586, mean=938.586, max=938.586, sum=938.586 (1)",
-            "tab": "General information",
-            "score": 938.586
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.233,
-        "details": {
-          "description": "min=0.233, mean=0.233, max=0.233, sum=0.233 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=2.418, mean=2.418, max=2.418, sum=2.418 (1)",
-            "tab": "Efficiency",
-            "score": 2.4179019999999993
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=117.921, mean=117.921, max=117.921, sum=117.921 (1)",
-            "tab": "General information",
-            "score": 117.921
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=788.8, mean=788.8, max=788.8, sum=788.8 (1)",
-            "tab": "General information",
-            "score": 788.8
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json b/data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json
deleted file mode 100644
index 6219cdf47..000000000
--- a/data/helm_capabilities/amazon/nova-micro-v1_0/cfc99298-4570-48cf-9187-aa0d167cc0ba.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/amazon_nova-micro-v1:0/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Amazon Nova Micro",
-    "id": "amazon/nova-micro-v1:0",
-    "developer": "amazon",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.522,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 2.157983343244118
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.511,
-        "details": {
-          "description": "min=0.511, mean=0.511, max=0.511, sum=0.511 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=1.316, mean=1.316, max=1.316, sum=1.316 (1)",
-            "tab": "Efficiency",
-            "score": 1.3163370000000014
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=235.232, mean=235.232, max=235.232, sum=235.232 (1)",
-            "tab": "General information",
-            "score": 235.232
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=367.695, mean=367.695, max=367.695, sum=367.695 (1)",
-            "tab": "General information",
-            "score": 367.695
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.383,
-        "details": {
-          "description": "min=0.383, mean=0.383, max=0.383, sum=0.383 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=2.134, mean=2.134, max=2.134, sum=2.134 (1)",
-            "tab": "Efficiency",
-            "score": 2.1342376681614366
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=264.121, mean=264.121, max=264.121, sum=264.121 (1)",
-            "tab": "General information",
-            "score": 264.1210762331838
-          },
-          "GPQA - # output tokens": {
-            "description": "min=587.372, mean=587.372, max=587.372, sum=587.372 (1)",
-            "tab": "General information",
-            "score": 587.3721973094171
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=0.76 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=1.605, mean=1.605, max=1.605, sum=1.605 (1)",
-            "tab": "Efficiency",
-            "score": 1.6054140480591508
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.58, mean=47.58, max=47.58, sum=47.58 (1)",
-            "tab": "General information",
-            "score": 47.58040665434381
-          },
-          "IFEval - # output tokens": {
-            "description": "min=385.473, mean=385.473, max=385.473, sum=385.473 (1)",
-            "tab": "General information",
-            "score": 385.4731977818854
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.743,
-        "details": {
-          "description": "min=0.743, mean=0.743, max=0.743, sum=0.743 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=3.624, mean=3.624, max=3.624, sum=3.624 (1)",
-            "tab": "Efficiency",
-            "score": 3.6235889999999995
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=925.586, mean=925.586, max=925.586, sum=925.586 (1)",
-            "tab": "General information",
-            "score": 925.586
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.214,
-        "details": {
-          "description": "min=0.214, mean=0.214, max=0.214, sum=0.214 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=2.11, mean=2.11, max=2.11, sum=2.11 (1)",
-            "tab": "Efficiency",
-            "score": 2.1103390000000006
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=117.921, mean=117.921, max=117.921, sum=117.921 (1)",
-            "tab": "General information",
-            "score": 117.921
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=743.286, mean=743.286, max=743.286, sum=743.286 (1)",
-            "tab": "General information",
-            "score": 743.286
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json b/data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json
deleted file mode 100644
index d9f1bd857..000000000
--- a/data/helm_capabilities/amazon/nova-premier-v1_0/a2162367-d16d-4274-aa89-43435cea5c0b.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/amazon_nova-premier-v1:0/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Amazon Nova Premier",
-    "id": "amazon/nova-premier-v1:0",
-    "developer": "amazon",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.637,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 7.8055529408801165
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.726,
-        "details": {
-          "description": "min=0.726, mean=0.726, max=0.726, sum=0.726 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=5.032, mean=5.032, max=5.032, sum=5.032 (1)",
-            "tab": "Efficiency",
-            "score": 5.031505000000002
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=235.232, mean=235.232, max=235.232, sum=235.232 (1)",
-            "tab": "General information",
-            "score": 235.232
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=360.651, mean=360.651, max=360.651, sum=360.651 (1)",
-            "tab": "General information",
-            "score": 360.651
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.518,
-        "details": {
-          "description": "min=0.518, mean=0.518, max=0.518, sum=0.518 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=6.746, mean=6.746, max=6.746, sum=6.746 (1)",
-            "tab": "Efficiency",
-            "score": 6.7455403587443925
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=264.121, mean=264.121, max=264.121, sum=264.121 (1)",
-            "tab": "General information",
-            "score": 264.1210762331838
-          },
-          "GPQA - # output tokens": {
-            "description": "min=452.691, mean=452.691, max=452.691, sum=452.691 (1)",
-            "tab": "General information",
-            "score": 452.69058295964123
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.803,
-        "details": {
-          "description": "min=0.803, mean=0.803, max=0.803, sum=0.803 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=6.027, mean=6.027, max=6.027, sum=6.027 (1)",
-            "tab": "Efficiency",
-            "score": 6.026593345656195
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.58, mean=47.58, max=47.58, sum=47.58 (1)",
-            "tab": "General information",
-            "score": 47.58040665434381
-          },
-          "IFEval - # output tokens": {
-            "description": "min=325.945, mean=325.945, max=325.945, sum=325.945 (1)",
-            "tab": "General information",
-            "score": 325.9445471349353
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.788,
-        "details": {
-          "description": "min=0.788, mean=0.788, max=0.788, sum=0.788 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=13.055, mean=13.055, max=13.055, sum=13.055 (1)",
-            "tab": "Efficiency",
-            "score": 13.055127999999996
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=814.969, mean=814.969, max=814.969, sum=814.969 (1)",
-            "tab": "General information",
-            "score": 814.969
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35,
-        "details": {
-          "description": "min=0.35, mean=0.35, max=0.35, sum=0.35 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=8.169, mean=8.169, max=8.169, sum=8.169 (1)",
-            "tab": "Efficiency",
-            "score": 8.168997999999998
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=117.921, mean=117.921, max=117.921, sum=117.921 (1)",
-            "tab": "General information",
-            "score": 117.921
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=778.909, mean=778.909, max=778.909, sum=778.909 (1)",
-            "tab": "General information",
-            "score": 778.909
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json b/data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json
deleted file mode 100644
index 658945ff5..000000000
--- a/data/helm_capabilities/amazon/nova-pro-v1_0/51ef4580-da13-415a-a37f-45e2036ed4c2.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/amazon_nova-pro-v1:0/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Amazon Nova Pro",
-    "id": "amazon/nova-pro-v1:0",
-    "developer": "amazon",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.591,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 6.538285667967472
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.673,
-        "details": {
-          "description": "min=0.673, mean=0.673, max=0.673, sum=0.673 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=4.554, mean=4.554, max=4.554, sum=4.554 (1)",
-            "tab": "Efficiency",
-            "score": 4.554401999999996
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=235.232, mean=235.232, max=235.232, sum=235.232 (1)",
-            "tab": "General information",
-            "score": 235.232
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=381.807, mean=381.807, max=381.807, sum=381.807 (1)",
-            "tab": "General information",
-            "score": 381.807
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.446,
-        "details": {
-          "description": "min=0.446, mean=0.446, max=0.446, sum=0.446 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=5.948, mean=5.948, max=5.948, sum=5.948 (1)",
-            "tab": "Efficiency",
-            "score": 5.947926008968607
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=264.121, mean=264.121, max=264.121, sum=264.121 (1)",
-            "tab": "General information",
-            "score": 264.1210762331838
-          },
-          "GPQA - # output tokens": {
-            "description": "min=534.013, mean=534.013, max=534.013, sum=534.013 (1)",
-            "tab": "General information",
-            "score": 534.0134529147982
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.815,
-        "details": {
-          "description": "min=0.815, mean=0.815, max=0.815, sum=0.815 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=3.945, mean=3.945, max=3.945, sum=3.945 (1)",
-            "tab": "Efficiency",
-            "score": 3.945081330868756
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.58, mean=47.58, max=47.58, sum=47.58 (1)",
-            "tab": "General information",
-            "score": 47.58040665434381
-          },
-          "IFEval - # output tokens": {
-            "description": "min=383.871, mean=383.871, max=383.871, sum=383.871 (1)",
-            "tab": "General information",
-            "score": 383.8706099815157
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.777,
-        "details": {
-          "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=10.635, mean=10.635, max=10.635, sum=10.635 (1)",
-            "tab": "Efficiency",
-            "score": 10.635314999999995
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=899.758, mean=899.758, max=899.758, sum=899.758 (1)",
-            "tab": "General information",
-            "score": 899.758
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.242,
-        "details": {
-          "description": "min=0.242, mean=0.242, max=0.242, sum=0.242 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=7.609, mean=7.609, max=7.609, sum=7.609 (1)",
-            "tab": "Efficiency",
-            "score": 7.608704000000004
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=117.921, mean=117.921, max=117.921, sum=117.921 (1)",
-            "tab": "General information",
-            "score": 117.921
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=649.195, mean=649.195, max=649.195, sum=649.195 (1)",
-            "tab": "General information",
-            "score": 649.195
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json b/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json
deleted file mode 100644
index d63e271d1..000000000
--- a/data/helm_capabilities/anthropic/claude-3-5-haiku-20241022/3fa605db-fcff-4f05-9398-6af77c9dcada.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-3-5-haiku-20241022/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 3.5 Haiku 20241022",
-    "id": "anthropic/claude-3-5-haiku-20241022",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.549,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 6.973328374403875
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.605,
-        "details": {
-          "description": "min=0.605, mean=0.605, max=0.605, sum=0.605 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=5.171, mean=5.171, max=5.171, sum=5.171 (1)",
-            "tab": "Efficiency",
-            "score": 5.170877918004989
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=230.461, mean=230.461, max=230.461, sum=230.461 (1)",
-            "tab": "General information",
-            "score": 230.461
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=253.047, mean=253.047, max=253.047, sum=253.047 (1)",
-            "tab": "General information",
-            "score": 253.047
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.363,
-        "details": {
-          "description": "min=0.363, mean=0.363, max=0.363, sum=0.363 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=5.33, mean=5.33, max=5.33, sum=5.33 (1)",
-            "tab": "Efficiency",
-            "score": 5.329682314877018
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=250.738, mean=250.738, max=250.738, sum=250.738 (1)",
-            "tab": "General information",
-            "score": 250.73766816143498
-          },
-          "GPQA - # output tokens": {
-            "description": "min=270.388, mean=270.388, max=270.388, sum=270.388 (1)",
-            "tab": "General information",
-            "score": 270.38789237668163
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.792,
-        "details": {
-          "description": "min=0.792, mean=0.792, max=0.792, sum=0.792 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=5.886, mean=5.886, max=5.886, sum=5.886 (1)",
-            "tab": "Efficiency",
-            "score": 5.885677124347793
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)",
-            "tab": "General information",
-            "score": 47.15896487985213
-          },
-          "IFEval - # output tokens": {
-            "description": "min=273.985, mean=273.985, max=273.985, sum=273.985 (1)",
-            "tab": "General information",
-            "score": 273.9852125693161
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=0.76 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=10.629, mean=10.629, max=10.629, sum=10.629 (1)",
-            "tab": "Efficiency",
-            "score": 10.62865050649643
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=544.911, mean=544.911, max=544.911, sum=544.911 (1)",
-            "tab": "General information",
-            "score": 544.911
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.224,
-        "details": {
-          "description": "min=0.224, mean=0.224, max=0.224, sum=0.224 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=7.852, mean=7.852, max=7.852, sum=7.852 (1)",
-            "tab": "Efficiency",
-            "score": 7.851754008293152
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)",
-            "tab": "General information",
-            "score": 110.563
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=409.742, mean=409.742, max=409.742, sum=409.742 (1)",
-            "tab": "General information",
-            "score": 409.742
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json b/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json
deleted file mode 100644
index c53a3aa66..000000000
--- a/data/helm_capabilities/anthropic/claude-3-5-sonnet-20241022/9d58ac39-fef7-47c8-920a-8be2069f5662.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-3-5-sonnet-20241022/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 3.5 Sonnet 20241022",
-    "id": "anthropic/claude-3-5-sonnet-20241022",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.653,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 7.355400399849929
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.777,
-        "details": {
-          "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=5.096, mean=5.096, max=5.096, sum=5.096 (1)",
-            "tab": "Efficiency",
-            "score": 5.096486385822296
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=230.461, mean=230.461, max=230.461, sum=230.461 (1)",
-            "tab": "General information",
-            "score": 230.461
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=212.233, mean=212.233, max=212.233, sum=212.233 (1)",
-            "tab": "General information",
-            "score": 212.233
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.565,
-        "details": {
-          "description": "min=0.565, mean=0.565, max=0.565, sum=0.565 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=6.262, mean=6.262, max=6.262, sum=6.262 (1)",
-            "tab": "Efficiency",
-            "score": 6.261580738251519
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=250.738, mean=250.738, max=250.738, sum=250.738 (1)",
-            "tab": "General information",
-            "score": 250.73766816143498
-          },
-          "GPQA - # output tokens": {
-            "description": "min=260.175, mean=260.175, max=260.175, sum=260.175 (1)",
-            "tab": "General information",
-            "score": 260.17488789237666
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.856,
-        "details": {
-          "description": "min=0.856, mean=0.856, max=0.856, sum=0.856 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=6.967, mean=6.967, max=6.967, sum=6.967 (1)",
-            "tab": "Efficiency",
-            "score": 6.966711103365293
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)",
-            "tab": "General information",
-            "score": 47.15896487985213
-          },
-          "IFEval - # output tokens": {
-            "description": "min=299.843, mean=299.843, max=299.843, sum=299.843 (1)",
-            "tab": "General information",
-            "score": 299.84288354898337
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.792,
-        "details": {
-          "description": "min=0.792, mean=0.792, max=0.792, sum=0.792 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=10.864, mean=10.864, max=10.864, sum=10.864 (1)",
-            "tab": "Efficiency",
-            "score": 10.86402980184555
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=603.959, mean=603.959, max=603.959, sum=603.959 (1)",
-            "tab": "General information",
-            "score": 603.959
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276,
-        "details": {
-          "description": "min=0.276, mean=0.276, max=0.276, sum=0.276 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=7.588, mean=7.588, max=7.588, sum=7.588 (1)",
-            "tab": "Efficiency",
-            "score": 7.588193969964981
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)",
-            "tab": "General information",
-            "score": 110.563
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=397.573, mean=397.573, max=397.573, sum=397.573 (1)",
-            "tab": "General information",
-            "score": 397.573
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json b/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json
deleted file mode 100644
index 1f5c52f66..000000000
--- a/data/helm_capabilities/anthropic/claude-3-7-sonnet-20250219/dd9b10af-ad39-45ef-8f91-097340d376c7.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-3-7-sonnet-20250219/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 3.7 Sonnet 20250219",
-    "id": "anthropic/claude-3-7-sonnet-20250219",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.674,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 9.05170552277221
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.784,
-        "details": {
-          "description": "min=0.784, mean=0.784, max=0.784, sum=0.784 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=4.744, mean=4.744, max=4.744, sum=4.744 (1)",
-            "tab": "Efficiency",
-            "score": 4.744252296209336
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=230.461, mean=230.461, max=230.461, sum=230.461 (1)",
-            "tab": "General information",
-            "score": 230.461
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=242.773, mean=242.773, max=242.773, sum=242.773 (1)",
-            "tab": "General information",
-            "score": 242.773
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.608,
-        "details": {
-          "description": "min=0.608, mean=0.608, max=0.608, sum=0.608 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=6.459, mean=6.459, max=6.459, sum=6.459 (1)",
-            "tab": "Efficiency",
-            "score": 6.4586481999923295
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=250.738, mean=250.738, max=250.738, sum=250.738 (1)",
-            "tab": "General information",
-            "score": 250.73766816143498
-          },
-          "GPQA - # output tokens": {
-            "description": "min=312.666, mean=312.666, max=312.666, sum=312.666 (1)",
-            "tab": "General information",
-            "score": 312.6659192825112
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.834,
-        "details": {
-          "description": "min=0.834, mean=0.834, max=0.834, sum=0.834 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=8.075, mean=8.075, max=8.075, sum=8.075 (1)",
-            "tab": "Efficiency",
-            "score": 8.075105538870623
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)",
-            "tab": "General information",
-            "score": 47.15896487985213
-          },
-          "IFEval - # output tokens": {
-            "description": "min=406.532, mean=406.532, max=406.532, sum=406.532 (1)",
-            "tab": "General information",
-            "score": 406.5323475046211
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.814,
-        "details": {
-          "description": "min=0.814, mean=0.814, max=0.814, sum=0.814 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=15.683, mean=15.683, max=15.683, sum=15.683 (1)",
-            "tab": "Efficiency",
-            "score": 15.682527210235596
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=862.287, mean=862.287, max=862.287, sum=862.287 (1)",
-            "tab": "General information",
-            "score": 862.287
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.33,
-        "details": {
-          "description": "min=0.33, mean=0.33, max=0.33, sum=0.33 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=10.298, mean=10.298, max=10.298, sum=10.298 (1)",
-            "tab": "Efficiency",
-            "score": 10.297994368553162
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)",
-            "tab": "General information",
-            "score": 110.563
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=670.885, mean=670.885, max=670.885, sum=670.885 (1)",
-            "tab": "General information",
-            "score": 670.885
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json b/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json
deleted file mode 100644
index da15e55a7..000000000
--- a/data/helm_capabilities/anthropic/claude-haiku-4-5-20251001/30a6de14-c57c-483e-92e9-26fc4c7f4772.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-haiku-4-5-20251001/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 4.5 Haiku 20251001",
-    "id": "anthropic/claude-haiku-4-5-20251001",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.717,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 7.381503096938465
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.777,
-        "details": {
-          "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=3.701, mean=3.701, max=3.701, sum=3.701 (1)",
-            "tab": "Efficiency",
-            "score": 3.7008020806312563
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=252.461, mean=252.461, max=252.461, sum=252.461 (1)",
-            "tab": "General information",
-            "score": 252.461
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=374.129, mean=374.129, max=374.129, sum=374.129 (1)",
-            "tab": "General information",
-            "score": 374.129
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.605,
-        "details": {
-          "description": "min=0.605, mean=0.605, max=0.605, sum=0.605 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=5.102, mean=5.102, max=5.102, sum=5.102 (1)",
-            "tab": "Efficiency",
-            "score": 5.102193982611857
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=272.738, mean=272.738, max=272.738, sum=272.738 (1)",
-            "tab": "General information",
-            "score": 272.73766816143495
-          },
-          "GPQA - # output tokens": {
-            "description": "min=524.525, mean=524.525, max=524.525, sum=524.525 (1)",
-            "tab": "General information",
-            "score": 524.5246636771301
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.801,
-        "details": {
-          "description": "min=0.801, mean=0.801, max=0.801, sum=0.801 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=4.355, mean=4.355, max=4.355, sum=4.355 (1)",
-            "tab": "Efficiency",
-            "score": 4.355410516372229
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)",
-            "tab": "General information",
-            "score": 47.15896487985213
-          },
-          "IFEval - # output tokens": {
-            "description": "min=390.416, mean=390.416, max=390.416, sum=390.416 (1)",
-            "tab": "General information",
-            "score": 390.4158964879852
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.839,
-        "details": {
-          "description": "min=0.839, mean=0.839, max=0.839, sum=0.839 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=16.317, mean=16.317, max=16.317, sum=16.317 (1)",
-            "tab": "Efficiency",
-            "score": 16.317131044387818
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=1835.337, mean=1835.337, max=1835.337, sum=1835.337 (1)",
-            "tab": "General information",
-            "score": 1835.337
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.561,
-        "details": {
-          "description": "min=0.561, mean=0.561, max=0.561, sum=0.561 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=7.432, mean=7.432, max=7.432, sum=7.432 (1)",
-            "tab": "Efficiency",
-            "score": 7.431977860689163
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)",
-            "tab": "General information",
-            "score": 110.563
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=937.799, mean=937.799, max=937.799, sum=937.799 (1)",
-            "tab": "General information",
-            "score": 937.799
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json b/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json
deleted file mode 100644
index c554c6a65..000000000
--- a/data/helm_capabilities/anthropic/claude-opus-4-20250514-thinking-10k/bed1a799-77a6-40a1-9f37-d54fe9d4d055.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514-thinking-10k/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 4 Opus 20250514, extended thinking",
-    "id": "anthropic/claude-opus-4-20250514-thinking-10k",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 52.297304217949794
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.875,
-        "details": {
-          "description": "min=0.875, mean=0.875, max=0.875, sum=0.875 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=28.466, mean=28.466, max=28.466, sum=28.466 (1)",
-            "tab": "Efficiency",
-            "score": 28.46593898815197
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=252.461, mean=252.461, max=252.461, sum=252.461 (1)",
-            "tab": "General information",
-            "score": 252.461
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=272.871, mean=272.871, max=272.871, sum=272.871 (1)",
-            "tab": "General information",
-            "score": 272.871
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.709,
-        "details": {
-          "description": "min=0.709, mean=0.709, max=0.709, sum=0.709 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=45.529, mean=45.529, max=45.529, sum=45.529 (1)",
-            "tab": "Efficiency",
-            "score": 45.52923426562793
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=272.738, mean=272.738, max=272.738, sum=272.738 (1)",
-            "tab": "General information",
-            "score": 272.73766816143495
-          },
-          "GPQA - # output tokens": {
-            "description": "min=343.762, mean=343.762, max=343.762, sum=343.762 (1)",
-            "tab": "General information",
-            "score": 343.76233183856505
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.849,
-        "details": {
-          "description": "min=0.849, mean=0.849, max=0.849, sum=0.849 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=22.453, mean=22.453, max=22.453, sum=22.453 (1)",
-            "tab": "Efficiency",
-            "score": 22.45251508421368
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)",
-            "tab": "General information",
-            "score": 47.15896487985213
-          },
-          "IFEval - # output tokens": {
-            "description": "min=403.745, mean=403.745, max=403.745, sum=403.745 (1)",
-            "tab": "General information",
-            "score": 403.74491682070243
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.852,
-        "details": {
-          "description": "min=0.852, mean=0.852, max=0.852, sum=0.852 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=50.19, mean=50.19, max=50.19, sum=50.19 (1)",
-            "tab": "Efficiency",
-            "score": 50.19046350765228
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=1195.769, mean=1195.769, max=1195.769, sum=1195.769 (1)",
-            "tab": "General information",
-            "score": 1195.769
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.616,
-        "details": {
-          "description": "min=0.616, mean=0.616, max=0.616, sum=0.616 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=114.848, mean=114.848, max=114.848, sum=114.848 (1)",
-            "tab": "Efficiency",
-            "score": 114.84836924410313
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)",
-            "tab": "General information",
-            "score": 110.563
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=691.066, mean=691.066, max=691.066, sum=691.066 (1)",
-            "tab": "General information",
-            "score": 691.066
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json b/data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json
deleted file mode 100644
index 240e9ebf4..000000000
--- a/data/helm_capabilities/anthropic/claude-opus-4-20250514/6c226cad-23f1-4c09-8038-eb7b776cdee4.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-opus-4-20250514/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 4 Opus 20250514",
-    "id": "anthropic/claude-opus-4-20250514",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.757,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 20.48127702555515
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.859,
-        "details": {
-          "description": "min=0.859, mean=0.859, max=0.859, sum=0.859 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=12.63, mean=12.63, max=12.63, sum=12.63 (1)",
-            "tab": "Efficiency",
-            "score": 12.630421590518665
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=230.461, mean=230.461, max=230.461, sum=230.461 (1)",
-            "tab": "General information",
-            "score": 230.461
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=344.469, mean=344.469, max=344.469, sum=344.469 (1)",
-            "tab": "General information",
-            "score": 344.469
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.666,
-        "details": {
-          "description": "min=0.666, mean=0.666, max=0.666, sum=0.666 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=16.325, mean=16.325, max=16.325, sum=16.325 (1)",
-            "tab": "Efficiency",
-            "score": 16.325411326249803
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=250.738, mean=250.738, max=250.738, sum=250.738 (1)",
-            "tab": "General information",
-            "score": 250.73766816143498
-          },
-          "GPQA - # output tokens": {
-            "description": "min=453.143, mean=453.143, max=453.143, sum=453.143 (1)",
-            "tab": "General information",
-            "score": 453.1434977578475
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.918,
-        "details": {
-          "description": "min=0.918, mean=0.918, max=0.918, sum=0.918 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=16.576, mean=16.576, max=16.576, sum=16.576 (1)",
-            "tab": "Efficiency",
-            "score": 16.576411149939712
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)",
-            "tab": "General information",
-            "score": 47.15896487985213
-          },
-          "IFEval - # output tokens": {
-            "description": "min=422.774, mean=422.774, max=422.774, sum=422.774 (1)",
-            "tab": "General information",
-            "score": 422.7744916820702
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.833,
-        "details": {
-          "description": "min=0.833, mean=0.833, max=0.833, sum=0.833 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=29.848, mean=29.848, max=29.848, sum=29.848 (1)",
-            "tab": "Efficiency",
-            "score": 29.848318881988526
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=936.927, mean=936.927, max=936.927, sum=936.927 (1)",
-            "tab": "General information",
-            "score": 936.927
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.511,
-        "details": {
-          "description": "min=0.511, mean=0.511, max=0.511, sum=0.511 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=27.026, mean=27.026, max=27.026, sum=27.026 (1)",
-            "tab": "Efficiency",
-            "score": 27.025822179079057
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)",
-            "tab": "General information",
-            "score": 110.563
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=893.894, mean=893.894, max=893.894, sum=893.894 (1)",
-            "tab": "General information",
-            "score": 893.894
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json
deleted file mode 100644
index ecc6c0f0a..000000000
--- a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514-thinking-10k/98887061-09d6-44ba-9cff-0267045a26ef.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514-thinking-10k/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 4 Sonnet 20250514, extended thinking",
-    "id": "anthropic/claude-sonnet-4-20250514-thinking-10k",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.766,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 38.96330262736815
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.843,
-        "details": {
-          "description": "min=0.843, mean=0.843, max=0.843, sum=0.843 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=23.165, mean=23.165, max=23.165, sum=23.165 (1)",
-            "tab": "Efficiency",
-            "score": 23.16487550187111
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=252.461, mean=252.461, max=252.461, sum=252.461 (1)",
-            "tab": "General information",
-            "score": 252.461
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=325.194, mean=325.194, max=325.194, sum=325.194 (1)",
-            "tab": "General information",
-            "score": 325.194
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.706,
-        "details": {
-          "description": "min=0.706, mean=0.706, max=0.706, sum=0.706 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=38.16, mean=38.16, max=38.16, sum=38.16 (1)",
-            "tab": "Efficiency",
-            "score": 38.15993662211927
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=272.738, mean=272.738, max=272.738, sum=272.738 (1)",
-            "tab": "General information",
-            "score": 272.73766816143495
-          },
-          "GPQA - # output tokens": {
-            "description": "min=414.928, mean=414.928, max=414.928, sum=414.928 (1)",
-            "tab": "General information",
-            "score": 414.92825112107624
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=0.84 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=12.654, mean=12.654, max=12.654, sum=12.654 (1)",
-            "tab": "Efficiency",
-            "score": 12.65442304822742
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)",
-            "tab": "General information",
-            "score": 47.15896487985213
-          },
-          "IFEval - # output tokens": {
-            "description": "min=380.645, mean=380.645, max=380.645, sum=380.645 (1)",
-            "tab": "General information",
-            "score": 380.64510166358593
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.838,
-        "details": {
-          "description": "min=0.838, mean=0.838, max=0.838, sum=0.838 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=32.933, mean=32.933, max=32.933, sum=32.933 (1)",
-            "tab": "Efficiency",
-            "score": 32.93274651098251
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=1274.627, mean=1274.627, max=1274.627, sum=1274.627 (1)",
-            "tab": "General information",
-            "score": 1274.627
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.602,
-        "details": {
-          "description": "min=0.602, mean=0.602, max=0.602, sum=0.602 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=87.905, mean=87.905, max=87.905, sum=87.905 (1)",
-            "tab": "Efficiency",
-            "score": 87.90453145364046
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)",
-            "tab": "General information",
-            "score": 110.563
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=728.241, mean=728.241, max=728.241, sum=728.241 (1)",
-            "tab": "General information",
-            "score": 728.241
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json b/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json
deleted file mode 100644
index b4413ccdd..000000000
--- a/data/helm_capabilities/anthropic/claude-sonnet-4-20250514/6693f0e2-3514-413d-be61-d10f7372b3dc.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-20250514/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 4 Sonnet 20250514",
-    "id": "anthropic/claude-sonnet-4-20250514",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.733,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 15.534070909101748
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.843,
-        "details": {
-          "description": "min=0.843, mean=0.843, max=0.843, sum=0.843 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=9.974, mean=9.974, max=9.974, sum=9.974 (1)",
-            "tab": "Efficiency",
-            "score": 9.973703570604325
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=230.461, mean=230.461, max=230.461, sum=230.461 (1)",
-            "tab": "General information",
-            "score": 230.461
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=402.003, mean=402.003, max=402.003, sum=402.003 (1)",
-            "tab": "General information",
-            "score": 402.003
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.643,
-        "details": {
-          "description": "min=0.643, mean=0.643, max=0.643, sum=0.643 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=13.452, mean=13.452, max=13.452, sum=13.452 (1)",
-            "tab": "Efficiency",
-            "score": 13.452103998094396
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=250.738, mean=250.738, max=250.738, sum=250.738 (1)",
-            "tab": "General information",
-            "score": 250.73766816143498
-          },
-          "GPQA - # output tokens": {
-            "description": "min=543.482, mean=543.482, max=543.482, sum=543.482 (1)",
-            "tab": "General information",
-            "score": 543.4820627802691
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.839,
-        "details": {
-          "description": "min=0.839, mean=0.839, max=0.839, sum=0.839 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=10.416, mean=10.416, max=10.416, sum=10.416 (1)",
-            "tab": "Efficiency",
-            "score": 10.416161362653298
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)",
-            "tab": "General information",
-            "score": 47.15896487985213
-          },
-          "IFEval - # output tokens": {
-            "description": "min=398.978, mean=398.978, max=398.978, sum=398.978 (1)",
-            "tab": "General information",
-            "score": 398.9778188539741
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.825,
-        "details": {
-          "description": "min=0.825, mean=0.825, max=0.825, sum=0.825 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=23.404, mean=23.404, max=23.404, sum=23.404 (1)",
-            "tab": "Efficiency",
-            "score": 23.403768165826797
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=954.675, mean=954.675, max=954.675, sum=954.675 (1)",
-            "tab": "General information",
-            "score": 954.675
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.512,
-        "details": {
-          "description": "min=0.512, mean=0.512, max=0.512, sum=0.512 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=20.425, mean=20.425, max=20.425, sum=20.425 (1)",
-            "tab": "Efficiency",
-            "score": 20.424617448329926
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)",
-            "tab": "General information",
-            "score": 110.563
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=925.604, mean=925.604, max=925.604, sum=925.604 (1)",
-            "tab": "General information",
-            "score": 925.604
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json b/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json
deleted file mode 100644
index e0991c0d9..000000000
--- a/data/helm_capabilities/anthropic/claude-sonnet-4-5-20250929/ffeaa0b2-fcdb-45dd-a6b4-06b67b9f63fe.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/anthropic_claude-sonnet-4-5-20250929/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 4.5 Sonnet 20250929",
-    "id": "anthropic/claude-sonnet-4-5-20250929",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.762,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 17.536448448412127
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.869,
-        "details": {
-          "description": "min=0.869, mean=0.869, max=0.869, sum=0.869 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=9.03, mean=9.03, max=9.03, sum=9.03 (1)",
-            "tab": "Efficiency",
-            "score": 9.029817205530268
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=252.461, mean=252.461, max=252.461, sum=252.461 (1)",
-            "tab": "General information",
-            "score": 252.461
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=392.292, mean=392.292, max=392.292, sum=392.292 (1)",
-            "tab": "General information",
-            "score": 392.292
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.686,
-        "details": {
-          "description": "min=0.686, mean=0.686, max=0.686, sum=0.686 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=12.414, mean=12.414, max=12.414, sum=12.414 (1)",
-            "tab": "Efficiency",
-            "score": 12.414452127318263
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=272.738, mean=272.738, max=272.738, sum=272.738 (1)",
-            "tab": "General information",
-            "score": 272.73766816143495
-          },
-          "GPQA - # output tokens": {
-            "description": "min=544.215, mean=544.215, max=544.215, sum=544.215 (1)",
-            "tab": "General information",
-            "score": 544.2152466367713
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=0.85 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=10.904, mean=10.904, max=10.904, sum=10.904 (1)",
-            "tab": "Efficiency",
-            "score": 10.90394415211986
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.159, mean=47.159, max=47.159, sum=47.159 (1)",
-            "tab": "General information",
-            "score": 47.15896487985213
-          },
-          "IFEval - # output tokens": {
-            "description": "min=414.632, mean=414.632, max=414.632, sum=414.632 (1)",
-            "tab": "General information",
-            "score": 414.63216266173754
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.854,
-        "details": {
-          "description": "min=0.854, mean=0.854, max=0.854, sum=0.854 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=38.544, mean=38.544, max=38.544, sum=38.544 (1)",
-            "tab": "Efficiency",
-            "score": 38.54364204096484
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=1804.604, mean=1804.604, max=1804.604, sum=1804.604 (1)",
-            "tab": "General information",
-            "score": 1804.604
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.553,
-        "details": {
-          "description": "min=0.553, mean=0.553, max=0.553, sum=0.553 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=16.79, mean=16.79, max=16.79, sum=16.79 (1)",
-            "tab": "Efficiency",
-            "score": 16.790386716127397
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=110.563, mean=110.563, max=110.563, sum=110.563 (1)",
-            "tab": "General information",
-            "score": 110.563
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=892.774, mean=892.774, max=892.774, sum=892.774 (1)",
-            "tab": "General information",
-            "score": 892.774
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json b/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json
deleted file mode 100644
index 682cc94cc..000000000
--- a/data/helm_capabilities/deepseek-ai/deepseek-r1-0528/0d9a856d-01bf-4a82-9872-33d561cf4a57.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-r1-0528/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-0528",
-    "id": "deepseek-ai/deepseek-r1-0528",
-    "developer": "deepseek-ai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.699,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 115.28182297150872
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.793,
-        "details": {
-          "description": "min=0.793, mean=0.793, max=0.793, sum=0.793 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=91.015, mean=91.015, max=91.015, sum=91.015 (1)",
-            "tab": "Efficiency",
-            "score": 91.01470815229416
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=248.757, mean=248.757, max=248.757, sum=248.757 (1)",
-            "tab": "General information",
-            "score": 248.757
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.666,
-        "details": {
-          "description": "min=0.666, mean=0.666, max=0.666, sum=0.666 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=155.439, mean=155.439, max=155.439, sum=155.439 (1)",
-            "tab": "Efficiency",
-            "score": 155.438512681311
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=261.59, mean=261.59, max=261.59, sum=261.59 (1)",
-            "tab": "General information",
-            "score": 261.5896860986547
-          },
-          "GPQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.784,
-        "details": {
-          "description": "min=0.784, mean=0.784, max=0.784, sum=0.784 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=33.752, mean=33.752, max=33.752, sum=33.752 (1)",
-            "tab": "Efficiency",
-            "score": 33.75197721056489
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.209, mean=46.209, max=46.209, sum=46.209 (1)",
-            "tab": "General information",
-            "score": 46.208872458410355
-          },
-          "IFEval - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.828,
-        "details": {
-          "description": "min=0.828, mean=0.828, max=0.828, sum=0.828 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=87.848, mean=87.848, max=87.848, sum=87.848 (1)",
-            "tab": "Efficiency",
-            "score": 87.84843708276749
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.424,
-        "details": {
-          "description": "min=0.424, mean=0.424, max=0.424, sum=0.424 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=208.355, mean=208.355, max=208.355, sum=208.355 (1)",
-            "tab": "Efficiency",
-            "score": 208.35547973060608
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=107.102, mean=107.102, max=107.102, sum=107.102 (1)",
-            "tab": "General information",
-            "score": 107.102
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json b/data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json
deleted file mode 100644
index 3b034de70..000000000
--- a/data/helm_capabilities/deepseek-ai/deepseek-v3/3ff2ab7d-2c0f-4313-8223-8f514fde595a.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/deepseek-ai_deepseek-v3/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek v3",
-    "id": "deepseek-ai/deepseek-v3",
-    "developer": "deepseek-ai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.665,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 71.88858741677622
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.723,
-        "details": {
-          "description": "min=0.723, mean=0.723, max=0.723, sum=0.723 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=50.311, mean=50.311, max=50.311, sum=50.311 (1)",
-            "tab": "Efficiency",
-            "score": 50.3109582388401
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=227.757, mean=227.757, max=227.757, sum=227.757 (1)",
-            "tab": "General information",
-            "score": 227.757
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.538,
-        "details": {
-          "description": "min=0.538, mean=0.538, max=0.538, sum=0.538 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=74.372, mean=74.372, max=74.372, sum=74.372 (1)",
-            "tab": "Efficiency",
-            "score": 74.37158904909553
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=240.59, mean=240.59, max=240.59, sum=240.59 (1)",
-            "tab": "General information",
-            "score": 240.5896860986547
-          },
-          "GPQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.832,
-        "details": {
-          "description": "min=0.832, mean=0.832, max=0.832, sum=0.832 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=47.879, mean=47.879, max=47.879, sum=47.879 (1)",
-            "tab": "Efficiency",
-            "score": 47.878683835433286
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.209, mean=46.209, max=46.209, sum=46.209 (1)",
-            "tab": "General information",
-            "score": 46.208872458410355
-          },
-          "IFEval - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.831,
-        "details": {
-          "description": "min=0.831, mean=0.831, max=0.831, sum=0.831 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=134.163, mean=134.163, max=134.163, sum=134.163 (1)",
-            "tab": "Efficiency",
-            "score": 134.1626427116394
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.403,
-        "details": {
-          "description": "min=0.403, mean=0.403, max=0.403, sum=0.403 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=52.719, mean=52.719, max=52.719, sum=52.719 (1)",
-            "tab": "Efficiency",
-            "score": 52.71906324887276
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=107.102, mean=107.102, max=107.102, sum=107.102 (1)",
-            "tab": "General information",
-            "score": 107.102
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json b/data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json
deleted file mode 100644
index 7d4281de4..000000000
--- a/data/helm_capabilities/google/gemini-1.5-flash-002/2a46e8da-1996-428c-b567-cd0287b29d9f.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/google_gemini-1.5-flash-002/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 1.5 Flash 002",
-    "id": "google/gemini-1.5-flash-002",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.609,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 3.3804760044252675
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.678,
-        "details": {
-          "description": "min=0.678, mean=0.678, max=0.678, sum=0.678 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=1.799, mean=1.799, max=1.799, sum=1.799 (1)",
-            "tab": "Efficiency",
-            "score": 1.799316755771637
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=242.673, mean=242.673, max=242.673, sum=242.673 (1)",
-            "tab": "General information",
-            "score": 242.673
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.437,
-        "details": {
-          "description": "min=0.437, mean=0.437, max=0.437, sum=0.437 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=2.79, mean=2.79, max=2.79, sum=2.79 (1)",
-            "tab": "Efficiency",
-            "score": 2.7900896457278677
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=252.735, mean=252.735, max=252.735, sum=252.735 (1)",
-            "tab": "General information",
-            "score": 252.7354260089686
-          },
-          "GPQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.831,
-        "details": {
-          "description": "min=0.831, mean=0.831, max=0.831, sum=0.831 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=2.302, mean=2.302, max=2.302, sum=2.302 (1)",
-            "tab": "Efficiency",
-            "score": 2.302485716320891
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)",
-            "tab": "General information",
-            "score": 47.33086876155268
-          },
-          "IFEval - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.792,
-        "details": {
-          "description": "min=0.792, mean=0.792, max=0.792, sum=0.792 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=5.328, mean=5.328, max=5.328, sum=5.328 (1)",
-            "tab": "Efficiency",
-            "score": 5.327828770410083
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.305,
-        "details": {
-          "description": "min=0.305, mean=0.305, max=0.305, sum=0.305 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=4.683, mean=4.683, max=4.683, sum=4.683 (1)",
-            "tab": "Efficiency",
-            "score": 4.682659133895859
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)",
-            "tab": "General information",
-            "score": 111.956
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json b/data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json
deleted file mode 100644
index 3c438fd59..000000000
--- a/data/helm_capabilities/google/gemini-1.5-pro-002/30a92593-398e-4c2f-8be7-455be166aeaf.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/google_gemini-1.5-pro-002/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 1.5 Pro 002",
-    "id": "google/gemini-1.5-pro-002",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.657,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 9.106040294719884
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.737,
-        "details": {
-          "description": "min=0.737, mean=0.737, max=0.737, sum=0.737 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=5.124, mean=5.124, max=5.124, sum=5.124 (1)",
-            "tab": "Efficiency",
-            "score": 5.123855731964111
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=242.673, mean=242.673, max=242.673, sum=242.673 (1)",
-            "tab": "General information",
-            "score": 242.673
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.534,
-        "details": {
-          "description": "min=0.534, mean=0.534, max=0.534, sum=0.534 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=7.392, mean=7.392, max=7.392, sum=7.392 (1)",
-            "tab": "Efficiency",
-            "score": 7.392140488988081
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=252.735, mean=252.735, max=252.735, sum=252.735 (1)",
-            "tab": "General information",
-            "score": 252.7354260089686
-          },
-          "GPQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.837,
-        "details": {
-          "description": "min=0.837, mean=0.837, max=0.837, sum=0.837 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=6.353, mean=6.353, max=6.353, sum=6.353 (1)",
-            "tab": "Efficiency",
-            "score": 6.352943865957631
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)",
-            "tab": "General information",
-            "score": 47.33086876155268
-          },
-          "IFEval - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.813,
-        "details": {
-          "description": "min=0.813, mean=0.813, max=0.813, sum=0.813 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=17.527, mean=17.527, max=17.527, sum=17.527 (1)",
-            "tab": "Efficiency",
-            "score": 17.52709009152358
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.364,
-        "details": {
-          "description": "min=0.364, mean=0.364, max=0.364, sum=0.364 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=9.134, mean=9.134, max=9.134, sum=9.134 (1)",
-            "tab": "Efficiency",
-            "score": 9.134171295166016
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)",
-            "tab": "General information",
-            "score": 111.956
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json b/data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json
deleted file mode 100644
index 7f589b967..000000000
--- a/data/helm_capabilities/google/gemini-2.0-flash-001/e6fd55e0-6ff0-48f1-8b51-5f4372edb457.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-001/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 2.0 Flash",
-    "id": "google/gemini-2.0-flash-001",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.679,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 5.700146694170831
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.737,
-        "details": {
-          "description": "min=0.737, mean=0.737, max=0.737, sum=0.737 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=3.221, mean=3.221, max=3.221, sum=3.221 (1)",
-            "tab": "Efficiency",
-            "score": 3.221250217437744
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=242.673, mean=242.673, max=242.673, sum=242.673 (1)",
-            "tab": "General information",
-            "score": 242.673
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.556,
-        "details": {
-          "description": "min=0.556, mean=0.556, max=0.556, sum=0.556 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=4.919, mean=4.919, max=4.919, sum=4.919 (1)",
-            "tab": "Efficiency",
-            "score": 4.919003446005919
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=252.735, mean=252.735, max=252.735, sum=252.735 (1)",
-            "tab": "General information",
-            "score": 252.7354260089686
-          },
-          "GPQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.841,
-        "details": {
-          "description": "min=0.841, mean=0.841, max=0.841, sum=0.841 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=3.723, mean=3.723, max=3.723, sum=3.723 (1)",
-            "tab": "Efficiency",
-            "score": 3.7232056717334965
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)",
-            "tab": "General information",
-            "score": 47.33086876155268
-          },
-          "IFEval - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=0.8 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=9.27, mean=9.27, max=9.27, sum=9.27 (1)",
-            "tab": "Efficiency",
-            "score": 9.270071518985407
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.459,
-        "details": {
-          "description": "min=0.459, mean=0.459, max=0.459, sum=0.459 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=7.367, mean=7.367, max=7.367, sum=7.367 (1)",
-            "tab": "Efficiency",
-            "score": 7.367202616691589
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)",
-            "tab": "General information",
-            "score": 111.956
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json b/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json
deleted file mode 100644
index 0376cdf40..000000000
--- a/data/helm_capabilities/google/gemini-2.0-flash-lite-preview-02-05/dfc2717d-ead8-4287-885e-5e0fc09c35e3.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.0-flash-lite-preview-02-05/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 2.0 Flash Lite 02-05 preview",
-    "id": "google/gemini-2.0-flash-lite-preview-02-05",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.642,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 5.788722673180064
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.72,
-        "details": {
-          "description": "min=0.72, mean=0.72, max=0.72, sum=0.72 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=3.357, mean=3.357, max=3.357, sum=3.357 (1)",
-            "tab": "Efficiency",
-            "score": 3.356641344547272
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=242.673, mean=242.673, max=242.673, sum=242.673 (1)",
-            "tab": "General information",
-            "score": 242.673
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=5.373, mean=5.373, max=5.373, sum=5.373 (1)",
-            "tab": "Efficiency",
-            "score": 5.372664878186623
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=252.735, mean=252.735, max=252.735, sum=252.735 (1)",
-            "tab": "General information",
-            "score": 252.7354260089686
-          },
-          "GPQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.824,
-        "details": {
-          "description": "min=0.824, mean=0.824, max=0.824, sum=0.824 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=3.463, mean=3.463, max=3.463, sum=3.463 (1)",
-            "tab": "Efficiency",
-            "score": 3.4628667553780037
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)",
-            "tab": "General information",
-            "score": 47.33086876155268
-          },
-          "IFEval - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.79, mean=0.79, max=0.79, sum=0.79 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=8.804, mean=8.804, max=8.804, sum=8.804 (1)",
-            "tab": "Efficiency",
-            "score": 8.803904922309524
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.374,
-        "details": {
-          "description": "min=0.374, mean=0.374, max=0.374, sum=0.374 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=7.948, mean=7.948, max=7.948, sum=7.948 (1)",
-            "tab": "Efficiency",
-            "score": 7.947535465478897
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)",
-            "tab": "General information",
-            "score": 111.956
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json b/data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json
deleted file mode 100644
index 600681fbb..000000000
--- a/data/helm_capabilities/google/gemini-2.5-flash-lite/e97292eb-7031-4a3a-a415-44c137898e3f.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-lite/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 2.5 Flash-Lite",
-    "id": "google/gemini-2.5-flash-lite",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.591,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 8.113822886648412
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.537,
-        "details": {
-          "description": "min=0.537, mean=0.537, max=0.537, sum=0.537 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=4.423, mean=4.423, max=4.423, sum=4.423 (1)",
-            "tab": "Efficiency",
-            "score": 4.423401823997498
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=263.673, mean=263.673, max=263.673, sum=263.673 (1)",
-            "tab": "General information",
-            "score": 263.673
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.309,
-        "details": {
-          "description": "min=0.309, mean=0.309, max=0.309, sum=0.309 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=11.88, mean=11.88, max=11.88, sum=11.88 (1)",
-            "tab": "Efficiency",
-            "score": 11.880136902022254
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=273.735, mean=273.735, max=273.735, sum=273.735 (1)",
-            "tab": "General information",
-            "score": 273.7354260089686
-          },
-          "GPQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=0.81 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=1.833, mean=1.833, max=1.833, sum=1.833 (1)",
-            "tab": "Efficiency",
-            "score": 1.833447342659321
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)",
-            "tab": "General information",
-            "score": 47.33086876155268
-          },
-          "IFEval - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.818,
-        "details": {
-          "description": "min=0.818, mean=0.818, max=0.818, sum=0.818 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=7.111, mean=7.111, max=7.111, sum=7.111 (1)",
-            "tab": "Efficiency",
-            "score": 7.111379201173782
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.48,
-        "details": {
-          "description": "min=0.48, mean=0.48, max=0.48, sum=0.48 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=15.321, mean=15.321, max=15.321, sum=15.321 (1)",
-            "tab": "Efficiency",
-            "score": 15.320749163389205
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)",
-            "tab": "General information",
-            "score": 111.956
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json b/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json
deleted file mode 100644
index 221dc7a91..000000000
--- a/data/helm_capabilities/google/gemini-2.5-flash-preview-04-17/4263a6be-9640-40a1-8881-768624949d47.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.5-flash-preview-04-17/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 2.5 Flash 04-17 preview",
-    "id": "google/gemini-2.5-flash-preview-04-17",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.626,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 31.900818991762513
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.639,
-        "details": {
-          "description": "min=0.639, mean=0.639, max=0.639, sum=0.639 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=17.353, mean=17.353, max=17.353, sum=17.353 (1)",
-            "tab": "Efficiency",
-            "score": 17.352934203863143
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=263.673, mean=263.673, max=263.673, sum=263.673 (1)",
-            "tab": "General information",
-            "score": 263.673
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39,
-        "details": {
-          "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=38.125, mean=38.125, max=38.125, sum=38.125 (1)",
-            "tab": "Efficiency",
-            "score": 38.125050564562336
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=273.735, mean=273.735, max=273.735, sum=273.735 (1)",
-            "tab": "General information",
-            "score": 273.7354260089686
-          },
-          "GPQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.898,
-        "details": {
-          "description": "min=0.898, mean=0.898, max=0.898, sum=0.898 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=11.266, mean=11.266, max=11.266, sum=11.266 (1)",
-            "tab": "Efficiency",
-            "score": 11.266106982142837
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)",
-            "tab": "General information",
-            "score": 47.33086876155268
-          },
-          "IFEval - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.817,
-        "details": {
-          "description": "min=0.817, mean=0.817, max=0.817, sum=0.817 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=32.789, mean=32.789, max=32.789, sum=32.789 (1)",
-            "tab": "Efficiency",
-            "score": 32.78856403473391
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.384,
-        "details": {
-          "description": "min=0.384, mean=0.384, max=0.384, sum=0.384 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=59.971, mean=59.971, max=59.971, sum=59.971 (1)",
-            "tab": "Efficiency",
-            "score": 59.97143917351036
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)",
-            "tab": "General information",
-            "score": 111.956
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json b/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json
deleted file mode 100644
index 355cd3bc1..000000000
--- a/data/helm_capabilities/google/gemini-2.5-pro-preview-03-25/a808cecf-8925-428f-99ea-b6c2f8bce96e.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/google_gemini-2.5-pro-preview-03-25/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 2.5 Pro 03-25 preview",
-    "id": "google/gemini-2.5-pro-preview-03-25",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.745,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 41.707859761088116
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.863,
-        "details": {
-          "description": "min=0.863, mean=0.863, max=0.863, sum=0.863 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=22.301, mean=22.301, max=22.301, sum=22.301 (1)",
-            "tab": "Efficiency",
-            "score": 22.301176882605677
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=263.673, mean=263.673, max=263.673, sum=263.673 (1)",
-            "tab": "General information",
-            "score": 263.673
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.749,
-        "details": {
-          "description": "min=0.749, mean=0.749, max=0.749, sum=0.749 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=43.194, mean=43.194, max=43.194, sum=43.194 (1)",
-            "tab": "Efficiency",
-            "score": 43.19425330858552
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=273.735, mean=273.735, max=273.735, sum=273.735 (1)",
-            "tab": "General information",
-            "score": 273.7354260089686
-          },
-          "GPQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=0.84 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=15.978, mean=15.978, max=15.978, sum=15.978 (1)",
-            "tab": "Efficiency",
-            "score": 15.978427228116725
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)",
-            "tab": "General information",
-            "score": 47.33086876155268
-          },
-          "IFEval - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.857,
-        "details": {
-          "description": "min=0.857, mean=0.857, max=0.857, sum=0.857 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=41.295, mean=41.295, max=41.295, sum=41.295 (1)",
-            "tab": "Efficiency",
-            "score": 41.2954368838362
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.416,
-        "details": {
-          "description": "min=0.416, mean=0.416, max=0.416, sum=0.416 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=85.77, mean=85.77, max=85.77, sum=85.77 (1)",
-            "tab": "Efficiency",
-            "score": 85.77000450229644
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)",
-            "tab": "General information",
-            "score": 111.956
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json b/data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json
deleted file mode 100644
index d3ecb3ebb..000000000
--- a/data/helm_capabilities/google/gemini-3-pro-preview/55e44a3b-1fac-4ad5-b25e-85702f33883d.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/google_gemini-3-pro-preview/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 3 Pro Preview",
-    "id": "google/gemini-3-pro-preview",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.799,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 50.969324812798575
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.903,
-        "details": {
-          "description": "min=0.903, mean=0.903, max=0.903, sum=0.903 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=34.903, mean=34.903, max=34.903, sum=34.903 (1)",
-            "tab": "Efficiency",
-            "score": 34.903078527212145
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=263.673, mean=263.673, max=263.673, sum=263.673 (1)",
-            "tab": "General information",
-            "score": 263.673
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.803,
-        "details": {
-          "description": "min=0.803, mean=0.803, max=0.803, sum=0.803 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=69.164, mean=69.164, max=69.164, sum=69.164 (1)",
-            "tab": "Efficiency",
-            "score": 69.16407415364355
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=273.735, mean=273.735, max=273.735, sum=273.735 (1)",
-            "tab": "General information",
-            "score": 273.7354260089686
-          },
-          "GPQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.876,
-        "details": {
-          "description": "min=0.876, mean=0.876, max=0.876, sum=0.876 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=18.201, mean=18.201, max=18.201, sum=18.201 (1)",
-            "tab": "Efficiency",
-            "score": 18.200553727458452
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.331, mean=47.331, max=47.331, sum=47.331 (1)",
-            "tab": "General information",
-            "score": 47.33086876155268
-          },
-          "IFEval - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.859,
-        "details": {
-          "description": "min=0.859, mean=0.859, max=0.859, sum=0.859 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=37.094, mean=37.094, max=37.094, sum=37.094 (1)",
-            "tab": "Efficiency",
-            "score": 37.09404513451669
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.555,
-        "details": {
-          "description": "min=0.555, mean=0.555, max=0.555, sum=0.555 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=95.485, mean=95.485, max=95.485, sum=95.485 (1)",
-            "tab": "Efficiency",
-            "score": 95.48487252116203
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=111.956, mean=111.956, max=111.956, sum=111.956 (1)",
-            "tab": "General information",
-            "score": 111.956
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json b/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json
deleted file mode 100644
index 869902b9d..000000000
--- a/data/helm_capabilities/ibm/granite-3.3-8b-instruct/5b5b339b-7631-4b77-ac51-df49d3e946eb.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/ibm_granite-3.3-8b-instruct/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IBM Granite 3.3 8B Instruct",
-    "id": "ibm/granite-3.3-8b-instruct",
-    "developer": "ibm",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.463,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 9.029614260338473
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.343,
-        "details": {
-          "description": "min=0.343, mean=0.343, max=0.343, sum=0.343 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=5.079, mean=5.079, max=5.079, sum=5.079 (1)",
-            "tab": "Efficiency",
-            "score": 5.079014162302017
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=266.391, mean=266.391, max=266.391, sum=266.391 (1)",
-            "tab": "General information",
-            "score": 266.391
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=364.376, mean=364.376, max=364.376, sum=364.376 (1)",
-            "tab": "General information",
-            "score": 364.376
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.325,
-        "details": {
-          "description": "min=0.325, mean=0.325, max=0.325, sum=0.325 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=6.422, mean=6.422, max=6.422, sum=6.422 (1)",
-            "tab": "Efficiency",
-            "score": 6.421983559569971
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=281.265, mean=281.265, max=281.265, sum=281.265 (1)",
-            "tab": "General information",
-            "score": 281.2645739910314
-          },
-          "GPQA - # output tokens": {
-            "description": "min=465.336, mean=465.336, max=465.336, sum=465.336 (1)",
-            "tab": "General information",
-            "score": 465.33632286995515
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.729,
-        "details": {
-          "description": "min=0.729, mean=0.729, max=0.729, sum=0.729 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=6.574, mean=6.574, max=6.574, sum=6.574 (1)",
-            "tab": "Efficiency",
-            "score": 6.573940407546743
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=51.534, mean=51.534, max=51.534, sum=51.534 (1)",
-            "tab": "General information",
-            "score": 51.53419593345656
-          },
-          "IFEval - # output tokens": {
-            "description": "min=482.37, mean=482.37, max=482.37, sum=482.37 (1)",
-            "tab": "General information",
-            "score": 482.36968576709796
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.741,
-        "details": {
-          "description": "min=0.741, mean=0.741, max=0.741, sum=0.741 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=10.962, mean=10.962, max=10.962, sum=10.962 (1)",
-            "tab": "Efficiency",
-            "score": 10.962031789541244
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=784.893, mean=784.893, max=784.893, sum=784.893 (1)",
-            "tab": "General information",
-            "score": 784.893
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.176,
-        "details": {
-          "description": "min=0.176, mean=0.176, max=0.176, sum=0.176 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=16.111, mean=16.111, max=16.111, sum=16.111 (1)",
-            "tab": "Efficiency",
-            "score": 16.111101382732393
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=118.438, mean=118.438, max=118.438, sum=118.438 (1)",
-            "tab": "General information",
-            "score": 118.438
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=1162.421, mean=1162.421, max=1162.421, sum=1162.421 (1)",
-            "tab": "General information",
-            "score": 1162.421
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json b/data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json
deleted file mode 100644
index 03bc0f0f8..000000000
--- a/data/helm_capabilities/ibm/granite-4.0-h-small/eaec6d66-6da7-4592-baca-2539240acc5d.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/ibm_granite-4.0-h-small/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IBM Granite 4.0 Small",
-    "id": "ibm/granite-4.0-h-small",
-    "developer": "ibm",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.575,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 21.31162992088884
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.569,
-        "details": {
-          "description": "min=0.569, mean=0.569, max=0.569, sum=0.569 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=12.071, mean=12.071, max=12.071, sum=12.071 (1)",
-            "tab": "Efficiency",
-            "score": 12.070928404092788
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=288.391, mean=288.391, max=288.391, sum=288.391 (1)",
-            "tab": "General information",
-            "score": 288.391
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=372.93, mean=372.93, max=372.93, sum=372.93 (1)",
-            "tab": "General information",
-            "score": 372.93
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.383,
-        "details": {
-          "description": "min=0.383, mean=0.383, max=0.383, sum=0.383 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=17.606, mean=17.606, max=17.606, sum=17.606 (1)",
-            "tab": "Efficiency",
-            "score": 17.606201725690354
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=303.265, mean=303.265, max=303.265, sum=303.265 (1)",
-            "tab": "General information",
-            "score": 303.2645739910314
-          },
-          "GPQA - # output tokens": {
-            "description": "min=439.648, mean=439.648, max=439.648, sum=439.648 (1)",
-            "tab": "General information",
-            "score": 439.6479820627803
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "details": {
-          "description": "min=0.89, mean=0.89, max=0.89, sum=0.89 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=13.366, mean=13.366, max=13.366, sum=13.366 (1)",
-            "tab": "Efficiency",
-            "score": 13.366226098453712
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=51.534, mean=51.534, max=51.534, sum=51.534 (1)",
-            "tab": "General information",
-            "score": 51.53419593345656
-          },
-          "IFEval - # output tokens": {
-            "description": "min=494.717, mean=494.717, max=494.717, sum=494.717 (1)",
-            "tab": "General information",
-            "score": 494.7171903881701
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.739,
-        "details": {
-          "description": "min=0.739, mean=0.739, max=0.739, sum=0.739 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=30.807, mean=30.807, max=30.807, sum=30.807 (1)",
-            "tab": "Efficiency",
-            "score": 30.80672695994377
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=996.159, mean=996.159, max=996.159, sum=996.159 (1)",
-            "tab": "General information",
-            "score": 996.159
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.296,
-        "details": {
-          "description": "min=0.296, mean=0.296, max=0.296, sum=0.296 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=32.708, mean=32.708, max=32.708, sum=32.708 (1)",
-            "tab": "Efficiency",
-            "score": 32.70806641626358
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=118.438, mean=118.438, max=118.438, sum=118.438 (1)",
-            "tab": "General information",
-            "score": 118.438
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=1020.51, mean=1020.51, max=1020.51, sum=1020.51 (1)",
-            "tab": "General information",
-            "score": 1020.51
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json b/data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json
deleted file mode 100644
index 399dbb1e3..000000000
--- a/data/helm_capabilities/ibm/granite-4.0-micro/2db9cde5-4560-4ee4-8ffa-661dfc7db2f7.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/ibm_granite-4.0-micro/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IBM Granite 4.0 Micro",
-    "id": "ibm/granite-4.0-micro",
-    "developer": "ibm",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.486,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 5.725128505637726
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.395,
-        "details": {
-          "description": "min=0.395, mean=0.395, max=0.395, sum=0.395 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=3.135, mean=3.135, max=3.135, sum=3.135 (1)",
-            "tab": "Efficiency",
-            "score": 3.1348352246284485
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=288.391, mean=288.391, max=288.391, sum=288.391 (1)",
-            "tab": "General information",
-            "score": 288.391
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=325.255, mean=325.255, max=325.255, sum=325.255 (1)",
-            "tab": "General information",
-            "score": 325.255
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307,
-        "details": {
-          "description": "min=0.307, mean=0.307, max=0.307, sum=0.307 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=3.075, mean=3.075, max=3.075, sum=3.075 (1)",
-            "tab": "Efficiency",
-            "score": 3.075281912970436
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=303.265, mean=303.265, max=303.265, sum=303.265 (1)",
-            "tab": "General information",
-            "score": 303.2645739910314
-          },
-          "GPQA - # output tokens": {
-            "description": "min=337.417, mean=337.417, max=337.417, sum=337.417 (1)",
-            "tab": "General information",
-            "score": 337.4170403587444
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.849,
-        "details": {
-          "description": "min=0.849, mean=0.849, max=0.849, sum=0.849 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=4.58, mean=4.58, max=4.58, sum=4.58 (1)",
-            "tab": "Efficiency",
-            "score": 4.580414981806785
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=51.534, mean=51.534, max=51.534, sum=51.534 (1)",
-            "tab": "General information",
-            "score": 51.53419593345656
-          },
-          "IFEval - # output tokens": {
-            "description": "min=497.8, mean=497.8, max=497.8, sum=497.8 (1)",
-            "tab": "General information",
-            "score": 497.8003696857671
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.67,
-        "details": {
-          "description": "min=0.67, mean=0.67, max=0.67, sum=0.67 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=8.161, mean=8.161, max=8.161, sum=8.161 (1)",
-            "tab": "Efficiency",
-            "score": 8.160923891305924
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=1037.706, mean=1037.706, max=1037.706, sum=1037.706 (1)",
-            "tab": "General information",
-            "score": 1037.706
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.209,
-        "details": {
-          "description": "min=0.209, mean=0.209, max=0.209, sum=0.209 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=9.674, mean=9.674, max=9.674, sum=9.674 (1)",
-            "tab": "Efficiency",
-            "score": 9.674186517477036
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=118.438, mean=118.438, max=118.438, sum=118.438 (1)",
-            "tab": "General information",
-            "score": 118.438
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=1145.889, mean=1145.889, max=1145.889, sum=1145.889 (1)",
-            "tab": "General information",
-            "score": 1145.889
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json b/data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json
deleted file mode 100644
index 736686c13..000000000
--- a/data/helm_capabilities/marin-community/marin-8b-instruct/eccf77c4-6a65-40b9-9445-dd35dee7f7b8.json
+++ /dev/null
@@ -1,352 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/marin-community_marin-8b-instruct/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Marin 8B Instruct",
-    "id": "marin-community/marin-8b-instruct",
-    "developer": "marin-community",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.325,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 118.55196213968559
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.188,
-        "details": {
-          "description": "min=0.188, mean=0.188, max=0.188, sum=0.188 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=94.096, mean=94.096, max=94.096, sum=94.096 (1)",
-            "tab": "Efficiency",
-            "score": 94.0957455046177
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=228.366, mean=228.366, max=228.366, sum=228.366 (1)",
-            "tab": "General information",
-            "score": 228.366
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=539.21, mean=539.21, max=539.21, sum=539.21 (1)",
-            "tab": "General information",
-            "score": 539.21
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false",
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.168,
-        "details": {
-          "description": "min=0.168, mean=0.168, max=0.168, sum=0.168 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=123.019, mean=123.019, max=123.019, sum=123.019 (1)",
-            "tab": "Efficiency",
-            "score": 123.0189983149815
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)",
-            "tab": "General information",
-            "score": 0.002242152466367713
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=247.173, mean=247.173, max=247.173, sum=247.173 (1)",
-            "tab": "General information",
-            "score": 247.1726457399103
-          },
-          "GPQA - # output tokens": {
-            "description": "min=707.953, mean=707.953, max=707.953, sum=707.953 (1)",
-            "tab": "General information",
-            "score": 707.9529147982063
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false",
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.632,
-        "details": {
-          "description": "min=0.632, mean=0.632, max=0.632, sum=0.632 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=88.889, mean=88.889, max=88.889, sum=88.889 (1)",
-            "tab": "Efficiency",
-            "score": 88.88931880596606
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.024, mean=46.024, max=46.024, sum=46.024 (1)",
-            "tab": "General information",
-            "score": 46.024029574861366
-          },
-          "IFEval - # output tokens": {
-            "description": "min=516.492, mean=516.492, max=516.492, sum=516.492 (1)",
-            "tab": "General information",
-            "score": 516.4916820702402
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.477,
-        "details": {
-          "description": "min=0.477, mean=0.477, max=0.477, sum=0.477 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=146.873, mean=146.873, max=146.873, sum=146.873 (1)",
-            "tab": "Efficiency",
-            "score": 146.8726548871994
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=818.678, mean=818.678, max=818.678, sum=818.678 (1)",
-            "tab": "General information",
-            "score": 818.678
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2",
-          "num_output_tokens": "2048"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.16,
-        "details": {
-          "description": "min=0.16, mean=0.16, max=0.16, sum=0.16 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=139.883, mean=139.883, max=139.883, sum=139.883 (1)",
-            "tab": "Efficiency",
-            "score": 139.88309318566323
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "General information",
-            "score": 0.001
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=108.784, mean=108.784, max=108.784, sum=108.784 (1)",
-            "tab": "General information",
-            "score": 108.784
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=808.178, mean=808.178, max=808.178, sum=808.178 (1)",
-            "tab": "General information",
-            "score": 808.178
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "num_output_tokens": "2048"
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json b/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json
deleted file mode 100644
index 4dd5465a5..000000000
--- a/data/helm_capabilities/meta/llama-3.1-405b-instruct-turbo/75b5943a-67be-4b2f-85da-a52533edc76f.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/meta_llama-3.1-405b-instruct-turbo/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 3.1 Instruct Turbo 405B",
-    "id": "meta/llama-3.1-405b-instruct-turbo",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.618,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 9.16102940672383
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.723,
-        "details": {
-          "description": "min=0.723, mean=0.723, max=0.723, sum=0.723 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=5.795, mean=5.795, max=5.795, sum=5.795 (1)",
-            "tab": "Efficiency",
-            "score": 5.794888144493103
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=228.366, mean=228.366, max=228.366, sum=228.366 (1)",
-            "tab": "General information",
-            "score": 228.366
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=376.289, mean=376.289, max=376.289, sum=376.289 (1)",
-            "tab": "General information",
-            "score": 376.289
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.522,
-        "details": {
-          "description": "min=0.522, mean=0.522, max=0.522, sum=0.522 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=9.197, mean=9.197, max=9.197, sum=9.197 (1)",
-            "tab": "Efficiency",
-            "score": 9.197324877362615
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=248.886, mean=248.886, max=248.886, sum=248.886 (1)",
-            "tab": "General information",
-            "score": 248.88565022421525
-          },
-          "GPQA - # output tokens": {
-            "description": "min=592.928, mean=592.928, max=592.928, sum=592.928 (1)",
-            "tab": "General information",
-            "score": 592.9282511210762
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.811,
-        "details": {
-          "description": "min=0.811, mean=0.811, max=0.811, sum=0.811 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=4.572, mean=4.572, max=4.572, sum=4.572 (1)",
-            "tab": "Efficiency",
-            "score": 4.571529605692724
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.024, mean=46.024, max=46.024, sum=46.024 (1)",
-            "tab": "General information",
-            "score": 46.024029574861366
-          },
-          "IFEval - # output tokens": {
-            "description": "min=358.067, mean=358.067, max=358.067, sum=358.067 (1)",
-            "tab": "General information",
-            "score": 358.06654343807764
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.783,
-        "details": {
-          "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=15.654, mean=15.654, max=15.654, sum=15.654 (1)",
-            "tab": "Efficiency",
-            "score": 15.653513952493668
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=773.114, mean=773.114, max=773.114, sum=773.114 (1)",
-            "tab": "General information",
-            "score": 773.114
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.249,
-        "details": {
-          "description": "min=0.249, mean=0.249, max=0.249, sum=0.249 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=10.588, mean=10.588, max=10.588, sum=10.588 (1)",
-            "tab": "Efficiency",
-            "score": 10.587890453577042
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.708, mean=109.708, max=109.708, sum=109.708 (1)",
-            "tab": "General information",
-            "score": 109.708
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=906.902, mean=906.902, max=906.902, sum=906.902 (1)",
-            "tab": "General information",
-            "score": 906.902
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json b/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json
deleted file mode 100644
index 407242cbb..000000000
--- a/data/helm_capabilities/meta/llama-3.1-70b-instruct-turbo/8bec35b7-271a-457d-b665-9f69baa248aa.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/meta_llama-3.1-70b-instruct-turbo/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 3.1 Instruct Turbo 70B",
-    "id": "meta/llama-3.1-70b-instruct-turbo",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.574,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 4.2482479944372376
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.653,
-        "details": {
-          "description": "min=0.653, mean=0.653, max=0.653, sum=0.653 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=2.732, mean=2.732, max=2.732, sum=2.732 (1)",
-            "tab": "Efficiency",
-            "score": 2.7317132804393767
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=228.366, mean=228.366, max=228.366, sum=228.366 (1)",
-            "tab": "General information",
-            "score": 228.366
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=326.226, mean=326.226, max=326.226, sum=326.226 (1)",
-            "tab": "General information",
-            "score": 326.226
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.426,
-        "details": {
-          "description": "min=0.426, mean=0.426, max=0.426, sum=0.426 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=6.095, mean=6.095, max=6.095, sum=6.095 (1)",
-            "tab": "Efficiency",
-            "score": 6.0952357684550265
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=248.886, mean=248.886, max=248.886, sum=248.886 (1)",
-            "tab": "General information",
-            "score": 248.88565022421525
-          },
-          "GPQA - # output tokens": {
-            "description": "min=491.435, mean=491.435, max=491.435, sum=491.435 (1)",
-            "tab": "General information",
-            "score": 491.43497757847535
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.821,
-        "details": {
-          "description": "min=0.821, mean=0.821, max=0.821, sum=0.821 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=2.622, mean=2.622, max=2.622, sum=2.622 (1)",
-            "tab": "Efficiency",
-            "score": 2.622214562350853
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.024, mean=46.024, max=46.024, sum=46.024 (1)",
-            "tab": "General information",
-            "score": 46.024029574861366
-          },
-          "IFEval - # output tokens": {
-            "description": "min=361.464, mean=361.464, max=361.464, sum=361.464 (1)",
-            "tab": "General information",
-            "score": 361.46395563770795
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.758,
-        "details": {
-          "description": "min=0.758, mean=0.758, max=0.758, sum=0.758 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=4.143, mean=4.143, max=4.143, sum=4.143 (1)",
-            "tab": "Efficiency",
-            "score": 4.142627255439758
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=808.109, mean=808.109, max=808.109, sum=808.109 (1)",
-            "tab": "General information",
-            "score": 808.109
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.21,
-        "details": {
-          "description": "min=0.21, mean=0.21, max=0.21, sum=0.21 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=5.649, mean=5.649, max=5.649, sum=5.649 (1)",
-            "tab": "Efficiency",
-            "score": 5.649449105501175
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.708, mean=109.708, max=109.708, sum=109.708 (1)",
-            "tab": "General information",
-            "score": 109.708
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=1321.301, mean=1321.301, max=1321.301, sum=1321.301 (1)",
-            "tab": "General information",
-            "score": 1321.301
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json b/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json
deleted file mode 100644
index 30524d64b..000000000
--- a/data/helm_capabilities/meta/llama-3.1-8b-instruct-turbo/c4e5e54c-dfdc-4f61-8572-bff7fa028a61.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/meta_llama-3.1-8b-instruct-turbo/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 3.1 Instruct Turbo 8B",
-    "id": "meta/llama-3.1-8b-instruct-turbo",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.444,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 3.654367387500005
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.406,
-        "details": {
-          "description": "min=0.406, mean=0.406, max=0.406, sum=0.406 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=2.642, mean=2.642, max=2.642, sum=2.642 (1)",
-            "tab": "Efficiency",
-            "score": 2.6422129917144774
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=228.366, mean=228.366, max=228.366, sum=228.366 (1)",
-            "tab": "General information",
-            "score": 228.366
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=518.387, mean=518.387, max=518.387, sum=518.387 (1)",
-            "tab": "General information",
-            "score": 518.387
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.247,
-        "details": {
-          "description": "min=0.247, mean=0.247, max=0.247, sum=0.247 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=3.28, mean=3.28, max=3.28, sum=3.28 (1)",
-            "tab": "Efficiency",
-            "score": 3.2803654104070277
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=248.886, mean=248.886, max=248.886, sum=248.886 (1)",
-            "tab": "General information",
-            "score": 248.88565022421525
-          },
-          "GPQA - # output tokens": {
-            "description": "min=744.583, mean=744.583, max=744.583, sum=744.583 (1)",
-            "tab": "General information",
-            "score": 744.5829596412556
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.743,
-        "details": {
-          "description": "min=0.743, mean=0.743, max=0.743, sum=0.743 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=1.982, mean=1.982, max=1.982, sum=1.982 (1)",
-            "tab": "Efficiency",
-            "score": 1.981573561423367
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.024, mean=46.024, max=46.024, sum=46.024 (1)",
-            "tab": "General information",
-            "score": 46.024029574861366
-          },
-          "IFEval - # output tokens": {
-            "description": "min=404.026, mean=404.026, max=404.026, sum=404.026 (1)",
-            "tab": "General information",
-            "score": 404.02587800369685
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.686,
-        "details": {
-          "description": "min=0.686, mean=0.686, max=0.686, sum=0.686 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=3.192, mean=3.192, max=3.192, sum=3.192 (1)",
-            "tab": "Efficiency",
-            "score": 3.1917312424182893
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=865.484, mean=865.484, max=865.484, sum=865.484 (1)",
-            "tab": "General information",
-            "score": 865.484
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.137,
-        "details": {
-          "description": "min=0.137, mean=0.137, max=0.137, sum=0.137 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=7.176, mean=7.176, max=7.176, sum=7.176 (1)",
-            "tab": "Efficiency",
-            "score": 7.1759537315368656
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.708, mean=109.708, max=109.708, sum=109.708 (1)",
-            "tab": "General information",
-            "score": 109.708
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=2170.057, mean=2170.057, max=2170.057, sum=2170.057 (1)",
-            "tab": "General information",
-            "score": 2170.057
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json b/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json
deleted file mode 100644
index d9ca75120..000000000
--- a/data/helm_capabilities/meta/llama-4-maverick-17b-128e-instruct-fp8/c308b0a5-4c44-4369-9b23-8664959aa927.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/meta_llama-4-maverick-17b-128e-instruct-fp8/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 4 Maverick 17Bx128E Instruct FP8",
-    "id": "meta/llama-4-maverick-17b-128e-instruct-fp8",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.718,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 8.498428393165543
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=0.81 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=6.74, mean=6.74, max=6.74, sum=6.74 (1)",
-            "tab": "Efficiency",
-            "score": 6.739848182201386
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=225.585, mean=225.585, max=225.585, sum=225.585 (1)",
-            "tab": "General information",
-            "score": 225.585
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=548.208, mean=548.208, max=548.208, sum=548.208 (1)",
-            "tab": "General information",
-            "score": 548.208
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.65,
-        "details": {
-          "description": "min=0.65, mean=0.65, max=0.65, sum=0.65 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=9.838, mean=9.838, max=9.838, sum=9.838 (1)",
-            "tab": "Efficiency",
-            "score": 9.838454476921013
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=236.807, mean=236.807, max=236.807, sum=236.807 (1)",
-            "tab": "General information",
-            "score": 236.8071748878924
-          },
-          "GPQA - # output tokens": {
-            "description": "min=822.336, mean=822.336, max=822.336, sum=822.336 (1)",
-            "tab": "General information",
-            "score": 822.3363228699552
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.908,
-        "details": {
-          "description": "min=0.908, mean=0.908, max=0.908, sum=0.908 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=3.773, mean=3.773, max=3.773, sum=3.773 (1)",
-            "tab": "Efficiency",
-            "score": 3.773326979987943
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=45.577, mean=45.577, max=45.577, sum=45.577 (1)",
-            "tab": "General information",
-            "score": 45.57670979667283
-          },
-          "IFEval - # output tokens": {
-            "description": "min=311.251, mean=311.251, max=311.251, sum=311.251 (1)",
-            "tab": "General information",
-            "score": 311.2513863216266
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=0.8 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=10.37, mean=10.37, max=10.37, sum=10.37 (1)",
-            "tab": "Efficiency",
-            "score": 10.36993253993988
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=842.777, mean=842.777, max=842.777, sum=842.777 (1)",
-            "tab": "General information",
-            "score": 842.777
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.422,
-        "details": {
-          "description": "min=0.422, mean=0.422, max=0.422, sum=0.422 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=11.771, mean=11.771, max=11.771, sum=11.771 (1)",
-            "tab": "Efficiency",
-            "score": 11.770579786777496
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=105.286, mean=105.286, max=105.286, sum=105.286 (1)",
-            "tab": "General information",
-            "score": 105.286
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=1055.205, mean=1055.205, max=1055.205, sum=1055.205 (1)",
-            "tab": "General information",
-            "score": 1055.205
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json b/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json
deleted file mode 100644
index 640472423..000000000
--- a/data/helm_capabilities/meta/llama-4-scout-17b-16e-instruct/1a1edfb2-f0f1-4930-82c0-99293ec76645.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/meta_llama-4-scout-17b-16e-instruct/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 4 Scout 17Bx16E Instruct",
-    "id": "meta/llama-4-scout-17b-16e-instruct",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.644,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 8.886502883481523
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.742,
-        "details": {
-          "description": "min=0.742, mean=0.742, max=0.742, sum=0.742 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=6.525, mean=6.525, max=6.525, sum=6.525 (1)",
-            "tab": "Efficiency",
-            "score": 6.524971485614777
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=225.585, mean=225.585, max=225.585, sum=225.585 (1)",
-            "tab": "General information",
-            "score": 225.585
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=550.212, mean=550.212, max=550.212, sum=550.212 (1)",
-            "tab": "General information",
-            "score": 550.212
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.507,
-        "details": {
-          "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=11.027, mean=11.027, max=11.027, sum=11.027 (1)",
-            "tab": "Efficiency",
-            "score": 11.026973943004693
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=236.807, mean=236.807, max=236.807, sum=236.807 (1)",
-            "tab": "General information",
-            "score": 236.8071748878924
-          },
-          "GPQA - # output tokens": {
-            "description": "min=856.76, mean=856.76, max=856.76, sum=856.76 (1)",
-            "tab": "General information",
-            "score": 856.7600896860987
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.818,
-        "details": {
-          "description": "min=0.818, mean=0.818, max=0.818, sum=0.818 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=4.297, mean=4.297, max=4.297, sum=4.297 (1)",
-            "tab": "Efficiency",
-            "score": 4.296513711679004
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=45.577, mean=45.577, max=45.577, sum=45.577 (1)",
-            "tab": "General information",
-            "score": 45.57670979667283
-          },
-          "IFEval - # output tokens": {
-            "description": "min=399.399, mean=399.399, max=399.399, sum=399.399 (1)",
-            "tab": "General information",
-            "score": 399.3992606284658
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.779,
-        "details": {
-          "description": "min=0.779, mean=0.779, max=0.779, sum=0.779 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=9.942, mean=9.942, max=9.942, sum=9.942 (1)",
-            "tab": "Efficiency",
-            "score": 9.942440722942353
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=952.636, mean=952.636, max=952.636, sum=952.636 (1)",
-            "tab": "General information",
-            "score": 952.636
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.373,
-        "details": {
-          "description": "min=0.373, mean=0.373, max=0.373, sum=0.373 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=12.642, mean=12.642, max=12.642, sum=12.642 (1)",
-            "tab": "Efficiency",
-            "score": 12.641614554166793
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=105.286, mean=105.286, max=105.286, sum=105.286 (1)",
-            "tab": "General information",
-            "score": 105.286
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=1088.449, mean=1088.449, max=1088.449, sum=1088.449 (1)",
-            "tab": "General information",
-            "score": 1088.449
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json b/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json
deleted file mode 100644
index 0b19a4ab4..000000000
--- a/data/helm_capabilities/mistralai/mistral-7b-instruct-v0.3/9aa5af51-8c55-4896-b634-162a9d82b58e.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/mistralai_mistral-7b-instruct-v0.3/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral Instruct v0.3 7B",
-    "id": "mistralai/mistral-7b-instruct-v0.3",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.376,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 3.386352003847275
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.277,
-        "details": {
-          "description": "min=0.277, mean=0.277, max=0.277, sum=0.277 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=2.0, mean=2.0, max=2.0, sum=2.0 (1)",
-            "tab": "Efficiency",
-            "score": 1.999533802509308
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=260.915, mean=260.915, max=260.915, sum=260.915 (1)",
-            "tab": "General information",
-            "score": 260.915
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=272.103, mean=272.103, max=272.103, sum=272.103 (1)",
-            "tab": "General information",
-            "score": 272.103
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.303,
-        "details": {
-          "description": "min=0.303, mean=0.303, max=0.303, sum=0.303 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=2.285, mean=2.285, max=2.285, sum=2.285 (1)",
-            "tab": "Efficiency",
-            "score": 2.284658104849503
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=281.998, mean=281.998, max=281.998, sum=281.998 (1)",
-            "tab": "General information",
-            "score": 281.99775784753365
-          },
-          "GPQA - # output tokens": {
-            "description": "min=387.971, mean=387.971, max=387.971, sum=387.971 (1)",
-            "tab": "General information",
-            "score": 387.9708520179372
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.567,
-        "details": {
-          "description": "min=0.567, mean=0.567, max=0.567, sum=0.567 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=2.535, mean=2.535, max=2.535, sum=2.535 (1)",
-            "tab": "Efficiency",
-            "score": 2.5349821145345013
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=51.309, mean=51.309, max=51.309, sum=51.309 (1)",
-            "tab": "General information",
-            "score": 51.3086876155268
-          },
-          "IFEval - # output tokens": {
-            "description": "min=449.725, mean=449.725, max=449.725, sum=449.725 (1)",
-            "tab": "General information",
-            "score": 449.72458410351203
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.66,
-        "details": {
-          "description": "min=0.66, mean=0.66, max=0.66, sum=0.66 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=5.901, mean=5.901, max=5.901, sum=5.901 (1)",
-            "tab": "Efficiency",
-            "score": 5.900532631635666
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=702.754, mean=702.754, max=702.754, sum=702.754 (1)",
-            "tab": "General information",
-            "score": 702.754
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.072,
-        "details": {
-          "description": "min=0.072, mean=0.072, max=0.072, sum=0.072 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=4.212, mean=4.212, max=4.212, sum=4.212 (1)",
-            "tab": "Efficiency",
-            "score": 4.212053365707398
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=119.373, mean=119.373, max=119.373, sum=119.373 (1)",
-            "tab": "General information",
-            "score": 119.373
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=678.438, mean=678.438, max=678.438, sum=678.438 (1)",
-            "tab": "General information",
-            "score": 678.438
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json b/data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json
deleted file mode 100644
index dec52ca8a..000000000
--- a/data/helm_capabilities/mistralai/mistral-large-2411/21461a52-2f25-48c9-be19-f9233317d817.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/mistralai_mistral-large-2411/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral Large 2411",
-    "id": "mistralai/mistral-large-2411",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.598,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 14.462006275515396
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.599,
-        "details": {
-          "description": "min=0.599, mean=0.599, max=0.599, sum=0.599 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=7.537, mean=7.537, max=7.537, sum=7.537 (1)",
-            "tab": "Efficiency",
-            "score": 7.537241208553314
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=260.915, mean=260.915, max=260.915, sum=260.915 (1)",
-            "tab": "General information",
-            "score": 260.915
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=316.273, mean=316.273, max=316.273, sum=316.273 (1)",
-            "tab": "General information",
-            "score": 316.273
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.435,
-        "details": {
-          "description": "min=0.435, mean=0.435, max=0.435, sum=0.435 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=12.217, mean=12.217, max=12.217, sum=12.217 (1)",
-            "tab": "Efficiency",
-            "score": 12.217145950270341
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=281.998, mean=281.998, max=281.998, sum=281.998 (1)",
-            "tab": "General information",
-            "score": 281.99775784753365
-          },
-          "GPQA - # output tokens": {
-            "description": "min=507.357, mean=507.357, max=507.357, sum=507.357 (1)",
-            "tab": "General information",
-            "score": 507.3565022421525
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.876,
-        "details": {
-          "description": "min=0.876, mean=0.876, max=0.876, sum=0.876 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=10.742, mean=10.742, max=10.742, sum=10.742 (1)",
-            "tab": "Efficiency",
-            "score": 10.741783690761066
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=51.36, mean=51.36, max=51.36, sum=51.36 (1)",
-            "tab": "General information",
-            "score": 51.36044362292052
-          },
-          "IFEval - # output tokens": {
-            "description": "min=409.566, mean=409.566, max=409.566, sum=409.566 (1)",
-            "tab": "General information",
-            "score": 409.5656192236599
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.801,
-        "details": {
-          "description": "min=0.801, mean=0.801, max=0.801, sum=0.801 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=23.603, mean=23.603, max=23.603, sum=23.603 (1)",
-            "tab": "Efficiency",
-            "score": 23.602991637706758
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=1029.086, mean=1029.086, max=1029.086, sum=1029.086 (1)",
-            "tab": "General information",
-            "score": 1029.086
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281,
-        "details": {
-          "description": "min=0.281, mean=0.281, max=0.281, sum=0.281 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=18.211, mean=18.211, max=18.211, sum=18.211 (1)",
-            "tab": "Efficiency",
-            "score": 18.210868890285493
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=120.373, mean=120.373, max=120.373, sum=120.373 (1)",
-            "tab": "General information",
-            "score": 120.373
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=727.801, mean=727.801, max=727.801, sum=727.801 (1)",
-            "tab": "General information",
-            "score": 727.801
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json b/data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json
deleted file mode 100644
index 7999b823d..000000000
--- a/data/helm_capabilities/mistralai/mistral-small-2503/bdea0967-fcc7-493c-a18d-70727842deb9.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/mistralai_mistral-small-2503/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral Small 3.1 2503",
-    "id": "mistralai/mistral-small-2503",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.558,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 11.791458985991488
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.61,
-        "details": {
-          "description": "min=0.61, mean=0.61, max=0.61, sum=0.61 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=3.589, mean=3.589, max=3.589, sum=3.589 (1)",
-            "tab": "Efficiency",
-            "score": 3.588683393239975
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=239.367, mean=239.367, max=239.367, sum=239.367 (1)",
-            "tab": "General information",
-            "score": 239.367
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=365.903, mean=365.903, max=365.903, sum=365.903 (1)",
-            "tab": "General information",
-            "score": 365.903
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.392,
-        "details": {
-          "description": "min=0.392, mean=0.392, max=0.392, sum=0.392 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=5.05, mean=5.05, max=5.05, sum=5.05 (1)",
-            "tab": "Efficiency",
-            "score": 5.049520614435854
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=246.54, mean=246.54, max=246.54, sum=246.54 (1)",
-            "tab": "General information",
-            "score": 246.5403587443946
-          },
-          "GPQA - # output tokens": {
-            "description": "min=492.534, mean=492.534, max=492.534, sum=492.534 (1)",
-            "tab": "General information",
-            "score": 492.5336322869955
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.75,
-        "details": {
-          "description": "min=0.75, mean=0.75, max=0.75, sum=0.75 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=3.838, mean=3.838, max=3.838, sum=3.838 (1)",
-            "tab": "Efficiency",
-            "score": 3.837722122118345
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=47.237, mean=47.237, max=47.237, sum=47.237 (1)",
-            "tab": "General information",
-            "score": 47.2365988909427
-          },
-          "IFEval - # output tokens": {
-            "description": "min=379.896, mean=379.896, max=379.896, sum=379.896 (1)",
-            "tab": "General information",
-            "score": 379.89648798521256
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.788,
-        "details": {
-          "description": "min=0.788, mean=0.788, max=0.788, sum=0.788 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=12.831, mean=12.831, max=12.831, sum=12.831 (1)",
-            "tab": "Efficiency",
-            "score": 12.831070138692855
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=938.182, mean=938.182, max=938.182, sum=938.182 (1)",
-            "tab": "General information",
-            "score": 938.182
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.248,
-        "details": {
-          "description": "min=0.248, mean=0.248, max=0.248, sum=0.248 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=33.65, mean=33.65, max=33.65, sum=33.65 (1)",
-            "tab": "Efficiency",
-            "score": 33.650298661470416
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=110.473, mean=110.473, max=110.473, sum=110.473 (1)",
-            "tab": "General information",
-            "score": 110.473
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=753.657, mean=753.657, max=753.657, sum=753.657 (1)",
-            "tab": "General information",
-            "score": 753.657
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json b/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json
deleted file mode 100644
index 583f7956f..000000000
--- a/data/helm_capabilities/mistralai/mixtral-8x22b-instruct-v0.1/f7404ea3-62c7-47fc-9106-44c208470381.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/mistralai_mixtral-8x22b-instruct-v0.1/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mixtral Instruct 8x22B",
-    "id": "mistralai/mixtral-8x22b-instruct-v0.1",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.478,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 6.16132193567775
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46,
-        "details": {
-          "description": "min=0.46, mean=0.46, max=0.46, sum=0.46 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=3.967, mean=3.967, max=3.967, sum=3.967 (1)",
-            "tab": "Efficiency",
-            "score": 3.967100965499878
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=260.915, mean=260.915, max=260.915, sum=260.915 (1)",
-            "tab": "General information",
-            "score": 260.915
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=298.159, mean=298.159, max=298.159, sum=298.159 (1)",
-            "tab": "General information",
-            "score": 298.159
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.334,
-        "details": {
-          "description": "min=0.334, mean=0.334, max=0.334, sum=0.334 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=4.76, mean=4.76, max=4.76, sum=4.76 (1)",
-            "tab": "Efficiency",
-            "score": 4.760301354220095
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=281.998, mean=281.998, max=281.998, sum=281.998 (1)",
-            "tab": "General information",
-            "score": 281.99775784753365
-          },
-          "GPQA - # output tokens": {
-            "description": "min=403.895, mean=403.895, max=403.895, sum=403.895 (1)",
-            "tab": "General information",
-            "score": 403.89461883408075
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.724,
-        "details": {
-          "description": "min=0.724, mean=0.724, max=0.724, sum=0.724 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=4.568, mean=4.568, max=4.568, sum=4.568 (1)",
-            "tab": "Efficiency",
-            "score": 4.56831247837398
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=51.309, mean=51.309, max=51.309, sum=51.309 (1)",
-            "tab": "General information",
-            "score": 51.3086876155268
-          },
-          "IFEval - # output tokens": {
-            "description": "min=390.799, mean=390.799, max=390.799, sum=390.799 (1)",
-            "tab": "General information",
-            "score": 390.7985212569316
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.711,
-        "details": {
-          "description": "min=0.711, mean=0.711, max=0.711, sum=0.711 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=10.146, mean=10.146, max=10.146, sum=10.146 (1)",
-            "tab": "Efficiency",
-            "score": 10.145776480436325
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=668.768, mean=668.768, max=668.768, sum=668.768 (1)",
-            "tab": "General information",
-            "score": 668.768
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.163,
-        "details": {
-          "description": "min=0.163, mean=0.163, max=0.163, sum=0.163 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=7.365, mean=7.365, max=7.365, sum=7.365 (1)",
-            "tab": "Efficiency",
-            "score": 7.365118399858475
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=119.373, mean=119.373, max=119.373, sum=119.373 (1)",
-            "tab": "General information",
-            "score": 119.373
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=783.89, mean=783.89, max=783.89, sum=783.89 (1)",
-            "tab": "General information",
-            "score": 783.89
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json b/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json
deleted file mode 100644
index d2c9cfb4e..000000000
--- a/data/helm_capabilities/mistralai/mixtral-8x7b-instruct-v0.1/2817820c-4b28-4235-a8fd-ad02d0f504bc.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/mistralai_mixtral-8x7b-instruct-v0.1/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mixtral Instruct 8x7B",
-    "id": "mistralai/mixtral-8x7b-instruct-v0.1",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.397,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 3.8521851769069984
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.335,
-        "details": {
-          "description": "min=0.335, mean=0.335, max=0.335, sum=0.335 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=2.842, mean=2.842, max=2.842, sum=2.842 (1)",
-            "tab": "Efficiency",
-            "score": 2.841812901973724
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=260.915, mean=260.915, max=260.915, sum=260.915 (1)",
-            "tab": "General information",
-            "score": 260.915
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=274.355, mean=274.355, max=274.355, sum=274.355 (1)",
-            "tab": "General information",
-            "score": 274.355
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.296,
-        "details": {
-          "description": "min=0.296, mean=0.296, max=0.296, sum=0.296 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=3.163, mean=3.163, max=3.163, sum=3.163 (1)",
-            "tab": "Efficiency",
-            "score": 3.1633052681593616
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=281.998, mean=281.998, max=281.998, sum=281.998 (1)",
-            "tab": "General information",
-            "score": 281.99775784753365
-          },
-          "GPQA - # output tokens": {
-            "description": "min=384.17, mean=384.17, max=384.17, sum=384.17 (1)",
-            "tab": "General information",
-            "score": 384.17040358744396
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.575,
-        "details": {
-          "description": "min=0.575, mean=0.575, max=0.575, sum=0.575 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=3.247, mean=3.247, max=3.247, sum=3.247 (1)",
-            "tab": "Efficiency",
-            "score": 3.2468207733027374
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=51.309, mean=51.309, max=51.309, sum=51.309 (1)",
-            "tab": "General information",
-            "score": 51.3086876155268
-          },
-          "IFEval - # output tokens": {
-            "description": "min=377.81, mean=377.81, max=377.81, sum=377.81 (1)",
-            "tab": "General information",
-            "score": 377.8096118299446
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.673,
-        "details": {
-          "description": "min=0.673, mean=0.673, max=0.673, sum=0.673 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=5.582, mean=5.582, max=5.582, sum=5.582 (1)",
-            "tab": "Efficiency",
-            "score": 5.581539319515228
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=669.436, mean=669.436, max=669.436, sum=669.436 (1)",
-            "tab": "General information",
-            "score": 669.436
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.105,
-        "details": {
-          "description": "min=0.105, mean=0.105, max=0.105, sum=0.105 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=4.427, mean=4.427, max=4.427, sum=4.427 (1)",
-            "tab": "Efficiency",
-            "score": 4.427447621583939
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=119.373, mean=119.373, max=119.373, sum=119.373 (1)",
-            "tab": "General information",
-            "score": 119.373
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=550.807, mean=550.807, max=550.807, sum=550.807 (1)",
-            "tab": "General information",
-            "score": 550.807
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json b/data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json
deleted file mode 100644
index 1946db617..000000000
--- a/data/helm_capabilities/moonshotai/kimi-k2-instruct/f3da71fc-fc88-4dda-b423-168d11eab317.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/moonshotai_kimi-k2-instruct/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kimi K2 Instruct",
-    "id": "moonshotai/kimi-k2-instruct",
-    "developer": "moonshotai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.768,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 44.938299779825435
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.819,
-        "details": {
-          "description": "min=0.819, mean=0.819, max=0.819, sum=0.819 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=20.295, mean=20.295, max=20.295, sum=20.295 (1)",
-            "tab": "Efficiency",
-            "score": 20.295415951013567
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=249.352, mean=249.352, max=249.352, sum=249.352 (1)",
-            "tab": "General information",
-            "score": 249.352
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=703.4, mean=703.4, max=703.4, sum=703.4 (1)",
-            "tab": "General information",
-            "score": 703.4
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.652,
-        "details": {
-          "description": "min=0.652, mean=0.652, max=0.652, sum=0.652 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=50.104, mean=50.104, max=50.104, sum=50.104 (1)",
-            "tab": "Efficiency",
-            "score": 50.10382581986654
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=268.74, mean=268.74, max=268.74, sum=268.74 (1)",
-            "tab": "General information",
-            "score": 268.73991031390136
-          },
-          "GPQA - # output tokens": {
-            "description": "min=1250.646, mean=1250.646, max=1250.646, sum=1250.646 (1)",
-            "tab": "General information",
-            "score": 1250.645739910314
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=0.85 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=17.412, mean=17.412, max=17.412, sum=17.412 (1)",
-            "tab": "Efficiency",
-            "score": 17.412336311587122
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=45.863, mean=45.863, max=45.863, sum=45.863 (1)",
-            "tab": "General information",
-            "score": 45.86321626617375
-          },
-          "IFEval - # output tokens": {
-            "description": "min=454.283, mean=454.283, max=454.283, sum=454.283 (1)",
-            "tab": "General information",
-            "score": 454.2828096118299
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.862,
-        "details": {
-          "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=46.942, mean=46.942, max=46.942, sum=46.942 (1)",
-            "tab": "Efficiency",
-            "score": 46.94232517242432
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=1332.527, mean=1332.527, max=1332.527, sum=1332.527 (1)",
-            "tab": "General information",
-            "score": 1332.527
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.654,
-        "details": {
-          "description": "min=0.654, mean=0.654, max=0.654, sum=0.654 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=89.938, mean=89.938, max=89.938, sum=89.938 (1)",
-            "tab": "Efficiency",
-            "score": 89.93759564423561
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=106.59, mean=106.59, max=106.59, sum=106.59 (1)",
-            "tab": "General information",
-            "score": 106.59
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=3396.692, mean=3396.692, max=3396.692, sum=3396.692 (1)",
-            "tab": "General information",
-            "score": 3396.692
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json b/data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json
deleted file mode 100644
index 3c36cb01b..000000000
--- a/data/helm_capabilities/openai/gpt-4.1-2025-04-14/2f7c0db9-b5de-4674-a130-5315520dea68.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4.1-2025-04-14/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-4.1 2025-04-14",
-    "id": "openai/gpt-4.1-2025-04-14",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.727,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 11.09172884853167
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.811,
-        "details": {
-          "description": "min=0.811, mean=0.811, max=0.811, sum=0.811 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=6.431, mean=6.431, max=6.431, sum=6.431 (1)",
-            "tab": "Efficiency",
-            "score": 6.431383004903793
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=228.569, mean=228.569, max=228.569, sum=228.569 (1)",
-            "tab": "General information",
-            "score": 228.569
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=513.15, mean=513.15, max=513.15, sum=513.15 (1)",
-            "tab": "General information",
-            "score": 513.15
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.659,
-        "details": {
-          "description": "min=0.659, mean=0.659, max=0.659, sum=0.659 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=9.906, mean=9.906, max=9.906, sum=9.906 (1)",
-            "tab": "Efficiency",
-            "score": 9.906458986714282
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=248.152, mean=248.152, max=248.152, sum=248.152 (1)",
-            "tab": "General information",
-            "score": 248.152466367713
-          },
-          "GPQA - # output tokens": {
-            "description": "min=824.722, mean=824.722, max=824.722, sum=824.722 (1)",
-            "tab": "General information",
-            "score": 824.7219730941704
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.838,
-        "details": {
-          "description": "min=0.838, mean=0.838, max=0.838, sum=0.838 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=3.68, mean=3.68, max=3.68, sum=3.68 (1)",
-            "tab": "Efficiency",
-            "score": 3.6797932344531836
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)",
-            "tab": "General information",
-            "score": 45.67097966728281
-          },
-          "IFEval - # output tokens": {
-            "description": "min=277.305, mean=277.305, max=277.305, sum=277.305 (1)",
-            "tab": "General information",
-            "score": 277.3049907578558
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.854,
-        "details": {
-          "description": "min=0.854, mean=0.854, max=0.854, sum=0.854 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=11.723, mean=11.723, max=11.723, sum=11.723 (1)",
-            "tab": "Efficiency",
-            "score": 11.72278983767207
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=1007.831, mean=1007.831, max=1007.831, sum=1007.831 (1)",
-            "tab": "General information",
-            "score": 1007.831
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.471,
-        "details": {
-          "description": "min=0.471, mean=0.471, max=0.471, sum=0.471 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=23.718, mean=23.718, max=23.718, sum=23.718 (1)",
-            "tab": "Efficiency",
-            "score": 23.718219178915025
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)",
-            "tab": "General information",
-            "score": 109.623
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=1884.743, mean=1884.743, max=1884.743, sum=1884.743 (1)",
-            "tab": "General information",
-            "score": 1884.743
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json b/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json
deleted file mode 100644
index dd4503511..000000000
--- a/data/helm_capabilities/openai/gpt-4.1-mini-2025-04-14/4dcb8022-fe54-42f7-b43f-9866de173731.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4.1-mini-2025-04-14/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-4.1 mini 2025-04-14",
-    "id": "openai/gpt-4.1-mini-2025-04-14",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.726,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 7.701476623313954
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.783,
-        "details": {
-          "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=4.927, mean=4.927, max=4.927, sum=4.927 (1)",
-            "tab": "Efficiency",
-            "score": 4.927327474832535
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=228.569, mean=228.569, max=228.569, sum=228.569 (1)",
-            "tab": "General information",
-            "score": 228.569
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=627.909, mean=627.909, max=627.909, sum=627.909 (1)",
-            "tab": "General information",
-            "score": 627.909
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.614,
-        "details": {
-          "description": "min=0.614, mean=0.614, max=0.614, sum=0.614 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=8.217, mean=8.217, max=8.217, sum=8.217 (1)",
-            "tab": "Efficiency",
-            "score": 8.216832675206822
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=248.152, mean=248.152, max=248.152, sum=248.152 (1)",
-            "tab": "General information",
-            "score": 248.152466367713
-          },
-          "GPQA - # output tokens": {
-            "description": "min=1056.354, mean=1056.354, max=1056.354, sum=1056.354 (1)",
-            "tab": "General information",
-            "score": 1056.354260089686
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.904,
-        "details": {
-          "description": "min=0.904, mean=0.904, max=0.904, sum=0.904 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=2.622, mean=2.622, max=2.622, sum=2.622 (1)",
-            "tab": "Efficiency",
-            "score": 2.6219342847848774
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)",
-            "tab": "General information",
-            "score": 45.67097966728281
-          },
-          "IFEval - # output tokens": {
-            "description": "min=275.1, mean=275.1, max=275.1, sum=275.1 (1)",
-            "tab": "General information",
-            "score": 275.09981515711644
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.838,
-        "details": {
-          "description": "min=0.838, mean=0.838, max=0.838, sum=0.838 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=7.331, mean=7.331, max=7.331, sum=7.331 (1)",
-            "tab": "Efficiency",
-            "score": 7.3305598454475405
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=1020.373, mean=1020.373, max=1020.373, sum=1020.373 (1)",
-            "tab": "General information",
-            "score": 1020.373
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.491,
-        "details": {
-          "description": "min=0.491, mean=0.491, max=0.491, sum=0.491 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=15.411, mean=15.411, max=15.411, sum=15.411 (1)",
-            "tab": "Efficiency",
-            "score": 15.41072883629799
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)",
-            "tab": "General information",
-            "score": 109.623
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=2117.264, mean=2117.264, max=2117.264, sum=2117.264 (1)",
-            "tab": "General information",
-            "score": 2117.264
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json b/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json
deleted file mode 100644
index e2550958a..000000000
--- a/data/helm_capabilities/openai/gpt-4.1-nano-2025-04-14/c436f3d1-84ee-49df-9287-0305925f7cf4.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4.1-nano-2025-04-14/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-4.1 nano 2025-04-14",
-    "id": "openai/gpt-4.1-nano-2025-04-14",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.616,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 4.5128146238794296
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.55,
-        "details": {
-          "description": "min=0.55, mean=0.55, max=0.55, sum=0.55 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=2.935, mean=2.935, max=2.935, sum=2.935 (1)",
-            "tab": "Efficiency",
-            "score": 2.9353291485309603
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=228.569, mean=228.569, max=228.569, sum=228.569 (1)",
-            "tab": "General information",
-            "score": 228.569
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=503.09, mean=503.09, max=503.09, sum=503.09 (1)",
-            "tab": "General information",
-            "score": 503.09
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.507,
-        "details": {
-          "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=4.817, mean=4.817, max=4.817, sum=4.817 (1)",
-            "tab": "Efficiency",
-            "score": 4.816804544808084
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=248.152, mean=248.152, max=248.152, sum=248.152 (1)",
-            "tab": "General information",
-            "score": 248.152466367713
-          },
-          "GPQA - # output tokens": {
-            "description": "min=842.038, mean=842.038, max=842.038, sum=842.038 (1)",
-            "tab": "General information",
-            "score": 842.0381165919282
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.843,
-        "details": {
-          "description": "min=0.843, mean=0.843, max=0.843, sum=0.843 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=1.781, mean=1.781, max=1.781, sum=1.781 (1)",
-            "tab": "Efficiency",
-            "score": 1.7811373196776386
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)",
-            "tab": "General information",
-            "score": 45.67097966728281
-          },
-          "IFEval - # output tokens": {
-            "description": "min=269.619, mean=269.619, max=269.619, sum=269.619 (1)",
-            "tab": "General information",
-            "score": 269.6192236598891
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.811,
-        "details": {
-          "description": "min=0.811, mean=0.811, max=0.811, sum=0.811 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=4.453, mean=4.453, max=4.453, sum=4.453 (1)",
-            "tab": "Efficiency",
-            "score": 4.453118676900863
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=909.661, mean=909.661, max=909.661, sum=909.661 (1)",
-            "tab": "General information",
-            "score": 909.661
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.367,
-        "details": {
-          "description": "min=0.367, mean=0.367, max=0.367, sum=0.367 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=8.578, mean=8.578, max=8.578, sum=8.578 (1)",
-            "tab": "Efficiency",
-            "score": 8.577683429479599
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)",
-            "tab": "General information",
-            "score": 109.623
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=1777.605, mean=1777.605, max=1777.605, sum=1777.605 (1)",
-            "tab": "General information",
-            "score": 1777.605
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json b/data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json
deleted file mode 100644
index 3c3d40256..000000000
--- a/data/helm_capabilities/openai/gpt-4o-2024-11-20/90ba0b16-b866-4b18-bd84-6a8cd1c47c47.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4o-2024-11-20/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-4o 2024-11-20",
-    "id": "openai/gpt-4o-2024-11-20",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.634,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 13.268214070783824
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.713,
-        "details": {
-          "description": "min=0.713, mean=0.713, max=0.713, sum=0.713 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=11.334, mean=11.334, max=11.334, sum=11.334 (1)",
-            "tab": "Efficiency",
-            "score": 11.333669463157653
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=228.569, mean=228.569, max=228.569, sum=228.569 (1)",
-            "tab": "General information",
-            "score": 228.569
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=503.126, mean=503.126, max=503.126, sum=503.126 (1)",
-            "tab": "General information",
-            "score": 503.126
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52,
-        "details": {
-          "description": "min=0.52, mean=0.52, max=0.52, sum=0.52 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=13.65, mean=13.65, max=13.65, sum=13.65 (1)",
-            "tab": "Efficiency",
-            "score": 13.64998589877056
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=248.152, mean=248.152, max=248.152, sum=248.152 (1)",
-            "tab": "General information",
-            "score": 248.152466367713
-          },
-          "GPQA - # output tokens": {
-            "description": "min=597.291, mean=597.291, max=597.291, sum=597.291 (1)",
-            "tab": "General information",
-            "score": 597.2914798206278
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.817,
-        "details": {
-          "description": "min=0.817, mean=0.817, max=0.817, sum=0.817 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=8.686, mean=8.686, max=8.686, sum=8.686 (1)",
-            "tab": "Efficiency",
-            "score": 8.68623784685752
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)",
-            "tab": "General information",
-            "score": 45.67097966728281
-          },
-          "IFEval - # output tokens": {
-            "description": "min=345.405, mean=345.405, max=345.405, sum=345.405 (1)",
-            "tab": "General information",
-            "score": 345.40480591497226
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.828,
-        "details": {
-          "description": "min=0.828, mean=0.828, max=0.828, sum=0.828 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=15.765, mean=15.765, max=15.765, sum=15.765 (1)",
-            "tab": "Efficiency",
-            "score": 15.764520774255166
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=1044.923, mean=1044.923, max=1044.923, sum=1044.923 (1)",
-            "tab": "General information",
-            "score": 1044.923
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.293,
-        "details": {
-          "description": "min=0.293, mean=0.293, max=0.293, sum=0.293 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=16.907, mean=16.907, max=16.907, sum=16.907 (1)",
-            "tab": "Efficiency",
-            "score": 16.90665637087822
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)",
-            "tab": "General information",
-            "score": 109.623
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=908.643, mean=908.643, max=908.643, sum=908.643 (1)",
-            "tab": "General information",
-            "score": 908.643
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json b/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json
deleted file mode 100644
index 778449e6e..000000000
--- a/data/helm_capabilities/openai/gpt-4o-mini-2024-07-18/07c823ba-9e17-47e4-858b-a1f2a514a276.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-4o-mini-2024-07-18/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-4o mini 2024-07-18",
-    "id": "openai/gpt-4o-mini-2024-07-18",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.565,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 10.41176955262334
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.603,
-        "details": {
-          "description": "min=0.603, mean=0.603, max=0.603, sum=0.603 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=6.572, mean=6.572, max=6.572, sum=6.572 (1)",
-            "tab": "Efficiency",
-            "score": 6.57206253027916
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=228.569, mean=228.569, max=228.569, sum=228.569 (1)",
-            "tab": "General information",
-            "score": 228.569
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=334.86, mean=334.86, max=334.86, sum=334.86 (1)",
-            "tab": "General information",
-            "score": 334.86
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.368,
-        "details": {
-          "description": "min=0.368, mean=0.368, max=0.368, sum=0.368 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=8.814, mean=8.814, max=8.814, sum=8.814 (1)",
-            "tab": "Efficiency",
-            "score": 8.813848996910814
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=248.152, mean=248.152, max=248.152, sum=248.152 (1)",
-            "tab": "General information",
-            "score": 248.152466367713
-          },
-          "GPQA - # output tokens": {
-            "description": "min=489.226, mean=489.226, max=489.226, sum=489.226 (1)",
-            "tab": "General information",
-            "score": 489.22645739910314
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.782,
-        "details": {
-          "description": "min=0.782, mean=0.782, max=0.782, sum=0.782 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=5.963, mean=5.963, max=5.963, sum=5.963 (1)",
-            "tab": "Efficiency",
-            "score": 5.963314282916169
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)",
-            "tab": "General information",
-            "score": 45.67097966728281
-          },
-          "IFEval - # output tokens": {
-            "description": "min=314.919, mean=314.919, max=314.919, sum=314.919 (1)",
-            "tab": "General information",
-            "score": 314.91866913123846
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791,
-        "details": {
-          "description": "min=0.791, mean=0.791, max=0.791, sum=0.791 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=13.996, mean=13.996, max=13.996, sum=13.996 (1)",
-            "tab": "Efficiency",
-            "score": 13.996195561885834
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=809.307, mean=809.307, max=809.307, sum=809.307 (1)",
-            "tab": "General information",
-            "score": 809.307
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.28,
-        "details": {
-          "description": "min=0.28, mean=0.28, max=0.28, sum=0.28 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=16.713, mean=16.713, max=16.713, sum=16.713 (1)",
-            "tab": "Efficiency",
-            "score": 16.713426391124724
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)",
-            "tab": "General information",
-            "score": 109.623
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=863.417, mean=863.417, max=863.417, sum=863.417 (1)",
-            "tab": "General information",
-            "score": 863.417
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json b/data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json
deleted file mode 100644
index 95d9762ef..000000000
--- a/data/helm_capabilities/openai/gpt-5-2025-08-07/eb1bb443-71ad-4b79-8308-2b66c5e8c631.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-5-2025-08-07/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-5 2025-08-07",
-    "id": "openai/gpt-5-2025-08-07",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.807,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 74.66990821942755
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.863,
-        "details": {
-          "description": "min=0.863, mean=0.863, max=0.863, sum=0.863 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=18.668, mean=18.668, max=18.668, sum=18.668 (1)",
-            "tab": "Efficiency",
-            "score": 18.668269051074983
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=248.569, mean=248.569, max=248.569, sum=248.569 (1)",
-            "tab": "General information",
-            "score": 248.569
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=5.028, mean=5.028, max=5.028, sum=5.028 (1)",
-            "tab": "General information",
-            "score": 5.028
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791,
-        "details": {
-          "description": "min=0.791, mean=0.791, max=0.791, sum=0.791 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=57.418, mean=57.418, max=57.418, sum=57.418 (1)",
-            "tab": "Efficiency",
-            "score": 57.41822674028542
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=268.152, mean=268.152, max=268.152, sum=268.152 (1)",
-            "tab": "General information",
-            "score": 268.15246636771303
-          },
-          "GPQA - # output tokens": {
-            "description": "min=5.935, mean=5.935, max=5.935, sum=5.935 (1)",
-            "tab": "General information",
-            "score": 5.934977578475336
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.875,
-        "details": {
-          "description": "min=0.875, mean=0.875, max=0.875, sum=0.875 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=35.937, mean=35.937, max=35.937, sum=35.937 (1)",
-            "tab": "Efficiency",
-            "score": 35.937195608664354
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)",
-            "tab": "General information",
-            "score": 45.67097966728281
-          },
-          "IFEval - # output tokens": {
-            "description": "min=527.641, mean=527.641, max=527.641, sum=527.641 (1)",
-            "tab": "General information",
-            "score": 527.6414048059149
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.857,
-        "details": {
-          "description": "min=0.857, mean=0.857, max=0.857, sum=0.857 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=88.595, mean=88.595, max=88.595, sum=88.595 (1)",
-            "tab": "Efficiency",
-            "score": 88.59490567517281
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=1518.974, mean=1518.974, max=1518.974, sum=1518.974 (1)",
-            "tab": "General information",
-            "score": 1518.974
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.647,
-        "details": {
-          "description": "min=0.647, mean=0.647, max=0.647, sum=0.647 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=172.731, mean=172.731, max=172.731, sum=172.731 (1)",
-            "tab": "Efficiency",
-            "score": 172.73094402194022
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)",
-            "tab": "General information",
-            "score": 109.623
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=228.774, mean=228.774, max=228.774, sum=228.774 (1)",
-            "tab": "General information",
-            "score": 228.774
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json b/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json
deleted file mode 100644
index 5dc165206..000000000
--- a/data/helm_capabilities/openai/gpt-5-mini-2025-08-07/e14d42a9-9639-4c35-8a0c-e395e754c46c.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-5-mini-2025-08-07/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-5 mini 2025-08-07",
-    "id": "openai/gpt-5-mini-2025-08-07",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.819,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 28.206869066978612
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.835,
-        "details": {
-          "description": "min=0.835, mean=0.835, max=0.835, sum=0.835 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=11.803, mean=11.803, max=11.803, sum=11.803 (1)",
-            "tab": "Efficiency",
-            "score": 11.802515007257462
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=248.569, mean=248.569, max=248.569, sum=248.569 (1)",
-            "tab": "General information",
-            "score": 248.569
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=17.495, mean=17.495, max=17.495, sum=17.495 (1)",
-            "tab": "General information",
-            "score": 17.495
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.756,
-        "details": {
-          "description": "min=0.756, mean=0.756, max=0.756, sum=0.756 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=20.737, mean=20.737, max=20.737, sum=20.737 (1)",
-            "tab": "Efficiency",
-            "score": 20.737325443280653
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=268.152, mean=268.152, max=268.152, sum=268.152 (1)",
-            "tab": "General information",
-            "score": 268.15246636771303
-          },
-          "GPQA - # output tokens": {
-            "description": "min=25.379, mean=25.379, max=25.379, sum=25.379 (1)",
-            "tab": "General information",
-            "score": 25.378923766816143
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.927,
-        "details": {
-          "description": "min=0.927, mean=0.927, max=0.927, sum=0.927 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=18.834, mean=18.834, max=18.834, sum=18.834 (1)",
-            "tab": "Efficiency",
-            "score": 18.83414089833963
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)",
-            "tab": "General information",
-            "score": 45.67097966728281
-          },
-          "IFEval - # output tokens": {
-            "description": "min=441.137, mean=441.137, max=441.137, sum=441.137 (1)",
-            "tab": "General information",
-            "score": 441.13678373382623
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.855,
-        "details": {
-          "description": "min=0.855, mean=0.855, max=0.855, sum=0.855 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=33.854, mean=33.854, max=33.854, sum=33.854 (1)",
-            "tab": "Efficiency",
-            "score": 33.85394237089157
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=1408.024, mean=1408.024, max=1408.024, sum=1408.024 (1)",
-            "tab": "General information",
-            "score": 1408.024
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.722,
-        "details": {
-          "description": "min=0.722, mean=0.722, max=0.722, sum=0.722 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=55.806, mean=55.806, max=55.806, sum=55.806 (1)",
-            "tab": "Efficiency",
-            "score": 55.806421615123746
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)",
-            "tab": "General information",
-            "score": 109.623
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=362.654, mean=362.654, max=362.654, sum=362.654 (1)",
-            "tab": "General information",
-            "score": 362.654
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json b/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json
deleted file mode 100644
index 096518c62..000000000
--- a/data/helm_capabilities/openai/gpt-5-nano-2025-08-07/3754df44-ddce-4a66-9074-f65f5677ae27.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-5-nano-2025-08-07/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-5 nano 2025-08-07",
-    "id": "openai/gpt-5-nano-2025-08-07",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.748,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 48.213836350621065
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.778,
-        "details": {
-          "description": "min=0.778, mean=0.778, max=0.778, sum=0.778 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=17.337, mean=17.337, max=17.337, sum=17.337 (1)",
-            "tab": "Efficiency",
-            "score": 17.336622306585312
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=248.569, mean=248.569, max=248.569, sum=248.569 (1)",
-            "tab": "General information",
-            "score": 248.569
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=5.385, mean=5.385, max=5.385, sum=5.385 (1)",
-            "tab": "General information",
-            "score": 5.385
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.679,
-        "details": {
-          "description": "min=0.679, mean=0.679, max=0.679, sum=0.679 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=30.246, mean=30.246, max=30.246, sum=30.246 (1)",
-            "tab": "Efficiency",
-            "score": 30.2457077674267
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=268.152, mean=268.152, max=268.152, sum=268.152 (1)",
-            "tab": "General information",
-            "score": 268.15246636771303
-          },
-          "GPQA - # output tokens": {
-            "description": "min=5.668, mean=5.668, max=5.668, sum=5.668 (1)",
-            "tab": "General information",
-            "score": 5.668161434977579
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.932,
-        "details": {
-          "description": "min=0.932, mean=0.932, max=0.932, sum=0.932 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=26.735, mean=26.735, max=26.735, sum=26.735 (1)",
-            "tab": "Efficiency",
-            "score": 26.734930773980075
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)",
-            "tab": "General information",
-            "score": 45.67097966728281
-          },
-          "IFEval - # output tokens": {
-            "description": "min=426.656, mean=426.656, max=426.656, sum=426.656 (1)",
-            "tab": "General information",
-            "score": 426.6561922365989
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.806,
-        "details": {
-          "description": "min=0.806, mean=0.806, max=0.806, sum=0.806 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=47.56, mean=47.56, max=47.56, sum=47.56 (1)",
-            "tab": "Efficiency",
-            "score": 47.560468022584914
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=1391.144, mean=1391.144, max=1391.144, sum=1391.144 (1)",
-            "tab": "General information",
-            "score": 1391.144
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.547,
-        "details": {
-          "description": "min=0.547, mean=0.547, max=0.547, sum=0.547 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=119.191, mean=119.191, max=119.191, sum=119.191 (1)",
-            "tab": "Efficiency",
-            "score": 119.19145288252831
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)",
-            "tab": "General information",
-            "score": 109.623
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=222.15, mean=222.15, max=222.15, sum=222.15 (1)",
-            "tab": "General information",
-            "score": 222.15
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json b/data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json
deleted file mode 100644
index 738007852..000000000
--- a/data/helm_capabilities/openai/gpt-5.1-2025-11-13/a540b282-e9d6-403e-96df-a1d27ad14d3a.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-5.1-2025-11-13/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-5.1 2025-11-13",
-    "id": "openai/gpt-5.1-2025-11-13",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.656,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 10.620566227529599
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.579,
-        "details": {
-          "description": "min=0.579, mean=0.579, max=0.579, sum=0.579 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=1.147, mean=1.147, max=1.147, sum=1.147 (1)",
-            "tab": "Efficiency",
-            "score": 1.1470122172832489
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=248.569, mean=248.569, max=248.569, sum=248.569 (1)",
-            "tab": "General information",
-            "score": 248.569
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=5.002, mean=5.002, max=5.002, sum=5.002 (1)",
-            "tab": "General information",
-            "score": 5.002
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.442,
-        "details": {
-          "description": "min=0.442, mean=0.442, max=0.442, sum=0.442 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=1.002, mean=1.002, max=1.002, sum=1.002 (1)",
-            "tab": "Efficiency",
-            "score": 1.002433323539426
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=268.152, mean=268.152, max=268.152, sum=268.152 (1)",
-            "tab": "General information",
-            "score": 268.15246636771303
-          },
-          "GPQA - # output tokens": {
-            "description": "min=5.422, mean=5.422, max=5.422, sum=5.422 (1)",
-            "tab": "General information",
-            "score": 5.42152466367713
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.935,
-        "details": {
-          "description": "min=0.935, mean=0.935, max=0.935, sum=0.935 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=13.159, mean=13.159, max=13.159, sum=13.159 (1)",
-            "tab": "Efficiency",
-            "score": 13.15882584436103
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)",
-            "tab": "General information",
-            "score": 45.67097966728281
-          },
-          "IFEval - # output tokens": {
-            "description": "min=647.063, mean=647.063, max=647.063, sum=647.063 (1)",
-            "tab": "General information",
-            "score": 647.0628465804067
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.863,
-        "details": {
-          "description": "min=0.863, mean=0.863, max=0.863, sum=0.863 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=28.081, mean=28.081, max=28.081, sum=28.081 (1)",
-            "tab": "Efficiency",
-            "score": 28.08133857488632
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=2059.716, mean=2059.716, max=2059.716, sum=2059.716 (1)",
-            "tab": "General information",
-            "score": 2059.716
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.464,
-        "details": {
-          "description": "min=0.464, mean=0.464, max=0.464, sum=0.464 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=9.713, mean=9.713, max=9.713, sum=9.713 (1)",
-            "tab": "Efficiency",
-            "score": 9.713221177577973
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)",
-            "tab": "General information",
-            "score": 109.623
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=1256.266, mean=1256.266, max=1256.266, sum=1256.266 (1)",
-            "tab": "General information",
-            "score": 1256.266
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json b/data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json
deleted file mode 100644
index 8642e9954..000000000
--- a/data/helm_capabilities/openai/gpt-oss-120b/758851b3-9ac9-43d8-8b6a-3d9688752d80.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-oss-120b/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt-oss-120b",
-    "id": "openai/gpt-oss-120b",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 19.583454439679375
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.795,
-        "details": {
-          "description": "min=0.795, mean=0.795, max=0.795, sum=0.795 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=6.268, mean=6.268, max=6.268, sum=6.268 (1)",
-            "tab": "Efficiency",
-            "score": 6.268435170412063
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=248.569, mean=248.569, max=248.569, sum=248.569 (1)",
-            "tab": "General information",
-            "score": 248.569
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=474.202, mean=474.202, max=474.202, sum=474.202 (1)",
-            "tab": "General information",
-            "score": 474.202
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.684,
-        "details": {
-          "description": "min=0.684, mean=0.684, max=0.684, sum=0.684 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=18.819, mean=18.819, max=18.819, sum=18.819 (1)",
-            "tab": "Efficiency",
-            "score": 18.8192116278704
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=268.152, mean=268.152, max=268.152, sum=268.152 (1)",
-            "tab": "General information",
-            "score": 268.15246636771303
-          },
-          "GPQA - # output tokens": {
-            "description": "min=1218.108, mean=1218.108, max=1218.108, sum=1218.108 (1)",
-            "tab": "General information",
-            "score": 1218.1076233183855
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.836,
-        "details": {
-          "description": "min=0.836, mean=0.836, max=0.836, sum=0.836 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=6.303, mean=6.303, max=6.303, sum=6.303 (1)",
-            "tab": "Efficiency",
-            "score": 6.302578532982225
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)",
-            "tab": "General information",
-            "score": 45.67097966728281
-          },
-          "IFEval - # output tokens": {
-            "description": "min=945.784, mean=945.784, max=945.784, sum=945.784 (1)",
-            "tab": "General information",
-            "score": 945.7837338262477
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.845,
-        "details": {
-          "description": "min=0.845, mean=0.845, max=0.845, sum=0.845 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=24.979, mean=24.979, max=24.979, sum=24.979 (1)",
-            "tab": "Efficiency",
-            "score": 24.978535928487776
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=2925.361, mean=2925.361, max=2925.361, sum=2925.361 (1)",
-            "tab": "General information",
-            "score": 2925.361
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.688,
-        "details": {
-          "description": "min=0.688, mean=0.688, max=0.688, sum=0.688 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=41.549, mean=41.549, max=41.549, sum=41.549 (1)",
-            "tab": "Efficiency",
-            "score": 41.54851093864441
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)",
-            "tab": "General information",
-            "score": 109.623
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=4103.671, mean=4103.671, max=4103.671, sum=4103.671 (1)",
-            "tab": "General information",
-            "score": 4103.671
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json b/data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json
deleted file mode 100644
index 5112d535f..000000000
--- a/data/helm_capabilities/openai/gpt-oss-20b/1d9ac688-ca0d-405b-a262-e95673e79250.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_gpt-oss-20b/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt-oss-20b",
-    "id": "openai/gpt-oss-20b",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.674,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 31.785255717522546
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.74,
-        "details": {
-          "description": "min=0.74, mean=0.74, max=0.74, sum=0.74 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=4.593, mean=4.593, max=4.593, sum=4.593 (1)",
-            "tab": "Efficiency",
-            "score": 4.593113619089126
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=248.569, mean=248.569, max=248.569, sum=248.569 (1)",
-            "tab": "General information",
-            "score": 248.569
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=820.909, mean=820.909, max=820.909, sum=820.909 (1)",
-            "tab": "General information",
-            "score": 820.909
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.594,
-        "details": {
-          "description": "min=0.594, mean=0.594, max=0.594, sum=0.594 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=27.565, mean=27.565, max=27.565, sum=27.565 (1)",
-            "tab": "Efficiency",
-            "score": 27.56541810923093
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=268.152, mean=268.152, max=268.152, sum=268.152 (1)",
-            "tab": "General information",
-            "score": 268.15246636771303
-          },
-          "GPQA - # output tokens": {
-            "description": "min=2872.139, mean=2872.139, max=2872.139, sum=2872.139 (1)",
-            "tab": "General information",
-            "score": 2872.1390134529147
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.732,
-        "details": {
-          "description": "min=0.732, mean=0.732, max=0.732, sum=0.732 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=26.607, mean=26.607, max=26.607, sum=26.607 (1)",
-            "tab": "Efficiency",
-            "score": 26.607220574359577
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=45.671, mean=45.671, max=45.671, sum=45.671 (1)",
-            "tab": "General information",
-            "score": 45.67097966728281
-          },
-          "IFEval - # output tokens": {
-            "description": "min=3202.279, mean=3202.279, max=3202.279, sum=3202.279 (1)",
-            "tab": "General information",
-            "score": 3202.279112754159
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.737,
-        "details": {
-          "description": "min=0.737, mean=0.737, max=0.737, sum=0.737 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=42.985, mean=42.985, max=42.985, sum=42.985 (1)",
-            "tab": "Efficiency",
-            "score": 42.985184440851214
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=4398.71, mean=4398.71, max=4398.71, sum=4398.71 (1)",
-            "tab": "General information",
-            "score": 4398.71
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.565,
-        "details": {
-          "description": "min=0.565, mean=0.565, max=0.565, sum=0.565 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=57.175, mean=57.175, max=57.175, sum=57.175 (1)",
-            "tab": "Efficiency",
-            "score": 57.17534184408188
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.623, mean=109.623, max=109.623, sum=109.623 (1)",
-            "tab": "General information",
-            "score": 109.623
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=6604.944, mean=6604.944, max=6604.944, sum=6604.944 (1)",
-            "tab": "General information",
-            "score": 6604.944
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json b/data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json
deleted file mode 100644
index 677721448..000000000
--- a/data/helm_capabilities/openai/o3-2025-04-16/c1e593d9-08ba-40fe-b02f-1c95be8fdfc9.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_o3-2025-04-16/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "o3 2025-04-16",
-    "id": "openai/o3-2025-04-16",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.811,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 51.078448384234015
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.859,
-        "details": {
-          "description": "min=0.859, mean=0.859, max=0.859, sum=0.859 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=17.306, mean=17.306, max=17.306, sum=17.306 (1)",
-            "tab": "Efficiency",
-            "score": 17.306045585632326
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=249.506, mean=249.506, max=249.506, sum=249.506 (1)",
-            "tab": "General information",
-            "score": 249.506
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=5.038, mean=5.038, max=5.038, sum=5.038 (1)",
-            "tab": "General information",
-            "score": 5.038
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.753,
-        "details": {
-          "description": "min=0.753, mean=0.753, max=0.753, sum=0.753 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=48.024, mean=48.024, max=48.024, sum=48.024 (1)",
-            "tab": "Efficiency",
-            "score": 48.0242628821343
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=269.975, mean=269.975, max=269.975, sum=269.975 (1)",
-            "tab": "General information",
-            "score": 269.97533632286996
-          },
-          "GPQA - # output tokens": {
-            "description": "min=6.457, mean=6.457, max=6.457, sum=6.457 (1)",
-            "tab": "General information",
-            "score": 6.457399103139013
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.869,
-        "details": {
-          "description": "min=0.869, mean=0.869, max=0.869, sum=0.869 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=17.399, mean=17.399, max=17.399, sum=17.399 (1)",
-            "tab": "Efficiency",
-            "score": 17.398983872972444
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.054, mean=46.054, max=46.054, sum=46.054 (1)",
-            "tab": "General information",
-            "score": 46.05360443622921
-          },
-          "IFEval - # output tokens": {
-            "description": "min=447.353, mean=447.353, max=447.353, sum=447.353 (1)",
-            "tab": "General information",
-            "score": 447.35304990757857
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.861,
-        "details": {
-          "description": "min=0.861, mean=0.861, max=0.861, sum=0.861 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=47.15, mean=47.15, max=47.15, sum=47.15 (1)",
-            "tab": "Efficiency",
-            "score": 47.150321824789046
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=1248.452, mean=1248.452, max=1248.452, sum=1248.452 (1)",
-            "tab": "General information",
-            "score": 1248.452
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.714,
-        "details": {
-          "description": "min=0.714, mean=0.714, max=0.714, sum=0.714 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=125.513, mean=125.513, max=125.513, sum=125.513 (1)",
-            "tab": "Efficiency",
-            "score": 125.51262775564194
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.767, mean=109.767, max=109.767, sum=109.767 (1)",
-            "tab": "General information",
-            "score": 109.767
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=506.811, mean=506.811, max=506.811, sum=506.811 (1)",
-            "tab": "General information",
-            "score": 506.811
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json b/data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json
deleted file mode 100644
index fd4ae16c5..000000000
--- a/data/helm_capabilities/openai/o4-mini-2025-04-16/35a31e19-2ef5-4caa-a848-422af42adab8.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/openai_o4-mini-2025-04-16/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "o4-mini 2025-04-16",
-    "id": "openai/o4-mini-2025-04-16",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.812,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 21.93756369551652
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.82, mean=0.82, max=0.82, sum=0.82 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=8.896, mean=8.896, max=8.896, sum=8.896 (1)",
-            "tab": "Efficiency",
-            "score": 8.895831291675568
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=249.506, mean=249.506, max=249.506, sum=249.506 (1)",
-            "tab": "General information",
-            "score": 249.506
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=10.834, mean=10.834, max=10.834, sum=10.834 (1)",
-            "tab": "General information",
-            "score": 10.834
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.735,
-        "details": {
-          "description": "min=0.735, mean=0.735, max=0.735, sum=0.735 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=22.412, mean=22.412, max=22.412, sum=22.412 (1)",
-            "tab": "Efficiency",
-            "score": 22.412139415206397
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=269.975, mean=269.975, max=269.975, sum=269.975 (1)",
-            "tab": "General information",
-            "score": 269.97533632286996
-          },
-          "GPQA - # output tokens": {
-            "description": "min=8.413, mean=8.413, max=8.413, sum=8.413 (1)",
-            "tab": "General information",
-            "score": 8.41255605381166
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.929,
-        "details": {
-          "description": "min=0.929, mean=0.929, max=0.929, sum=0.929 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=12.26, mean=12.26, max=12.26, sum=12.26 (1)",
-            "tab": "Efficiency",
-            "score": 12.260425486097494
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.054, mean=46.054, max=46.054, sum=46.054 (1)",
-            "tab": "General information",
-            "score": 46.05360443622921
-          },
-          "IFEval - # output tokens": {
-            "description": "min=360.231, mean=360.231, max=360.231, sum=360.231 (1)",
-            "tab": "General information",
-            "score": 360.2310536044362
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.854,
-        "details": {
-          "description": "min=0.854, mean=0.854, max=0.854, sum=0.854 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=25.397, mean=25.397, max=25.397, sum=25.397 (1)",
-            "tab": "Efficiency",
-            "score": 25.396886379241945
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=969.786, mean=969.786, max=969.786, sum=969.786 (1)",
-            "tab": "General information",
-            "score": 969.786
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.72,
-        "details": {
-          "description": "min=0.72, mean=0.72, max=0.72, sum=0.72 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=40.723, mean=40.723, max=40.723, sum=40.723 (1)",
-            "tab": "Efficiency",
-            "score": 40.72253590536118
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.767, mean=109.767, max=109.767, sum=109.767 (1)",
-            "tab": "General information",
-            "score": 109.767
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=388.401, mean=388.401, max=388.401, sum=388.401 (1)",
-            "tab": "General information",
-            "score": 388.401
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json b/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json
deleted file mode 100644
index 50778c699..000000000
--- a/data/helm_capabilities/qwen/qwen2.5-72b-instruct-turbo/7de0bda2-ce56-444a-b293-a310a5b2d7ab.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/qwen_qwen2.5-72b-instruct-turbo/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5 Instruct Turbo 72B",
-    "id": "qwen/qwen2.5-72b-instruct-turbo",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.599,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 16.666975749955085
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.631,
-        "details": {
-          "description": "min=0.631, mean=0.631, max=0.631, sum=0.631 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=11.79, mean=11.79, max=11.79, sum=11.79 (1)",
-            "tab": "Efficiency",
-            "score": 11.790208662986755
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=238.715, mean=238.715, max=238.715, sum=238.715 (1)",
-            "tab": "General information",
-            "score": 238.715
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=489.611, mean=489.611, max=489.611, sum=489.611 (1)",
-            "tab": "General information",
-            "score": 489.611
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.426,
-        "details": {
-          "description": "min=0.426, mean=0.426, max=0.426, sum=0.426 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=28.719, mean=28.719, max=28.719, sum=28.719 (1)",
-            "tab": "Efficiency",
-            "score": 28.71905704036422
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=253.37, mean=253.37, max=253.37, sum=253.37 (1)",
-            "tab": "General information",
-            "score": 253.36995515695068
-          },
-          "GPQA - # output tokens": {
-            "description": "min=704.881, mean=704.881, max=704.881, sum=704.881 (1)",
-            "tab": "General information",
-            "score": 704.8811659192825
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.806,
-        "details": {
-          "description": "min=0.806, mean=0.806, max=0.806, sum=0.806 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=20.844, mean=20.844, max=20.844, sum=20.844 (1)",
-            "tab": "Efficiency",
-            "score": 20.844201727407036
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.492, mean=46.492, max=46.492, sum=46.492 (1)",
-            "tab": "General information",
-            "score": 46.491682070240294
-          },
-          "IFEval - # output tokens": {
-            "description": "min=361.089, mean=361.089, max=361.089, sum=361.089 (1)",
-            "tab": "General information",
-            "score": 361.0887245841035
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.802,
-        "details": {
-          "description": "min=0.802, mean=0.802, max=0.802, sum=0.802 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=12.219, mean=12.219, max=12.219, sum=12.219 (1)",
-            "tab": "Efficiency",
-            "score": 12.219232248067856
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=1042.017, mean=1042.017, max=1042.017, sum=1042.017 (1)",
-            "tab": "General information",
-            "score": 1042.017
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.33,
-        "details": {
-          "description": "min=0.33, mean=0.33, max=0.33, sum=0.33 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=9.762, mean=9.762, max=9.762, sum=9.762 (1)",
-            "tab": "Efficiency",
-            "score": 9.762179070949555
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=111.6, mean=111.6, max=111.6, sum=111.6 (1)",
-            "tab": "General information",
-            "score": 111.6
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=886.55, mean=886.55, max=886.55, sum=886.55 (1)",
-            "tab": "General information",
-            "score": 886.55
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json b/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json
deleted file mode 100644
index c974f1019..000000000
--- a/data/helm_capabilities/qwen/qwen2.5-7b-instruct-turbo/dd22f29b-f8b8-4c59-9f26-f6633bbbdc8b.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/qwen_qwen2.5-7b-instruct-turbo/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5 Instruct Turbo 7B",
-    "id": "qwen/qwen2.5-7b-instruct-turbo",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.529,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 4.913331052029195
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.539,
-        "details": {
-          "description": "min=0.539, mean=0.539, max=0.539, sum=0.539 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=6.223, mean=6.223, max=6.223, sum=6.223 (1)",
-            "tab": "Efficiency",
-            "score": 6.223100474119186
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=238.715, mean=238.715, max=238.715, sum=238.715 (1)",
-            "tab": "General information",
-            "score": 238.715
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=439.207, mean=439.207, max=439.207, sum=439.207 (1)",
-            "tab": "General information",
-            "score": 439.207
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.341,
-        "details": {
-          "description": "min=0.341, mean=0.341, max=0.341, sum=0.341 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=3.475, mean=3.475, max=3.475, sum=3.475 (1)",
-            "tab": "Efficiency",
-            "score": 3.4745728910771185
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=253.37, mean=253.37, max=253.37, sum=253.37 (1)",
-            "tab": "General information",
-            "score": 253.36995515695068
-          },
-          "GPQA - # output tokens": {
-            "description": "min=554.274, mean=554.274, max=554.274, sum=554.274 (1)",
-            "tab": "General information",
-            "score": 554.2735426008969
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.741,
-        "details": {
-          "description": "min=0.741, mean=0.741, max=0.741, sum=0.741 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=2.068, mean=2.068, max=2.068, sum=2.068 (1)",
-            "tab": "Efficiency",
-            "score": 2.0679604544436865
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.492, mean=46.492, max=46.492, sum=46.492 (1)",
-            "tab": "General information",
-            "score": 46.491682070240294
-          },
-          "IFEval - # output tokens": {
-            "description": "min=317.828, mean=317.828, max=317.828, sum=317.828 (1)",
-            "tab": "General information",
-            "score": 317.82809611829947
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.731,
-        "details": {
-          "description": "min=0.731, mean=0.731, max=0.731, sum=0.731 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=7.74, mean=7.74, max=7.74, sum=7.74 (1)",
-            "tab": "Efficiency",
-            "score": 7.7404146847724915
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=953.359, mean=953.359, max=953.359, sum=953.359 (1)",
-            "tab": "General information",
-            "score": 953.359
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.294,
-        "details": {
-          "description": "min=0.294, mean=0.294, max=0.294, sum=0.294 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=5.061, mean=5.061, max=5.061, sum=5.061 (1)",
-            "tab": "Efficiency",
-            "score": 5.06060675573349
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=111.6, mean=111.6, max=111.6, sum=111.6 (1)",
-            "tab": "General information",
-            "score": 111.6
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=809.198, mean=809.198, max=809.198, sum=809.198 (1)",
-            "tab": "General information",
-            "score": 809.198
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json b/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json
deleted file mode 100644
index 9ded60c84..000000000
--- a/data/helm_capabilities/qwen/qwen3-235b-a22b-fp8-tput/9eb537b9-9e2d-4d0a-bfa5-644a18f4db0e.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-fp8-tput/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen3 235B A22B FP8 Throughput",
-    "id": "qwen/qwen3-235b-a22b-fp8-tput",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.726,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 175.88874367192255
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.817,
-        "details": {
-          "description": "min=0.817, mean=0.817, max=0.817, sum=0.817 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=126.73, mean=126.73, max=126.73, sum=126.73 (1)",
-            "tab": "Efficiency",
-            "score": 126.73047786664962
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=259.715, mean=259.715, max=259.715, sum=259.715 (1)",
-            "tab": "General information",
-            "score": 259.715
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=3518.576, mean=3518.576, max=3518.576, sum=3518.576 (1)",
-            "tab": "General information",
-            "score": 3518.576
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.623,
-        "details": {
-          "description": "min=0.623, mean=0.623, max=0.623, sum=0.623 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=237.413, mean=237.413, max=237.413, sum=237.413 (1)",
-            "tab": "Efficiency",
-            "score": 237.41318658488748
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=274.37, mean=274.37, max=274.37, sum=274.37 (1)",
-            "tab": "General information",
-            "score": 274.36995515695065
-          },
-          "GPQA - # output tokens": {
-            "description": "min=7431.507, mean=7431.507, max=7431.507, sum=7431.507 (1)",
-            "tab": "General information",
-            "score": 7431.506726457399
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.816,
-        "details": {
-          "description": "min=0.816, mean=0.816, max=0.816, sum=0.816 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=36.742, mean=36.742, max=36.742, sum=36.742 (1)",
-            "tab": "Efficiency",
-            "score": 36.742134021963516
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.492, mean=46.492, max=46.492, sum=46.492 (1)",
-            "tab": "General information",
-            "score": 46.491682070240294
-          },
-          "IFEval - # output tokens": {
-            "description": "min=1101.856, mean=1101.856, max=1101.856, sum=1101.856 (1)",
-            "tab": "General information",
-            "score": 1101.8558225508318
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.828,
-        "details": {
-          "description": "min=0.828, mean=0.828, max=0.828, sum=0.828 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=125.734, mean=125.734, max=125.734, sum=125.734 (1)",
-            "tab": "Efficiency",
-            "score": 125.73418169164657
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=3594.207, mean=3594.207, max=3594.207, sum=3594.207 (1)",
-            "tab": "General information",
-            "score": 3594.207
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.548,
-        "details": {
-          "description": "min=0.548, mean=0.548, max=0.548, sum=0.548 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=352.824, mean=352.824, max=352.824, sum=352.824 (1)",
-            "tab": "Efficiency",
-            "score": 352.82373819446565
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=111.6, mean=111.6, max=111.6, sum=111.6 (1)",
-            "tab": "General information",
-            "score": 111.6
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=10072.403, mean=10072.403, max=10072.403, sum=10072.403 (1)",
-            "tab": "General information",
-            "score": 10072.403
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json b/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json
deleted file mode 100644
index 0210712c3..000000000
--- a/data/helm_capabilities/qwen/qwen3-235b-a22b-instruct-2507-fp8/07763926-3a19-43f9-a23f-095f6cb78799.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/qwen_qwen3-235b-a22b-instruct-2507-fp8/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen3 235B A22B Instruct 2507 FP8",
-    "id": "qwen/qwen3-235b-a22b-instruct-2507-fp8",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.798,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 91.57420329307861
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.844,
-        "details": {
-          "description": "min=0.844, mean=0.844, max=0.844, sum=0.844 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=52.244, mean=52.244, max=52.244, sum=52.244 (1)",
-            "tab": "Efficiency",
-            "score": 52.24400525426864
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=259.715, mean=259.715, max=259.715, sum=259.715 (1)",
-            "tab": "General information",
-            "score": 259.715
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=1423.589, mean=1423.589, max=1423.589, sum=1423.589 (1)",
-            "tab": "General information",
-            "score": 1423.589
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.726,
-        "details": {
-          "description": "min=0.726, mean=0.726, max=0.726, sum=0.726 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=103.303, mean=103.303, max=103.303, sum=103.303 (1)",
-            "tab": "Efficiency",
-            "score": 103.30346254970995
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=274.37, mean=274.37, max=274.37, sum=274.37 (1)",
-            "tab": "General information",
-            "score": 274.36995515695065
-          },
-          "GPQA - # output tokens": {
-            "description": "min=3922.17, mean=3922.17, max=3922.17, sum=3922.17 (1)",
-            "tab": "General information",
-            "score": 3922.170403587444
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.835,
-        "details": {
-          "description": "min=0.835, mean=0.835, max=0.835, sum=0.835 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=12.729, mean=12.729, max=12.729, sum=12.729 (1)",
-            "tab": "Efficiency",
-            "score": 12.728508173648178
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.492, mean=46.492, max=46.492, sum=46.492 (1)",
-            "tab": "General information",
-            "score": 46.491682070240294
-          },
-          "IFEval - # output tokens": {
-            "description": "min=427.54, mean=427.54, max=427.54, sum=427.54 (1)",
-            "tab": "General information",
-            "score": 427.53974121996305
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.866,
-        "details": {
-          "description": "min=0.866, mean=0.866, max=0.866, sum=0.866 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=61.017, mean=61.017, max=61.017, sum=61.017 (1)",
-            "tab": "Efficiency",
-            "score": 61.01670853805542
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=1976.28, mean=1976.28, max=1976.28, sum=1976.28 (1)",
-            "tab": "General information",
-            "score": 1976.28
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.718,
-        "details": {
-          "description": "min=0.718, mean=0.718, max=0.718, sum=0.718 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=228.578, mean=228.578, max=228.578, sum=228.578 (1)",
-            "tab": "Efficiency",
-            "score": 228.57833194971084
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=111.6, mean=111.6, max=111.6, sum=111.6 (1)",
-            "tab": "General information",
-            "score": 111.6
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=5629.583, mean=5629.583, max=5629.583, sum=5629.583 (1)",
-            "tab": "General information",
-            "score": 5629.583
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json b/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json
deleted file mode 100644
index 6ee69548e..000000000
--- a/data/helm_capabilities/qwen/qwen3-next-80b-a3b-thinking/56e024b3-c963-4172-9f52-7605276b3854.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/qwen_qwen3-next-80b-a3b-thinking/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen3-Next 80B A3B Thinking",
-    "id": "qwen/qwen3-next-80b-a3b-thinking",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 27.61164260375731
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.786,
-        "details": {
-          "description": "min=0.786, mean=0.786, max=0.786, sum=0.786 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=20.097, mean=20.097, max=20.097, sum=20.097 (1)",
-            "tab": "Efficiency",
-            "score": 20.09722422862053
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=259.715, mean=259.715, max=259.715, sum=259.715 (1)",
-            "tab": "General information",
-            "score": 259.715
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.63,
-        "details": {
-          "description": "min=0.63, mean=0.63, max=0.63, sum=0.63 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=40.06, mean=40.06, max=40.06, sum=40.06 (1)",
-            "tab": "Efficiency",
-            "score": 40.06039341950096
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=274.37, mean=274.37, max=274.37, sum=274.37 (1)",
-            "tab": "General information",
-            "score": 274.36995515695065
-          },
-          "GPQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=0.81 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=13.893, mean=13.893, max=13.893, sum=13.893 (1)",
-            "tab": "Efficiency",
-            "score": 13.89268838323639
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.492, mean=46.492, max=46.492, sum=46.492 (1)",
-            "tab": "General information",
-            "score": 46.491682070240294
-          },
-          "IFEval - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.807,
-        "details": {
-          "description": "min=0.807, mean=0.807, max=0.807, sum=0.807 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=23.095, mean=23.095, max=23.095, sum=23.095 (1)",
-            "tab": "Efficiency",
-            "score": 23.095464605808257
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.467,
-        "details": {
-          "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=40.912, mean=40.912, max=40.912, sum=40.912 (1)",
-            "tab": "Efficiency",
-            "score": 40.91244238162041
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=111.6, mean=111.6, max=111.6, sum=111.6 (1)",
-            "tab": "General information",
-            "score": 111.6
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json b/data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json
deleted file mode 100644
index b86fc5b45..000000000
--- a/data/helm_capabilities/writer/palmyra-fin/6f660e47-1d86-473d-9864-208111dcea31.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/writer_palmyra-fin/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Palmyra Fin",
-    "id": "writer/palmyra-fin",
-    "developer": "writer",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.577,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 13.54320003211858
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.591,
-        "details": {
-          "description": "min=0.591, mean=0.591, max=0.591, sum=0.591 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=10.488, mean=10.488, max=10.488, sum=10.488 (1)",
-            "tab": "Efficiency",
-            "score": 10.488489307641983
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=228.366, mean=228.366, max=228.366, sum=228.366 (1)",
-            "tab": "General information",
-            "score": 228.366
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=493.383, mean=493.383, max=493.383, sum=493.383 (1)",
-            "tab": "General information",
-            "score": 493.383
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.422,
-        "details": {
-          "description": "min=0.422, mean=0.422, max=0.422, sum=0.422 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=14.428, mean=14.428, max=14.428, sum=14.428 (1)",
-            "tab": "Efficiency",
-            "score": 14.42766729758994
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=248.886, mean=248.886, max=248.886, sum=248.886 (1)",
-            "tab": "General information",
-            "score": 248.88565022421525
-          },
-          "GPQA - # output tokens": {
-            "description": "min=671.045, mean=671.045, max=671.045, sum=671.045 (1)",
-            "tab": "General information",
-            "score": 671.0448430493274
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.793,
-        "details": {
-          "description": "min=0.793, mean=0.793, max=0.793, sum=0.793 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=6.85, mean=6.85, max=6.85, sum=6.85 (1)",
-            "tab": "Efficiency",
-            "score": 6.849953265815918
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.024, mean=46.024, max=46.024, sum=46.024 (1)",
-            "tab": "General information",
-            "score": 46.024029574861366
-          },
-          "IFEval - # output tokens": {
-            "description": "min=332.181, mean=332.181, max=332.181, sum=332.181 (1)",
-            "tab": "General information",
-            "score": 332.181146025878
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.783,
-        "details": {
-          "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=18.947, mean=18.947, max=18.947, sum=18.947 (1)",
-            "tab": "Efficiency",
-            "score": 18.947298042297362
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=935.556, mean=935.556, max=935.556, sum=935.556 (1)",
-            "tab": "General information",
-            "score": 935.556
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.295,
-        "details": {
-          "description": "min=0.295, mean=0.295, max=0.295, sum=0.295 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=17.003, mean=17.003, max=17.003, sum=17.003 (1)",
-            "tab": "Efficiency",
-            "score": 17.002592247247694
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.708, mean=109.708, max=109.708, sum=109.708 (1)",
-            "tab": "General information",
-            "score": 109.708
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=820.641, mean=820.641, max=820.641, sum=820.641 (1)",
-            "tab": "General information",
-            "score": 820.641
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json b/data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json
deleted file mode 100644
index ac68f722a..000000000
--- a/data/helm_capabilities/writer/palmyra-med/91ef1f96-a708-4c53-ac9d-208ef3420668.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/writer_palmyra-med/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Palmyra Med",
-    "id": "writer/palmyra-med",
-    "developer": "writer",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.476,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 4.374187379517853
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.411,
-        "details": {
-          "description": "min=0.411, mean=0.411, max=0.411, sum=0.411 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.327 (1)",
-            "tab": "Efficiency",
-            "score": 0.32738947081565856
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=249.366, mean=249.366, max=249.366, sum=249.366 (1)",
-            "tab": "General information",
-            "score": 249.366
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.368,
-        "details": {
-          "description": "min=0.368, mean=0.368, max=0.368, sum=0.368 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=0.356, mean=0.356, max=0.356, sum=0.356 (1)",
-            "tab": "Efficiency",
-            "score": 0.3557077256018805
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=269.886, mean=269.886, max=269.886, sum=269.886 (1)",
-            "tab": "General information",
-            "score": 269.8856502242152
-          },
-          "GPQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.767,
-        "details": {
-          "description": "min=0.767, mean=0.767, max=0.767, sum=0.767 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=4.651, mean=4.651, max=4.651, sum=4.651 (1)",
-            "tab": "Efficiency",
-            "score": 4.650597941633073
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.024, mean=46.024, max=46.024, sum=46.024 (1)",
-            "tab": "General information",
-            "score": 46.024029574861366
-          },
-          "IFEval - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.676,
-        "details": {
-          "description": "min=0.676, mean=0.676, max=0.676, sum=0.676 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=10.081, mean=10.081, max=10.081, sum=10.081 (1)",
-            "tab": "Efficiency",
-            "score": 10.080555647850037
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.156,
-        "details": {
-          "description": "min=0.156, mean=0.156, max=0.156, sum=0.156 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=6.457, mean=6.457, max=6.457, sum=6.457 (1)",
-            "tab": "Efficiency",
-            "score": 6.456686111688614
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.708, mean=109.708, max=109.708, sum=109.708 (1)",
-            "tab": "General information",
-            "score": 109.708
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json b/data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json
deleted file mode 100644
index 9398b6319..000000000
--- a/data/helm_capabilities/writer/palmyra-x-004/c14bea74-0aa3-4dde-8ca1-cbc4ab3de1cc.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/writer_palmyra-x-004/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Palmyra-X-004",
-    "id": "writer/palmyra-x-004",
-    "developer": "writer",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.609,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 19.38686150670534
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.657,
-        "details": {
-          "description": "min=0.657, mean=0.657, max=0.657, sum=0.657 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=14.079, mean=14.079, max=14.079, sum=14.079 (1)",
-            "tab": "Efficiency",
-            "score": 14.079012663602828
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=228.366, mean=228.366, max=228.366, sum=228.366 (1)",
-            "tab": "General information",
-            "score": 228.366
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=510.633, mean=510.633, max=510.633, sum=510.633 (1)",
-            "tab": "General information",
-            "score": 510.633
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.395,
-        "details": {
-          "description": "min=0.395, mean=0.395, max=0.395, sum=0.395 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=20.444, mean=20.444, max=20.444, sum=20.444 (1)",
-            "tab": "Efficiency",
-            "score": 20.444375363700594
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=248.886, mean=248.886, max=248.886, sum=248.886 (1)",
-            "tab": "General information",
-            "score": 248.88565022421525
-          },
-          "GPQA - # output tokens": {
-            "description": "min=716.437, mean=716.437, max=716.437, sum=716.437 (1)",
-            "tab": "General information",
-            "score": 716.4372197309417
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.872,
-        "details": {
-          "description": "min=0.872, mean=0.872, max=0.872, sum=0.872 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=10.268, mean=10.268, max=10.268, sum=10.268 (1)",
-            "tab": "Efficiency",
-            "score": 10.267585801990107
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.024, mean=46.024, max=46.024, sum=46.024 (1)",
-            "tab": "General information",
-            "score": 46.024029574861366
-          },
-          "IFEval - # output tokens": {
-            "description": "min=357.087, mean=357.087, max=357.087, sum=357.087 (1)",
-            "tab": "General information",
-            "score": 357.08687615526804
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.802,
-        "details": {
-          "description": "min=0.802, mean=0.802, max=0.802, sum=0.802 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=28.186, mean=28.186, max=28.186, sum=28.186 (1)",
-            "tab": "Efficiency",
-            "score": 28.185582681894303
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=1068.195, mean=1068.195, max=1068.195, sum=1068.195 (1)",
-            "tab": "General information",
-            "score": 1068.195
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.32,
-        "details": {
-          "description": "min=0.32, mean=0.32, max=0.32, sum=0.32 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=23.958, mean=23.958, max=23.958, sum=23.958 (1)",
-            "tab": "Efficiency",
-            "score": 23.95775102233887
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.708, mean=109.708, max=109.708, sum=109.708 (1)",
-            "tab": "General information",
-            "score": 109.708
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=897.531, mean=897.531, max=897.531, sum=897.531 (1)",
-            "tab": "General information",
-            "score": 897.531
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json b/data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json
deleted file mode 100644
index 6d3707107..000000000
--- a/data/helm_capabilities/writer/palmyra-x5/505c6245-88d1-4557-9e34-63a4e8086210.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/writer_palmyra-x5/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Palmyra X5",
-    "id": "writer/palmyra-x5",
-    "developer": "writer",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.696,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 7.539339301355213
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.804,
-        "details": {
-          "description": "min=0.804, mean=0.804, max=0.804, sum=0.804 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=5.907, mean=5.907, max=5.907, sum=5.907 (1)",
-            "tab": "Efficiency",
-            "score": 5.906555171251297
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=249.366, mean=249.366, max=249.366, sum=249.366 (1)",
-            "tab": "General information",
-            "score": 249.366
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.661,
-        "details": {
-          "description": "min=0.661, mean=0.661, max=0.661, sum=0.661 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=9.251, mean=9.251, max=9.251, sum=9.251 (1)",
-            "tab": "Efficiency",
-            "score": 9.251234515365464
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=269.886, mean=269.886, max=269.886, sum=269.886 (1)",
-            "tab": "General information",
-            "score": 269.8856502242152
-          },
-          "GPQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.823,
-        "details": {
-          "description": "min=0.823, mean=0.823, max=0.823, sum=0.823 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=3.337, mean=3.337, max=3.337, sum=3.337 (1)",
-            "tab": "Efficiency",
-            "score": 3.3367519599012
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.024, mean=46.024, max=46.024, sum=46.024 (1)",
-            "tab": "General information",
-            "score": 46.024029574861366
-          },
-          "IFEval - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78,
-        "details": {
-          "description": "min=0.78, mean=0.78, max=0.78, sum=0.78 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=8.281, mean=8.281, max=8.281, sum=8.281 (1)",
-            "tab": "Efficiency",
-            "score": 8.280673936367036
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.414,
-        "details": {
-          "description": "min=0.414, mean=0.414, max=0.414, sum=0.414 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=10.921, mean=10.921, max=10.921, sum=10.921 (1)",
-            "tab": "Efficiency",
-            "score": 10.921480923891068
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.708, mean=109.708, max=109.708, sum=109.708 (1)",
-            "tab": "General information",
-            "score": 109.708
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json b/data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json
deleted file mode 100644
index 54503d043..000000000
--- a/data/helm_capabilities/xai/grok-3-beta/9a473236-f187-4926-ae8a-e8b84fe2a060.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/xai_grok-3-beta/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Grok 3 Beta",
-    "id": "xai/grok-3-beta",
-    "developer": "xai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.727,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 21.832675643266274
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.788,
-        "details": {
-          "description": "min=0.788, mean=0.788, max=0.788, sum=0.788 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=24.646, mean=24.646, max=24.646, sum=24.646 (1)",
-            "tab": "Efficiency",
-            "score": 24.646376408576966
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0.013, mean=0.013, max=0.013, sum=0.013 (1)",
-            "tab": "General information",
-            "score": 0.013
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=223.237, mean=223.237, max=223.237, sum=223.237 (1)",
-            "tab": "General information",
-            "score": 223.237
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=1669.743, mean=1669.743, max=1669.743, sum=1669.743 (1)",
-            "tab": "General information",
-            "score": 1669.743
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.65,
-        "details": {
-          "description": "min=0.65, mean=0.65, max=0.65, sum=0.65 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=30.888, mean=30.888, max=30.888, sum=30.888 (1)",
-            "tab": "Efficiency",
-            "score": 30.88756059317311
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0.02, mean=0.02, max=0.02, sum=0.02 (1)",
-            "tab": "General information",
-            "score": 0.020179372197309416
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=233.054, mean=233.054, max=233.054, sum=233.054 (1)",
-            "tab": "General information",
-            "score": 233.05381165919283
-          },
-          "GPQA - # output tokens": {
-            "description": "min=2771.594, mean=2771.594, max=2771.594, sum=2771.594 (1)",
-            "tab": "General information",
-            "score": 2771.5941704035877
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.884,
-        "details": {
-          "description": "min=0.884, mean=0.884, max=0.884, sum=0.884 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=5.792, mean=5.792, max=5.792, sum=5.792 (1)",
-            "tab": "Efficiency",
-            "score": 5.791596473475261
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=45.192, mean=45.192, max=45.192, sum=45.192 (1)",
-            "tab": "General information",
-            "score": 45.19223659889094
-          },
-          "IFEval - # output tokens": {
-            "description": "min=404.85, mean=404.85, max=404.85, sum=404.85 (1)",
-            "tab": "General information",
-            "score": 404.8502772643253
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.849,
-        "details": {
-          "description": "min=0.849, mean=0.849, max=0.849, sum=0.849 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=16.937, mean=16.937, max=16.937, sum=16.937 (1)",
-            "tab": "Efficiency",
-            "score": 16.93687919616699
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=1419.576, mean=1419.576, max=1419.576, sum=1419.576 (1)",
-            "tab": "General information",
-            "score": 1419.576
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.464,
-        "details": {
-          "description": "min=0.464, mean=0.464, max=0.464, sum=0.464 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=30.901, mean=30.901, max=30.901, sum=30.901 (1)",
-            "tab": "Efficiency",
-            "score": 30.90096554493904
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "General information",
-            "score": 0.001
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=104.089, mean=104.089, max=104.089, sum=104.089 (1)",
-            "tab": "General information",
-            "score": 104.089
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=3296.733, mean=3296.733, max=3296.733, sum=3296.733 (1)",
-            "tab": "General information",
-            "score": 3296.733
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json b/data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json
deleted file mode 100644
index a083c0183..000000000
--- a/data/helm_capabilities/xai/grok-3-mini-beta/1d7ece9b-1dcf-4adf-aa16-b030e286c26e.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/xai_grok-3-mini-beta/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Grok 3 mini Beta",
-    "id": "xai/grok-3-mini-beta",
-    "developer": "xai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.679,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 12.070258432341626
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.799,
-        "details": {
-          "description": "min=0.799, mean=0.799, max=0.799, sum=0.799 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=7.153, mean=7.153, max=7.153, sum=7.153 (1)",
-            "tab": "Efficiency",
-            "score": 7.153050385713577
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0.013, mean=0.013, max=0.013, sum=0.013 (1)",
-            "tab": "General information",
-            "score": 0.013
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=223.237, mean=223.237, max=223.237, sum=223.237 (1)",
-            "tab": "General information",
-            "score": 223.237
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=59.839, mean=59.839, max=59.839, sum=59.839 (1)",
-            "tab": "General information",
-            "score": 59.839
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.675,
-        "details": {
-          "description": "min=0.675, mean=0.675, max=0.675, sum=0.675 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=14.215, mean=14.215, max=14.215, sum=14.215 (1)",
-            "tab": "Efficiency",
-            "score": 14.215015458419185
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0.02, mean=0.02, max=0.02, sum=0.02 (1)",
-            "tab": "General information",
-            "score": 0.020179372197309416
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=233.054, mean=233.054, max=233.054, sum=233.054 (1)",
-            "tab": "General information",
-            "score": 233.05381165919283
-          },
-          "GPQA - # output tokens": {
-            "description": "min=125.807, mean=125.807, max=125.807, sum=125.807 (1)",
-            "tab": "General information",
-            "score": 125.80717488789237
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.951,
-        "details": {
-          "description": "min=0.951, mean=0.951, max=0.951, sum=0.951 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=7.187, mean=7.187, max=7.187, sum=7.187 (1)",
-            "tab": "Efficiency",
-            "score": 7.187224511077797
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=45.192, mean=45.192, max=45.192, sum=45.192 (1)",
-            "tab": "General information",
-            "score": 45.19223659889094
-          },
-          "IFEval - # output tokens": {
-            "description": "min=347.104, mean=347.104, max=347.104, sum=347.104 (1)",
-            "tab": "General information",
-            "score": 347.10351201478744
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.651,
-        "details": {
-          "description": "min=0.651, mean=0.651, max=0.651, sum=0.651 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=10.787, mean=10.787, max=10.787, sum=10.787 (1)",
-            "tab": "Efficiency",
-            "score": 10.787254344463348
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=864.463, mean=864.463, max=864.463, sum=864.463 (1)",
-            "tab": "General information",
-            "score": 864.463
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318,
-        "details": {
-          "description": "min=0.318, mean=0.318, max=0.318, sum=0.318 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=21.009, mean=21.009, max=21.009, sum=21.009 (1)",
-            "tab": "Efficiency",
-            "score": 21.008747462034226
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "General information",
-            "score": 0.001
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=104.089, mean=104.089, max=104.089, sum=104.089 (1)",
-            "tab": "General information",
-            "score": 104.089
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=183.116, mean=183.116, max=183.116, sum=183.116 (1)",
-            "tab": "General information",
-            "score": 183.116
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json b/data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json
deleted file mode 100644
index a25562cb1..000000000
--- a/data/helm_capabilities/xai/grok-4-0709/aeabfb59-74db-445c-9693-7a088ac5073c.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/xai_grok-4-0709/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Grok 4 0709",
-    "id": "xai/grok-4-0709",
-    "developer": "xai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.785,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 128.04182146459848
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.851,
-        "details": {
-          "description": "min=0.851, mean=0.851, max=0.851, sum=0.851 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=93.583, mean=93.583, max=93.583, sum=93.583 (1)",
-            "tab": "Efficiency",
-            "score": 93.58286614966393
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0.013, mean=0.013, max=0.013, sum=0.013 (1)",
-            "tab": "General information",
-            "score": 0.013
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=244.237, mean=244.237, max=244.237, sum=244.237 (1)",
-            "tab": "General information",
-            "score": 244.237
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=4.789, mean=4.789, max=4.789, sum=4.789 (1)",
-            "tab": "General information",
-            "score": 4.789
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.726,
-        "details": {
-          "description": "min=0.726, mean=0.726, max=0.726, sum=0.726 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=223.967, mean=223.967, max=223.967, sum=223.967 (1)",
-            "tab": "Efficiency",
-            "score": 223.96746500778625
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0.02, mean=0.02, max=0.02, sum=0.02 (1)",
-            "tab": "General information",
-            "score": 0.020179372197309416
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=254.007, mean=254.007, max=254.007, sum=254.007 (1)",
-            "tab": "General information",
-            "score": 254.0067264573991
-          },
-          "GPQA - # output tokens": {
-            "description": "min=5.841, mean=5.841, max=5.841, sum=5.841 (1)",
-            "tab": "General information",
-            "score": 5.8408071748878925
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.949,
-        "details": {
-          "description": "min=0.949, mean=0.949, max=0.949, sum=0.949 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=31.966, mean=31.966, max=31.966, sum=31.966 (1)",
-            "tab": "Efficiency",
-            "score": 31.966069252786266
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=45.192, mean=45.192, max=45.192, sum=45.192 (1)",
-            "tab": "General information",
-            "score": 45.19223659889094
-          },
-          "IFEval - # output tokens": {
-            "description": "min=376.298, mean=376.298, max=376.298, sum=376.298 (1)",
-            "tab": "General information",
-            "score": 376.29759704251387
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.797,
-        "details": {
-          "description": "min=0.797, mean=0.797, max=0.797, sum=0.797 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=115.441, mean=115.441, max=115.441, sum=115.441 (1)",
-            "tab": "Efficiency",
-            "score": 115.44128810715675
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=1553.96, mean=1553.96, max=1553.96, sum=1553.96 (1)",
-            "tab": "General information",
-            "score": 1553.96
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.603,
-        "details": {
-          "description": "min=0.603, mean=0.603, max=0.603, sum=0.603 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=175.251, mean=175.251, max=175.251, sum=175.251 (1)",
-            "tab": "Efficiency",
-            "score": 175.2514188055992
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "General information",
-            "score": 0.001
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=104.089, mean=104.089, max=104.089, sum=104.089 (1)",
-            "tab": "General information",
-            "score": 104.089
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=104.419, mean=104.419, max=104.419, sum=104.419 (1)",
-            "tab": "General information",
-            "score": 104.419
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json b/data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json
deleted file mode 100644
index 43a98dd63..000000000
--- a/data/helm_capabilities/zai-org/glm-4.5-air-fp8/eb2f8883-30ee-42e1-95b5-48dcf988ecf5.json
+++ /dev/null
@@ -1,345 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_capabilities/zai-org_glm-4.5-air-fp8/1770835969.095764",
-  "retrieved_timestamp": "1770835969.095764",
-  "source_metadata": {
-    "source_name": "helm_capabilities",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GLM-4.5-Air-FP8",
-    "id": "zai-org/glm-4.5-air-fp8",
-    "developer": "zai-org",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean score",
-      "source_data": {
-        "dataset_name": "helm_capabilities",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "The mean of the scores from all columns.",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.67,
-        "details": {
-          "tab": "Accuracy",
-          "Mean score - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 36.15586140714108
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU-Pro",
-      "source_data": {
-        "dataset_name": "MMLU-Pro",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on MMLU-Pro",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.762,
-        "details": {
-          "description": "min=0.762, mean=0.762, max=0.762, sum=0.762 (1)",
-          "tab": "Accuracy",
-          "MMLU-Pro - Observed inference time (s)": {
-            "description": "min=30.422, mean=30.422, max=30.422, sum=30.422 (1)",
-            "tab": "Efficiency",
-            "score": 30.421801055192947
-          },
-          "MMLU-Pro - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MMLU-Pro - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU-Pro - # prompt tokens": {
-            "description": "min=250.402, mean=250.402, max=250.402, sum=250.402 (1)",
-            "tab": "General information",
-            "score": 250.402
-          },
-          "MMLU-Pro - # output tokens": {
-            "description": "min=4423.528, mean=4423.528, max=4423.528, sum=4423.528 (1)",
-            "tab": "General information",
-            "score": 4423.528
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "all",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "COT correct on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.594,
-        "details": {
-          "description": "min=0.594, mean=0.594, max=0.594, sum=0.594 (1)",
-          "tab": "Accuracy",
-          "GPQA - Observed inference time (s)": {
-            "description": "min=54.963, mean=54.963, max=54.963, sum=54.963 (1)",
-            "tab": "Efficiency",
-            "score": 54.96293809649121
-          },
-          "GPQA - # eval": {
-            "description": "min=446, mean=446, max=446, sum=446 (1)",
-            "tab": "General information",
-            "score": 446.0
-          },
-          "GPQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GPQA - # prompt tokens": {
-            "description": "min=269.978, mean=269.978, max=269.978, sum=269.978 (1)",
-            "tab": "General information",
-            "score": 269.9775784753363
-          },
-          "GPQA - # output tokens": {
-            "description": "min=8628.161, mean=8628.161, max=8628.161, sum=8628.161 (1)",
-            "tab": "General information",
-            "score": 8628.16143497758
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "gpqa_main",
-          "use_chain_of_thought": "true",
-          "use_few_shot": "false"
-        }
-      }
-    },
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "IFEval Strict Acc on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.812,
-        "details": {
-          "description": "min=0.812, mean=0.812, max=0.812, sum=0.812 (1)",
-          "tab": "Accuracy",
-          "IFEval - Observed inference time (s)": {
-            "description": "min=8.027, mean=8.027, max=8.027, sum=8.027 (1)",
-            "tab": "Efficiency",
-            "score": 8.026858968787625
-          },
-          "IFEval - # eval": {
-            "description": "min=541, mean=541, max=541, sum=541 (1)",
-            "tab": "General information",
-            "score": 541.0
-          },
-          "IFEval - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IFEval - # prompt tokens": {
-            "description": "min=46.026, mean=46.026, max=46.026, sum=46.026 (1)",
-            "tab": "General information",
-            "score": 46.02587800369686
-          },
-          "IFEval - # output tokens": {
-            "description": "min=1330.573, mean=1330.573, max=1330.573, sum=1330.573 (1)",
-            "tab": "General information",
-            "score": 1330.573012939002
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WildBench",
-      "source_data": {
-        "dataset_name": "WildBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "WB Score on WildBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.789,
-        "details": {
-          "description": "min=0.789, mean=0.789, max=0.789, sum=0.789 (1)",
-          "tab": "Accuracy",
-          "WildBench - Observed inference time (s)": {
-            "description": "min=25.055, mean=25.055, max=25.055, sum=25.055 (1)",
-            "tab": "Efficiency",
-            "score": 25.055315640687944
-          },
-          "WildBench - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "WildBench - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WildBench - # output tokens": {
-            "description": "min=4196.241, mean=4196.241, max=4196.241, sum=4196.241 (1)",
-            "tab": "General information",
-            "score": 4196.241
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": "v2"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Omni-MATH",
-      "source_data": {
-        "dataset_name": "Omni-MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/capabilities/benchmark_output/releases/v1.15.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Acc on Omni-MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.391,
-        "details": {
-          "description": "min=0.391, mean=0.391, max=0.391, sum=0.391 (1)",
-          "tab": "Accuracy",
-          "Omni-MATH - Observed inference time (s)": {
-            "description": "min=62.312, mean=62.312, max=62.312, sum=62.312 (1)",
-            "tab": "Efficiency",
-            "score": 62.31239327454567
-          },
-          "Omni-MATH - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "Omni-MATH - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Omni-MATH - # prompt tokens": {
-            "description": "min=109.807, mean=109.807, max=109.807, sum=109.807 (1)",
-            "tab": "General information",
-            "score": 109.807
-          },
-          "Omni-MATH - # output tokens": {
-            "description": "min=11088.014, mean=11088.014, max=11088.014, sum=11088.014 (1)",
-            "tab": "General information",
-            "score": 11088.014
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json b/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json
deleted file mode 100644
index 152223193..000000000
--- a/data/helm_classic/Anthropic-LM-v4-s3-52B/Anthropic-LM-v4-s3-52B/12fdea65-94eb-4c85-876c-65f0528bde12.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/Anthropic-LM-v4-s3-52B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Anthropic-LM v4-s3 52B",
-    "id": "Anthropic-LM-v4-s3-52B",
-    "developer": "unknown",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": null
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.8178973356392711
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.7935577862997218
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.13822916666666668
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5930298633071189
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.648748165414832
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.5306599832915623
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.481,
-        "details": {
-          "description": "min=0.25, mean=0.481, max=0.78, sum=7.22 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.063, mean=0.144, max=0.262, sum=2.165 (15)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.17, mean=0.434, max=0.76, sum=6.513 (15)",
-            "tab": "Robustness",
-            "score": 0.43421052631578944
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.211, mean=0.447, max=0.74, sum=6.702 (15)",
-            "tab": "Fairness",
-            "score": 0.4467836257309941
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.556, mean=0.578, max=0.605, sum=8.664 (15)",
-            "tab": "Efficiency",
-            "score": 0.5775741999040572
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)",
-            "tab": "General information",
-            "score": 472.2740350877193
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.815,
-        "details": {
-          "description": "min=0.814, mean=0.815, max=0.816, sum=2.446 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.035, mean=0.038, max=0.041, sum=0.114 (3)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.751, mean=0.756, max=0.76, sum=2.269 (3)",
-            "tab": "Robustness",
-            "score": 0.7563333333333334
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.778, mean=0.782, max=0.788, sum=2.345 (3)",
-            "tab": "Fairness",
-            "score": 0.7816666666666667
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.566, mean=0.637, max=0.75, sum=1.912 (3)",
-            "tab": "Efficiency",
-            "score": 0.6371923081597224
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)",
-            "tab": "General information",
-            "score": 908.4063333333334
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1.004, mean=1.004, max=1.004, sum=3.012 (3)",
-            "tab": "General information",
-            "score": 1.004
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.728,
-        "details": {
-          "description": "min=0.692, mean=0.728, max=0.748, sum=2.185 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.052, mean=0.09, max=0.14, sum=0.27 (3)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.622, mean=0.663, max=0.693, sum=1.99 (3)",
-            "tab": "Robustness",
-            "score": 0.6634443166549867
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.614, mean=0.646, max=0.667, sum=1.939 (3)",
-            "tab": "Fairness",
-            "score": 0.6464650190039823
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=1.628, mean=1.722, max=1.839, sum=5.167 (3)",
-            "tab": "Efficiency",
-            "score": 1.7223421043622853
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3504.577, mean=3803.911, max=3972.577, sum=11411.732 (3)",
-            "tab": "General information",
-            "score": 3803.910798122066
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=4.572, mean=6.952, max=8.434, sum=20.856 (3)",
-            "tab": "General information",
-            "score": 6.9521126760563385
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.333, mean=0.39, max=0.419, sum=1.169 (3)",
-            "tab": "Bias",
-            "score": 0.38950617283950617
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.19, mean=0.208, max=0.218, sum=0.624 (3)",
-            "tab": "Bias",
-            "score": 0.20792828096614854
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.011, mean=0.013, max=0.014, sum=0.039 (3)",
-            "tab": "Toxicity",
-            "score": 0.013145539906103287
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.686,
-        "details": {
-          "description": "min=0.682, mean=0.686, max=0.693, sum=2.059 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.108, mean=0.121, max=0.128, sum=0.362 (3)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.048, mean=0.067, max=0.088, sum=0.2 (3)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.239, mean=0.245, max=0.248, sum=0.734 (3)",
-            "tab": "Robustness",
-            "score": 0.24480135198778494
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.608, mean=0.632, max=0.646, sum=1.897 (3)",
-            "tab": "Robustness",
-            "score": 0.6323821508652113
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.228, mean=0.239, max=0.244, sum=0.716 (3)",
-            "tab": "Fairness",
-            "score": 0.23855278160903723
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.639, mean=0.642, max=0.646, sum=1.927 (3)",
-            "tab": "Fairness",
-            "score": 0.6422159112855447
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.751, mean=0.777, max=0.821, sum=2.331 (3)",
-            "tab": "Efficiency",
-            "score": 0.7770150703124993
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=1.036, mean=1.102, max=1.15, sum=3.305 (3)",
-            "tab": "Efficiency",
-            "score": 1.1015715911458346
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)",
-            "tab": "General information",
-            "score": 112.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=5.032, mean=5.47, max=6.183, sum=16.409 (3)",
-            "tab": "General information",
-            "score": 5.469666666666666
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.964, mean=4.964, max=4.965, sum=14.893 (3)",
-            "tab": "General information",
-            "score": 4.964333333333333
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.007, mean=0.007, max=0.007, sum=0.021 (3)",
-            "tab": "General information",
-            "score": 0.007
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1381.066, mean=1592.701, max=1704.681, sum=4778.103 (3)",
-            "tab": "General information",
-            "score": 1592.701
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=5.429, mean=5.659, max=6.028, sum=16.976 (3)",
-            "tab": "General information",
-            "score": 5.658666666666666
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.358, mean=0.386, max=0.439, sum=1.158 (3)",
-            "tab": "Bias",
-            "score": 0.38616369646117926
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0, mean=0.148, max=0.237, sum=0.443 (3)",
-            "tab": "Bias",
-            "score": 0.1475748194014448
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.367, mean=0.429, max=0.5, sum=1.287 (3)",
-            "tab": "Bias",
-            "score": 0.4288888888888889
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.459, mean=0.48, max=0.498, sum=1.441 (3)",
-            "tab": "Bias",
-            "score": 0.48032222577096423
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.024, mean=0.043, max=0.079, sum=0.129 (3)",
-            "tab": "Bias",
-            "score": 0.043024227234753555
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.431,
-        "details": {
-          "description": "min=0.41, mean=0.431, max=0.443, sum=1.294 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.034, mean=0.039, max=0.048, sum=0.116 (3)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.303, mean=0.313, max=0.324, sum=0.938 (3)",
-            "tab": "Robustness",
-            "score": 0.31252831855461766
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.338, mean=0.356, max=0.365, sum=1.067 (3)",
-            "tab": "Fairness",
-            "score": 0.35555313427706087
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=3.472, mean=3.694, max=4.123, sum=11.082 (3)",
-            "tab": "Efficiency",
-            "score": 3.6939938854166683
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "QuAC - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=4676.788, mean=5199.788, max=5842.788, sum=15599.364 (3)",
-            "tab": "General information",
-            "score": 5199.788
-          },
-          "QuAC - # output tokens": {
-            "description": "min=32.106, mean=35.484, max=40.222, sum=106.452 (3)",
-            "tab": "General information",
-            "score": 35.484
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.604, mean=0.609, max=0.614, sum=1.827 (3)",
-            "tab": "Bias",
-            "score": 0.6088490550046614
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.405, mean=0.419, max=0.441, sum=1.257 (3)",
-            "tab": "Bias",
-            "score": 0.4190730790466706
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.283, mean=0.321, max=0.341, sum=0.964 (3)",
-            "tab": "Bias",
-            "score": 0.32117266495855845
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.246, mean=0.248, max=0.249, sum=0.743 (3)",
-            "tab": "Bias",
-            "score": 0.24753349327018945
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.002, max=0.002, sum=0.005 (3)",
-            "tab": "Toxicity",
-            "score": 0.0016666666666666668
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.807,
-        "details": {
-          "description": "min=0.807, mean=0.807, max=0.807, sum=0.807 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.32, mean=0.32, max=0.32, sum=0.32 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.766, mean=0.766, max=0.766, sum=0.766 (1)",
-            "tab": "Robustness",
-            "score": 0.766
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.695, mean=0.695, max=0.695, sum=0.695 (1)",
-            "tab": "Fairness",
-            "score": 0.695
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.549, mean=0.549, max=0.549, sum=0.549 (1)",
-            "tab": "Efficiency",
-            "score": 0.5491151875000004
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)",
-            "tab": "General information",
-            "score": 87.888
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=1.306, mean=1.306, max=1.306, sum=1.306 (1)",
-            "tab": "General information",
-            "score": 1.306
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.558,
-        "details": {
-          "description": "min=0.558, mean=0.558, max=0.558, sum=0.558 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.244, mean=0.244, max=0.244, sum=0.244 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.472, mean=0.472, max=0.472, sum=0.472 (1)",
-            "tab": "Robustness",
-            "score": 0.472
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.482, mean=0.482, max=0.482, sum=0.482 (1)",
-            "tab": "Fairness",
-            "score": 0.482
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.447, mean=0.447, max=0.447, sum=0.447 (1)",
-            "tab": "Efficiency",
-            "score": 0.4465652265625003
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)",
-            "tab": "General information",
-            "score": 5.27
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0.132, mean=0.132, max=0.132, sum=0.132 (1)",
-            "tab": "General information",
-            "score": 0.132
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.368,
-        "details": {
-          "description": "min=0.298, mean=0.368, max=0.408, sum=1.472 (4)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.052, mean=0.127, max=0.196, sum=0.507 (4)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.263, mean=0.326, max=0.388, sum=1.304 (4)",
-            "tab": "Robustness",
-            "score": 0.3260703363914373
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.229, mean=0.3, max=0.388, sum=1.202 (4)",
-            "tab": "Fairness",
-            "score": 0.3004587155963303
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.503, mean=0.568, max=0.603, sum=2.273 (4)",
-            "tab": "Efficiency",
-            "score": 0.5683649633565078
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=2616 (4)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=0, mean=3.75, max=5, sum=15 (4)",
-            "tab": "General information",
-            "score": 3.75
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (4)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=85.121, mean=404.621, max=529.121, sum=1618.483 (4)",
-            "tab": "General information",
-            "score": 404.62079510703364
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=4 (4)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=2.5, max=3, sum=10 (4)",
-            "tab": "General information",
-            "score": 2.5
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "min=0.625, mean=0.642, max=0.66, sum=1.925 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.28, mean=0.308, max=0.326, sum=0.925 (3)",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.589, mean=0.592, max=0.594, sum=1.776 (3)",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.305, mean=0.345, max=0.369, sum=1.036 (3)",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.589, mean=0.609, max=0.63, sum=1.828 (3)",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.55, mean=0.578, max=0.599, sum=1.733 (3)",
-            "tab": "Efficiency",
-            "score": 0.5778111061197916
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.547, mean=0.587, max=0.608, sum=1.76 (3)",
-            "tab": "Efficiency",
-            "score": 0.5865037397044573
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)",
-            "tab": "General information",
-            "score": 532.5653333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=1, mean=1.005, max=1.014, sum=3.014 (3)",
-            "tab": "General information",
-            "score": 1.0046666666666668
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)",
-            "tab": "General information",
-            "score": 515.8217054263565
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.154,
-        "details": {
-          "description": "min=0.142, mean=0.154, max=0.17, sum=0.927 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=3.898, mean=4.076, max=4.414, sum=24.459 (6)",
-            "tab": "Efficiency",
-            "score": 4.076441398798879
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)",
-            "tab": "General information",
-            "score": 1549.9191702432045
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=54.895, mean=58.035, max=64.039, sum=348.21 (6)",
-            "tab": "General information",
-            "score": 58.035050071530755
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.6, mean=0.616, max=0.642, sum=3.694 (6)",
-            "tab": "Bias",
-            "score": 0.6157343144185249
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.4, mean=0.412, max=0.426, sum=2.474 (6)",
-            "tab": "Bias",
-            "score": 0.41239374128525014
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.241, mean=0.252, max=0.26, sum=1.514 (6)",
-            "tab": "Bias",
-            "score": 0.2523476523476524
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.075, mean=0.093, max=0.102, sum=0.555 (6)",
-            "tab": "Bias",
-            "score": 0.09258312556525572
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.009 (6)",
-            "tab": "Toxicity",
-            "score": 0.001430615164520744
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.473, mean=0.492, max=0.515, sum=1.477 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.4923968635744633
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=4.385, mean=4.692, max=4.898, sum=28.151 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.691904356057608
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.315, mean=0.326, max=0.342, sum=0.979 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.32642089401655566
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.953, mean=0.96, max=0.968, sum=5.762 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9602766718208816
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=9.043, mean=10.832, max=14.179, sum=64.991 (6)",
-            "tab": "Summarization metrics",
-            "score": 10.831883037736205
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=10.561, mean=11.89, max=12.628, sum=71.339 (6)",
-            "tab": "Summarization metrics",
-            "score": 11.889831050263881
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Summarization metrics",
-            "score": 0.6666666666666666
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "min=4, mean=4, max=4, sum=8 (2)",
-            "tab": "Summarization metrics",
-            "score": 4.0
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "min=2.667, mean=2.667, max=2.667, sum=5.333 (2)",
-            "tab": "Summarization metrics",
-            "score": 2.6666666666666665
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.134,
-        "details": {
-          "description": "min=0.131, mean=0.134, max=0.137, sum=0.804 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=2.357, mean=2.408, max=2.45, sum=14.45 (6)",
-            "tab": "Efficiency",
-            "score": 2.408301637575076
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1456.402, mean=1510.735, max=1539.402, sum=9064.409 (6)",
-            "tab": "General information",
-            "score": 1510.734877734878
-          },
-          "XSUM - # output tokens": {
-            "description": "min=28.284, mean=28.94, max=29.546, sum=173.637 (6)",
-            "tab": "General information",
-            "score": 28.93951093951094
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.415, mean=0.439, max=0.454, sum=2.637 (6)",
-            "tab": "Bias",
-            "score": 0.43949621664675426
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.497, mean=0.541, max=0.59, sum=3.246 (6)",
-            "tab": "Bias",
-            "score": 0.54094360657117
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.189, mean=0.207, max=0.22, sum=1.244 (6)",
-            "tab": "Bias",
-            "score": 0.20735056882648284
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0.002, mean=0.004, max=0.006, sum=0.023 (6)",
-            "tab": "Toxicity",
-            "score": 0.0038610038610038615
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.278, mean=-0.271, max=-0.263, sum=-0.812 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.2708329675740717
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=2.934, mean=3.066, max=3.179, sum=18.394 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.0656965498353155
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.434, mean=0.437, max=0.441, sum=1.311 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.4370376831136327
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.806, mean=0.808, max=0.811, sum=4.849 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.8082245669950062
-          },
-          "XSUM - Density": {
-            "description": "min=2.656, mean=2.691, max=2.726, sum=16.146 (6)",
-            "tab": "Summarization metrics",
-            "score": 2.6910357109145138
-          },
-          "XSUM - Compression": {
-            "description": "min=14.828, mean=15.182, max=15.567, sum=91.094 (6)",
-            "tab": "Summarization metrics",
-            "score": 15.182390855675616
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "min=0.667, mean=0.778, max=0.889, sum=4.667 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.7777777777777777
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "min=4.333, mean=4.398, max=4.444, sum=26.389 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.398148148148148
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "min=4.889, mean=4.898, max=4.917, sum=29.389 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.898148148148149
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.934,
-        "details": {
-          "description": "min=0.924, mean=0.934, max=0.948, sum=2.802 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.01, mean=0.015, max=0.024, sum=0.045 (3)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.921, mean=0.928, max=0.94, sum=2.783 (3)",
-            "tab": "Robustness",
-            "score": 0.9276666666666666
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.918, mean=0.925, max=0.936, sum=2.775 (3)",
-            "tab": "Fairness",
-            "score": 0.9249999999999999
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.714, mean=0.79, max=0.897, sum=2.37 (3)",
-            "tab": "Efficiency",
-            "score": 0.7899130366753467
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1155.212, mean=1422.545, max=1836.212, sum=4267.636 (3)",
-            "tab": "General information",
-            "score": 1422.5453333333335
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1.002, mean=1.014, max=1.02, sum=3.042 (3)",
-            "tab": "General information",
-            "score": 1.014
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.61,
-        "details": {
-          "description": "min=0.182, mean=0.61, max=0.939, sum=32.915 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.048, mean=0.179, max=0.449, sum=9.655 (54)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.105, mean=0.514, max=0.854, sum=27.755 (54)",
-            "tab": "Robustness",
-            "score": 0.5139820592784173
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.105, mean=0.512, max=0.939, sum=27.636 (54)",
-            "tab": "Fairness",
-            "score": 0.5117722022150621
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.555, mean=0.594, max=0.756, sum=32.071 (54)",
-            "tab": "Efficiency",
-            "score": 0.5939081200798796
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)",
-            "tab": "General information",
-            "score": 722.6354931173206
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=54 (54)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.699,
-        "details": {
-          "description": "min=0.225, mean=0.699, max=0.95, sum=23.075 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.071, mean=0.212, max=0.648, sum=7.002 (33)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.225, mean=0.6, max=0.95, sum=19.8 (33)",
-            "tab": "Robustness",
-            "score": 0.6000000000000001
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.225, mean=0.67, max=0.95, sum=22.1 (33)",
-            "tab": "Fairness",
-            "score": 0.6696969696969697
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.583, mean=0.883, max=2.075, sum=29.139 (33)",
-            "tab": "Efficiency",
-            "score": 0.8829963013928345
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=5, mean=5, max=5, sum=165 (33)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=257.35, mean=1279.572, max=6599.65, sum=42225.875 (33)",
-            "tab": "General information",
-            "score": 1279.5719696969697
-          },
-          "RAFT - # output tokens": {
-            "description": "min=1, mean=2.986, max=5.3, sum=98.55 (33)",
-            "tab": "General information",
-            "score": 2.9863636363636363
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json b/data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json
deleted file mode 100644
index 6a9a41b41..000000000
--- a/data/helm_classic/ai21/J1-Grande-v1-17B/d3519b2c-2e74-4e5f-8e2a-ab13446d126a.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/ai21_J1-Grande-v1-17B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "J1-Grande v1 17B",
-    "id": "ai21/J1-Grande-v1-17B",
-    "developer": "ai21",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.433,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.6221919576066971
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.4225080073800875
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.4539316449216338
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.31716008771929827
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5580147362700336
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.6300489633822968
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.6689640768588138
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.27,
-        "details": {
-          "description": "min=0.2, mean=0.27, max=0.35, sum=4.047 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.063, mean=0.114, max=0.154, sum=1.708 (15)",
-            "tab": "Calibration",
-            "score": 0.11389257817699022
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.15, mean=0.225, max=0.27, sum=3.377 (15)",
-            "tab": "Robustness",
-            "score": 0.22511111111111112
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.158, mean=0.232, max=0.29, sum=3.474 (15)",
-            "tab": "Fairness",
-            "score": 0.23159064327485382
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.381, mean=0.411, max=0.466, sum=6.166 (15)",
-            "tab": "Efficiency",
-            "score": 0.41104061293859656
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=308.59, mean=396.74, max=552.719, sum=5951.098 (15)",
-            "tab": "General information",
-            "score": 396.73985964912276
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.722,
-        "details": {
-          "description": "min=0.712, mean=0.722, max=0.733, sum=2.165 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.139, mean=0.154, max=0.169, sum=0.462 (3)",
-            "tab": "Calibration",
-            "score": 0.15409092997354776
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.632, mean=0.643, max=0.658, sum=1.929 (3)",
-            "tab": "Robustness",
-            "score": 0.6429999999999999
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.656, mean=0.678, max=0.695, sum=2.035 (3)",
-            "tab": "Fairness",
-            "score": 0.6783333333333333
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.47, mean=0.535, max=0.624, sum=1.606 (3)",
-            "tab": "Efficiency",
-            "score": 0.5352501416015627
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=506.985, mean=694.652, max=952.985, sum=2083.955 (3)",
-            "tab": "General information",
-            "score": 694.6516666666666
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.672,
-        "details": {
-          "description": "min=0.664, mean=0.672, max=0.68, sum=2.016 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.039, mean=0.047, max=0.062, sum=0.141 (3)",
-            "tab": "Calibration",
-            "score": 0.04705310707412085
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.409, mean=0.477, max=0.522, sum=1.432 (3)",
-            "tab": "Robustness",
-            "score": 0.47749086119263257
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.526, mean=0.547, max=0.563, sum=1.641 (3)",
-            "tab": "Fairness",
-            "score": 0.5469545337986748
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=0.892, mean=0.923, max=0.955, sum=2.769 (3)",
-            "tab": "Efficiency",
-            "score": 0.9228662338615026
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=2.166, mean=2.639, max=3.225, sum=7.918 (3)",
-            "tab": "General information",
-            "score": 2.63943661971831
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1598.614, mean=1692.218, max=1777.299, sum=5076.654 (3)",
-            "tab": "General information",
-            "score": 1692.2178403755868
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=4.324, mean=4.528, max=4.701, sum=13.583 (3)",
-            "tab": "General information",
-            "score": 4.527699530516432
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.15, mean=0.164, max=0.18, sum=0.491 (3)",
-            "tab": "Bias",
-            "score": 0.1636261091893518
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.008, mean=0.014, max=0.017, sum=0.042 (3)",
-            "tab": "Toxicity",
-            "score": 0.014084507042253521
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.578,
-        "details": {
-          "description": "min=0.561, mean=0.578, max=0.59, sum=1.734 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.027, mean=0.029, max=0.03, sum=0.087 (3)",
-            "tab": "Calibration",
-            "score": 0.028955351873343083
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.073, mean=0.081, max=0.097, sum=0.243 (3)",
-            "tab": "Calibration",
-            "score": 0.08114120238748938
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.164, mean=0.17, max=0.175, sum=0.511 (3)",
-            "tab": "Robustness",
-            "score": 0.17025794044565556
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.449, mean=0.478, max=0.494, sum=1.433 (3)",
-            "tab": "Robustness",
-            "score": 0.4776074011626843
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.185, mean=0.187, max=0.189, sum=0.562 (3)",
-            "tab": "Fairness",
-            "score": 0.1872477522460834
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.501, mean=0.521, max=0.534, sum=1.563 (3)",
-            "tab": "Fairness",
-            "score": 0.5209919156580172
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.437, mean=0.466, max=0.494, sum=1.399 (3)",
-            "tab": "Efficiency",
-            "score": 0.46640491796874967
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=0.774, mean=0.873, max=0.927, sum=2.618 (3)",
-            "tab": "Efficiency",
-            "score": 0.8728225097656246
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=94.377, mean=99.377, max=102.377, sum=298.131 (3)",
-            "tab": "General information",
-            "score": 99.377
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.791, mean=5.971, max=7.18, sum=17.913 (3)",
-            "tab": "General information",
-            "score": 5.971
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.568, mean=4.666, max=4.734, sum=13.999 (3)",
-            "tab": "General information",
-            "score": 4.666333333333333
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.038, mean=0.038, max=0.038, sum=0.114 (3)",
-            "tab": "General information",
-            "score": 0.038
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1136.933, mean=1418.457, max=1595.508, sum=4255.37 (3)",
-            "tab": "General information",
-            "score": 1418.4566666666667
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=6.302, mean=6.538, max=6.976, sum=19.615 (3)",
-            "tab": "General information",
-            "score": 6.538333333333333
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.473, mean=0.521, max=0.556, sum=1.564 (3)",
-            "tab": "Bias",
-            "score": 0.5214747518446415
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0, mean=0.033, max=0.1, sum=0.1 (3)",
-            "tab": "Bias",
-            "score": 0.033333333333333326
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.346, mean=0.346, max=0.346, sum=1.038 (3)",
-            "tab": "Bias",
-            "score": 0.3461538461538461
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.45, mean=0.488, max=0.521, sum=1.463 (3)",
-            "tab": "Bias",
-            "score": 0.48764942579375564
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.111, mean=0.113, max=0.118, sum=0.34 (3)",
-            "tab": "Bias",
-            "score": 0.11339991677070331
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)",
-            "tab": "Toxicity",
-            "score": 0.0006666666666666666
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.362,
-        "details": {
-          "description": "min=0.355, mean=0.362, max=0.372, sum=1.087 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.019, mean=0.036, max=0.06, sum=0.107 (3)",
-            "tab": "Calibration",
-            "score": 0.03571925908384949
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.215, mean=0.219, max=0.227, sum=0.658 (3)",
-            "tab": "Robustness",
-            "score": 0.21921244416502939
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.266, mean=0.274, max=0.282, sum=0.821 (3)",
-            "tab": "Fairness",
-            "score": 0.27362985580399246
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=1.302, mean=1.413, max=1.478, sum=4.24 (3)",
-            "tab": "Efficiency",
-            "score": 1.4134776341145843
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=1.788, mean=1.829, max=1.88, sum=5.486 (3)",
-            "tab": "General information",
-            "score": 1.8286666666666667
-          },
-          "QuAC - truncated": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)",
-            "tab": "General information",
-            "score": 0.001
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1645.856, mean=1698.711, max=1730.814, sum=5096.134 (3)",
-            "tab": "General information",
-            "score": 1698.7113333333334
-          },
-          "QuAC - # output tokens": {
-            "description": "min=22.154, mean=27.786, max=31.692, sum=83.357 (3)",
-            "tab": "General information",
-            "score": 27.785666666666668
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.58, mean=0.6, max=0.639, sum=1.799 (3)",
-            "tab": "Bias",
-            "score": 0.5996635891593876
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.415, mean=0.428, max=0.44, sum=1.283 (3)",
-            "tab": "Bias",
-            "score": 0.42780085419627883
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.298, mean=0.34, max=0.378, sum=1.019 (3)",
-            "tab": "Bias",
-            "score": 0.3397817992618246
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.237, mean=0.242, max=0.25, sum=0.727 (3)",
-            "tab": "Bias",
-            "score": 0.24231770708576347
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.004, mean=0.004, max=0.004, sum=0.012 (3)",
-            "tab": "Toxicity",
-            "score": 0.004
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.739,
-        "details": {
-          "description": "min=0.739, mean=0.739, max=0.739, sum=0.739 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.213, mean=0.213, max=0.213, sum=0.213 (1)",
-            "tab": "Calibration",
-            "score": 0.21338082493857388
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.695, mean=0.695, max=0.695, sum=0.695 (1)",
-            "tab": "Robustness",
-            "score": 0.695
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.58, mean=0.58, max=0.58, sum=0.58 (1)",
-            "tab": "Fairness",
-            "score": 0.58
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.33, mean=0.33, max=0.33, sum=0.33 (1)",
-            "tab": "Efficiency",
-            "score": 0.3304377109375
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=62.466, mean=62.466, max=62.466, sum=62.466 (1)",
-            "tab": "General information",
-            "score": 62.466
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52,
-        "details": {
-          "description": "min=0.52, mean=0.52, max=0.52, sum=0.52 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.258, mean=0.258, max=0.258, sum=0.258 (1)",
-            "tab": "Calibration",
-            "score": 0.25849314658751343
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.424, mean=0.424, max=0.424, sum=0.424 (1)",
-            "tab": "Robustness",
-            "score": 0.424
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.472, mean=0.472, max=0.472, sum=0.472 (1)",
-            "tab": "Fairness",
-            "score": 0.472
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.281, mean=0.281, max=0.281, sum=0.281 (1)",
-            "tab": "Efficiency",
-            "score": 0.280719578125
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=4.348, mean=4.348, max=4.348, sum=4.348 (1)",
-            "tab": "General information",
-            "score": 4.348
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.193,
-        "details": {
-          "description": "min=0.171, mean=0.193, max=0.217, sum=0.58 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.064, mean=0.091, max=0.109, sum=0.273 (3)",
-            "tab": "Calibration",
-            "score": 0.09083831911084679
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.116, mean=0.142, max=0.159, sum=0.425 (3)",
-            "tab": "Robustness",
-            "score": 0.1416921508664628
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.138, mean=0.163, max=0.182, sum=0.489 (3)",
-            "tab": "Fairness",
-            "score": 0.16309887869520898
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.384, mean=0.396, max=0.403, sum=1.189 (3)",
-            "tab": "Efficiency",
-            "score": 0.39626294915902127
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=317.682, mean=355.015, max=375.682, sum=1065.046 (3)",
-            "tab": "General information",
-            "score": 355.0152905198777
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.341,
-        "details": {
-          "description": "min=0.31, mean=0.341, max=0.389, sum=1.022 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.105, mean=0.121, max=0.133, sum=0.362 (3)",
-            "tab": "Robustness",
-            "score": 0.12069748677248683
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.27, mean=0.297, max=0.328, sum=0.89 (3)",
-            "tab": "Robustness",
-            "score": 0.29680328755123014
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.126, mean=0.138, max=0.155, sum=0.414 (3)",
-            "tab": "Fairness",
-            "score": 0.1378972222222222
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.296, mean=0.328, max=0.372, sum=0.985 (3)",
-            "tab": "Fairness",
-            "score": 0.3284974893691146
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.415, mean=0.428, max=0.44, sum=1.283 (3)",
-            "tab": "Efficiency",
-            "score": 0.4278073636067708
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.412, mean=0.424, max=0.437, sum=1.272 (3)",
-            "tab": "Efficiency",
-            "score": 0.42392066375968995
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=349.303, mean=385.636, max=423.303, sum=1156.909 (3)",
-            "tab": "General information",
-            "score": 385.63633333333337
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=2.004, mean=2.011, max=2.023, sum=6.034 (3)",
-            "tab": "General information",
-            "score": 2.0113333333333334
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=337.047, mean=373.38, max=411.047, sum=1120.14 (3)",
-            "tab": "General information",
-            "score": 373.3798449612403
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=2.023, mean=2.023, max=2.023, sum=6.07 (3)",
-            "tab": "General information",
-            "score": 2.0232558139534884
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.143,
-        "details": {
-          "description": "min=0.127, mean=0.143, max=0.163, sum=0.859 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=1.956, mean=2.074, max=2.263, sum=12.445 (6)",
-            "tab": "Efficiency",
-            "score": 2.074164002425339
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1203.032, mean=1213.032, max=1224.032, sum=7278.193 (6)",
-            "tab": "General information",
-            "score": 1213.0321888412018
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=61.569, mean=67.049, max=76.034, sum=402.296 (6)",
-            "tab": "General information",
-            "score": 67.04935622317596
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.608, mean=0.633, max=0.647, sum=3.801 (6)",
-            "tab": "Bias",
-            "score": 0.6334968330766649
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.39, mean=0.4, max=0.407, sum=2.398 (6)",
-            "tab": "Bias",
-            "score": 0.39959768497778553
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.263, mean=0.351, max=0.399, sum=2.104 (6)",
-            "tab": "Bias",
-            "score": 0.3506178570090534
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.115, mean=0.13, max=0.14, sum=0.782 (6)",
-            "tab": "Bias",
-            "score": 0.1303299541894603
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.009 (6)",
-            "tab": "Toxicity",
-            "score": 0.001430615164520744
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.514, mean=0.539, max=0.586, sum=1.617 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.5391092885196874
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=4.706, mean=4.81, max=4.896, sum=28.859 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.809910581145076
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.247, mean=0.275, max=0.302, sum=0.824 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.2747429286177279
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.966, mean=0.973, max=0.984, sum=5.84 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9733042514029583
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=31.118, mean=41.027, max=60.066, sum=246.163 (6)",
-            "tab": "Summarization metrics",
-            "score": 41.02711755812993
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=8.092, mean=9.888, max=11.258, sum=59.326 (6)",
-            "tab": "Summarization metrics",
-            "score": 9.887609814491976
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.122,
-        "details": {
-          "description": "min=0.118, mean=0.122, max=0.127, sum=0.733 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=1.055, mean=1.07, max=1.082, sum=6.42 (6)",
-            "tab": "Efficiency",
-            "score": 1.0700079645773009
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1099.388, mean=1133.388, max=1172.388, sum=6800.328 (6)",
-            "tab": "General information",
-            "score": 1133.388030888031
-          },
-          "XSUM - # output tokens": {
-            "description": "min=19.975, mean=20.468, max=21.141, sum=122.807 (6)",
-            "tab": "General information",
-            "score": 20.467824967824967
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.417, mean=0.442, max=0.485, sum=2.652 (6)",
-            "tab": "Bias",
-            "score": 0.44203142536475876
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.439, mean=0.557, max=0.667, sum=3.34 (6)",
-            "tab": "Bias",
-            "score": 0.5566296694116243
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.149, mean=0.171, max=0.211, sum=1.025 (6)",
-            "tab": "Bias",
-            "score": 0.17086307216738958
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.002, max=0.004, sum=0.012 (6)",
-            "tab": "Toxicity",
-            "score": 0.0019305019305019308
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.282, mean=-0.272, max=-0.264, sum=-0.815 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.2715132814883572
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=3.221, mean=3.447, max=3.575, sum=20.68 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.446713620425662
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.424, mean=0.429, max=0.434, sum=1.287 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.4288941077256343
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.78, mean=0.783, max=0.785, sum=4.696 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.7826042118856411
-          },
-          "XSUM - Density": {
-            "description": "min=2.514, mean=2.64, max=2.767, sum=15.838 (6)",
-            "tab": "Summarization metrics",
-            "score": 2.6397086455700927
-          },
-          "XSUM - Compression": {
-            "description": "min=18.382, mean=19.012, max=19.445, sum=114.069 (6)",
-            "tab": "Summarization metrics",
-            "score": 19.011567725134377
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.953,
-        "details": {
-          "description": "min=0.947, mean=0.953, max=0.957, sum=2.859 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.152, mean=0.158, max=0.166, sum=0.473 (3)",
-            "tab": "Calibration",
-            "score": 0.15775206410447826
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.932, mean=0.941, max=0.948, sum=2.822 (3)",
-            "tab": "Robustness",
-            "score": 0.9406666666666667
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.94, mean=0.946, max=0.95, sum=2.839 (3)",
-            "tab": "Fairness",
-            "score": 0.9463333333333331
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.59, mean=0.732, max=0.881, sum=2.197 (3)",
-            "tab": "Efficiency",
-            "score": 0.7321998525390631
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=4.915, mean=4.972, max=5, sum=14.915 (3)",
-            "tab": "General information",
-            "score": 4.971666666666667
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=853.851, mean=1281.577, max=1725.03, sum=3844.732 (3)",
-            "tab": "General information",
-            "score": 1281.5773333333334
-          },
-          "IMDB - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.529,
-        "details": {
-          "description": "min=0.014, mean=0.529, max=0.991, sum=28.55 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.228, mean=0.408, max=0.593, sum=22.008 (54)",
-            "tab": "Calibration",
-            "score": 0.4075612338805137
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.014, mean=0.417, max=0.938, sum=22.51 (54)",
-            "tab": "Robustness",
-            "score": 0.41686056018907397
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.014, mean=0.482, max=0.962, sum=26.023 (54)",
-            "tab": "Fairness",
-            "score": 0.4819034071645267
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.418, mean=0.482, max=0.621, sum=26.002 (54)",
-            "tab": "Efficiency",
-            "score": 0.48152748003997736
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=271.927, mean=532.602, max=942.498, sum=28760.487 (54)",
-            "tab": "General information",
-            "score": 532.6016121330534
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=108 (54)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.658,
-        "details": {
-          "description": "min=0.2, mean=0.658, max=0.975, sum=21.7 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.113, mean=0.244, max=0.466, sum=8.048 (33)",
-            "tab": "Calibration",
-            "score": 0.24386423436086976
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.025, mean=0.513, max=0.775, sum=16.925 (33)",
-            "tab": "Robustness",
-            "score": 0.5128787878787878
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.175, mean=0.636, max=0.975, sum=21 (33)",
-            "tab": "Fairness",
-            "score": 0.6363636363636364
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.401, mean=0.59, max=0.888, sum=19.483 (33)",
-            "tab": "Efficiency",
-            "score": 0.5903971827651516
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.95, mean=4.658, max=5, sum=153.7 (33)",
-            "tab": "General information",
-            "score": 4.657575757575757
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=212.25, mean=712.248, max=1745.25, sum=23504.175 (33)",
-            "tab": "General information",
-            "score": 712.2477272727273
-          },
-          "RAFT - # output tokens": {
-            "description": "min=1.95, mean=3.59, max=6.575, sum=118.475 (33)",
-            "tab": "General information",
-            "score": 3.590151515151515
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json b/data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json
deleted file mode 100644
index 30c92ab94..000000000
--- a/data/helm_classic/ai21/J1-Grande-v2-beta-17B/1f2516b9-54b6-4dcf-a575-734c0d0b17b5.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/ai21_J1-Grande-v2-beta-17B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "J1-Grande v2 beta 17B",
-    "id": "ai21/J1-Grande-v2-beta-17B",
-    "developer": "ai21",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.706,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.6340622537431048
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.7106770870953296
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.6771299149497148
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5919924787763542
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.5063399563399563
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.6776315789473685
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.445,
-        "details": {
-          "description": "min=0.23, mean=0.445, max=0.8, sum=6.677 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.067, mean=0.139, max=0.205, sum=2.09 (15)",
-            "tab": "Calibration",
-            "score": 0.13930239849591303
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.2, mean=0.392, max=0.73, sum=5.887 (15)",
-            "tab": "Robustness",
-            "score": 0.39245614035087717
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.19, mean=0.409, max=0.77, sum=6.142 (15)",
-            "tab": "Fairness",
-            "score": 0.4094619883040936
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=308.59, mean=396.74, max=552.719, sum=5951.098 (15)",
-            "tab": "General information",
-            "score": 396.73985964912276
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.812,
-        "details": {
-          "description": "min=0.799, mean=0.812, max=0.823, sum=2.437 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.155, mean=0.167, max=0.185, sum=0.5 (3)",
-            "tab": "Calibration",
-            "score": 0.16655399552246586
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.669, mean=0.692, max=0.714, sum=2.077 (3)",
-            "tab": "Robustness",
-            "score": 0.6923333333333334
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.751, mean=0.764, max=0.784, sum=2.291 (3)",
-            "tab": "Fairness",
-            "score": 0.7636666666666668
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=506.985, mean=694.652, max=952.985, sum=2083.955 (3)",
-            "tab": "General information",
-            "score": 694.6516666666666
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.725,
-        "details": {
-          "description": "min=0.712, mean=0.725, max=0.736, sum=2.176 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.034, mean=0.041, max=0.05, sum=0.122 (3)",
-            "tab": "Calibration",
-            "score": 0.040831012535009516
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.484, mean=0.565, max=0.616, sum=1.694 (3)",
-            "tab": "Robustness",
-            "score": 0.5646966401263148
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.622, mean=0.647, max=0.665, sum=1.941 (3)",
-            "tab": "Fairness",
-            "score": 0.6470593497686433
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=2.166, mean=2.639, max=3.225, sum=7.918 (3)",
-            "tab": "General information",
-            "score": 2.63943661971831
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1598.614, mean=1692.218, max=1777.299, sum=5076.654 (3)",
-            "tab": "General information",
-            "score": 1692.2178403755868
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=4.194, mean=4.6, max=5.011, sum=13.8 (3)",
-            "tab": "General information",
-            "score": 4.6
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.25, mean=0.3, max=0.4, sum=0.9 (3)",
-            "tab": "Bias",
-            "score": 0.3
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.156, mean=0.179, max=0.205, sum=0.536 (3)",
-            "tab": "Bias",
-            "score": 0.1787801116945903
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.008, mean=0.014, max=0.017, sum=0.042 (3)",
-            "tab": "Toxicity",
-            "score": 0.014084507042253521
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.625,
-        "details": {
-          "description": "min=0.622, mean=0.625, max=0.628, sum=1.874 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.031, mean=0.036, max=0.043, sum=0.107 (3)",
-            "tab": "Calibration",
-            "score": 0.035782131071618734
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.051, mean=0.065, max=0.075, sum=0.196 (3)",
-            "tab": "Calibration",
-            "score": 0.06520649617008285
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.23, mean=0.235, max=0.241, sum=0.705 (3)",
-            "tab": "Robustness",
-            "score": 0.2349124459413927
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.556, mean=0.56, max=0.568, sum=1.681 (3)",
-            "tab": "Robustness",
-            "score": 0.5603824984507094
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.269, mean=0.27, max=0.27, sum=0.81 (3)",
-            "tab": "Fairness",
-            "score": 0.269872960171523
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.562, mean=0.571, max=0.578, sum=1.714 (3)",
-            "tab": "Fairness",
-            "score": 0.5712438797598854
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=94.377, mean=99.377, max=102.377, sum=298.131 (3)",
-            "tab": "General information",
-            "score": 99.377
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.471, mean=5.282, max=6.145, sum=15.846 (3)",
-            "tab": "General information",
-            "score": 5.282
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.568, mean=4.666, max=4.734, sum=13.999 (3)",
-            "tab": "General information",
-            "score": 4.666333333333333
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.038, mean=0.038, max=0.038, sum=0.114 (3)",
-            "tab": "General information",
-            "score": 0.038
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1136.933, mean=1418.457, max=1595.508, sum=4255.37 (3)",
-            "tab": "General information",
-            "score": 1418.4566666666667
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=5.132, mean=5.27, max=5.521, sum=15.809 (3)",
-            "tab": "General information",
-            "score": 5.269666666666667
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.288, mean=0.392, max=0.491, sum=1.177 (3)",
-            "tab": "Bias",
-            "score": 0.3923268084547134
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.026, mean=0.174, max=0.318, sum=0.522 (3)",
-            "tab": "Bias",
-            "score": 0.17397232083140401
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.1, mean=0.167, max=0.3, sum=0.5 (3)",
-            "tab": "Bias",
-            "score": 0.16666666666666666
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.478, mean=0.488, max=0.498, sum=1.465 (3)",
-            "tab": "Bias",
-            "score": 0.48822694742885336
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.369, mean=0.381, max=0.394, sum=1.143 (3)",
-            "tab": "Bias",
-            "score": 0.38112988257848074
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.002, sum=0.004 (3)",
-            "tab": "Toxicity",
-            "score": 0.0013333333333333333
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.392,
-        "details": {
-          "description": "min=0.375, mean=0.392, max=0.411, sum=1.177 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.031, mean=0.04, max=0.051, sum=0.121 (3)",
-            "tab": "Calibration",
-            "score": 0.04046561186462396
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.232, mean=0.251, max=0.261, sum=0.752 (3)",
-            "tab": "Robustness",
-            "score": 0.2506588392587418
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.297, mean=0.308, max=0.319, sum=0.923 (3)",
-            "tab": "Fairness",
-            "score": 0.30759220119907554
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=1.788, mean=1.829, max=1.88, sum=5.486 (3)",
-            "tab": "General information",
-            "score": 1.8286666666666667
-          },
-          "QuAC - truncated": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)",
-            "tab": "General information",
-            "score": 0.001
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1645.856, mean=1698.711, max=1730.814, sum=5096.134 (3)",
-            "tab": "General information",
-            "score": 1698.7113333333334
-          },
-          "QuAC - # output tokens": {
-            "description": "min=19.318, mean=23.053, max=25.3, sum=69.158 (3)",
-            "tab": "General information",
-            "score": 23.052666666666667
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.583, mean=0.628, max=0.66, sum=1.884 (3)",
-            "tab": "Bias",
-            "score": 0.6279609279609281
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.396, mean=0.411, max=0.426, sum=1.232 (3)",
-            "tab": "Bias",
-            "score": 0.41081218336807646
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.302, mean=0.327, max=0.359, sum=0.981 (3)",
-            "tab": "Bias",
-            "score": 0.3270316371542728
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.198, mean=0.225, max=0.241, sum=0.676 (3)",
-            "tab": "Bias",
-            "score": 0.22518777152451866
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.003, mean=0.003, max=0.004, sum=0.01 (3)",
-            "tab": "Toxicity",
-            "score": 0.0033333333333333335
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.764,
-        "details": {
-          "description": "min=0.764, mean=0.764, max=0.764, sum=0.764 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.226, mean=0.226, max=0.226, sum=0.226 (1)",
-            "tab": "Calibration",
-            "score": 0.2263163700416937
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.732, mean=0.732, max=0.732, sum=0.732 (1)",
-            "tab": "Robustness",
-            "score": 0.732
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.623, mean=0.623, max=0.623, sum=0.623 (1)",
-            "tab": "Fairness",
-            "score": 0.623
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=62.466, mean=62.466, max=62.466, sum=62.466 (1)",
-            "tab": "General information",
-            "score": 62.466
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.56,
-        "details": {
-          "description": "min=0.56, mean=0.56, max=0.56, sum=0.56 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.215, mean=0.215, max=0.215, sum=0.215 (1)",
-            "tab": "Calibration",
-            "score": 0.21479287621696264
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.474, mean=0.474, max=0.474, sum=0.474 (1)",
-            "tab": "Robustness",
-            "score": 0.474
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.478, mean=0.478, max=0.478, sum=0.478 (1)",
-            "tab": "Fairness",
-            "score": 0.478
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=4.348, mean=4.348, max=4.348, sum=4.348 (1)",
-            "tab": "General information",
-            "score": 4.348
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.306,
-        "details": {
-          "description": "min=0.266, mean=0.306, max=0.333, sum=0.917 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.101, mean=0.123, max=0.157, sum=0.37 (3)",
-            "tab": "Calibration",
-            "score": 0.1233746034244333
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.216, mean=0.252, max=0.294, sum=0.755 (3)",
-            "tab": "Robustness",
-            "score": 0.25178389398572887
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.216, mean=0.242, max=0.271, sum=0.725 (3)",
-            "tab": "Fairness",
-            "score": 0.24159021406727832
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=317.682, mean=355.015, max=375.682, sum=1065.046 (3)",
-            "tab": "General information",
-            "score": 355.0152905198777
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46,
-        "details": {
-          "description": "min=0.401, mean=0.46, max=0.51, sum=1.38 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.207, mean=0.222, max=0.244, sum=0.666 (3)",
-            "tab": "Robustness",
-            "score": 0.22205343915343892
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.361, mean=0.407, max=0.448, sum=1.222 (3)",
-            "tab": "Robustness",
-            "score": 0.40738421631598776
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.23, mean=0.253, max=0.284, sum=0.76 (3)",
-            "tab": "Fairness",
-            "score": 0.25326719576719553
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.371, mean=0.435, max=0.486, sum=1.304 (3)",
-            "tab": "Fairness",
-            "score": 0.4346805929346467
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=349.303, mean=385.636, max=423.303, sum=1156.909 (3)",
-            "tab": "General information",
-            "score": 385.63633333333337
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=2.001, mean=2.009, max=2.02, sum=6.026 (3)",
-            "tab": "General information",
-            "score": 2.0086666666666666
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=337.047, mean=373.38, max=411.047, sum=1120.14 (3)",
-            "tab": "General information",
-            "score": 373.3798449612403
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=2.023, mean=2.023, max=2.023, sum=6.07 (3)",
-            "tab": "General information",
-            "score": 2.0232558139534884
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.146,
-        "details": {
-          "description": "min=0.14, mean=0.146, max=0.152, sum=0.875 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1203.032, mean=1213.032, max=1224.032, sum=7278.193 (6)",
-            "tab": "General information",
-            "score": 1213.0321888412018
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=48.575, mean=53.215, max=56.485, sum=319.288 (6)",
-            "tab": "General information",
-            "score": 53.21459227467812
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.605, mean=0.615, max=0.633, sum=3.691 (6)",
-            "tab": "Bias",
-            "score": 0.615138154027043
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.39, mean=0.401, max=0.416, sum=2.409 (6)",
-            "tab": "Bias",
-            "score": 0.4014349780782224
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.278, mean=0.293, max=0.321, sum=1.76 (6)",
-            "tab": "Bias",
-            "score": 0.2933799533799534
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.077, mean=0.099, max=0.123, sum=0.596 (6)",
-            "tab": "Bias",
-            "score": 0.09929925405618005
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0.002, mean=0.004, max=0.006, sum=0.026 (6)",
-            "tab": "Toxicity",
-            "score": 0.004291845493562232
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.533, mean=0.552, max=0.585, sum=1.655 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.5516800688123055
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.273, mean=0.29, max=0.308, sum=0.871 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.2904019284209938
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.965, mean=0.973, max=0.983, sum=5.838 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9729724626233943
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=18.643, mean=24.032, max=31.138, sum=144.19 (6)",
-            "tab": "Summarization metrics",
-            "score": 24.0317341420422
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=10.389, mean=11.659, max=13.368, sum=69.956 (6)",
-            "tab": "Summarization metrics",
-            "score": 11.65941362001026
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.152,
-        "details": {
-          "description": "min=0.149, mean=0.152, max=0.157, sum=0.911 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1099.388, mean=1133.388, max=1172.388, sum=6800.328 (6)",
-            "tab": "General information",
-            "score": 1133.388030888031
-          },
-          "XSUM - # output tokens": {
-            "description": "min=21.805, mean=22.092, max=22.577, sum=132.552 (6)",
-            "tab": "General information",
-            "score": 22.09202059202059
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.45, mean=0.465, max=0.474, sum=2.791 (6)",
-            "tab": "Bias",
-            "score": 0.46523352396514167
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.494, mean=0.522, max=0.536, sum=3.133 (6)",
-            "tab": "Bias",
-            "score": 0.5222388805597201
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.201, mean=0.214, max=0.224, sum=1.284 (6)",
-            "tab": "Bias",
-            "score": 0.21406383130768433
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.008 (6)",
-            "tab": "Toxicity",
-            "score": 0.001287001287001287
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.298, mean=-0.282, max=-0.27, sum=-0.845 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.2817185772994412
-          },
-          "XSUM - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.45, mean=0.454, max=0.458, sum=1.362 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.4538733417652499
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.782, mean=0.786, max=0.79, sum=4.714 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.7856975370843048
-          },
-          "XSUM - Density": {
-            "description": "min=2.624, mean=2.816, max=3.113, sum=16.895 (6)",
-            "tab": "Summarization metrics",
-            "score": 2.815909720295231
-          },
-          "XSUM - Compression": {
-            "description": "min=16.323, mean=16.857, max=17.149, sum=101.14 (6)",
-            "tab": "Summarization metrics",
-            "score": 16.856596376166145
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.957,
-        "details": {
-          "description": "min=0.947, mean=0.957, max=0.964, sum=2.872 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.119, mean=0.136, max=0.165, sum=0.407 (3)",
-            "tab": "Calibration",
-            "score": 0.13573735378803647
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.931, mean=0.947, max=0.955, sum=2.841 (3)",
-            "tab": "Robustness",
-            "score": 0.9470000000000001
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.935, mean=0.95, max=0.959, sum=2.851 (3)",
-            "tab": "Fairness",
-            "score": 0.9503333333333334
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=4.915, mean=4.972, max=5, sum=14.915 (3)",
-            "tab": "General information",
-            "score": 4.971666666666667
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=853.851, mean=1281.577, max=1725.03, sum=3844.732 (3)",
-            "tab": "General information",
-            "score": 1281.5773333333334
-          },
-          "IMDB - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.546,
-        "details": {
-          "description": "min=0.008, mean=0.546, max=1, sum=29.501 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.131, mean=0.376, max=0.649, sum=20.307 (54)",
-            "tab": "Calibration",
-            "score": 0.37604932471578795
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.495, max=0.995, sum=26.738 (54)",
-            "tab": "Robustness",
-            "score": 0.49514299676627055
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.005, mean=0.404, max=0.901, sum=21.814 (54)",
-            "tab": "Fairness",
-            "score": 0.40396201739558046
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=271.927, mean=532.602, max=942.498, sum=28760.487 (54)",
-            "tab": "General information",
-            "score": 532.6016121330534
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=108 (54)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.679,
-        "details": {
-          "description": "min=0.225, mean=0.679, max=0.95, sum=22.4 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.095, mean=0.234, max=0.473, sum=7.733 (33)",
-            "tab": "Calibration",
-            "score": 0.23434348116913628
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.025, mean=0.555, max=0.925, sum=18.3 (33)",
-            "tab": "Robustness",
-            "score": 0.5545454545454547
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.2, mean=0.637, max=0.95, sum=21.025 (33)",
-            "tab": "Fairness",
-            "score": 0.6371212121212121
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.95, mean=4.658, max=5, sum=153.7 (33)",
-            "tab": "General information",
-            "score": 4.657575757575757
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=212.25, mean=712.248, max=1745.25, sum=23504.175 (33)",
-            "tab": "General information",
-            "score": 712.2477272727273
-          },
-          "RAFT - # output tokens": {
-            "description": "min=1.95, mean=3.574, max=6.575, sum=117.95 (33)",
-            "tab": "General information",
-            "score": 3.5742424242424238
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json b/data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json
deleted file mode 100644
index df8111bcc..000000000
--- a/data/helm_classic/ai21/J1-Jumbo-v1-178B/deddbc80-70ac-43e7-b052-753d127f8390.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/ai21_J1-Jumbo-v1-178B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "J1-Jumbo v1 178B",
-    "id": "ai21/J1-Jumbo-v1-178B",
-    "developer": "ai21",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.517,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.6662512419912975
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.4518627645991383
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.48803949109844547
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.2218311403508772
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5485082680240319
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.6042735042735042
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.5867794486215538
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.259,
-        "details": {
-          "description": "min=0.19, mean=0.259, max=0.35, sum=3.891 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.074, mean=0.131, max=0.172, sum=1.96 (15)",
-            "tab": "Calibration",
-            "score": 0.13067986008352367
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.15, mean=0.221, max=0.31, sum=3.313 (15)",
-            "tab": "Robustness",
-            "score": 0.22085380116959066
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.17, mean=0.236, max=0.33, sum=3.545 (15)",
-            "tab": "Fairness",
-            "score": 0.23635087719298245
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.419, mean=0.457, max=0.511, sum=6.851 (15)",
-            "tab": "Efficiency",
-            "score": 0.4567342927631581
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=308.59, mean=396.74, max=552.719, sum=5951.098 (15)",
-            "tab": "General information",
-            "score": 396.73985964912276
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.776,
-        "details": {
-          "description": "min=0.766, mean=0.776, max=0.786, sum=2.327 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.205, mean=0.215, max=0.223, sum=0.646 (3)",
-            "tab": "Calibration",
-            "score": 0.21546167732589497
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.635, mean=0.65, max=0.659, sum=1.949 (3)",
-            "tab": "Robustness",
-            "score": 0.6496666666666667
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.693, mean=0.709, max=0.73, sum=2.128 (3)",
-            "tab": "Fairness",
-            "score": 0.7093333333333334
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.55, mean=0.62, max=0.727, sum=1.859 (3)",
-            "tab": "Efficiency",
-            "score": 0.6195252891710069
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=506.985, mean=694.652, max=952.985, sum=2083.955 (3)",
-            "tab": "General information",
-            "score": 694.6516666666666
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.695,
-        "details": {
-          "description": "min=0.689, mean=0.695, max=0.698, sum=2.085 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.028, mean=0.034, max=0.042, sum=0.101 (3)",
-            "tab": "Calibration",
-            "score": 0.033635629206676086
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.448, mean=0.523, max=0.573, sum=1.57 (3)",
-            "tab": "Robustness",
-            "score": 0.5232968431666949
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.566, mean=0.581, max=0.592, sum=1.743 (3)",
-            "tab": "Fairness",
-            "score": 0.5811269391716133
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=1.085, mean=1.126, max=1.167, sum=3.379 (3)",
-            "tab": "Efficiency",
-            "score": 1.1261881626564945
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=2.166, mean=2.639, max=3.225, sum=7.918 (3)",
-            "tab": "General information",
-            "score": 2.63943661971831
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1598.614, mean=1692.218, max=1777.299, sum=5076.654 (3)",
-            "tab": "General information",
-            "score": 1692.2178403755868
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=4.434, mean=4.514, max=4.617, sum=13.541 (3)",
-            "tab": "General information",
-            "score": 4.513615023474178
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.375, mean=0.438, max=0.5, sum=0.875 (2)",
-            "tab": "Bias",
-            "score": 0.4375
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.196, mean=0.214, max=0.225, sum=0.641 (3)",
-            "tab": "Bias",
-            "score": 0.21357560568086884
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.014, mean=0.014, max=0.014, sum=0.042 (3)",
-            "tab": "Toxicity",
-            "score": 0.014084507042253521
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.595,
-        "details": {
-          "description": "min=0.593, mean=0.595, max=0.598, sum=1.786 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.029, mean=0.035, max=0.042, sum=0.106 (3)",
-            "tab": "Calibration",
-            "score": 0.035434924784030764
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.058, mean=0.065, max=0.069, sum=0.195 (3)",
-            "tab": "Calibration",
-            "score": 0.06491976505236641
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.177, mean=0.179, max=0.183, sum=0.537 (3)",
-            "tab": "Robustness",
-            "score": 0.17889901825749613
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.487, mean=0.503, max=0.515, sum=1.509 (3)",
-            "tab": "Robustness",
-            "score": 0.5031073713472458
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.227, mean=0.235, max=0.239, sum=0.704 (3)",
-            "tab": "Fairness",
-            "score": 0.23456155611286555
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.54, mean=0.54, max=0.54, sum=1.62 (3)",
-            "tab": "Fairness",
-            "score": 0.5399104355251988
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.466, mean=0.493, max=0.536, sum=1.478 (3)",
-            "tab": "Efficiency",
-            "score": 0.492596863281249
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=0.931, mean=1.06, max=1.147, sum=3.179 (3)",
-            "tab": "Efficiency",
-            "score": 1.0597537076822923
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=94.377, mean=99.377, max=102.377, sum=298.131 (3)",
-            "tab": "General information",
-            "score": 99.377
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=5.012, mean=5.602, max=6.608, sum=16.806 (3)",
-            "tab": "General information",
-            "score": 5.602
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.568, mean=4.666, max=4.734, sum=13.999 (3)",
-            "tab": "General information",
-            "score": 4.666333333333333
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.038, mean=0.038, max=0.038, sum=0.114 (3)",
-            "tab": "General information",
-            "score": 0.038
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1136.933, mean=1418.457, max=1595.508, sum=4255.37 (3)",
-            "tab": "General information",
-            "score": 1418.4566666666667
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=5.418, mean=5.682, max=5.988, sum=17.046 (3)",
-            "tab": "General information",
-            "score": 5.6819999999999995
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.255, mean=0.333, max=0.386, sum=1.0 (3)",
-            "tab": "Bias",
-            "score": 0.3331804837187507
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.125, mean=0.175, max=0.2, sum=0.525 (3)",
-            "tab": "Bias",
-            "score": 0.17500000000000002
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.38, mean=0.46, max=0.5, sum=1.38 (3)",
-            "tab": "Bias",
-            "score": 0.4601449275362319
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.451, mean=0.478, max=0.506, sum=1.433 (3)",
-            "tab": "Bias",
-            "score": 0.47760288745821544
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.011, mean=0.041, max=0.063, sum=0.122 (3)",
-            "tab": "Bias",
-            "score": 0.04050846488217801
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.002, sum=0.004 (3)",
-            "tab": "Toxicity",
-            "score": 0.0013333333333333333
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358,
-        "details": {
-          "description": "min=0.348, mean=0.358, max=0.372, sum=1.075 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.042, mean=0.043, max=0.045, sum=0.13 (3)",
-            "tab": "Calibration",
-            "score": 0.04341080368618692
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.216, mean=0.222, max=0.232, sum=0.667 (3)",
-            "tab": "Robustness",
-            "score": 0.22242500588714678
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.263, mean=0.268, max=0.275, sum=0.805 (3)",
-            "tab": "Fairness",
-            "score": 0.2682228394530809
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=1.898, mean=2.064, max=2.149, sum=6.193 (3)",
-            "tab": "Efficiency",
-            "score": 2.0642993667534726
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=1.788, mean=1.829, max=1.88, sum=5.486 (3)",
-            "tab": "General information",
-            "score": 1.8286666666666667
-          },
-          "QuAC - truncated": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)",
-            "tab": "General information",
-            "score": 0.001
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1645.856, mean=1698.711, max=1730.814, sum=5096.134 (3)",
-            "tab": "General information",
-            "score": 1698.7113333333334
-          },
-          "QuAC - # output tokens": {
-            "description": "min=22.621, mean=26.784, max=29.261, sum=80.351 (3)",
-            "tab": "General information",
-            "score": 26.783666666666665
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.594, mean=0.604, max=0.613, sum=1.811 (3)",
-            "tab": "Bias",
-            "score": 0.6038019374416433
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.417, mean=0.42, max=0.425, sum=1.26 (3)",
-            "tab": "Bias",
-            "score": 0.4200049682548366
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.287, mean=0.329, max=0.362, sum=0.988 (3)",
-            "tab": "Bias",
-            "score": 0.3293434102054505
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.231, mean=0.242, max=0.257, sum=0.725 (3)",
-            "tab": "Bias",
-            "score": 0.2415041378322658
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.002, mean=0.003, max=0.004, sum=0.009 (3)",
-            "tab": "Toxicity",
-            "score": 0.0030000000000000005
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.765,
-        "details": {
-          "description": "min=0.765, mean=0.765, max=0.765, sum=0.765 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.217, mean=0.217, max=0.217, sum=0.217 (1)",
-            "tab": "Calibration",
-            "score": 0.21741807730831492
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.726, mean=0.726, max=0.726, sum=0.726 (1)",
-            "tab": "Robustness",
-            "score": 0.726
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.614, mean=0.614, max=0.614, sum=0.614 (1)",
-            "tab": "Fairness",
-            "score": 0.614
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.284, mean=0.284, max=0.284, sum=0.284 (1)",
-            "tab": "Efficiency",
-            "score": 0.2835968515624999
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=62.466, mean=62.466, max=62.466, sum=62.466 (1)",
-            "tab": "General information",
-            "score": 62.466
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.534,
-        "details": {
-          "description": "min=0.534, mean=0.534, max=0.534, sum=0.534 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.25, mean=0.25, max=0.25, sum=0.25 (1)",
-            "tab": "Calibration",
-            "score": 0.25015305244306557
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.43, mean=0.43, max=0.43, sum=0.43 (1)",
-            "tab": "Robustness",
-            "score": 0.43
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.466, mean=0.466, max=0.466, sum=0.466 (1)",
-            "tab": "Fairness",
-            "score": 0.466
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.259, mean=0.259, max=0.259, sum=0.259 (1)",
-            "tab": "Efficiency",
-            "score": 0.2588512968749986
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=4.348, mean=4.348, max=4.348, sum=4.348 (1)",
-            "tab": "General information",
-            "score": 4.348
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.175,
-        "details": {
-          "description": "min=0.157, mean=0.175, max=0.187, sum=0.524 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.099, mean=0.113, max=0.123, sum=0.339 (3)",
-            "tab": "Calibration",
-            "score": 0.11285677982128534
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.13, mean=0.154, max=0.176, sum=0.462 (3)",
-            "tab": "Robustness",
-            "score": 0.15392456676860347
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.142, mean=0.156, max=0.168, sum=0.468 (3)",
-            "tab": "Fairness",
-            "score": 0.15596330275229356
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.423, mean=0.443, max=0.454, sum=1.328 (3)",
-            "tab": "Efficiency",
-            "score": 0.44282831613149837
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=317.682, mean=355.015, max=375.682, sum=1065.046 (3)",
-            "tab": "General information",
-            "score": 355.0152905198777
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.363,
-        "details": {
-          "description": "min=0.316, mean=0.363, max=0.406, sum=1.089 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.131, mean=0.144, max=0.157, sum=0.433 (3)",
-            "tab": "Robustness",
-            "score": 0.14417447089947086
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.276, mean=0.307, max=0.347, sum=0.921 (3)",
-            "tab": "Robustness",
-            "score": 0.3070790784160127
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.151, mean=0.18, max=0.202, sum=0.54 (3)",
-            "tab": "Fairness",
-            "score": 0.17989272486772476
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.308, mean=0.348, max=0.386, sum=1.044 (3)",
-            "tab": "Fairness",
-            "score": 0.34798299201075195
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.482, mean=0.501, max=0.52, sum=1.502 (3)",
-            "tab": "Efficiency",
-            "score": 0.500707514648438
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.477, mean=0.496, max=0.516, sum=1.489 (3)",
-            "tab": "Efficiency",
-            "score": 0.4963945009689923
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=349.303, mean=385.636, max=423.303, sum=1156.909 (3)",
-            "tab": "General information",
-            "score": 385.63633333333337
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=2, mean=2.001, max=2.004, sum=6.004 (3)",
-            "tab": "General information",
-            "score": 2.001333333333333
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=337.047, mean=373.38, max=411.047, sum=1120.14 (3)",
-            "tab": "General information",
-            "score": 373.3798449612403
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=2.047, mean=2.047, max=2.047, sum=6.14 (3)",
-            "tab": "General information",
-            "score": 2.046511627906977
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.144,
-        "details": {
-          "description": "min=0.137, mean=0.144, max=0.157, sum=0.861 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=3.558, mean=3.777, max=3.91, sum=22.664 (6)",
-            "tab": "Efficiency",
-            "score": 3.777328921804216
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1203.032, mean=1213.032, max=1224.032, sum=7278.193 (6)",
-            "tab": "General information",
-            "score": 1213.0321888412018
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=67.139, mean=72.469, max=75.648, sum=434.815 (6)",
-            "tab": "General information",
-            "score": 72.46924177396282
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.62, mean=0.63, max=0.647, sum=3.781 (6)",
-            "tab": "Bias",
-            "score": 0.6302246589223909
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.382, mean=0.386, max=0.393, sum=2.314 (6)",
-            "tab": "Bias",
-            "score": 0.385603383216647
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.288, mean=0.325, max=0.362, sum=1.95 (6)",
-            "tab": "Bias",
-            "score": 0.3250193306482005
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.13, mean=0.131, max=0.132, sum=0.788 (6)",
-            "tab": "Bias",
-            "score": 0.13141527227323743
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.002, max=0.004, sum=0.013 (6)",
-            "tab": "Toxicity",
-            "score": 0.002145922746781116
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.491, mean=0.515, max=0.544, sum=1.545 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.5151288171631818
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=4.661, mean=4.697, max=4.725, sum=28.182 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.696964335081241
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.264, mean=0.278, max=0.301, sum=0.834 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.27790265116917295
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.965, mean=0.976, max=0.984, sum=5.856 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.97598626364496
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=40.605, mean=53.93, max=67.411, sum=323.578 (6)",
-            "tab": "Summarization metrics",
-            "score": 53.929605831357485
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=8.981, mean=9.579, max=10.219, sum=57.476 (6)",
-            "tab": "Summarization metrics",
-            "score": 9.579310239916042
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.129,
-        "details": {
-          "description": "min=0.128, mean=0.129, max=0.131, sum=0.776 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=1.615, mean=1.629, max=1.648, sum=9.776 (6)",
-            "tab": "Efficiency",
-            "score": 1.6292920332441818
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1099.388, mean=1133.388, max=1172.388, sum=6800.328 (6)",
-            "tab": "General information",
-            "score": 1133.388030888031
-          },
-          "XSUM - # output tokens": {
-            "description": "min=21.958, mean=22.013, max=22.106, sum=132.077 (6)",
-            "tab": "General information",
-            "score": 22.012870012870014
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4.0 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666669
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.46, mean=0.472, max=0.483, sum=2.834 (6)",
-            "tab": "Bias",
-            "score": 0.4724007038712921
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.467, mean=0.48, max=0.505, sum=2.877 (6)",
-            "tab": "Bias",
-            "score": 0.47956989247311826
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.154, mean=0.186, max=0.216, sum=1.116 (6)",
-            "tab": "Bias",
-            "score": 0.18604199883585584
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.002, max=0.004, sum=0.012 (6)",
-            "tab": "Toxicity",
-            "score": 0.0019305019305019308
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.294, mean=-0.287, max=-0.282, sum=-0.861 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.2868511554050323
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=2.48, mean=3.182, max=3.598, sum=19.091 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.1818935586249126
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.432, mean=0.435, max=0.438, sum=1.305 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.43511885902101227
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.775, mean=0.784, max=0.792, sum=4.704 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.7840584721092689
-          },
-          "XSUM - Density": {
-            "description": "min=2.514, mean=2.63, max=2.802, sum=15.779 (6)",
-            "tab": "Summarization metrics",
-            "score": 2.6298709619480816
-          },
-          "XSUM - Compression": {
-            "description": "min=16.767, mean=16.862, max=16.987, sum=101.17 (6)",
-            "tab": "Summarization metrics",
-            "score": 16.861740741647864
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.943,
-        "details": {
-          "description": "min=0.934, mean=0.943, max=0.951, sum=2.83 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.06, mean=0.064, max=0.072, sum=0.191 (3)",
-            "tab": "Calibration",
-            "score": 0.06375881576094916
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.917, mean=0.923, max=0.934, sum=2.768 (3)",
-            "tab": "Robustness",
-            "score": 0.9226666666666666
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.922, mean=0.932, max=0.941, sum=2.797 (3)",
-            "tab": "Fairness",
-            "score": 0.9323333333333333
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.682, mean=0.852, max=1.035, sum=2.555 (3)",
-            "tab": "Efficiency",
-            "score": 0.8516515608723956
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=4.915, mean=4.972, max=5, sum=14.915 (3)",
-            "tab": "General information",
-            "score": 4.971666666666667
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=853.851, mean=1281.577, max=1725.03, sum=3844.732 (3)",
-            "tab": "General information",
-            "score": 1281.5773333333334
-          },
-          "IMDB - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.553,
-        "details": {
-          "description": "min=0.03, mean=0.553, max=0.968, sum=29.863 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.048, mean=0.27, max=0.587, sum=14.569 (54)",
-            "tab": "Calibration",
-            "score": 0.26979933840430187
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.027, mean=0.271, max=0.732, sum=14.649 (54)",
-            "tab": "Robustness",
-            "score": 0.2712865813183887
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.006, mean=0.478, max=0.958, sum=25.823 (54)",
-            "tab": "Fairness",
-            "score": 0.4782106548652487
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.43, mean=0.552, max=0.724, sum=29.829 (54)",
-            "tab": "Efficiency",
-            "score": 0.5523870780537201
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=271.927, mean=532.602, max=942.498, sum=28760.487 (54)",
-            "tab": "General information",
-            "score": 532.6016121330534
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=108 (54)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.681,
-        "details": {
-          "description": "min=0.225, mean=0.681, max=0.975, sum=22.475 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.103, mean=0.228, max=0.595, sum=7.528 (33)",
-            "tab": "Calibration",
-            "score": 0.2281177870147751
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.025, mean=0.555, max=0.875, sum=18.3 (33)",
-            "tab": "Robustness",
-            "score": 0.5545454545454546
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.125, mean=0.623, max=0.975, sum=20.55 (33)",
-            "tab": "Fairness",
-            "score": 0.6227272727272728
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.423, mean=0.687, max=1.043, sum=22.661 (33)",
-            "tab": "Efficiency",
-            "score": 0.6866916923137625
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.95, mean=4.658, max=5, sum=153.7 (33)",
-            "tab": "General information",
-            "score": 4.657575757575757
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=212.25, mean=712.248, max=1745.25, sum=23504.175 (33)",
-            "tab": "General information",
-            "score": 712.2477272727273
-          },
-          "RAFT - # output tokens": {
-            "description": "min=1.95, mean=3.634, max=6.925, sum=119.925 (33)",
-            "tab": "General information",
-            "score": 3.6340909090909084
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json b/data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json
deleted file mode 100644
index 5c8560533..000000000
--- a/data/helm_classic/ai21/J1-Large-v1-7.5B/e4780862-bf3c-4856-b1e7-02616afe931a.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/ai21_J1-Large-v1-7.5B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "J1-Large v1 7.5B",
-    "id": "ai21/J1-Large-v1-7.5B",
-    "developer": "ai21",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.285,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.6383920923698907
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.29777282413544925
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.27467778791471786
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.38930372807017544
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5487461676083087
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.6599416016082683
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.6502297410192147
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.241,
-        "details": {
-          "description": "min=0.2, mean=0.241, max=0.298, sum=3.617 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.051, mean=0.123, max=0.181, sum=1.842 (15)",
-            "tab": "Calibration",
-            "score": 0.12277396117394333
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.16, mean=0.2, max=0.272, sum=3.002 (15)",
-            "tab": "Robustness",
-            "score": 0.20011695906432747
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.16, mean=0.204, max=0.23, sum=3.059 (15)",
-            "tab": "Fairness",
-            "score": 0.2039415204678363
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.348, mean=0.377, max=0.422, sum=5.648 (15)",
-            "tab": "Efficiency",
-            "score": 0.3765351217105263
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=308.59, mean=396.74, max=552.719, sum=5951.098 (15)",
-            "tab": "General information",
-            "score": 396.73985964912276
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.683,
-        "details": {
-          "description": "min=0.652, mean=0.683, max=0.709, sum=2.05 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.085, mean=0.106, max=0.133, sum=0.319 (3)",
-            "tab": "Calibration",
-            "score": 0.10621693084730484
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.539, mean=0.567, max=0.603, sum=1.701 (3)",
-            "tab": "Robustness",
-            "score": 0.5670000000000001
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.591, mean=0.622, max=0.651, sum=1.867 (3)",
-            "tab": "Fairness",
-            "score": 0.6223333333333333
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.43, mean=0.485, max=0.566, sum=1.455 (3)",
-            "tab": "Efficiency",
-            "score": 0.48513916883680525
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=506.985, mean=694.652, max=952.985, sum=2083.955 (3)",
-            "tab": "General information",
-            "score": 694.6516666666666
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.623,
-        "details": {
-          "description": "min=0.612, mean=0.623, max=0.634, sum=1.87 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.042, mean=0.046, max=0.048, sum=0.137 (3)",
-            "tab": "Calibration",
-            "score": 0.04554705251298522
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.341, mean=0.4, max=0.438, sum=1.201 (3)",
-            "tab": "Robustness",
-            "score": 0.4003895179156612
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.496, mean=0.513, max=0.524, sum=1.538 (3)",
-            "tab": "Fairness",
-            "score": 0.5126679432053903
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=0.768, mean=0.797, max=0.829, sum=2.391 (3)",
-            "tab": "Efficiency",
-            "score": 0.7971074946205007
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=2.166, mean=2.639, max=3.225, sum=7.918 (3)",
-            "tab": "General information",
-            "score": 2.63943661971831
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1598.614, mean=1692.218, max=1777.299, sum=5076.654 (3)",
-            "tab": "General information",
-            "score": 1692.2178403755868
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=4.797, mean=5.09, max=5.518, sum=15.27 (3)",
-            "tab": "General information",
-            "score": 5.090140845070422
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.17, mean=0.203, max=0.223, sum=0.609 (3)",
-            "tab": "Bias",
-            "score": 0.20304247377415918
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.011, mean=0.013, max=0.014, sum=0.039 (3)",
-            "tab": "Toxicity",
-            "score": 0.013145539906103287
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.532,
-        "details": {
-          "description": "min=0.5, mean=0.532, max=0.571, sum=1.597 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.013, mean=0.015, max=0.017, sum=0.046 (3)",
-            "tab": "Calibration",
-            "score": 0.01549922748171477
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.075, mean=0.086, max=0.093, sum=0.258 (3)",
-            "tab": "Calibration",
-            "score": 0.08597598507389619
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.092, mean=0.098, max=0.106, sum=0.293 (3)",
-            "tab": "Robustness",
-            "score": 0.097632746101742
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.361, mean=0.41, max=0.455, sum=1.23 (3)",
-            "tab": "Robustness",
-            "score": 0.4099829032840138
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.14, mean=0.146, max=0.151, sum=0.439 (3)",
-            "tab": "Fairness",
-            "score": 0.14648226412007787
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.44, mean=0.47, max=0.508, sum=1.409 (3)",
-            "tab": "Fairness",
-            "score": 0.4695231845662433
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.355, mean=0.372, max=0.396, sum=1.117 (3)",
-            "tab": "Efficiency",
-            "score": 0.3722484414062495
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=0.66, mean=0.733, max=0.784, sum=2.198 (3)",
-            "tab": "Efficiency",
-            "score": 0.7326816432291658
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=94.377, mean=99.377, max=102.377, sum=298.131 (3)",
-            "tab": "General information",
-            "score": 99.377
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=6.868, mean=7.876, max=9.311, sum=23.628 (3)",
-            "tab": "General information",
-            "score": 7.876
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.568, mean=4.666, max=4.734, sum=13.999 (3)",
-            "tab": "General information",
-            "score": 4.666333333333333
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.038, mean=0.038, max=0.038, sum=0.114 (3)",
-            "tab": "General information",
-            "score": 0.038
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1136.933, mean=1418.457, max=1595.508, sum=4255.37 (3)",
-            "tab": "General information",
-            "score": 1418.4566666666667
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=5.487, mean=5.946, max=6.338, sum=17.838 (3)",
-            "tab": "General information",
-            "score": 5.946000000000001
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.214, mean=0.405, max=0.5, sum=1.214 (3)",
-            "tab": "Bias",
-            "score": 0.4047619047619048
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.302, mean=0.362, max=0.45, sum=1.085 (3)",
-            "tab": "Bias",
-            "score": 0.36169748540882557
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.088, mean=0.216, max=0.371, sum=0.647 (3)",
-            "tab": "Bias",
-            "score": 0.21556767868437698
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.327, mean=0.394, max=0.457, sum=1.182 (3)",
-            "tab": "Bias",
-            "score": 0.39383347574877653
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.106, mean=0.109, max=0.113, sum=0.328 (3)",
-            "tab": "Bias",
-            "score": 0.10941198128319474
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.002, max=0.002, sum=0.005 (3)",
-            "tab": "Toxicity",
-            "score": 0.0016666666666666668
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)",
-            "tab": "Toxicity",
-            "score": 0.0006666666666666666
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328,
-        "details": {
-          "description": "min=0.322, mean=0.328, max=0.336, sum=0.983 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.016, mean=0.024, max=0.033, sum=0.073 (3)",
-            "tab": "Calibration",
-            "score": 0.02431531680637249
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.186, mean=0.197, max=0.209, sum=0.591 (3)",
-            "tab": "Robustness",
-            "score": 0.19699898429353593
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.227, mean=0.241, max=0.256, sum=0.722 (3)",
-            "tab": "Fairness",
-            "score": 0.24062000532402938
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=1.105, mean=1.16, max=1.191, sum=3.48 (3)",
-            "tab": "Efficiency",
-            "score": 1.159840737413194
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=1.788, mean=1.829, max=1.88, sum=5.486 (3)",
-            "tab": "General information",
-            "score": 1.8286666666666667
-          },
-          "QuAC - truncated": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)",
-            "tab": "General information",
-            "score": 0.001
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1645.856, mean=1698.711, max=1730.814, sum=5096.134 (3)",
-            "tab": "General information",
-            "score": 1698.7113333333334
-          },
-          "QuAC - # output tokens": {
-            "description": "min=23.833, mean=27.642, max=30.067, sum=82.927 (3)",
-            "tab": "General information",
-            "score": 27.64233333333333
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.632, mean=0.647, max=0.667, sum=1.942 (3)",
-            "tab": "Bias",
-            "score": 0.6472747525379104
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.407, mean=0.428, max=0.446, sum=1.284 (3)",
-            "tab": "Bias",
-            "score": 0.42785601825865643
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.226, mean=0.3, max=0.351, sum=0.9 (3)",
-            "tab": "Bias",
-            "score": 0.2998485806834953
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.235, mean=0.249, max=0.271, sum=0.748 (3)",
-            "tab": "Bias",
-            "score": 0.24941347459181362
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.002, mean=0.003, max=0.004, sum=0.008 (3)",
-            "tab": "Toxicity",
-            "score": 0.0026666666666666666
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7,
-        "details": {
-          "description": "min=0.7, mean=0.7, max=0.7, sum=0.7 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.192, mean=0.192, max=0.192, sum=0.192 (1)",
-            "tab": "Calibration",
-            "score": 0.19173198668049052
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.646, mean=0.646, max=0.646, sum=0.646 (1)",
-            "tab": "Robustness",
-            "score": 0.646
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.528, mean=0.528, max=0.528, sum=0.528 (1)",
-            "tab": "Fairness",
-            "score": 0.528
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.253, mean=0.253, max=0.253, sum=0.253 (1)",
-            "tab": "Efficiency",
-            "score": 0.25286050781250013
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=62.466, mean=62.466, max=62.466, sum=62.466 (1)",
-            "tab": "General information",
-            "score": 62.466
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.514,
-        "details": {
-          "description": "min=0.514, mean=0.514, max=0.514, sum=0.514 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.25, mean=0.25, max=0.25, sum=0.25 (1)",
-            "tab": "Calibration",
-            "score": 0.24986668171933007
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.412, mean=0.412, max=0.412, sum=0.412 (1)",
-            "tab": "Robustness",
-            "score": 0.412
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.444, mean=0.444, max=0.444, sum=0.444 (1)",
-            "tab": "Fairness",
-            "score": 0.444
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.238, mean=0.238, max=0.238, sum=0.238 (1)",
-            "tab": "Efficiency",
-            "score": 0.2381039843749996
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=4.348, mean=4.348, max=4.348, sum=4.348 (1)",
-            "tab": "General information",
-            "score": 4.348
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.197,
-        "details": {
-          "description": "min=0.19, mean=0.197, max=0.2, sum=0.59 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.105, mean=0.112, max=0.121, sum=0.337 (3)",
-            "tab": "Calibration",
-            "score": 0.11232689963932652
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.138, mean=0.155, max=0.168, sum=0.465 (3)",
-            "tab": "Robustness",
-            "score": 0.15494393476044852
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.159, mean=0.174, max=0.182, sum=0.521 (3)",
-            "tab": "Fairness",
-            "score": 0.17380224260958207
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.351, mean=0.365, max=0.372, sum=1.094 (3)",
-            "tab": "Efficiency",
-            "score": 0.36458362003058115
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=317.682, mean=355.015, max=375.682, sum=1065.046 (3)",
-            "tab": "General information",
-            "score": 355.0152905198777
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.292,
-        "details": {
-          "description": "min=0.266, mean=0.292, max=0.338, sum=0.877 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.089, mean=0.105, max=0.128, sum=0.315 (3)",
-            "tab": "Robustness",
-            "score": 0.10499510582010585
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.231, mean=0.248, max=0.274, sum=0.743 (3)",
-            "tab": "Robustness",
-            "score": 0.24769351383898738
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.096, mean=0.117, max=0.143, sum=0.351 (3)",
-            "tab": "Fairness",
-            "score": 0.11706984126984123
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.258, mean=0.28, max=0.322, sum=0.841 (3)",
-            "tab": "Fairness",
-            "score": 0.2804651230679189
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.379, mean=0.393, max=0.406, sum=1.178 (3)",
-            "tab": "Efficiency",
-            "score": 0.3926667591145831
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.376, mean=0.389, max=0.402, sum=1.167 (3)",
-            "tab": "Efficiency",
-            "score": 0.3890438468992247
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=349.303, mean=385.636, max=423.303, sum=1156.909 (3)",
-            "tab": "General information",
-            "score": 385.63633333333337
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=2.011, mean=2.072, max=2.163, sum=6.217 (3)",
-            "tab": "General information",
-            "score": 2.0723333333333334
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=337.047, mean=373.38, max=411.047, sum=1120.14 (3)",
-            "tab": "General information",
-            "score": 373.3798449612403
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=2.093, mean=2.116, max=2.163, sum=6.349 (3)",
-            "tab": "General information",
-            "score": 2.116279069767442
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.134,
-        "details": {
-          "description": "min=0.123, mean=0.134, max=0.147, sum=0.802 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=1.832, mean=2.011, max=2.216, sum=12.069 (6)",
-            "tab": "Efficiency",
-            "score": 2.011487112821144
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1203.032, mean=1213.032, max=1224.032, sum=7278.193 (6)",
-            "tab": "General information",
-            "score": 1213.0321888412018
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=78.521, mean=89.614, max=102.401, sum=537.682 (6)",
-            "tab": "General information",
-            "score": 89.61373390557941
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.602, mean=0.632, max=0.648, sum=3.791 (6)",
-            "tab": "Bias",
-            "score": 0.6318145834093977
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.385, mean=0.391, max=0.396, sum=2.349 (6)",
-            "tab": "Bias",
-            "score": 0.3914278177516011
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.257, mean=0.302, max=0.354, sum=1.811 (6)",
-            "tab": "Bias",
-            "score": 0.3019033965877131
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.135, mean=0.142, max=0.152, sum=0.851 (6)",
-            "tab": "Bias",
-            "score": 0.14183552076259287
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.004, sum=0.009 (6)",
-            "tab": "Toxicity",
-            "score": 0.001430615164520744
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.488, mean=0.512, max=0.535, sum=1.537 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.5121705493530246
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=4.664, mean=4.716, max=4.749, sum=28.295 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.715823146970394
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.229, mean=0.248, max=0.272, sum=0.745 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.2482954175661162
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.971, mean=0.977, max=0.985, sum=5.861 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9768840440430324
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=55.528, mean=71.654, max=97.831, sum=429.924 (6)",
-            "tab": "Summarization metrics",
-            "score": 71.65405587945487
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=5.872, mean=7.632, max=9.373, sum=45.79 (6)",
-            "tab": "Summarization metrics",
-            "score": 7.631709472598792
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102,
-        "details": {
-          "description": "min=0.095, mean=0.102, max=0.107, sum=0.612 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=0.896, mean=0.903, max=0.91, sum=5.418 (6)",
-            "tab": "Efficiency",
-            "score": 0.9030293349990619
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1099.388, mean=1133.388, max=1172.388, sum=6800.328 (6)",
-            "tab": "General information",
-            "score": 1133.388030888031
-          },
-          "XSUM - # output tokens": {
-            "description": "min=20.832, mean=21.299, max=21.809, sum=127.792 (6)",
-            "tab": "General information",
-            "score": 21.2985842985843
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4.0 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666669
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.397, mean=0.424, max=0.451, sum=2.547 (6)",
-            "tab": "Bias",
-            "score": 0.42449478248089356
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.387, mean=0.426, max=0.467, sum=2.554 (6)",
-            "tab": "Bias",
-            "score": 0.4255855855855855
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.151, mean=0.172, max=0.189, sum=1.031 (6)",
-            "tab": "Bias",
-            "score": 0.1717873516720604
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.26, mean=-0.239, max=-0.222, sum=-0.716 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.23866760351278402
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=3.354, mean=3.675, max=4.009, sum=22.047 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.674546888395078
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.393, mean=0.4, max=0.405, sum=1.2 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.40004604044843806
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.804, mean=0.808, max=0.813, sum=4.85 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.8084128334077892
-          },
-          "XSUM - Density": {
-            "description": "min=3.618, mean=3.757, max=3.939, sum=22.541 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.7567632334705046
-          },
-          "XSUM - Compression": {
-            "description": "min=17.523, mean=18.133, max=18.761, sum=108.8 (6)",
-            "tab": "Summarization metrics",
-            "score": 18.133322572088453
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.956,
-        "details": {
-          "description": "min=0.951, mean=0.956, max=0.962, sum=2.869 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.196, mean=0.213, max=0.234, sum=0.639 (3)",
-            "tab": "Calibration",
-            "score": 0.21314336064172376
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.927, mean=0.932, max=0.936, sum=2.796 (3)",
-            "tab": "Robustness",
-            "score": 0.932
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.939, mean=0.946, max=0.951, sum=2.839 (3)",
-            "tab": "Fairness",
-            "score": 0.9463333333333334
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.531, mean=0.637, max=0.757, sum=1.911 (3)",
-            "tab": "Efficiency",
-            "score": 0.6371184251302079
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=4.915, mean=4.972, max=5, sum=14.915 (3)",
-            "tab": "General information",
-            "score": 4.971666666666667
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=853.851, mean=1281.577, max=1725.03, sum=3844.732 (3)",
-            "tab": "General information",
-            "score": 1281.5773333333334
-          },
-          "IMDB - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.532,
-        "details": {
-          "description": "min=0, mean=0.532, max=0.996, sum=28.713 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.073, mean=0.377, max=0.573, sum=20.347 (54)",
-            "tab": "Calibration",
-            "score": 0.37680252478263027
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.444, max=0.984, sum=23.966 (54)",
-            "tab": "Robustness",
-            "score": 0.4438230435194026
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.447, max=0.962, sum=24.127 (54)",
-            "tab": "Fairness",
-            "score": 0.4468037461427085
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.338, mean=0.434, max=0.564, sum=23.454 (54)",
-            "tab": "Efficiency",
-            "score": 0.43432643222557377
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=271.927, mean=532.602, max=942.498, sum=28760.487 (54)",
-            "tab": "General information",
-            "score": 532.6016121330534
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=108 (54)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.545,
-        "details": {
-          "description": "min=0.15, mean=0.545, max=0.95, sum=18 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.134, mean=0.269, max=0.513, sum=8.875 (33)",
-            "tab": "Calibration",
-            "score": 0.2689468403025133
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.025, mean=0.443, max=0.95, sum=14.625 (33)",
-            "tab": "Robustness",
-            "score": 0.4431818181818182
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.15, mean=0.511, max=0.95, sum=16.85 (33)",
-            "tab": "Fairness",
-            "score": 0.5106060606060605
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.312, mean=0.499, max=0.763, sum=16.476 (33)",
-            "tab": "Efficiency",
-            "score": 0.4992617404513889
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.95, mean=4.658, max=5, sum=153.7 (33)",
-            "tab": "General information",
-            "score": 4.657575757575757
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=212.25, mean=712.248, max=1745.25, sum=23504.175 (33)",
-            "tab": "General information",
-            "score": 712.2477272727273
-          },
-          "RAFT - # output tokens": {
-            "description": "min=1.975, mean=3.499, max=7.025, sum=115.475 (33)",
-            "tab": "General information",
-            "score": 3.4992424242424245
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json b/data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json
deleted file mode 100644
index 4f288f894..000000000
--- a/data/helm_classic/ai21/Jurassic-2-Grande-17B/cd1ec0ed-44cb-4e99-b58d-f026c3172f8c.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/ai21_Jurassic-2-Grande-17B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Jurassic-2 Grande 17B",
-    "id": "ai21/Jurassic-2-Grande-17B",
-    "developer": "ai21",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.743,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.6300647548566143
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.7641047680536001
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.7037362526239056
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.561885097395068
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.3875874125874126
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.6710526315789473
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.475,
-        "details": {
-          "description": "min=0.24, mean=0.475, max=0.81, sum=7.13 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.076, mean=0.134, max=0.172, sum=2.006 (15)",
-            "tab": "Calibration",
-            "score": 0.13373539597087636
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.22, mean=0.411, max=0.68, sum=6.168 (15)",
-            "tab": "Robustness",
-            "score": 0.41120467836257313
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.23, mean=0.433, max=0.73, sum=6.498 (15)",
-            "tab": "Fairness",
-            "score": 0.43321637426900583
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=308.59, mean=396.74, max=552.719, sum=5951.098 (15)",
-            "tab": "General information",
-            "score": 396.73985964912276
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.826,
-        "details": {
-          "description": "min=0.816, mean=0.826, max=0.832, sum=2.478 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.179, mean=0.209, max=0.243, sum=0.627 (3)",
-            "tab": "Calibration",
-            "score": 0.20883844550071148
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.714, mean=0.729, max=0.743, sum=2.187 (3)",
-            "tab": "Robustness",
-            "score": 0.729
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.758, mean=0.78, max=0.791, sum=2.34 (3)",
-            "tab": "Fairness",
-            "score": 0.7799999999999999
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=506.985, mean=694.652, max=952.985, sum=2083.955 (3)",
-            "tab": "General information",
-            "score": 694.6516666666666
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=2.002, mean=2.002, max=2.002, sum=6.006 (3)",
-            "tab": "General information",
-            "score": 2.002
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.737,
-        "details": {
-          "description": "min=0.732, mean=0.737, max=0.744, sum=2.21 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.107, mean=0.126, max=0.158, sum=0.377 (3)",
-            "tab": "Calibration",
-            "score": 0.12569343029680938
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.49, mean=0.583, max=0.65, sum=1.75 (3)",
-            "tab": "Robustness",
-            "score": 0.5834381641862693
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.638, mean=0.645, max=0.651, sum=1.935 (3)",
-            "tab": "Fairness",
-            "score": 0.6449807868174807
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=2.166, mean=2.639, max=3.225, sum=7.918 (3)",
-            "tab": "General information",
-            "score": 2.63943661971831
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1598.614, mean=1692.218, max=1777.299, sum=5076.654 (3)",
-            "tab": "General information",
-            "score": 1692.2178403755868
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.039, mean=5.261, max=5.473, sum=15.783 (3)",
-            "tab": "General information",
-            "score": 5.261032863849765
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.4, mean=0.448, max=0.5, sum=1.344 (3)",
-            "tab": "Bias",
-            "score": 0.4481481481481482
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.185, mean=0.196, max=0.205, sum=0.587 (3)",
-            "tab": "Bias",
-            "score": 0.19550967146595563
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.017, mean=0.02, max=0.023, sum=0.059 (3)",
-            "tab": "Toxicity",
-            "score": 0.019718309859154928
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.639,
-        "details": {
-          "description": "min=0.627, mean=0.639, max=0.649, sum=1.918 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.016, mean=0.018, max=0.019, sum=0.054 (3)",
-            "tab": "Calibration",
-            "score": 0.01803156970695322
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.053, mean=0.063, max=0.072, sum=0.188 (3)",
-            "tab": "Calibration",
-            "score": 0.06257440554546793
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.277, mean=0.285, max=0.29, sum=0.854 (3)",
-            "tab": "Robustness",
-            "score": 0.28458982309414393
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.555, mean=0.564, max=0.568, sum=1.691 (3)",
-            "tab": "Robustness",
-            "score": 0.5635162273229849
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.276, mean=0.283, max=0.288, sum=0.85 (3)",
-            "tab": "Fairness",
-            "score": 0.2832503879785802
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.569, mean=0.584, max=0.592, sum=1.752 (3)",
-            "tab": "Fairness",
-            "score": 0.5839142853000876
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=94.377, mean=99.377, max=102.377, sum=298.131 (3)",
-            "tab": "General information",
-            "score": 99.377
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=5.466, mean=6.315, max=6.864, sum=18.944 (3)",
-            "tab": "General information",
-            "score": 6.314666666666667
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.568, mean=4.666, max=4.734, sum=13.999 (3)",
-            "tab": "General information",
-            "score": 4.666333333333333
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.038, mean=0.038, max=0.038, sum=0.114 (3)",
-            "tab": "General information",
-            "score": 0.038
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1136.933, mean=1418.457, max=1595.508, sum=4255.37 (3)",
-            "tab": "General information",
-            "score": 1418.4566666666667
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=5.441, mean=5.676, max=6.069, sum=17.029 (3)",
-            "tab": "General information",
-            "score": 5.676333333333333
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.431, mean=0.507, max=0.569, sum=1.52 (3)",
-            "tab": "Bias",
-            "score": 0.5067443890625439
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.1, mean=0.176, max=0.273, sum=0.527 (3)",
-            "tab": "Bias",
-            "score": 0.1755244755244755
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.431, mean=0.465, max=0.498, sum=1.395 (3)",
-            "tab": "Bias",
-            "score": 0.46507125832968527
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.005, mean=0.03, max=0.053, sum=0.089 (3)",
-            "tab": "Bias",
-            "score": 0.02952187967385538
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)",
-            "tab": "Toxicity",
-            "score": 0.0003333333333333333
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)",
-            "tab": "Toxicity",
-            "score": 0.0003333333333333333
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.418,
-        "details": {
-          "description": "min=0.412, mean=0.418, max=0.429, sum=1.255 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.027, mean=0.035, max=0.04, sum=0.105 (3)",
-            "tab": "Calibration",
-            "score": 0.03491339390127312
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.271, mean=0.276, max=0.281, sum=0.827 (3)",
-            "tab": "Robustness",
-            "score": 0.27557303329747496
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.335, mean=0.34, max=0.35, sum=1.02 (3)",
-            "tab": "Fairness",
-            "score": 0.34002521409765923
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=1.788, mean=1.829, max=1.88, sum=5.486 (3)",
-            "tab": "General information",
-            "score": 1.8286666666666667
-          },
-          "QuAC - truncated": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)",
-            "tab": "General information",
-            "score": 0.001
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1645.856, mean=1698.711, max=1730.814, sum=5096.134 (3)",
-            "tab": "General information",
-            "score": 1698.7113333333334
-          },
-          "QuAC - # output tokens": {
-            "description": "min=22.04, mean=24.469, max=26.73, sum=73.408 (3)",
-            "tab": "General information",
-            "score": 24.469333333333335
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.625, mean=0.64, max=0.651, sum=1.919 (3)",
-            "tab": "Bias",
-            "score": 0.6395502645502645
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.389, mean=0.422, max=0.455, sum=1.267 (3)",
-            "tab": "Bias",
-            "score": 0.4224807266199369
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.183, mean=0.23, max=0.263, sum=0.689 (3)",
-            "tab": "Bias",
-            "score": 0.22977891012599364
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.223, mean=0.224, max=0.225, sum=0.673 (3)",
-            "tab": "Bias",
-            "score": 0.22430144583085757
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.002, mean=0.003, max=0.004, sum=0.009 (3)",
-            "tab": "Toxicity",
-            "score": 0.0030000000000000005
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.781,
-        "details": {
-          "description": "min=0.781, mean=0.781, max=0.781, sum=0.781 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.755, mean=0.755, max=0.755, sum=0.755 (1)",
-            "tab": "Robustness",
-            "score": 0.755
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.632, mean=0.632, max=0.632, sum=0.632 (1)",
-            "tab": "Fairness",
-            "score": 0.632
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=62.466, mean=62.466, max=62.466, sum=62.466 (1)",
-            "tab": "General information",
-            "score": 62.466
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.542,
-        "details": {
-          "description": "min=0.542, mean=0.542, max=0.542, sum=0.542 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.474, mean=0.474, max=0.474, sum=0.474 (1)",
-            "tab": "Robustness",
-            "score": 0.474
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.466, mean=0.466, max=0.466, sum=0.466 (1)",
-            "tab": "Fairness",
-            "score": 0.466
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=4.348, mean=4.348, max=4.348, sum=4.348 (1)",
-            "tab": "General information",
-            "score": 4.348
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.348,
-        "details": {
-          "description": "min=0.287, mean=0.348, max=0.384, sum=1.043 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.073, mean=0.097, max=0.142, sum=0.291 (3)",
-            "tab": "Calibration",
-            "score": 0.09707246189445913
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.245, mean=0.293, max=0.326, sum=0.878 (3)",
-            "tab": "Robustness",
-            "score": 0.29255861365953106
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.242, mean=0.29, max=0.32, sum=0.87 (3)",
-            "tab": "Fairness",
-            "score": 0.2900101936799185
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=317.682, mean=355.015, max=375.682, sum=1065.046 (3)",
-            "tab": "General information",
-            "score": 355.0152905198777
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.514,
-        "details": {
-          "description": "min=0.473, mean=0.514, max=0.577, sum=1.543 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.18, mean=0.227, max=0.253, sum=0.681 (3)",
-            "tab": "Robustness",
-            "score": 0.22687976190476158
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.406, mean=0.423, max=0.451, sum=1.269 (3)",
-            "tab": "Robustness",
-            "score": 0.42305953691791237
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.206, mean=0.243, max=0.271, sum=0.728 (3)",
-            "tab": "Fairness",
-            "score": 0.242712169312169
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.438, mean=0.471, max=0.522, sum=1.413 (3)",
-            "tab": "Fairness",
-            "score": 0.47089412794287994
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=349.303, mean=385.636, max=423.303, sum=1156.909 (3)",
-            "tab": "General information",
-            "score": 385.63633333333337
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=2.003, mean=2.006, max=2.008, sum=6.017 (3)",
-            "tab": "General information",
-            "score": 2.005666666666667
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=337.047, mean=373.38, max=411.047, sum=1120.14 (3)",
-            "tab": "General information",
-            "score": 373.3798449612403
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=2.023, mean=2.023, max=2.023, sum=6.07 (3)",
-            "tab": "General information",
-            "score": 2.0232558139534884
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.144,
-        "details": {
-          "description": "min=0.131, mean=0.144, max=0.153, sum=0.865 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1203.032, mean=1213.032, max=1224.032, sum=7278.193 (6)",
-            "tab": "General information",
-            "score": 1213.0321888412018
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=48.987, mean=55.762, max=59.891, sum=334.571 (6)",
-            "tab": "General information",
-            "score": 55.76180257510729
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.619, mean=0.636, max=0.667, sum=3.817 (6)",
-            "tab": "Bias",
-            "score": 0.6361416361416362
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.386, mean=0.402, max=0.424, sum=2.411 (6)",
-            "tab": "Bias",
-            "score": 0.4017992121362035
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.338, mean=0.359, max=0.379, sum=2.152 (6)",
-            "tab": "Bias",
-            "score": 0.3586894722560466
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.099, mean=0.117, max=0.128, sum=0.701 (6)",
-            "tab": "Bias",
-            "score": 0.11681135928174619
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0.002, mean=0.003, max=0.004, sum=0.017 (6)",
-            "tab": "Toxicity",
-            "score": 0.002861230329041488
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.469, mean=0.503, max=0.535, sum=1.51 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.5032610058862116
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.281, mean=0.299, max=0.308, sum=0.896 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.2987736324577836
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.953, mean=0.96, max=0.965, sum=5.76 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9600651009447835
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=14.681, mean=22.305, max=27.564, sum=133.827 (6)",
-            "tab": "Summarization metrics",
-            "score": 22.304503793993888
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=10.404, mean=11.399, max=13.033, sum=68.393 (6)",
-            "tab": "Summarization metrics",
-            "score": 11.39877050033896
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.167,
-        "details": {
-          "description": "min=0.164, mean=0.167, max=0.173, sum=1.005 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1099.388, mean=1133.388, max=1172.388, sum=6800.328 (6)",
-            "tab": "General information",
-            "score": 1133.388030888031
-          },
-          "XSUM - # output tokens": {
-            "description": "min=21.463, mean=21.75, max=22.241, sum=130.502 (6)",
-            "tab": "General information",
-            "score": 21.75032175032175
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.445, mean=0.456, max=0.463, sum=2.736 (6)",
-            "tab": "Bias",
-            "score": 0.4559853927203065
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.362, mean=0.466, max=0.532, sum=2.798 (6)",
-            "tab": "Bias",
-            "score": 0.4664089053990878
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.192, mean=0.207, max=0.233, sum=1.24 (6)",
-            "tab": "Bias",
-            "score": 0.2066101848280066
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)",
-            "tab": "Toxicity",
-            "score": 0.0006435006435006435
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.31, mean=-0.289, max=-0.268, sum=-0.868 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.2893415716573027
-          },
-          "XSUM - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.47, mean=0.475, max=0.48, sum=1.424 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.474663326872436
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.761, mean=0.766, max=0.771, sum=4.596 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.7660021617230298
-          },
-          "XSUM - Density": {
-            "description": "min=2.196, mean=2.36, max=2.464, sum=14.158 (6)",
-            "tab": "Summarization metrics",
-            "score": 2.359653576011524
-          },
-          "XSUM - Compression": {
-            "description": "min=16.605, mean=17.045, max=17.3, sum=102.267 (6)",
-            "tab": "Summarization metrics",
-            "score": 17.044545661784866
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.938,
-        "details": {
-          "description": "min=0.926, mean=0.938, max=0.954, sum=2.814 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.088, mean=0.111, max=0.153, sum=0.333 (3)",
-            "tab": "Calibration",
-            "score": 0.11088831926219649
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.915, mean=0.928, max=0.949, sum=2.784 (3)",
-            "tab": "Robustness",
-            "score": 0.9279999999999999
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.92, mean=0.931, max=0.951, sum=2.792 (3)",
-            "tab": "Fairness",
-            "score": 0.9306666666666666
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=4.915, mean=4.972, max=5, sum=14.915 (3)",
-            "tab": "General information",
-            "score": 4.971666666666667
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=853.851, mean=1281.577, max=1725.03, sum=3844.732 (3)",
-            "tab": "General information",
-            "score": 1281.5773333333334
-          },
-          "IMDB - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.547,
-        "details": {
-          "description": "min=0.011, mean=0.547, max=0.998, sum=29.525 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.097, mean=0.381, max=0.605, sum=20.56 (54)",
-            "tab": "Calibration",
-            "score": 0.38073513412444826
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.488, max=0.986, sum=26.326 (54)",
-            "tab": "Robustness",
-            "score": 0.4875180109221431
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.445, max=0.973, sum=24.007 (54)",
-            "tab": "Fairness",
-            "score": 0.44457169485758724
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=271.927, mean=532.602, max=942.498, sum=28760.487 (54)",
-            "tab": "General information",
-            "score": 532.6016121330534
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=108 (54)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.712,
-        "details": {
-          "description": "min=0.225, mean=0.712, max=0.975, sum=23.5 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.119, mean=0.232, max=0.581, sum=7.664 (33)",
-            "tab": "Calibration",
-            "score": 0.23222744852932867
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.025, mean=0.618, max=0.875, sum=20.4 (33)",
-            "tab": "Robustness",
-            "score": 0.6181818181818182
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.2, mean=0.689, max=0.975, sum=22.725 (33)",
-            "tab": "Fairness",
-            "score": 0.6886363636363637
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.95, mean=4.658, max=5, sum=153.7 (33)",
-            "tab": "General information",
-            "score": 4.657575757575757
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=212.25, mean=712.248, max=1745.25, sum=23504.175 (33)",
-            "tab": "General information",
-            "score": 712.2477272727273
-          },
-          "RAFT - # output tokens": {
-            "description": "min=1.95, mean=3.644, max=6.3, sum=120.25 (33)",
-            "tab": "General information",
-            "score": 3.643939393939394
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json b/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json
deleted file mode 100644
index 6d0308b9f..000000000
--- a/data/helm_classic/ai21/Jurassic-2-Jumbo-178B/13a22d40-f274-4384-adcc-1539da821c6a.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/ai21_Jurassic-2-Jumbo-178B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Jurassic-2 Jumbo 178B",
-    "id": "ai21/Jurassic-2-Jumbo-178B",
-    "developer": "ai21",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.824,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.6597594819611471
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.7910296229539834
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.8360206534288848
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5968189835436076
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.5064102564102564
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.6447368421052632
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.48,
-        "details": {
-          "description": "min=0.23, mean=0.48, max=0.83, sum=7.207 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.056, mean=0.137, max=0.248, sum=2.059 (15)",
-            "tab": "Calibration",
-            "score": 0.13723997934779486
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.17, mean=0.417, max=0.75, sum=6.251 (15)",
-            "tab": "Robustness",
-            "score": 0.41671345029239765
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.21, mean=0.45, max=0.78, sum=6.75 (15)",
-            "tab": "Fairness",
-            "score": 0.44997660818713453
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=308.59, mean=396.74, max=552.719, sum=5951.098 (15)",
-            "tab": "General information",
-            "score": 396.73985964912276
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.829,
-        "details": {
-          "description": "min=0.818, mean=0.829, max=0.838, sum=2.487 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.163, mean=0.175, max=0.198, sum=0.526 (3)",
-            "tab": "Calibration",
-            "score": 0.17545319159294462
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.72, mean=0.729, max=0.736, sum=2.188 (3)",
-            "tab": "Robustness",
-            "score": 0.7293333333333333
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.78, mean=0.792, max=0.798, sum=2.375 (3)",
-            "tab": "Fairness",
-            "score": 0.7916666666666666
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=506.985, mean=694.652, max=952.985, sum=2083.955 (3)",
-            "tab": "General information",
-            "score": 694.6516666666666
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=2, mean=2.002, max=2.003, sum=6.005 (3)",
-            "tab": "General information",
-            "score": 2.0016666666666665
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.733,
-        "details": {
-          "description": "min=0.715, mean=0.733, max=0.757, sum=2.2 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.068, mean=0.073, max=0.076, sum=0.219 (3)",
-            "tab": "Calibration",
-            "score": 0.07310994320832209
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.627, mean=0.66, max=0.69, sum=1.98 (3)",
-            "tab": "Robustness",
-            "score": 0.6601600341725052
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.63, mean=0.658, max=0.69, sum=1.973 (3)",
-            "tab": "Fairness",
-            "score": 0.6577011654908803
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=2534.434, mean=2818.1, max=3027.434, sum=8454.301 (3)",
-            "tab": "General information",
-            "score": 2818.1004694835683
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=4.879, mean=6.406, max=7.755, sum=19.217 (3)",
-            "tab": "General information",
-            "score": 6.405633802816901
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.385, mean=0.43, max=0.5, sum=1.29 (3)",
-            "tab": "Bias",
-            "score": 0.4298611111111111
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.333, mean=0.5, max=0.667, sum=1 (2)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.171, mean=0.183, max=0.192, sum=0.55 (3)",
-            "tab": "Bias",
-            "score": 0.18345814920903128
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.014, mean=0.017, max=0.02, sum=0.051 (3)",
-            "tab": "Toxicity",
-            "score": 0.016901408450704227
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.669,
-        "details": {
-          "description": "min=0.65, mean=0.669, max=0.681, sum=2.007 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.018, mean=0.018, max=0.019, sum=0.054 (3)",
-            "tab": "Calibration",
-            "score": 0.018133452831606698
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.071, mean=0.073, max=0.076, sum=0.22 (3)",
-            "tab": "Calibration",
-            "score": 0.07345259187429393
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.31, mean=0.315, max=0.318, sum=0.945 (3)",
-            "tab": "Robustness",
-            "score": 0.3150688575152197
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.576, mean=0.599, max=0.616, sum=1.796 (3)",
-            "tab": "Robustness",
-            "score": 0.5985032886794094
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.326, mean=0.327, max=0.328, sum=0.982 (3)",
-            "tab": "Fairness",
-            "score": 0.32739768950953246
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.601, mean=0.62, max=0.633, sum=1.86 (3)",
-            "tab": "Fairness",
-            "score": 0.6201543217700605
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=94.377, mean=99.377, max=102.377, sum=298.131 (3)",
-            "tab": "General information",
-            "score": 99.377
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=5.127, mean=5.365, max=5.79, sum=16.095 (3)",
-            "tab": "General information",
-            "score": 5.364999999999999
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.928, mean=4.93, max=4.932, sum=14.791 (3)",
-            "tab": "General information",
-            "score": 4.9303333333333335
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.012, mean=0.012, max=0.012, sum=0.036 (3)",
-            "tab": "General information",
-            "score": 0.012000000000000002
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1254.565, mean=1571.171, max=1771.274, sum=4713.512 (3)",
-            "tab": "General information",
-            "score": 1571.1706666666669
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=4.785, mean=5.113, max=5.399, sum=15.338 (3)",
-            "tab": "General information",
-            "score": 5.112666666666667
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.352, mean=0.376, max=0.405, sum=1.127 (3)",
-            "tab": "Bias",
-            "score": 0.3756261756261756
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.033, mean=0.095, max=0.136, sum=0.285 (3)",
-            "tab": "Bias",
-            "score": 0.09502719502719503
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.3, mean=0.413, max=0.5, sum=1.238 (3)",
-            "tab": "Bias",
-            "score": 0.41250000000000003
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.514, mean=0.541, max=0.561, sum=1.624 (3)",
-            "tab": "Bias",
-            "score": 0.5414311179017061
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.06, mean=0.107, max=0.132, sum=0.321 (3)",
-            "tab": "Bias",
-            "score": 0.10706952566601687
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)",
-            "tab": "Toxicity",
-            "score": 0.0006666666666666666
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.435,
-        "details": {
-          "description": "min=0.426, mean=0.435, max=0.446, sum=1.305 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.032, mean=0.035, max=0.037, sum=0.104 (3)",
-            "tab": "Calibration",
-            "score": 0.03466023181877799
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.31, mean=0.314, max=0.316, sum=0.941 (3)",
-            "tab": "Robustness",
-            "score": 0.3135172870245195
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.333, mean=0.34, max=0.348, sum=1.02 (3)",
-            "tab": "Fairness",
-            "score": 0.34006270092560414
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=4.999, mean=5.0, max=5, sum=14.999 (3)",
-            "tab": "General information",
-            "score": 4.999666666666666
-          },
-          "QuAC - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=3587.32, mean=4018.779, max=4568.698, sum=12056.338 (3)",
-            "tab": "General information",
-            "score": 4018.7793333333334
-          },
-          "QuAC - # output tokens": {
-            "description": "min=21.621, mean=22.178, max=22.826, sum=66.533 (3)",
-            "tab": "General information",
-            "score": 22.177666666666664
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.615, mean=0.642, max=0.667, sum=1.925 (3)",
-            "tab": "Bias",
-            "score": 0.6416361416361417
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.425, mean=0.454, max=0.476, sum=1.363 (3)",
-            "tab": "Bias",
-            "score": 0.45448951168627727
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.342, mean=0.359, max=0.375, sum=1.078 (3)",
-            "tab": "Bias",
-            "score": 0.35949126363389555
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.22, mean=0.232, max=0.241, sum=0.696 (3)",
-            "tab": "Bias",
-            "score": 0.23190752816365634
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.788,
-        "details": {
-          "description": "min=0.788, mean=0.788, max=0.788, sum=0.788 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.754, mean=0.754, max=0.754, sum=0.754 (1)",
-            "tab": "Robustness",
-            "score": 0.754
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.655, mean=0.655, max=0.655, sum=0.655 (1)",
-            "tab": "Fairness",
-            "score": 0.655
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=62.466, mean=62.466, max=62.466, sum=62.466 (1)",
-            "tab": "General information",
-            "score": 62.466
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.558,
-        "details": {
-          "description": "min=0.558, mean=0.558, max=0.558, sum=0.558 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.47, mean=0.47, max=0.47, sum=0.47 (1)",
-            "tab": "Robustness",
-            "score": 0.47
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.488, mean=0.488, max=0.488, sum=0.488 (1)",
-            "tab": "Fairness",
-            "score": 0.488
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=4.348, mean=4.348, max=4.348, sum=4.348 (1)",
-            "tab": "General information",
-            "score": 4.348
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.437,
-        "details": {
-          "description": "min=0.367, mean=0.437, max=0.485, sum=1.312 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.049, mean=0.068, max=0.095, sum=0.203 (3)",
-            "tab": "Calibration",
-            "score": 0.06751578986419772
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.326, mean=0.39, max=0.43, sum=1.17 (3)",
-            "tab": "Robustness",
-            "score": 0.38990825688073394
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.289, mean=0.354, max=0.398, sum=1.063 (3)",
-            "tab": "Fairness",
-            "score": 0.35423037716615696
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=317.682, mean=355.015, max=375.682, sum=1065.046 (3)",
-            "tab": "General information",
-            "score": 355.0152905198777
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.661,
-        "details": {
-          "description": "min=0.62, mean=0.661, max=0.706, sum=1.982 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.333, mean=0.337, max=0.343, sum=1.012 (3)",
-            "tab": "Robustness",
-            "score": 0.3372691798941794
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.569, mean=0.607, max=0.639, sum=1.821 (3)",
-            "tab": "Robustness",
-            "score": 0.6069545244562901
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.339, mean=0.342, max=0.346, sum=1.027 (3)",
-            "tab": "Fairness",
-            "score": 0.34235396825396786
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.578, mean=0.62, max=0.66, sum=1.861 (3)",
-            "tab": "Fairness",
-            "score": 0.6202649047028815
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=349.303, mean=385.636, max=423.303, sum=1156.909 (3)",
-            "tab": "General information",
-            "score": 385.63633333333337
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=2, mean=2.001, max=2.003, sum=6.003 (3)",
-            "tab": "General information",
-            "score": 2.001
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=337.047, mean=373.38, max=411.047, sum=1120.14 (3)",
-            "tab": "General information",
-            "score": 373.3798449612403
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.149,
-        "details": {
-          "description": "min=0.142, mean=0.149, max=0.157, sum=0.892 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1203.032, mean=1213.032, max=1224.032, sum=7278.193 (6)",
-            "tab": "General information",
-            "score": 1213.0321888412018
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=47.208, mean=49.239, max=51.633, sum=295.433 (6)",
-            "tab": "General information",
-            "score": 49.238912732474965
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.593, mean=0.608, max=0.618, sum=3.649 (6)",
-            "tab": "Bias",
-            "score": 0.6082305358040653
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.396, mean=0.411, max=0.434, sum=2.467 (6)",
-            "tab": "Bias",
-            "score": 0.4111171483483329
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.177, mean=0.254, max=0.301, sum=1.526 (6)",
-            "tab": "Bias",
-            "score": 0.25438070908615346
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.064, mean=0.083, max=0.119, sum=0.497 (6)",
-            "tab": "Bias",
-            "score": 0.08290586755395449
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.009 (6)",
-            "tab": "Toxicity",
-            "score": 0.001430615164520744
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.442, mean=0.489, max=0.543, sum=1.468 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.48944984939262354
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.299, mean=0.313, max=0.33, sum=0.94 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.31320318480412634
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.952, mean=0.957, max=0.964, sum=5.745 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9574608785885589
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=12.535, mean=15.317, max=20.424, sum=91.904 (6)",
-            "tab": "Summarization metrics",
-            "score": 15.31737957113954
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=11.81, mean=12.304, max=13.072, sum=73.827 (6)",
-            "tab": "Summarization metrics",
-            "score": 12.30449736723726
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.182,
-        "details": {
-          "description": "min=0.177, mean=0.182, max=0.186, sum=1.09 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1099.388, mean=1133.388, max=1172.388, sum=6800.328 (6)",
-            "tab": "General information",
-            "score": 1133.388030888031
-          },
-          "XSUM - # output tokens": {
-            "description": "min=21.909, mean=22.142, max=22.392, sum=132.853 (6)",
-            "tab": "General information",
-            "score": 22.142213642213644
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.451, mean=0.466, max=0.478, sum=2.796 (6)",
-            "tab": "Bias",
-            "score": 0.4660306771417882
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.362, mean=0.399, max=0.429, sum=2.397 (6)",
-            "tab": "Bias",
-            "score": 0.39943255885284873
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.189, mean=0.205, max=0.224, sum=1.232 (6)",
-            "tab": "Bias",
-            "score": 0.20538608377971754
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0.002, mean=0.003, max=0.004, sum=0.019 (6)",
-            "tab": "Toxicity",
-            "score": 0.0032175032175032173
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.325, mean=-0.32, max=-0.314, sum=-0.96 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.31997175372142944
-          },
-          "XSUM - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.484, mean=0.489, max=0.493, sum=1.468 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.4894925021585029
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.75, mean=0.755, max=0.761, sum=4.53 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.7549647155240389
-          },
-          "XSUM - Density": {
-            "description": "min=1.852, mean=2.145, max=2.331, sum=12.869 (6)",
-            "tab": "Summarization metrics",
-            "score": 2.144865535443147
-          },
-          "XSUM - Compression": {
-            "description": "min=16.369, mean=16.589, max=16.81, sum=99.535 (6)",
-            "tab": "Summarization metrics",
-            "score": 16.58922760069323
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.938,
-        "details": {
-          "description": "min=0.936, mean=0.938, max=0.943, sum=2.815 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.157, mean=0.182, max=0.199, sum=0.546 (3)",
-            "tab": "Calibration",
-            "score": 0.18203122522171636
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.878, mean=0.896, max=0.916, sum=2.688 (3)",
-            "tab": "Robustness",
-            "score": 0.896
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.928, mean=0.933, max=0.937, sum=2.799 (3)",
-            "tab": "Fairness",
-            "score": 0.9329999999999999
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=853.851, mean=1288.518, max=1745.851, sum=3865.553 (3)",
-            "tab": "General information",
-            "score": 1288.5176666666669
-          },
-          "IMDB - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.57,
-        "details": {
-          "description": "min=0.011, mean=0.57, max=1, sum=30.805 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.07, mean=0.314, max=0.578, sum=16.962 (54)",
-            "tab": "Calibration",
-            "score": 0.31411210820302815
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.009, mean=0.449, max=0.979, sum=24.224 (54)",
-            "tab": "Robustness",
-            "score": 0.4485846578472439
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.005, mean=0.507, max=0.995, sum=27.37 (54)",
-            "tab": "Fairness",
-            "score": 0.5068507198702314
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=271.927, mean=532.602, max=942.498, sum=28760.487 (54)",
-            "tab": "General information",
-            "score": 532.6016121330534
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=108 (54)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.746,
-        "details": {
-          "description": "min=0.225, mean=0.746, max=0.975, sum=24.625 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.126, mean=0.218, max=0.683, sum=7.184 (33)",
-            "tab": "Calibration",
-            "score": 0.2177038585857703
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.225, mean=0.69, max=0.95, sum=22.775 (33)",
-            "tab": "Robustness",
-            "score": 0.6901515151515151
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.225, mean=0.711, max=0.975, sum=23.45 (33)",
-            "tab": "Fairness",
-            "score": 0.7106060606060605
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=5, mean=5, max=5, sum=165 (33)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=212.25, mean=944.157, max=4506.05, sum=31157.175 (33)",
-            "tab": "General information",
-            "score": 944.1568181818182
-          },
-          "RAFT - # output tokens": {
-            "description": "min=2, mean=3.597, max=7.275, sum=118.7 (33)",
-            "tab": "General information",
-            "score": 3.5969696969696967
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json b/data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json
deleted file mode 100644
index 4278cef81..000000000
--- a/data/helm_classic/ai21/Jurassic-2-Large-7.5B/a01f642e-730b-461d-8afe-9c077ab3f149.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/ai21_Jurassic-2-Large-7.5B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Jurassic-2 Large 7.5B",
-    "id": "ai21/Jurassic-2-Large-7.5B",
-    "developer": "ai21",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.553,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.6435013876040703
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.5267325431952796
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.48311004284307957
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.4461156665667944
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.4555798368298368
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.5723684210526315
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.339,
-        "details": {
-          "description": "min=0.211, mean=0.339, max=0.5, sum=5.078 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.06, mean=0.141, max=0.219, sum=2.11 (15)",
-            "tab": "Calibration",
-            "score": 0.1406708954092635
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.17, mean=0.263, max=0.42, sum=3.938 (15)",
-            "tab": "Robustness",
-            "score": 0.2625146198830409
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.167, mean=0.297, max=0.45, sum=4.453 (15)",
-            "tab": "Fairness",
-            "score": 0.2968421052631579
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=308.59, mean=396.74, max=552.719, sum=5951.098 (15)",
-            "tab": "General information",
-            "score": 396.73985964912276
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.742,
-        "details": {
-          "description": "min=0.737, mean=0.742, max=0.747, sum=2.227 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.126, mean=0.147, max=0.165, sum=0.442 (3)",
-            "tab": "Calibration",
-            "score": 0.14720347227904834
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.602, mean=0.607, max=0.615, sum=1.822 (3)",
-            "tab": "Robustness",
-            "score": 0.6073333333333334
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.675, mean=0.685, max=0.697, sum=2.055 (3)",
-            "tab": "Fairness",
-            "score": 0.685
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=506.985, mean=694.652, max=952.985, sum=2083.955 (3)",
-            "tab": "General information",
-            "score": 694.6516666666666
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "NarrativeQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "NarrativeQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "NarrativeQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.589,
-        "details": {
-          "description": "min=0.576, mean=0.589, max=0.605, sum=1.766 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.008, mean=0.014, max=0.021, sum=0.042 (3)",
-            "tab": "Calibration",
-            "score": 0.01399000614897039
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.081, mean=0.084, max=0.089, sum=0.253 (3)",
-            "tab": "Calibration",
-            "score": 0.08428284450081218
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.177, mean=0.187, max=0.195, sum=0.562 (3)",
-            "tab": "Robustness",
-            "score": 0.18733342573827472
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.485, mean=0.503, max=0.529, sum=1.51 (3)",
-            "tab": "Robustness",
-            "score": 0.5031846716563587
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.215, mean=0.217, max=0.221, sum=0.652 (3)",
-            "tab": "Fairness",
-            "score": 0.21726190588701
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.53, mean=0.539, max=0.557, sum=1.616 (3)",
-            "tab": "Fairness",
-            "score": 0.5388295929563434
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=94.377, mean=99.377, max=102.377, sum=298.131 (3)",
-            "tab": "General information",
-            "score": 99.377
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=5.924, mean=6.729, max=7.956, sum=20.187 (3)",
-            "tab": "General information",
-            "score": 6.729
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.568, mean=4.666, max=4.734, sum=13.999 (3)",
-            "tab": "General information",
-            "score": 4.666333333333333
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.038, mean=0.038, max=0.038, sum=0.114 (3)",
-            "tab": "General information",
-            "score": 0.038
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1136.933, mean=1418.457, max=1595.508, sum=4255.37 (3)",
-            "tab": "General information",
-            "score": 1418.4566666666667
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=5.825, mean=6.311, max=6.845, sum=18.932 (3)",
-            "tab": "General information",
-            "score": 6.310666666666666
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.502, mean=0.531, max=0.563, sum=1.594 (3)",
-            "tab": "Bias",
-            "score": 0.5313654482080615
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0, mean=0.079, max=0.192, sum=0.238 (3)",
-            "tab": "Bias",
-            "score": 0.07925407925407925
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.3, mean=0.433, max=0.5, sum=1.3 (3)",
-            "tab": "Bias",
-            "score": 0.43333333333333335
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.441, mean=0.504, max=0.574, sum=1.513 (3)",
-            "tab": "Bias",
-            "score": 0.5041929581337629
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.186, mean=0.203, max=0.225, sum=0.608 (3)",
-            "tab": "Bias",
-            "score": 0.20273109243697482
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.002, max=0.003, sum=0.006 (3)",
-            "tab": "Toxicity",
-            "score": 0.002
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "QuAC - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "QuAC - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "QuAC - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "QuAC - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "QuAC - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "QuAC - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "QuAC - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "QuAC - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.729,
-        "details": {
-          "description": "min=0.729, mean=0.729, max=0.729, sum=0.729 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.687, mean=0.687, max=0.687, sum=0.687 (1)",
-            "tab": "Robustness",
-            "score": 0.687
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.567, mean=0.567, max=0.567, sum=0.567 (1)",
-            "tab": "Fairness",
-            "score": 0.567
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=62.466, mean=62.466, max=62.466, sum=62.466 (1)",
-            "tab": "General information",
-            "score": 62.466
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53,
-        "details": {
-          "description": "min=0.53, mean=0.53, max=0.53, sum=0.53 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.448, mean=0.448, max=0.448, sum=0.448 (1)",
-            "tab": "Robustness",
-            "score": 0.448
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.45, mean=0.45, max=0.45, sum=0.45 (1)",
-            "tab": "Fairness",
-            "score": 0.45
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=4.348, mean=4.348, max=4.348, sum=4.348 (1)",
-            "tab": "General information",
-            "score": 4.348
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.245,
-        "details": {
-          "description": "min=0.22, mean=0.245, max=0.283, sum=0.734 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.043, mean=0.102, max=0.134, sum=0.306 (3)",
-            "tab": "Calibration",
-            "score": 0.1021312296645796
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.197, mean=0.21, max=0.228, sum=0.63 (3)",
-            "tab": "Robustness",
-            "score": 0.20998980632008157
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.185, mean=0.196, max=0.205, sum=0.589 (3)",
-            "tab": "Fairness",
-            "score": 0.1962283384301733
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=317.682, mean=355.015, max=375.682, sum=1065.046 (3)",
-            "tab": "General information",
-            "score": 355.0152905198777
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.464,
-        "details": {
-          "description": "min=0.454, mean=0.464, max=0.479, sum=1.393 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.168, mean=0.177, max=0.186, sum=0.532 (3)",
-            "tab": "Robustness",
-            "score": 0.1774849206349205
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.391, mean=0.397, max=0.403, sum=1.192 (3)",
-            "tab": "Robustness",
-            "score": 0.39737317282374035
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.21, mean=0.215, max=0.221, sum=0.646 (3)",
-            "tab": "Fairness",
-            "score": 0.21544642857142837
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.432, mean=0.44, max=0.457, sum=1.32 (3)",
-            "tab": "Fairness",
-            "score": 0.44015360771598083
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=349.303, mean=385.636, max=423.303, sum=1156.909 (3)",
-            "tab": "General information",
-            "score": 385.63633333333337
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=2.006, mean=2.012, max=2.022, sum=6.037 (3)",
-            "tab": "General information",
-            "score": 2.012333333333333
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=337.047, mean=373.38, max=411.047, sum=1120.14 (3)",
-            "tab": "General information",
-            "score": 373.3798449612403
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=2.023, mean=2.023, max=2.023, sum=6.07 (3)",
-            "tab": "General information",
-            "score": 2.0232558139534884
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.136,
-        "details": {
-          "description": "min=0.122, mean=0.136, max=0.15, sum=0.813 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1203.032, mean=1213.032, max=1224.032, sum=7278.193 (6)",
-            "tab": "General information",
-            "score": 1213.0321888412018
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=52.573, mean=58.246, max=61.575, sum=349.476 (6)",
-            "tab": "General information",
-            "score": 58.24606580829757
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.612, mean=0.647, max=0.667, sum=3.885 (6)",
-            "tab": "Bias",
-            "score": 0.6474734228728262
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.365, mean=0.405, max=0.442, sum=2.432 (6)",
-            "tab": "Bias",
-            "score": 0.405313769914252
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.175, mean=0.245, max=0.377, sum=1.468 (6)",
-            "tab": "Bias",
-            "score": 0.24474724360307878
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.103, mean=0.133, max=0.149, sum=0.796 (6)",
-            "tab": "Bias",
-            "score": 0.13266873135824753
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.465, mean=0.496, max=0.548, sum=1.488 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.49606841741715785
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.242, mean=0.271, max=0.304, sum=0.812 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.27057214623114106
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.952, mean=0.963, max=0.98, sum=5.779 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9630886941006946
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=15.279, mean=25.251, max=36.976, sum=151.506 (6)",
-            "tab": "Summarization metrics",
-            "score": 25.250963083991945
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=9.923, mean=11.503, max=13.28, sum=69.019 (6)",
-            "tab": "Summarization metrics",
-            "score": 11.503115138085485
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.142,
-        "details": {
-          "description": "min=0.14, mean=0.142, max=0.145, sum=0.853 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1099.388, mean=1133.388, max=1172.388, sum=6800.328 (6)",
-            "tab": "General information",
-            "score": 1133.388030888031
-          },
-          "XSUM - # output tokens": {
-            "description": "min=21.112, mean=21.228, max=21.315, sum=127.371 (6)",
-            "tab": "General information",
-            "score": 21.22844272844273
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.433, mean=0.464, max=0.492, sum=2.785 (6)",
-            "tab": "Bias",
-            "score": 0.46417690732206857
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.407, mean=0.58, max=0.667, sum=3.481 (6)",
-            "tab": "Bias",
-            "score": 0.5802469135802469
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.209, mean=0.22, max=0.234, sum=1.321 (6)",
-            "tab": "Bias",
-            "score": 0.2200902099970423
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.012 (6)",
-            "tab": "Toxicity",
-            "score": 0.0019305019305019308
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.306, mean=-0.278, max=-0.26, sum=-0.833 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.27758991887056994
-          },
-          "XSUM - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.449, mean=0.45, max=0.451, sum=1.35 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.44989833153156206
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.781, mean=0.782, max=0.783, sum=4.694 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.7823704015893701
-          },
-          "XSUM - Density": {
-            "description": "min=2.345, mean=2.659, max=2.826, sum=15.954 (6)",
-            "tab": "Summarization metrics",
-            "score": 2.6589249165198687
-          },
-          "XSUM - Compression": {
-            "description": "min=17.896, mean=18.03, max=18.26, sum=108.178 (6)",
-            "tab": "Summarization metrics",
-            "score": 18.02961749079778
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.956,
-        "details": {
-          "description": "min=0.952, mean=0.956, max=0.96, sum=2.869 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.153, mean=0.178, max=0.201, sum=0.534 (3)",
-            "tab": "Calibration",
-            "score": 0.17816129477822015
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.935, mean=0.941, max=0.946, sum=2.822 (3)",
-            "tab": "Robustness",
-            "score": 0.9406666666666667
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.941, mean=0.945, max=0.951, sum=2.835 (3)",
-            "tab": "Fairness",
-            "score": 0.945
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=4.915, mean=4.972, max=5, sum=14.915 (3)",
-            "tab": "General information",
-            "score": 4.971666666666667
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=853.851, mean=1281.577, max=1725.03, sum=3844.732 (3)",
-            "tab": "General information",
-            "score": 1281.5773333333334
-          },
-          "IMDB - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.57,
-        "details": {
-          "description": "min=0.149, mean=0.57, max=0.909, sum=30.8 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.033, mean=0.19, max=0.41, sum=10.274 (54)",
-            "tab": "Calibration",
-            "score": 0.19026595574841215
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.116, mean=0.469, max=0.844, sum=25.305 (54)",
-            "tab": "Robustness",
-            "score": 0.4686089323926605
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.143, mean=0.403, max=0.834, sum=21.752 (54)",
-            "tab": "Fairness",
-            "score": 0.4028192827891808
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=271.927, mean=532.602, max=942.498, sum=28760.487 (54)",
-            "tab": "General information",
-            "score": 532.6016121330534
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=108 (54)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.622,
-        "details": {
-          "description": "min=0.25, mean=0.622, max=0.975, sum=20.525 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.128, mean=0.254, max=0.441, sum=8.368 (33)",
-            "tab": "Calibration",
-            "score": 0.25356461082010057
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.498, max=0.975, sum=16.425 (33)",
-            "tab": "Robustness",
-            "score": 0.49772727272727263
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.2, mean=0.567, max=0.975, sum=18.725 (33)",
-            "tab": "Fairness",
-            "score": 0.5674242424242424
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.95, mean=4.658, max=5, sum=153.7 (33)",
-            "tab": "General information",
-            "score": 4.657575757575757
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=212.25, mean=712.248, max=1745.25, sum=23504.175 (33)",
-            "tab": "General information",
-            "score": 712.2477272727273
-          },
-          "RAFT - # output tokens": {
-            "description": "min=1.975, mean=3.562, max=6.575, sum=117.55 (33)",
-            "tab": "General information",
-            "score": 3.5621212121212116
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json b/data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json
deleted file mode 100644
index 7e02805f7..000000000
--- a/data/helm_classic/aleph-alpha/Luminous-Base-13B/813802a3-483e-443d-9e49-7cd581b5ea6d.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Base-13B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Luminous Base 13B",
-    "id": "aleph-alpha/Luminous-Base-13B",
-    "developer": "aleph-alpha",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.315,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.6405642923219241
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.31855477855477854
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.23762237762237765
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5516493320513314
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.5035063701730368
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.42105263157894735
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.27,
-        "details": {
-          "description": "min=0.193, mean=0.27, max=0.32, sum=4.045 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.087, mean=0.111, max=0.157, sum=1.661 (15)",
-            "tab": "Calibration",
-            "score": 0.110752611571227
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.1, mean=0.183, max=0.27, sum=2.74 (15)",
-            "tab": "Robustness",
-            "score": 0.1826549707602339
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.09, mean=0.185, max=0.27, sum=2.769 (15)",
-            "tab": "Fairness",
-            "score": 0.1845730994152047
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=360.75, mean=471.075, max=618.447, sum=7066.132 (15)",
-            "tab": "General information",
-            "score": 471.0754736842105
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.719,
-        "details": {
-          "description": "min=0.7, mean=0.719, max=0.74, sum=2.156 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.056, mean=0.066, max=0.084, sum=0.197 (3)",
-            "tab": "Calibration",
-            "score": 0.06557915095556173
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.643, mean=0.655, max=0.673, sum=1.965 (3)",
-            "tab": "Robustness",
-            "score": 0.655
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.634, mean=0.653, max=0.682, sum=1.958 (3)",
-            "tab": "Fairness",
-            "score": 0.6526666666666667
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=651.658, mean=908.991, max=1252.658, sum=2726.974 (3)",
-            "tab": "General information",
-            "score": 908.9913333333333
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1.002, max=1.003, sum=3.006 (3)",
-            "tab": "General information",
-            "score": 1.002
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.605,
-        "details": {
-          "description": "min=0.577, mean=0.605, max=0.633, sum=1.815 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.04, mean=0.048, max=0.063, sum=0.145 (3)",
-            "tab": "Calibration",
-            "score": 0.04822831549746422
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.444, mean=0.476, max=0.505, sum=1.429 (3)",
-            "tab": "Robustness",
-            "score": 0.4761726989393548
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.462, mean=0.498, max=0.532, sum=1.495 (3)",
-            "tab": "Fairness",
-            "score": 0.4982467496641079
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.039, mean=1.621, max=2.037, sum=4.862 (3)",
-            "tab": "General information",
-            "score": 1.6206572769953052
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1606.952, mean=1647.783, max=1694.642, sum=4943.349 (3)",
-            "tab": "General information",
-            "score": 1647.783098591549
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.521, mean=6.798, max=8.192, sum=20.394 (3)",
-            "tab": "General information",
-            "score": 6.798122065727699
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.396, mean=0.438, max=0.5, sum=1.313 (3)",
-            "tab": "Bias",
-            "score": 0.4375901875901876
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.333, mean=0.556, max=0.667, sum=1.667 (3)",
-            "tab": "Bias",
-            "score": 0.5555555555555557
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.152, mean=0.172, max=0.197, sum=0.516 (3)",
-            "tab": "Bias",
-            "score": 0.1718450326045263
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.02, mean=0.022, max=0.025, sum=0.065 (3)",
-            "tab": "Toxicity",
-            "score": 0.0215962441314554
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.568,
-        "details": {
-          "description": "min=0.563, mean=0.568, max=0.577, sum=1.705 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.039, mean=0.045, max=0.054, sum=0.136 (3)",
-            "tab": "Calibration",
-            "score": 0.04534548194935659
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.068, mean=0.07, max=0.074, sum=0.21 (3)",
-            "tab": "Calibration",
-            "score": 0.07013609628734997
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.157, mean=0.163, max=0.168, sum=0.489 (3)",
-            "tab": "Robustness",
-            "score": 0.1628593597054443
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.484, mean=0.491, max=0.498, sum=1.474 (3)",
-            "tab": "Robustness",
-            "score": 0.4912891920785376
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.156, mean=0.16, max=0.164, sum=0.481 (3)",
-            "tab": "Fairness",
-            "score": 0.16022586408623682
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.505, mean=0.511, max=0.515, sum=1.534 (3)",
-            "tab": "Fairness",
-            "score": 0.5114691771549933
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=109.087, mean=111.754, max=116.087, sum=335.261 (3)",
-            "tab": "General information",
-            "score": 111.75366666666667
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.314, mean=5.287, max=5.908, sum=15.861 (3)",
-            "tab": "General information",
-            "score": 5.287
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.691, mean=4.711, max=4.726, sum=14.134 (3)",
-            "tab": "General information",
-            "score": 4.711333333333333
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.038, mean=0.039, max=0.04, sum=0.116 (3)",
-            "tab": "General information",
-            "score": 0.03866666666666666
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1224.733, mean=1384.565, max=1488.14, sum=4153.695 (3)",
-            "tab": "General information",
-            "score": 1384.5649999999998
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=7.685, mean=10.15, max=11.898, sum=30.449 (3)",
-            "tab": "General information",
-            "score": 10.149666666666667
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.25, mean=0.417, max=0.5, sum=1.25 (3)",
-            "tab": "Bias",
-            "score": 0.4166666666666667
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.339, mean=0.433, max=0.5, sum=1.298 (3)",
-            "tab": "Bias",
-            "score": 0.43278417840114286
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.081, mean=0.162, max=0.239, sum=0.486 (3)",
-            "tab": "Bias",
-            "score": 0.16214742091319934
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.3, mean=0.432, max=0.5, sum=1.296 (3)",
-            "tab": "Bias",
-            "score": 0.432010582010582
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.429, mean=0.457, max=0.498, sum=1.37 (3)",
-            "tab": "Bias",
-            "score": 0.45656911106888937
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.272, mean=0.32, max=0.416, sum=0.961 (3)",
-            "tab": "Bias",
-            "score": 0.3202891068062547
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.006 (3)",
-            "tab": "Toxicity",
-            "score": 0.002
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.003, sum=0.004 (3)",
-            "tab": "Toxicity",
-            "score": 0.0013333333333333333
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.334,
-        "details": {
-          "description": "min=0.317, mean=0.334, max=0.362, sum=1.003 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.068, mean=0.098, max=0.131, sum=0.295 (3)",
-            "tab": "Calibration",
-            "score": 0.09821008405024316
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.166, mean=0.185, max=0.212, sum=0.556 (3)",
-            "tab": "Robustness",
-            "score": 0.18543862521458307
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.251, mean=0.266, max=0.284, sum=0.799 (3)",
-            "tab": "Fairness",
-            "score": 0.2662906470176498
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.84, mean=0.909, max=0.991, sum=2.727 (3)",
-            "tab": "General information",
-            "score": 0.9089999999999999
-          },
-          "QuAC - truncated": {
-            "description": "min=0.029, mean=0.033, max=0.037, sum=0.098 (3)",
-            "tab": "General information",
-            "score": 0.03266666666666667
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1596.904, mean=1641.256, max=1672.92, sum=4923.768 (3)",
-            "tab": "General information",
-            "score": 1641.256
-          },
-          "QuAC - # output tokens": {
-            "description": "min=18.527, mean=23.472, max=28.795, sum=70.415 (3)",
-            "tab": "General information",
-            "score": 23.471666666666668
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.641, mean=0.658, max=0.667, sum=1.974 (3)",
-            "tab": "Bias",
-            "score": 0.6581196581196581
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.401, mean=0.417, max=0.432, sum=1.251 (3)",
-            "tab": "Bias",
-            "score": 0.41695983406755
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.258, mean=0.32, max=0.377, sum=0.96 (3)",
-            "tab": "Bias",
-            "score": 0.3200297021845843
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.193, mean=0.203, max=0.212, sum=0.61 (3)",
-            "tab": "Bias",
-            "score": 0.20338227449992274
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.002, max=0.003, sum=0.006 (3)",
-            "tab": "Toxicity",
-            "score": 0.002
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.182,
-        "details": {
-          "description": "min=0.165, mean=0.182, max=0.194, sum=0.547 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.069, mean=0.081, max=0.095, sum=0.244 (3)",
-            "tab": "Calibration",
-            "score": 0.08144933240589737
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.107, mean=0.112, max=0.118, sum=0.335 (3)",
-            "tab": "Robustness",
-            "score": 0.11162079510703364
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.118, mean=0.125, max=0.13, sum=0.375 (3)",
-            "tab": "Fairness",
-            "score": 0.12487257900101938
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=504.073, mean=514.073, max=533.073, sum=1542.22 (3)",
-            "tab": "General information",
-            "score": 514.0733944954128
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.11,
-        "details": {
-          "description": "min=0.048, mean=0.11, max=0.147, sum=0.661 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1564.648, mean=1578.648, max=1593.648, sum=9471.888 (6)",
-            "tab": "General information",
-            "score": 1578.648068669528
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=59.824, mean=80.866, max=92.721, sum=485.197 (6)",
-            "tab": "General information",
-            "score": 80.86623748211731
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.607, mean=0.629, max=0.667, sum=3.775 (6)",
-            "tab": "Bias",
-            "score": 0.629159058053613
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.388, mean=0.408, max=0.443, sum=2.45 (6)",
-            "tab": "Bias",
-            "score": 0.40834546858679427
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.211, mean=0.287, max=0.333, sum=1.725 (6)",
-            "tab": "Bias",
-            "score": 0.2874529064836184
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.138, mean=0.164, max=0.192, sum=0.984 (6)",
-            "tab": "Bias",
-            "score": 0.16396073067980207
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)",
-            "tab": "Toxicity",
-            "score": 0.000715307582260372
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=-0.076, mean=0.32, max=0.527, sum=0.959 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.3197354449182434
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.045, mean=0.188, max=0.278, sum=0.563 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.18776450739321585
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.543, mean=0.834, max=0.982, sum=5.004 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.8340516341645151
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=15.163, mean=35.663, max=51.192, sum=213.977 (6)",
-            "tab": "Summarization metrics",
-            "score": 35.66281771790173
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=8.191, mean=9.346, max=11.345, sum=56.078 (6)",
-            "tab": "Summarization metrics",
-            "score": 9.346357628862261
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.105,
-        "details": {
-          "description": "min=0.101, mean=0.105, max=0.107, sum=0.628 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)",
-            "tab": "General information",
-            "score": 4.998712998712999
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1472.903, mean=1532.912, max=1566.407, sum=9197.471 (6)",
-            "tab": "General information",
-            "score": 1532.9118404118406
-          },
-          "XSUM - # output tokens": {
-            "description": "min=25.481, mean=26.021, max=26.315, sum=156.127 (6)",
-            "tab": "General information",
-            "score": 26.02123552123552
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.429, mean=0.442, max=0.453, sum=2.655 (6)",
-            "tab": "Bias",
-            "score": 0.4424845269672855
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.153, mean=0.165, max=0.183, sum=0.99 (6)",
-            "tab": "Bias",
-            "score": 0.16492426719539477
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.012 (6)",
-            "tab": "Toxicity",
-            "score": 0.0019305019305019308
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.217, mean=-0.213, max=-0.206, sum=-0.639 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.2129847266550281
-          },
-          "XSUM - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.391, mean=0.394, max=0.396, sum=1.183 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.3944890669761573
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.828, mean=0.834, max=0.838, sum=5.002 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.8336902125268334
-          },
-          "XSUM - Density": {
-            "description": "min=4.128, mean=4.393, max=4.529, sum=26.358 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.392991783737345
-          },
-          "XSUM - Compression": {
-            "description": "min=17.248, mean=17.535, max=17.956, sum=105.21 (6)",
-            "tab": "Summarization metrics",
-            "score": 17.535051923934834
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.939,
-        "details": {
-          "description": "min=0.931, mean=0.939, max=0.949, sum=2.818 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.187, mean=0.232, max=0.257, sum=0.695 (3)",
-            "tab": "Calibration",
-            "score": 0.23165086222498446
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.864, mean=0.887, max=0.918, sum=2.662 (3)",
-            "tab": "Robustness",
-            "score": 0.8873333333333333
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.902, mean=0.912, max=0.926, sum=2.737 (3)",
-            "tab": "Fairness",
-            "score": 0.9123333333333333
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.908, mean=4.236, max=4.985, sum=12.708 (3)",
-            "tab": "General information",
-            "score": 4.236000000000001
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1283.569, mean=1560.056, max=1777.712, sum=4680.167 (3)",
-            "tab": "General information",
-            "score": 1560.0556666666664
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.544,
-        "details": {
-          "description": "min=0.003, mean=0.544, max=1, sum=29.372 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.071, mean=0.28, max=0.632, sum=15.102 (54)",
-            "tab": "Calibration",
-            "score": 0.2796625331945748
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.416, max=0.99, sum=22.479 (54)",
-            "tab": "Robustness",
-            "score": 0.416268791059841
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.397, max=1, sum=21.425 (54)",
-            "tab": "Fairness",
-            "score": 0.3967651888403395
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=362.037, mean=724.782, max=1272.822, sum=39138.207 (54)",
-            "tab": "General information",
-            "score": 724.7816027688522
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=54 (54)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.473,
-        "details": {
-          "description": "min=0.025, mean=0.473, max=0.975, sum=15.625 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.115, mean=0.29, max=0.826, sum=9.575 (33)",
-            "tab": "Calibration",
-            "score": 0.29014727083072167
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.402, max=0.975, sum=13.25 (33)",
-            "tab": "Robustness",
-            "score": 0.4015151515151515
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0, mean=0.445, max=0.975, sum=14.7 (33)",
-            "tab": "Fairness",
-            "score": 0.4454545454545455
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.56, max=5, sum=150.475 (33)",
-            "tab": "General information",
-            "score": 4.5598484848484855
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0.002, max=0.025, sum=0.075 (33)",
-            "tab": "General information",
-            "score": 0.002272727272727273
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=262.3, mean=810.769, max=1759.65, sum=26755.375 (33)",
-            "tab": "General information",
-            "score": 810.7689393939394
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.75, mean=2.916, max=6.5, sum=96.225 (33)",
-            "tab": "General information",
-            "score": 2.91590909090909
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json b/data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json
deleted file mode 100644
index d6f8fa8ea..000000000
--- a/data/helm_classic/aleph-alpha/Luminous-Extended-30B/90e7bfa7-af3a-4979-b0d1-9d75db6e4e30.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Extended-30B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Luminous Extended 30B",
-    "id": "aleph-alpha/Luminous-Extended-30B",
-    "developer": "aleph-alpha",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.485,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.5765957446808511
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.42993006993006994
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.45142191142191146
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.629471974916769
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.7191265524598858
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.5657894736842105
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.321,
-        "details": {
-          "description": "min=0.23, mean=0.321, max=0.49, sum=4.811 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.075, mean=0.135, max=0.225, sum=2.023 (15)",
-            "tab": "Calibration",
-            "score": 0.1348564339845485
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.1, mean=0.23, max=0.37, sum=3.451 (15)",
-            "tab": "Robustness",
-            "score": 0.23008187134502922
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.14, mean=0.237, max=0.35, sum=3.549 (15)",
-            "tab": "Fairness",
-            "score": 0.23658479532163745
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=360.75, mean=471.075, max=618.447, sum=7066.132 (15)",
-            "tab": "General information",
-            "score": 471.0754736842105
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.767,
-        "details": {
-          "description": "min=0.752, mean=0.767, max=0.794, sum=2.3 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.11, mean=0.129, max=0.154, sum=0.387 (3)",
-            "tab": "Calibration",
-            "score": 0.1289354797828563
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.637, mean=0.659, max=0.7, sum=1.976 (3)",
-            "tab": "Robustness",
-            "score": 0.6586666666666666
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.692, mean=0.711, max=0.733, sum=2.133 (3)",
-            "tab": "Fairness",
-            "score": 0.711
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=651.658, mean=908.991, max=1252.658, sum=2726.974 (3)",
-            "tab": "General information",
-            "score": 908.9913333333333
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.665,
-        "details": {
-          "description": "min=0.637, mean=0.665, max=0.684, sum=1.994 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.043, mean=0.046, max=0.047, sum=0.138 (3)",
-            "tab": "Calibration",
-            "score": 0.046063826868188405
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.481, mean=0.513, max=0.539, sum=1.54 (3)",
-            "tab": "Robustness",
-            "score": 0.513450295883327
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.503, mean=0.532, max=0.565, sum=1.597 (3)",
-            "tab": "Fairness",
-            "score": 0.5321907426131639
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.039, mean=1.621, max=2.037, sum=4.862 (3)",
-            "tab": "General information",
-            "score": 1.6206572769953052
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1606.952, mean=1647.783, max=1694.642, sum=4943.349 (3)",
-            "tab": "General information",
-            "score": 1647.783098591549
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=6.321, mean=7.042, max=8.175, sum=21.127 (3)",
-            "tab": "General information",
-            "score": 7.04225352112676
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.4, mean=0.416, max=0.44, sum=1.248 (3)",
-            "tab": "Bias",
-            "score": 0.4159611992945326
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.333, mean=0.556, max=0.667, sum=1.667 (3)",
-            "tab": "Bias",
-            "score": 0.5555555555555557
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.186, mean=0.199, max=0.207, sum=0.598 (3)",
-            "tab": "Bias",
-            "score": 0.19931611685099856
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.014, mean=0.017, max=0.02, sum=0.051 (3)",
-            "tab": "Toxicity",
-            "score": 0.016901408450704227
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.609,
-        "details": {
-          "description": "min=0.606, mean=0.609, max=0.611, sum=1.827 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.018, mean=0.022, max=0.024, sum=0.065 (3)",
-            "tab": "Calibration",
-            "score": 0.02157162838647707
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.08, mean=0.09, max=0.095, sum=0.269 (3)",
-            "tab": "Calibration",
-            "score": 0.08979897901208977
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.205, mean=0.212, max=0.218, sum=0.635 (3)",
-            "tab": "Robustness",
-            "score": 0.211552896733343
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.515, mean=0.524, max=0.537, sum=1.572 (3)",
-            "tab": "Robustness",
-            "score": 0.5239378524073847
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.205, mean=0.214, max=0.22, sum=0.642 (3)",
-            "tab": "Fairness",
-            "score": 0.21385439000180537
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.548, mean=0.551, max=0.554, sum=1.654 (3)",
-            "tab": "Fairness",
-            "score": 0.5512241821510145
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=109.087, mean=111.754, max=116.087, sum=335.261 (3)",
-            "tab": "General information",
-            "score": 111.75366666666667
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=5.508, mean=6.119, max=6.869, sum=18.356 (3)",
-            "tab": "General information",
-            "score": 6.118666666666666
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.691, mean=4.711, max=4.726, sum=14.134 (3)",
-            "tab": "General information",
-            "score": 4.711333333333333
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.038, mean=0.039, max=0.04, sum=0.116 (3)",
-            "tab": "General information",
-            "score": 0.03866666666666666
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1224.733, mean=1384.565, max=1488.14, sum=4153.695 (3)",
-            "tab": "General information",
-            "score": 1384.5649999999998
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=8.216, mean=10.3, max=11.913, sum=30.9 (3)",
-            "tab": "General information",
-            "score": 10.299999999999999
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.379, mean=0.46, max=0.5, sum=1.379 (3)",
-            "tab": "Bias",
-            "score": 0.4597701149425288
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.414, mean=0.435, max=0.447, sum=1.304 (3)",
-            "tab": "Bias",
-            "score": 0.43455385345385017
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.15, mean=0.223, max=0.269, sum=0.669 (3)",
-            "tab": "Bias",
-            "score": 0.2230769230769231
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.4, mean=0.411, max=0.433, sum=1.233 (3)",
-            "tab": "Bias",
-            "score": 0.41111111111111115
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.421, mean=0.441, max=0.477, sum=1.324 (3)",
-            "tab": "Bias",
-            "score": 0.44143286168772855
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.022, mean=0.045, max=0.082, sum=0.135 (3)",
-            "tab": "Bias",
-            "score": 0.04515740195666192
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.349,
-        "details": {
-          "description": "min=0.34, mean=0.349, max=0.363, sum=1.047 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.081, mean=0.096, max=0.116, sum=0.287 (3)",
-            "tab": "Calibration",
-            "score": 0.09561324552236967
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.188, mean=0.193, max=0.201, sum=0.578 (3)",
-            "tab": "Robustness",
-            "score": 0.1926796273359054
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.268, mean=0.277, max=0.295, sum=0.832 (3)",
-            "tab": "Fairness",
-            "score": 0.2774375608495023
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.84, mean=0.909, max=0.991, sum=2.727 (3)",
-            "tab": "General information",
-            "score": 0.9089999999999999
-          },
-          "QuAC - truncated": {
-            "description": "min=0.029, mean=0.033, max=0.037, sum=0.098 (3)",
-            "tab": "General information",
-            "score": 0.03266666666666667
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1596.904, mean=1641.256, max=1672.92, sum=4923.768 (3)",
-            "tab": "General information",
-            "score": 1641.256
-          },
-          "QuAC - # output tokens": {
-            "description": "min=20.299, mean=21.144, max=22.408, sum=63.432 (3)",
-            "tab": "General information",
-            "score": 21.144000000000002
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.59, mean=0.612, max=0.636, sum=1.837 (3)",
-            "tab": "Bias",
-            "score": 0.6124061124061125
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.382, mean=0.403, max=0.421, sum=1.208 (3)",
-            "tab": "Bias",
-            "score": 0.40276421801932005
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.202, mean=0.24, max=0.259, sum=0.719 (3)",
-            "tab": "Bias",
-            "score": 0.23980711859954595
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.194, mean=0.2, max=0.205, sum=0.601 (3)",
-            "tab": "Bias",
-            "score": 0.20029662396768255
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)",
-            "tab": "Toxicity",
-            "score": 0.0003333333333333333
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.221,
-        "details": {
-          "description": "min=0.208, mean=0.221, max=0.231, sum=0.662 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.057, mean=0.064, max=0.068, sum=0.192 (3)",
-            "tab": "Calibration",
-            "score": 0.0641638452052097
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.139, mean=0.151, max=0.161, sum=0.454 (3)",
-            "tab": "Robustness",
-            "score": 0.15137614678899083
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.144, mean=0.16, max=0.171, sum=0.479 (3)",
-            "tab": "Fairness",
-            "score": 0.15953109072375127
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=504.073, mean=514.073, max=533.073, sum=1542.22 (3)",
-            "tab": "General information",
-            "score": 514.0733944954128
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.139,
-        "details": {
-          "description": "min=0.117, mean=0.139, max=0.15, sum=0.834 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1564.648, mean=1578.648, max=1593.648, sum=9471.888 (6)",
-            "tab": "General information",
-            "score": 1578.648068669528
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=73.322, mean=83.112, max=88.178, sum=498.674 (6)",
-            "tab": "General information",
-            "score": 83.11230329041489
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.58, mean=0.608, max=0.637, sum=3.651 (6)",
-            "tab": "Bias",
-            "score": 0.6084787955510622
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.382, mean=0.391, max=0.398, sum=2.347 (6)",
-            "tab": "Bias",
-            "score": 0.3911797965697547
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.254, mean=0.274, max=0.288, sum=1.642 (6)",
-            "tab": "Bias",
-            "score": 0.27361254875467617
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.128, mean=0.151, max=0.191, sum=0.909 (6)",
-            "tab": "Bias",
-            "score": 0.15142644383010628
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.309, mean=0.481, max=0.569, sum=1.443 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.4809362133230566
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.202, mean=0.255, max=0.288, sum=0.766 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.25521962437955664
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.8, mean=0.925, max=0.989, sum=5.552 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9253891304300669
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=34.945, mean=41.619, max=45.552, sum=249.715 (6)",
-            "tab": "Summarization metrics",
-            "score": 41.61911540769457
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=8.478, mean=9.039, max=9.909, sum=54.236 (6)",
-            "tab": "Summarization metrics",
-            "score": 9.039273431117751
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.124,
-        "details": {
-          "description": "min=0.122, mean=0.124, max=0.126, sum=0.742 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)",
-            "tab": "General information",
-            "score": 4.998712998712999
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1472.903, mean=1532.912, max=1566.407, sum=9197.471 (6)",
-            "tab": "General information",
-            "score": 1532.9118404118406
-          },
-          "XSUM - # output tokens": {
-            "description": "min=25.747, mean=25.987, max=26.212, sum=155.923 (6)",
-            "tab": "General information",
-            "score": 25.987129987129986
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.449, mean=0.45, max=0.451, sum=2.701 (6)",
-            "tab": "Bias",
-            "score": 0.450224364113253
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.532, mean=0.547, max=0.565, sum=3.282 (6)",
-            "tab": "Bias",
-            "score": 0.5469576096753798
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.212, mean=0.214, max=0.217, sum=1.283 (6)",
-            "tab": "Bias",
-            "score": 0.2138886962661304
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.002, max=0.004, sum=0.012 (6)",
-            "tab": "Toxicity",
-            "score": 0.0019305019305019308
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.233, mean=-0.225, max=-0.212, sum=-0.675 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.22500232932190178
-          },
-          "XSUM - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.419, mean=0.423, max=0.427, sum=1.269 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.4230439766625391
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.817, mean=0.818, max=0.819, sum=4.91 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.8184154242425056
-          },
-          "XSUM - Density": {
-            "description": "min=3.392, mean=3.507, max=3.668, sum=21.042 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.507010978728374
-          },
-          "XSUM - Compression": {
-            "description": "min=17.136, mean=17.376, max=17.524, sum=104.258 (6)",
-            "tab": "Summarization metrics",
-            "score": 17.376290660463752
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.947,
-        "details": {
-          "description": "min=0.944, mean=0.947, max=0.951, sum=2.842 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.177, mean=0.204, max=0.232, sum=0.612 (3)",
-            "tab": "Calibration",
-            "score": 0.2038815444945483
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.913, mean=0.92, max=0.933, sum=2.76 (3)",
-            "tab": "Robustness",
-            "score": 0.9199999999999999
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.93, mean=0.937, max=0.946, sum=2.811 (3)",
-            "tab": "Fairness",
-            "score": 0.9369999999999999
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.908, mean=4.236, max=4.985, sum=12.708 (3)",
-            "tab": "General information",
-            "score": 4.236000000000001
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1283.569, mean=1560.056, max=1777.712, sum=4680.167 (3)",
-            "tab": "General information",
-            "score": 1560.0556666666664
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.524,
-        "details": {
-          "description": "min=0.014, mean=0.524, max=0.997, sum=28.276 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.112, mean=0.359, max=0.619, sum=19.409 (54)",
-            "tab": "Calibration",
-            "score": 0.35941964376806523
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.011, mean=0.368, max=0.874, sum=19.881 (54)",
-            "tab": "Robustness",
-            "score": 0.36816849425853654
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.462, max=0.985, sum=24.963 (54)",
-            "tab": "Fairness",
-            "score": 0.4622866273105216
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=362.037, mean=724.782, max=1272.822, sum=39138.207 (54)",
-            "tab": "General information",
-            "score": 724.7816027688522
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=54 (54)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.523,
-        "details": {
-          "description": "min=0, mean=0.523, max=0.925, sum=17.25 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.143, mean=0.29, max=0.954, sum=9.577 (33)",
-            "tab": "Calibration",
-            "score": 0.2902057183123561
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.436, max=0.825, sum=14.4 (33)",
-            "tab": "Robustness",
-            "score": 0.43636363636363645
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0, mean=0.489, max=0.925, sum=16.15 (33)",
-            "tab": "Fairness",
-            "score": 0.4893939393939393
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.56, max=5, sum=150.475 (33)",
-            "tab": "General information",
-            "score": 4.5598484848484855
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0.002, max=0.025, sum=0.075 (33)",
-            "tab": "General information",
-            "score": 0.002272727272727273
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=262.3, mean=810.769, max=1759.65, sum=26755.375 (33)",
-            "tab": "General information",
-            "score": 810.7689393939394
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.125, mean=2.796, max=6.825, sum=92.275 (33)",
-            "tab": "General information",
-            "score": 2.796212121212121
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json b/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json
deleted file mode 100644
index 5680298fb..000000000
--- a/data/helm_classic/aleph-alpha/Luminous-Supreme-70B/d113c21d-7c89-4cde-98b8-0c2f8d03fdf6.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/aleph-alpha_Luminous-Supreme-70B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Luminous Supreme 70B",
-    "id": "aleph-alpha/Luminous-Supreme-70B",
-    "developer": "aleph-alpha",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.662,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.6242368177613321
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.5464102564102564
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.5218648018648019
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5709490829944818
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.5562049062049063
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.7171052631578947
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38,
-        "details": {
-          "description": "min=0.22, mean=0.38, max=0.61, sum=5.702 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.122, mean=0.154, max=0.217, sum=2.31 (15)",
-            "tab": "Calibration",
-            "score": 0.15396738685964684
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.08, mean=0.255, max=0.51, sum=3.821 (15)",
-            "tab": "Robustness",
-            "score": 0.2547368421052632
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.11, mean=0.264, max=0.51, sum=3.955 (15)",
-            "tab": "Fairness",
-            "score": 0.2636608187134503
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=360.75, mean=471.075, max=618.447, sum=7066.132 (15)",
-            "tab": "General information",
-            "score": 471.0754736842105
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.775,
-        "details": {
-          "description": "min=0.748, mean=0.775, max=0.795, sum=2.325 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.06, mean=0.083, max=0.111, sum=0.248 (3)",
-            "tab": "Calibration",
-            "score": 0.08277086924611576
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.624, mean=0.665, max=0.693, sum=1.996 (3)",
-            "tab": "Robustness",
-            "score": 0.6653333333333333
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.66, mean=0.694, max=0.713, sum=2.081 (3)",
-            "tab": "Fairness",
-            "score": 0.6936666666666667
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=651.658, mean=908.991, max=1252.658, sum=2726.974 (3)",
-            "tab": "General information",
-            "score": 908.9913333333333
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.711,
-        "details": {
-          "description": "min=0.687, mean=0.711, max=0.742, sum=2.133 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.036, mean=0.049, max=0.061, sum=0.147 (3)",
-            "tab": "Calibration",
-            "score": 0.04915634481869984
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.557, mean=0.59, max=0.617, sum=1.771 (3)",
-            "tab": "Robustness",
-            "score": 0.5902392957151222
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.562, mean=0.603, max=0.637, sum=1.808 (3)",
-            "tab": "Fairness",
-            "score": 0.6025352758861713
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.039, mean=1.621, max=2.037, sum=4.862 (3)",
-            "tab": "General information",
-            "score": 1.6206572769953052
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1606.952, mean=1647.783, max=1694.642, sum=4943.349 (3)",
-            "tab": "General information",
-            "score": 1647.783098591549
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.749, mean=6.84, max=8.158, sum=20.521 (3)",
-            "tab": "General information",
-            "score": 6.84037558685446
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.396, mean=0.465, max=0.5, sum=1.396 (3)",
-            "tab": "Bias",
-            "score": 0.46527777777777773
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.216, mean=0.238, max=0.256, sum=0.714 (3)",
-            "tab": "Bias",
-            "score": 0.23804020866547204
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.011, mean=0.016, max=0.02, sum=0.048 (3)",
-            "tab": "Toxicity",
-            "score": 0.01596244131455399
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.649,
-        "details": {
-          "description": "min=0.644, mean=0.649, max=0.656, sum=1.946 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.035, mean=0.041, max=0.045, sum=0.123 (3)",
-            "tab": "Calibration",
-            "score": 0.04112615448004484
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.07, mean=0.074, max=0.077, sum=0.222 (3)",
-            "tab": "Calibration",
-            "score": 0.07410001302901324
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.243, mean=0.252, max=0.261, sum=0.757 (3)",
-            "tab": "Robustness",
-            "score": 0.25230806968086933
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.576, mean=0.586, max=0.593, sum=1.758 (3)",
-            "tab": "Robustness",
-            "score": 0.5861072363623724
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.23, mean=0.241, max=0.25, sum=0.723 (3)",
-            "tab": "Fairness",
-            "score": 0.24089192251975544
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.583, mean=0.597, max=0.61, sum=1.79 (3)",
-            "tab": "Fairness",
-            "score": 0.5966421355805813
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=109.087, mean=111.754, max=116.087, sum=335.261 (3)",
-            "tab": "General information",
-            "score": 111.75366666666667
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.262, mean=4.508, max=4.666, sum=13.525 (3)",
-            "tab": "General information",
-            "score": 4.508333333333334
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.691, mean=4.711, max=4.726, sum=14.134 (3)",
-            "tab": "General information",
-            "score": 4.711333333333333
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.038, mean=0.039, max=0.04, sum=0.116 (3)",
-            "tab": "General information",
-            "score": 0.03866666666666666
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1224.733, mean=1384.565, max=1488.14, sum=4153.695 (3)",
-            "tab": "General information",
-            "score": 1384.5649999999998
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=6.064, mean=6.362, max=6.864, sum=19.086 (3)",
-            "tab": "General information",
-            "score": 6.361999999999999
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.338, mean=0.446, max=0.5, sum=1.338 (3)",
-            "tab": "Bias",
-            "score": 0.445882557030098
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.467, mean=0.48, max=0.498, sum=1.441 (3)",
-            "tab": "Bias",
-            "score": 0.48022397745392514
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.02, mean=0.125, max=0.265, sum=0.374 (3)",
-            "tab": "Bias",
-            "score": 0.12466386554621849
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.4, mean=0.444, max=0.5, sum=1.333 (3)",
-            "tab": "Bias",
-            "score": 0.4444444444444445
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.401, mean=0.44, max=0.506, sum=1.319 (3)",
-            "tab": "Bias",
-            "score": 0.43982889050590296
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.205, mean=0.22, max=0.25, sum=0.66 (3)",
-            "tab": "Bias",
-            "score": 0.2201426024955437
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.002, max=0.003, sum=0.006 (3)",
-            "tab": "Toxicity",
-            "score": 0.002
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.004 (3)",
-            "tab": "Toxicity",
-            "score": 0.0013333333333333333
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37,
-        "details": {
-          "description": "min=0.364, mean=0.37, max=0.378, sum=1.111 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.054, mean=0.058, max=0.061, sum=0.175 (3)",
-            "tab": "Calibration",
-            "score": 0.05820640656843105
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.221, mean=0.233, max=0.24, sum=0.699 (3)",
-            "tab": "Robustness",
-            "score": 0.23311906486145426
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.28, mean=0.288, max=0.3, sum=0.865 (3)",
-            "tab": "Fairness",
-            "score": 0.28824116919086756
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.84, mean=0.909, max=0.991, sum=2.727 (3)",
-            "tab": "General information",
-            "score": 0.9089999999999999
-          },
-          "QuAC - truncated": {
-            "description": "min=0.029, mean=0.033, max=0.037, sum=0.098 (3)",
-            "tab": "General information",
-            "score": 0.03266666666666667
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1596.904, mean=1641.256, max=1672.92, sum=4923.768 (3)",
-            "tab": "General information",
-            "score": 1641.256
-          },
-          "QuAC - # output tokens": {
-            "description": "min=22.638, mean=26.241, max=28.094, sum=78.723 (3)",
-            "tab": "General information",
-            "score": 26.241000000000003
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.571, mean=0.598, max=0.615, sum=1.794 (3)",
-            "tab": "Bias",
-            "score": 0.5980796023899473
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.408, mean=0.412, max=0.415, sum=1.236 (3)",
-            "tab": "Bias",
-            "score": 0.41214192227908586
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.269, mean=0.305, max=0.351, sum=0.914 (3)",
-            "tab": "Bias",
-            "score": 0.3046567170277752
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.227, mean=0.232, max=0.235, sum=0.696 (3)",
-            "tab": "Bias",
-            "score": 0.23187441800624423
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.003, sum=0.007 (3)",
-            "tab": "Toxicity",
-            "score": 0.0023333333333333335
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.222,
-        "details": {
-          "description": "min=0.2, mean=0.222, max=0.258, sum=0.667 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.072, mean=0.092, max=0.102, sum=0.276 (3)",
-            "tab": "Calibration",
-            "score": 0.09195091586715554
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.092, mean=0.106, max=0.121, sum=0.318 (3)",
-            "tab": "Robustness",
-            "score": 0.10601427115188583
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.128, mean=0.132, max=0.138, sum=0.396 (3)",
-            "tab": "Fairness",
-            "score": 0.13200815494393475
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=504.073, mean=514.073, max=533.073, sum=1542.22 (3)",
-            "tab": "General information",
-            "score": 514.0733944954128
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.15,
-        "details": {
-          "description": "min=0.133, mean=0.15, max=0.16, sum=0.899 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1564.648, mean=1578.648, max=1593.648, sum=9471.888 (6)",
-            "tab": "General information",
-            "score": 1578.648068669528
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=71.758, mean=75.51, max=79.294, sum=453.06 (6)",
-            "tab": "General information",
-            "score": 75.51001430615165
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.621, mean=0.63, max=0.646, sum=3.782 (6)",
-            "tab": "Bias",
-            "score": 0.6303974395279242
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.39, mean=0.401, max=0.412, sum=2.406 (6)",
-            "tab": "Bias",
-            "score": 0.4010246477666291
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.281, mean=0.291, max=0.297, sum=1.746 (6)",
-            "tab": "Bias",
-            "score": 0.2910346586068148
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.114, mean=0.13, max=0.148, sum=0.782 (6)",
-            "tab": "Bias",
-            "score": 0.1303630037220396
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.423, mean=0.552, max=0.624, sum=1.656 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.5518853318256234
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.236, mean=0.28, max=0.304, sum=0.841 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.28049037475726807
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.846, mean=0.939, max=0.988, sum=5.636 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9393220183960566
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=31.874, mean=33.625, max=34.739, sum=201.751 (6)",
-            "tab": "Summarization metrics",
-            "score": 33.625141882714196
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=8.884, mean=9.298, max=9.552, sum=55.787 (6)",
-            "tab": "Summarization metrics",
-            "score": 9.29781469578472
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.136,
-        "details": {
-          "description": "min=0.133, mean=0.136, max=0.14, sum=0.813 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)",
-            "tab": "General information",
-            "score": 4.998712998712999
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1472.903, mean=1532.912, max=1566.407, sum=9197.471 (6)",
-            "tab": "General information",
-            "score": 1532.9118404118406
-          },
-          "XSUM - # output tokens": {
-            "description": "min=25.844, mean=26.423, max=26.988, sum=158.537 (6)",
-            "tab": "General information",
-            "score": 26.422779922779924
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.42, mean=0.439, max=0.456, sum=2.635 (6)",
-            "tab": "Bias",
-            "score": 0.4390946502057613
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.532, mean=0.544, max=0.556, sum=3.264 (6)",
-            "tab": "Bias",
-            "score": 0.5439341780805197
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.201, mean=0.206, max=0.21, sum=1.238 (6)",
-            "tab": "Bias",
-            "score": 0.2063342186388344
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.008 (6)",
-            "tab": "Toxicity",
-            "score": 0.001287001287001287
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.251, mean=-0.241, max=-0.231, sum=-0.723 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.2409771191414105
-          },
-          "XSUM - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.442, mean=0.444, max=0.446, sum=1.331 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.44350630738930513
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.799, mean=0.807, max=0.816, sum=4.841 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.8068883614050096
-          },
-          "XSUM - Density": {
-            "description": "min=2.852, mean=3.08, max=3.225, sum=18.481 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.080091964253596
-          },
-          "XSUM - Compression": {
-            "description": "min=16.326, mean=16.97, max=17.573, sum=101.823 (6)",
-            "tab": "Summarization metrics",
-            "score": 16.97049624677277
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.959,
-        "details": {
-          "description": "min=0.957, mean=0.959, max=0.961, sum=2.878 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.137, mean=0.173, max=0.222, sum=0.519 (3)",
-            "tab": "Calibration",
-            "score": 0.1730084935772459
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.931, mean=0.932, max=0.934, sum=2.797 (3)",
-            "tab": "Robustness",
-            "score": 0.9323333333333333
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.948, mean=0.949, max=0.951, sum=2.848 (3)",
-            "tab": "Fairness",
-            "score": 0.9493333333333333
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.908, mean=4.236, max=4.985, sum=12.708 (3)",
-            "tab": "General information",
-            "score": 4.236000000000001
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1283.569, mean=1560.056, max=1777.712, sum=4680.167 (3)",
-            "tab": "General information",
-            "score": 1560.0556666666664
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.562,
-        "details": {
-          "description": "min=0.049, mean=0.562, max=0.984, sum=30.331 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.051, mean=0.272, max=0.563, sum=14.71 (54)",
-            "tab": "Calibration",
-            "score": 0.27240452987490027
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.035, mean=0.263, max=0.67, sum=14.178 (54)",
-            "tab": "Robustness",
-            "score": 0.26255411827214337
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.014, mean=0.432, max=0.912, sum=23.313 (54)",
-            "tab": "Fairness",
-            "score": 0.4317285215923749
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=362.037, mean=724.782, max=1272.822, sum=39138.207 (54)",
-            "tab": "General information",
-            "score": 724.7816027688522
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=54 (54)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.653,
-        "details": {
-          "description": "min=0, mean=0.653, max=0.975, sum=21.55 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.072, mean=0.238, max=1, sum=7.863 (33)",
-            "tab": "Calibration",
-            "score": 0.238277000839632
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.564, max=0.975, sum=18.6 (33)",
-            "tab": "Robustness",
-            "score": 0.5636363636363637
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0, mean=0.601, max=0.975, sum=19.825 (33)",
-            "tab": "Fairness",
-            "score": 0.6007575757575758
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.56, max=5, sum=150.475 (33)",
-            "tab": "General information",
-            "score": 4.5598484848484855
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0.002, max=0.025, sum=0.075 (33)",
-            "tab": "General information",
-            "score": 0.002272727272727273
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=262.3, mean=810.769, max=1759.65, sum=26755.375 (33)",
-            "tab": "General information",
-            "score": 810.7689393939394
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0, mean=3.097, max=6.725, sum=102.2 (33)",
-            "tab": "General information",
-            "score": 3.0969696969696976
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json b/data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json
deleted file mode 100644
index caffd542e..000000000
--- a/data/helm_classic/bigscience/BLOOM-176B/3dc29785-a884-4496-a6f4-a8bf19892e50.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/bigscience_BLOOM-176B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BLOOM 176B",
-    "id": "bigscience/BLOOM-176B",
-    "developer": "bigscience",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.446,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.3480016788296159
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.5409357605686861
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.5507003378527294
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.26823464912280703
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5459762982621468
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.5959534292867626
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.29074770258980787
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.299,
-        "details": {
-          "description": "min=0.19, mean=0.299, max=0.42, sum=4.481 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.115, mean=0.137, max=0.173, sum=2.054 (15)",
-            "tab": "Calibration",
-            "score": 0.13690038983912287
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.167, mean=0.25, max=0.38, sum=3.754 (15)",
-            "tab": "Robustness",
-            "score": 0.25025730994152046
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.175, mean=0.274, max=0.38, sum=4.104 (15)",
-            "tab": "Fairness",
-            "score": 0.27360233918128657
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.135, mean=0.233, max=0.418, sum=3.493 (15)",
-            "tab": "Efficiency",
-            "score": 0.23288457024982262
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=333.02, mean=436.99, max=574.658, sum=6554.844 (15)",
-            "tab": "General information",
-            "score": 436.9895789473684
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.704,
-        "details": {
-          "description": "min=0.659, mean=0.704, max=0.728, sum=2.112 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.153, mean=0.209, max=0.247, sum=0.626 (3)",
-            "tab": "Calibration",
-            "score": 0.2086643852555177
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.595, mean=0.642, max=0.674, sum=1.926 (3)",
-            "tab": "Robustness",
-            "score": 0.642
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.601, mean=0.656, max=0.693, sum=1.968 (3)",
-            "tab": "Fairness",
-            "score": 0.656
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.665, mean=0.853, max=1.05, sum=2.558 (3)",
-            "tab": "Efficiency",
-            "score": 0.852823399183769
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=636.774, mean=897.107, max=1242.774, sum=2691.322 (3)",
-            "tab": "General information",
-            "score": 897.1073333333333
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.662,
-        "details": {
-          "description": "min=0.631, mean=0.662, max=0.695, sum=1.986 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.231, mean=0.237, max=0.242, sum=0.712 (3)",
-            "tab": "Calibration",
-            "score": 0.2374266630696186
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.468, mean=0.53, max=0.574, sum=1.591 (3)",
-            "tab": "Robustness",
-            "score": 0.5303029858435905
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.535, mean=0.577, max=0.613, sum=1.73 (3)",
-            "tab": "Fairness",
-            "score": 0.5767895596204061
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=2.081, mean=2.598, max=3.427, sum=7.794 (3)",
-            "tab": "Efficiency",
-            "score": 2.5979962524114084
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.042, mean=1.621, max=2.048, sum=4.862 (3)",
-            "tab": "General information",
-            "score": 1.6206572769953052
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1604.899, mean=1649.598, max=1699.146, sum=4948.794 (3)",
-            "tab": "General information",
-            "score": 1649.5981220657277
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=18.468, mean=33.276, max=50.499, sum=99.828 (3)",
-            "tab": "General information",
-            "score": 33.27605633802816
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.333, mean=0.355, max=0.389, sum=1.065 (3)",
-            "tab": "Bias",
-            "score": 0.354945620223398
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.118, mean=0.165, max=0.241, sum=0.494 (3)",
-            "tab": "Bias",
-            "score": 0.16472050143449737
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.011, mean=0.012, max=0.014, sum=0.037 (3)",
-            "tab": "Toxicity",
-            "score": 0.012206572769953052
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.621,
-        "details": {
-          "description": "min=0.61, mean=0.621, max=0.628, sum=1.864 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.11, mean=0.116, max=0.118, sum=0.347 (3)",
-            "tab": "Calibration",
-            "score": 0.11564225453050514
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.338, mean=0.347, max=0.36, sum=1.041 (3)",
-            "tab": "Calibration",
-            "score": 0.3469801265406112
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.18, mean=0.185, max=0.19, sum=0.556 (3)",
-            "tab": "Robustness",
-            "score": 0.18537100322417385
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.547, mean=0.558, max=0.569, sum=1.675 (3)",
-            "tab": "Robustness",
-            "score": 0.5582069622847597
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.183, mean=0.187, max=0.189, sum=0.56 (3)",
-            "tab": "Fairness",
-            "score": 0.18669047090402127
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.56, mean=0.575, max=0.585, sum=1.724 (3)",
-            "tab": "Fairness",
-            "score": 0.5745618824682682
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.931, mean=1.115, max=1.261, sum=3.346 (3)",
-            "tab": "Efficiency",
-            "score": 1.115412127906084
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=2.213, mean=2.547, max=2.912, sum=7.64 (3)",
-            "tab": "Efficiency",
-            "score": 2.546660231937965
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=92.12, mean=96.12, max=102.12, sum=288.36 (3)",
-            "tab": "General information",
-            "score": 96.12
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=34.82, mean=48.109, max=57.074, sum=144.327 (3)",
-            "tab": "General information",
-            "score": 48.109
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.73, mean=4.743, max=4.751, sum=14.229 (3)",
-            "tab": "General information",
-            "score": 4.743000000000001
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.035, mean=0.035, max=0.035, sum=0.105 (3)",
-            "tab": "General information",
-            "score": 0.035
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1193.69, mean=1313.422, max=1423.457, sum=3940.267 (3)",
-            "tab": "General information",
-            "score": 1313.4223333333334
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=31.304, mean=38.803, max=46.481, sum=116.409 (3)",
-            "tab": "General information",
-            "score": 38.803000000000004
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.303, mean=0.418, max=0.519, sum=1.254 (3)",
-            "tab": "Bias",
-            "score": 0.4180133480204756
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.056, mean=0.09, max=0.143, sum=0.27 (3)",
-            "tab": "Bias",
-            "score": 0.08994708994708996
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.352, mean=0.426, max=0.5, sum=0.852 (2)",
-            "tab": "Bias",
-            "score": 0.42619047619047623
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.454, mean=0.499, max=0.546, sum=1.498 (3)",
-            "tab": "Bias",
-            "score": 0.499333679443982
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.11, mean=0.135, max=0.177, sum=0.404 (3)",
-            "tab": "Bias",
-            "score": 0.13470779383719764
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.002, sum=0.004 (3)",
-            "tab": "Toxicity",
-            "score": 0.0013333333333333333
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.002, max=0.003, sum=0.006 (3)",
-            "tab": "Toxicity",
-            "score": 0.002
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.361,
-        "details": {
-          "description": "min=0.342, mean=0.361, max=0.375, sum=1.082 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.103, mean=0.122, max=0.142, sum=0.367 (3)",
-            "tab": "Calibration",
-            "score": 0.1222163558834574
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.229, mean=0.234, max=0.24, sum=0.701 (3)",
-            "tab": "Robustness",
-            "score": 0.23376457225319638
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.265, mean=0.273, max=0.289, sum=0.82 (3)",
-            "tab": "Fairness",
-            "score": 0.27335853114408787
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=5.124, mean=5.306, max=5.436, sum=15.919 (3)",
-            "tab": "Efficiency",
-            "score": 5.3062709801205585
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.855, mean=0.944, max=1.07, sum=2.832 (3)",
-            "tab": "General information",
-            "score": 0.944
-          },
-          "QuAC - truncated": {
-            "description": "min=0.017, mean=0.017, max=0.017, sum=0.051 (3)",
-            "tab": "General information",
-            "score": 0.017
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1614.308, mean=1639.494, max=1673.303, sum=4918.482 (3)",
-            "tab": "General information",
-            "score": 1639.494
-          },
-          "QuAC - # output tokens": {
-            "description": "min=86.351, mean=90.164, max=93.357, sum=270.491 (3)",
-            "tab": "General information",
-            "score": 90.16366666666666
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.604, mean=0.631, max=0.647, sum=1.894 (3)",
-            "tab": "Bias",
-            "score": 0.6313294548588666
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.388, mean=0.396, max=0.408, sum=1.189 (3)",
-            "tab": "Bias",
-            "score": 0.3963840842187811
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.35, mean=0.365, max=0.381, sum=1.094 (3)",
-            "tab": "Bias",
-            "score": 0.3645250034421991
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.235, mean=0.244, max=0.26, sum=0.732 (3)",
-            "tab": "Bias",
-            "score": 0.2440549375970967
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.744,
-        "details": {
-          "description": "min=0.744, mean=0.744, max=0.744, sum=0.744 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.293 (1)",
-            "tab": "Calibration",
-            "score": 0.2926428762465171
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.699, mean=0.699, max=0.699, sum=0.699 (1)",
-            "tab": "Robustness",
-            "score": 0.699
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.585, mean=0.585, max=0.585, sum=0.585 (1)",
-            "tab": "Fairness",
-            "score": 0.585
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.075, mean=0.075, max=0.075, sum=0.075 (1)",
-            "tab": "Efficiency",
-            "score": 0.07493321968615055
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=88.875, mean=88.875, max=88.875, sum=88.875 (1)",
-            "tab": "General information",
-            "score": 88.875
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.534,
-        "details": {
-          "description": "min=0.534, mean=0.534, max=0.534, sum=0.534 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.248, mean=0.248, max=0.248, sum=0.248 (1)",
-            "tab": "Calibration",
-            "score": 0.24842661648577113
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.438, mean=0.438, max=0.438, sum=0.438 (1)",
-            "tab": "Robustness",
-            "score": 0.438
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.482, mean=0.482, max=0.482, sum=0.482 (1)",
-            "tab": "Fairness",
-            "score": 0.482
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.032, mean=0.032, max=0.032, sum=0.032 (1)",
-            "tab": "Efficiency",
-            "score": 0.03224579076468945
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.444, mean=5.444, max=5.444, sum=5.444 (1)",
-            "tab": "General information",
-            "score": 5.444
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.205,
-        "details": {
-          "description": "min=0.197, mean=0.205, max=0.211, sum=0.82 (4)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.053, mean=0.096, max=0.128, sum=0.385 (4)",
-            "tab": "Calibration",
-            "score": 0.09624512475777981
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.168, mean=0.183, max=0.206, sum=0.734 (4)",
-            "tab": "Robustness",
-            "score": 0.1834862385321101
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.164, mean=0.186, max=0.206, sum=0.745 (4)",
-            "tab": "Fairness",
-            "score": 0.18616207951070335
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.084, mean=0.143, max=0.226, sum=0.573 (4)",
-            "tab": "Efficiency",
-            "score": 0.14325443854568073
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=2616 (4)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=0, mean=3.75, max=5, sum=15 (4)",
-            "tab": "General information",
-            "score": 3.75
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (4)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=79.361, mean=370.611, max=481.361, sum=1482.443 (4)",
-            "tab": "General information",
-            "score": 370.6108562691131
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=4 (4)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=2.5, max=3, sum=10 (4)",
-            "tab": "General information",
-            "score": 2.5
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.386,
-        "details": {
-          "description": "min=0.364, mean=0.386, max=0.429, sum=1.158 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.158, mean=0.19, max=0.218, sum=0.57 (3)",
-            "tab": "Robustness",
-            "score": 0.18996269841269822
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.304, mean=0.333, max=0.385, sum=0.998 (3)",
-            "tab": "Robustness",
-            "score": 0.33254039819149694
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.189, mean=0.211, max=0.231, sum=0.633 (3)",
-            "tab": "Fairness",
-            "score": 0.2110978835978834
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.345, mean=0.371, max=0.418, sum=1.114 (3)",
-            "tab": "Fairness",
-            "score": 0.37148573288404924
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.246, mean=0.257, max=0.27, sum=0.77 (3)",
-            "tab": "Efficiency",
-            "score": 0.25680491607178446
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.227, mean=0.246, max=0.271, sum=0.739 (3)",
-            "tab": "Efficiency",
-            "score": 0.24635170979166832
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=484.472, mean=524.472, max=570.472, sum=1573.416 (3)",
-            "tab": "General information",
-            "score": 524.472
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=466.814, mean=506.814, max=552.814, sum=1520.442 (3)",
-            "tab": "General information",
-            "score": 506.81395348837214
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.08,
-        "details": {
-          "description": "min=0.052, mean=0.08, max=0.118, sum=0.478 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=5.515, mean=5.584, max=5.648, sum=33.506 (6)",
-            "tab": "Efficiency",
-            "score": 5.5842744588340345
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1520.33, mean=1541.33, max=1578.33, sum=9247.983 (6)",
-            "tab": "General information",
-            "score": 1541.3304721030042
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=104.867, mean=117.435, max=124.011, sum=704.609 (6)",
-            "tab": "General information",
-            "score": 117.4349070100143
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.641, mean=0.658, max=0.667, sum=3.949 (6)",
-            "tab": "Bias",
-            "score": 0.6581699346405229
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.372, mean=0.385, max=0.405, sum=2.311 (6)",
-            "tab": "Bias",
-            "score": 0.3851952735514946
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.291, mean=0.314, max=0.352, sum=1.882 (6)",
-            "tab": "Bias",
-            "score": 0.31373280163525924
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.119, mean=0.145, max=0.16, sum=0.872 (6)",
-            "tab": "Bias",
-            "score": 0.14536660393941517
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.009 (6)",
-            "tab": "Toxicity",
-            "score": 0.001430615164520744
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=-0.129, mean=-0.02, max=0.115, sum=-0.059 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.01977462275373982
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=4.63, mean=4.665, max=4.719, sum=27.988 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.66471171081461
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.005, mean=0.08, max=0.184, sum=0.24 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.08008308750782954
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.618, mean=0.71, max=0.826, sum=4.26 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.7099913231813372
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=20.964, mean=32.013, max=45.756, sum=192.081 (6)",
-            "tab": "Summarization metrics",
-            "score": 32.0134921906249
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=4.623, mean=5.252, max=6.434, sum=31.514 (6)",
-            "tab": "Summarization metrics",
-            "score": 5.2523388558949184
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.03,
-        "details": {
-          "description": "min=0.022, mean=0.03, max=0.038, sum=0.179 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=3.874, mean=3.9, max=3.923, sum=23.4 (6)",
-            "tab": "Efficiency",
-            "score": 3.899962288877679
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1456.338, mean=1501.338, max=1528.338, sum=9008.027 (6)",
-            "tab": "General information",
-            "score": 1501.3378378378377
-          },
-          "XSUM - # output tokens": {
-            "description": "min=50.606, mean=54.066, max=57.05, sum=324.394 (6)",
-            "tab": "General information",
-            "score": 54.06563706563707
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.45, mean=0.467, max=0.5, sum=2.802 (6)",
-            "tab": "Bias",
-            "score": 0.46699346405228753
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.238, mean=0.309, max=0.356, sum=1.856 (6)",
-            "tab": "Bias",
-            "score": 0.3092501368363437
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.109, mean=0.172, max=0.212, sum=1.032 (6)",
-            "tab": "Bias",
-            "score": 0.17201180425265794
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.365, mean=-0.35, max=-0.335, sum=-1.049 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.3496571157539257
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=4.196, mean=4.778, max=5.107, sum=28.667 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.77785601273731
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.025, mean=0.059, max=0.095, sum=0.177 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.05904374779925766
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.48, mean=0.515, max=0.553, sum=3.091 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.5151319646119767
-          },
-          "XSUM - Density": {
-            "description": "min=1.41, mean=1.764, max=2.014, sum=10.585 (6)",
-            "tab": "Summarization metrics",
-            "score": 1.764128575895107
-          },
-          "XSUM - Compression": {
-            "description": "min=7.741, mean=8.934, max=10.222, sum=53.603 (6)",
-            "tab": "Summarization metrics",
-            "score": 8.933804533381347
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.945,
-        "details": {
-          "description": "min=0.936, mean=0.945, max=0.95, sum=2.836 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.305, mean=0.343, max=0.41, sum=1.029 (3)",
-            "tab": "Calibration",
-            "score": 0.3430318396761201
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.907, mean=0.92, max=0.927, sum=2.761 (3)",
-            "tab": "Robustness",
-            "score": 0.9203333333333333
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.927, mean=0.938, max=0.946, sum=2.814 (3)",
-            "tab": "Fairness",
-            "score": 0.9380000000000001
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=3.425, mean=3.536, max=3.659, sum=10.608 (3)",
-            "tab": "Efficiency",
-            "score": 3.5360445948161456
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=4.876, mean=4.943, max=4.987, sum=14.83 (3)",
-            "tab": "General information",
-            "score": 4.943333333333333
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1129.265, mean=1375.21, max=1727.698, sum=4125.631 (3)",
-            "tab": "General information",
-            "score": 1375.2103333333334
-          },
-          "IMDB - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.62,
-        "details": {
-          "description": "min=0.293, mean=0.62, max=0.92, sum=33.467 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.069, mean=0.262, max=0.456, sum=14.142 (54)",
-            "tab": "Calibration",
-            "score": 0.26189371110201226
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.088, mean=0.467, max=0.827, sum=25.192 (54)",
-            "tab": "Robustness",
-            "score": 0.46652660062188434
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.252, mean=0.546, max=0.91, sum=29.488 (54)",
-            "tab": "Fairness",
-            "score": 0.5460670492526992
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.316, mean=0.533, max=1.372, sum=28.76 (54)",
-            "tab": "Efficiency",
-            "score": 0.5325854907984409
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=327.671, mean=683.498, max=1208.636, sum=36908.883 (54)",
-            "tab": "General information",
-            "score": 683.497824649871
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.592,
-        "details": {
-          "description": "min=0.25, mean=0.592, max=0.975, sum=19.525 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.246, mean=0.44, max=0.775, sum=14.508 (33)",
-            "tab": "Calibration",
-            "score": 0.4396262000869267
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.175, mean=0.527, max=0.95, sum=17.375 (33)",
-            "tab": "Robustness",
-            "score": 0.5265151515151515
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.2, mean=0.563, max=0.975, sum=18.575 (33)",
-            "tab": "Fairness",
-            "score": 0.5628787878787879
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.258, mean=1.866, max=3.777, sum=61.574 (33)",
-            "tab": "Efficiency",
-            "score": 1.86588385979184
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.05, mean=4.567, max=5, sum=150.725 (33)",
-            "tab": "General information",
-            "score": 4.567424242424242
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=234.025, mean=779.203, max=1729.4, sum=25713.7 (33)",
-            "tab": "General information",
-            "score": 779.2030303030305
-          },
-          "RAFT - # output tokens": {
-            "description": "min=5, mean=7.127, max=13.7, sum=235.2 (33)",
-            "tab": "General information",
-            "score": 7.127272727272727
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json b/data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json
deleted file mode 100644
index 400f064d5..000000000
--- a/data/helm_classic/bigscience/T0pp-11B/ff8dc291-bbaf-4149-854e-e1780b0c86d5.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/bigscience_T0pp-11B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "T0pp 11B",
-    "id": "bigscience/T0pp-11B",
-    "developer": "bigscience",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.197,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.7577474560592045
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.2275932400932401
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.20273892773892774
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.42000000000000004
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.6045183982683983
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.3965229215229215
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.407,
-        "details": {
-          "description": "min=0.25, mean=0.407, max=0.67, sum=6.098 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.074, mean=0.168, max=0.3, sum=2.515 (15)",
-            "tab": "Calibration",
-            "score": 0.16765379656947835
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.25, mean=0.378, max=0.62, sum=5.675 (15)",
-            "tab": "Robustness",
-            "score": 0.37832748538011696
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.25, mean=0.382, max=0.63, sum=5.731 (15)",
-            "tab": "Fairness",
-            "score": 0.3820701754385965
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.141, mean=0.145, max=0.149, sum=2.18 (15)",
-            "tab": "Efficiency",
-            "score": 0.1453571324242486
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=386.05, mean=492.01, max=639.561, sum=7380.154 (15)",
-            "tab": "General information",
-            "score": 492.0102807017544
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0,
-        "details": {
-          "description": "min=0, mean=0, max=0, sum=0 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.208, mean=0.322, max=0.435, sum=0.967 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Calibration",
-            "score": 0.32218942300251074
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Robustness",
-            "score": 0.0
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Fairness",
-            "score": 0.0
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.366, mean=0.374, max=0.385, sum=1.121 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Efficiency",
-            "score": 0.3736038734018803
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=2.027, mean=3.972, max=4.988, sum=11.915 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 3.971666666666667
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=479.758, mean=702.438, max=905.932, sum=2107.314 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 702.4380000000001
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "min=0, mean=0.25, max=0.5, sum=0.5 (2)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Bias",
-            "score": 0.25
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.151,
-        "details": {
-          "description": "min=0.139, mean=0.151, max=0.158, sum=0.454 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.0, mean=0.0, max=0.0, sum=0.0 (3)",
-            "tab": "Calibration",
-            "score": 0.000042543589701120735
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.087, mean=0.099, max=0.105, sum=0.296 (3)",
-            "tab": "Robustness",
-            "score": 0.09874765137769782
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.074, mean=0.086, max=0.093, sum=0.258 (3)",
-            "tab": "Fairness",
-            "score": 0.0858526263629113
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=0.848, mean=0.945, max=1.053, sum=2.834 (3)",
-            "tab": "Efficiency",
-            "score": 0.9445703822729286
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=0, mean=0.187, max=0.33, sum=0.561 (3)",
-            "tab": "General information",
-            "score": 0.18685446009389672
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0.369, mean=0.372, max=0.377, sum=1.115 (3)",
-            "tab": "General information",
-            "score": 0.37183098591549296
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=807.577, mean=877.742, max=916.668, sum=2633.225 (3)",
-            "tab": "General information",
-            "score": 877.7417840375587
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=300 (3)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.332, mean=0.339, max=0.343, sum=1.017 (3)",
-            "tab": "Bias",
-            "score": 0.3389834657156105
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.093, mean=0.105, max=0.113, sum=0.314 (3)",
-            "tab": "Bias",
-            "score": 0.1046501526237907
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.023, mean=0.023, max=0.025, sum=0.07 (3)",
-            "tab": "Toxicity",
-            "score": 0.02347417840375587
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.19,
-        "details": {
-          "description": "min=0.171, mean=0.19, max=0.203, sum=0.569 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.0, mean=0.0, max=0.0, sum=0.0 (3)",
-            "tab": "Calibration",
-            "score": 3.521055021161368e-9
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.0, mean=0.0, max=0.0, sum=0.0 (3)",
-            "tab": "Calibration",
-            "score": 0.00009644610962286308
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.03, mean=0.031, max=0.032, sum=0.092 (3)",
-            "tab": "Robustness",
-            "score": 0.030683511825215847
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.101, mean=0.122, max=0.135, sum=0.367 (3)",
-            "tab": "Robustness",
-            "score": 0.12220564653363493
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.027, mean=0.028, max=0.03, sum=0.084 (3)",
-            "tab": "Fairness",
-            "score": 0.028132918197666456
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.119, mean=0.136, max=0.151, sum=0.407 (3)",
-            "tab": "Fairness",
-            "score": 0.13562055302845238
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=1.309, mean=1.457, max=1.621, sum=4.371 (3)",
-            "tab": "Efficiency",
-            "score": 1.4571279249547553
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=2.864, mean=2.895, max=2.953, sum=8.685 (3)",
-            "tab": "Efficiency",
-            "score": 2.8950855693236632
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=109.556, mean=113.556, max=118.556, sum=340.668 (3)",
-            "tab": "General information",
-            "score": 113.556
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=900 (3)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=3.164, mean=3.396, max=3.709, sum=10.189 (3)",
-            "tab": "General information",
-            "score": 3.396333333333333
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.052, mean=0.057, max=0.066, sum=0.172 (3)",
-            "tab": "General information",
-            "score": 0.057333333333333326
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=850.863, mean=903.877, max=958.904, sum=2711.631 (3)",
-            "tab": "General information",
-            "score": 903.8770000000001
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=900 (3)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.385, mean=0.462, max=0.5, sum=1.385 (3)",
-            "tab": "Bias",
-            "score": 0.46155024509803927
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.552, mean=0.613, max=0.657, sum=1.84 (3)",
-            "tab": "Bias",
-            "score": 0.6131917464492584
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.028, mean=0.177, max=0.252, sum=0.53 (3)",
-            "tab": "Bias",
-            "score": 0.17673498741459906
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.209, mean=0.329, max=0.473, sum=0.987 (3)",
-            "tab": "Bias",
-            "score": 0.32890264223378113
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.289, mean=0.388, max=0.456, sum=1.164 (3)",
-            "tab": "Bias",
-            "score": 0.38814814814814813
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.394, mean=0.462, max=0.563, sum=1.386 (3)",
-            "tab": "Bias",
-            "score": 0.4620750643944221
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.044, mean=0.091, max=0.176, sum=0.273 (3)",
-            "tab": "Bias",
-            "score": 0.09087407629591253
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.002, sum=0.004 (3)",
-            "tab": "Toxicity",
-            "score": 0.0013333333333333333
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)",
-            "tab": "Toxicity",
-            "score": 0.0003333333333333333
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.121,
-        "details": {
-          "description": "min=0.121, mean=0.121, max=0.121, sum=0.362 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.002 (3)",
-            "tab": "Calibration",
-            "score": 0.0005015010499976317
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.071, mean=0.071, max=0.071, sum=0.212 (3)",
-            "tab": "Robustness",
-            "score": 0.07065126152546952
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.067, mean=0.067, max=0.067, sum=0.201 (3)",
-            "tab": "Fairness",
-            "score": 0.06691720655918869
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=1.239, mean=1.239, max=1.239, sum=3.716 (3)",
-            "tab": "Efficiency",
-            "score": 1.2385025575706792
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "QuAC - truncated": {
-            "description": "min=0.985, mean=0.985, max=0.985, sum=2.955 (3)",
-            "tab": "General information",
-            "score": 0.985
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=823.365, mean=823.365, max=823.365, sum=2470.095 (3)",
-            "tab": "General information",
-            "score": 823.3650000000001
-          },
-          "QuAC - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=300 (3)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.428, mean=0.428, max=0.428, sum=1.284 (3)",
-            "tab": "Bias",
-            "score": 0.42797040922040913
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=1.308 (3)",
-            "tab": "Bias",
-            "score": 0.4358974358974359
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.291, mean=0.291, max=0.291, sum=0.872 (3)",
-            "tab": "Bias",
-            "score": 0.2905073649754501
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.377,
-        "details": {
-          "description": "min=0.347, mean=0.377, max=0.411, sum=1.508 (4)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.1, mean=0.154, max=0.234, sum=0.617 (4)",
-            "tab": "Calibration",
-            "score": 0.15413479575183991
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.33, mean=0.365, max=0.411, sum=1.46 (4)",
-            "tab": "Robustness",
-            "score": 0.3650611620795107
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.307, mean=0.35, max=0.411, sum=1.399 (4)",
-            "tab": "Fairness",
-            "score": 0.34977064220183485
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.133, mean=0.142, max=0.145, sum=0.567 (4)",
-            "tab": "Efficiency",
-            "score": 0.14173421436146078
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=2616 (4)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=0, mean=3.75, max=5, sum=15 (4)",
-            "tab": "General information",
-            "score": 3.75
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (4)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=85.896, mean=391.646, max=515.896, sum=1566.584 (4)",
-            "tab": "General information",
-            "score": 391.6460244648318
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=4 (4)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=2.5, max=3, sum=10 (4)",
-            "tab": "General information",
-            "score": 2.5
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.122,
-        "details": {
-          "description": "min=0.121, mean=0.122, max=0.122, sum=0.73 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=1.057, mean=1.066, max=1.081, sum=6.393 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Efficiency",
-            "score": 1.0655231237061773
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=1.303, mean=1.335, max=1.378, sum=8.013 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 1.3354792560801145
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0.004, mean=0.004, max=0.004, sum=0.026 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 0.004291845493562232
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=885.292, mean=886.838, max=888.921, sum=5321.026 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 886.8376251788268
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=128, mean=128, max=128, sum=768 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 128.0
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.562, mean=0.594, max=0.631, sum=3.562 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Bias",
-            "score": 0.5936999598322023
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.391, mean=0.403, max=0.421, sum=2.417 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Bias",
-            "score": 0.4028700462262689
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.27, mean=0.277, max=0.282, sum=1.662 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Bias",
-            "score": 0.2769263317991031
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.047, mean=0.093, max=0.138, sum=0.559 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Bias",
-            "score": 0.09311410441258088
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.009 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Toxicity",
-            "score": 0.001430615164520744
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=-0.052, mean=-0.044, max=-0.031, sum=-0.132 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Summarization metrics",
-            "score": -0.04384894228805586
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.151, mean=0.155, max=0.163, sum=0.465 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Summarization metrics",
-            "score": 0.1550916195946839
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.836, mean=0.841, max=0.845, sum=5.047 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Summarization metrics",
-            "score": 0.841192270385719
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=8.147, mean=8.588, max=8.816, sum=51.53 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Summarization metrics",
-            "score": 8.588383920302716
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=8.169, mean=8.274, max=8.416, sum=49.643 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Summarization metrics",
-            "score": 8.27387938295926
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.09,
-        "details": {
-          "description": "min=0.07, mean=0.09, max=0.103, sum=0.539 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=0.523, mean=0.554, max=0.571, sum=3.326 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Efficiency",
-            "score": 0.5543883131537052
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=1.967, mean=2.068, max=2.214, sum=12.405 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 2.0675675675675675
-          },
-          "XSUM - truncated": {
-            "description": "min=0.002, mean=0.01, max=0.019, sum=0.058 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 0.009652509652509652
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=889.981, mean=907.769, max=929.006, sum=5446.614 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 907.7689832689833
-          },
-          "XSUM - # output tokens": {
-            "description": "min=64, mean=64, max=64, sum=384 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 64.0
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.43, mean=0.444, max=0.463, sum=2.663 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Bias",
-            "score": 0.4438297255067441
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.286, mean=0.457, max=0.617, sum=2.74 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Bias",
-            "score": 0.45673778645470176
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.215, mean=0.27, max=0.328, sum=1.62 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Bias",
-            "score": 0.2699471127776433
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0006435006435006435
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.331, mean=-0.3, max=-0.268, sum=-0.901 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Summarization metrics",
-            "score": -0.3004745337800477
-          },
-          "XSUM - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.083, mean=0.097, max=0.111, sum=0.292 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Summarization metrics",
-            "score": 0.09723521885401472
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.543, mean=0.579, max=0.605, sum=3.474 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Summarization metrics",
-            "score": 0.5789418979978066
-          },
-          "XSUM - Density": {
-            "description": "min=1.492, mean=1.684, max=1.861, sum=10.105 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Summarization metrics",
-            "score": 1.6841663389066148
-          },
-          "XSUM - Compression": {
-            "description": "min=10.341, mean=11.178, max=11.672, sum=67.065 (6)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Summarization metrics",
-            "score": 11.17756803869132
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.207,
-        "details": {
-          "description": "min=0.181, mean=0.207, max=0.26, sum=0.622 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.207, mean=0.291, max=0.36, sum=0.872 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Calibration",
-            "score": 0.29061500207311436
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.13, mean=0.17, max=0.227, sum=0.511 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Robustness",
-            "score": 0.17033333333333334
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.129, mean=0.168, max=0.22, sum=0.505 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Fairness",
-            "score": 0.16833333333333333
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.37, mean=0.393, max=0.436, sum=1.18 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "Efficiency",
-            "score": 0.39343433208828427
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=1.981, mean=2.44, max=3.074, sum=7.321 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 2.4403333333333332
-          },
-          "IMDB - truncated": {
-            "description": "min=0.03, mean=0.03, max=0.03, sum=0.09 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 0.03
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=905.879, mean=910.174, max=913.752, sum=2730.521 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 910.1736666666666
-          },
-          "IMDB - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n☠ T0++ is explicitly trained on these datasets, i.e. data from the same distribution as the test set. See Table 5 on page 24 of https://arxiv.org/pdf/2110.08207.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.234,
-        "details": {
-          "description": "min=0, mean=0.234, max=0.985, sum=12.634 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.067, mean=0.308, max=0.574, sum=16.631 (54)",
-            "tab": "Calibration",
-            "score": 0.30797595023001567
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.087, max=0.824, sum=4.704 (54)",
-            "tab": "Robustness",
-            "score": 0.0871064519307774
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.165, max=0.947, sum=8.894 (54)",
-            "tab": "Fairness",
-            "score": 0.16470832145418626
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.328, mean=0.391, max=0.487, sum=21.126 (54)",
-            "tab": "Efficiency",
-            "score": 0.3912135341654548
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=2.991, mean=4.861, max=5, sum=262.497 (54)",
-            "tab": "General information",
-            "score": 4.861055391438897
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=385.732, mean=744.109, max=936.562, sum=40181.894 (54)",
-            "tab": "General information",
-            "score": 744.1091399163704
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "min=0.3, mean=0.459, max=0.5, sum=5.503 (12)",
-            "tab": "Bias",
-            "score": 0.4585978835978836
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0.0, max=0.008, sum=0.025 (54)",
-            "tab": "Toxicity",
-            "score": 0.0004596436870303355
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.118,
-        "details": {
-          "description": "min=0, mean=0.118, max=0.775, sum=3.9 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.0, mean=0.086, max=0.573, sum=2.84 (33)",
-            "tab": "Calibration",
-            "score": 0.08607203532710274
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.085, max=0.775, sum=2.8 (33)",
-            "tab": "Robustness",
-            "score": 0.08484848484848484
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0, mean=0.106, max=0.75, sum=3.5 (33)",
-            "tab": "Fairness",
-            "score": 0.10606060606060606
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.329, mean=0.586, max=0.74, sum=19.352 (33)",
-            "tab": "Efficiency",
-            "score": 0.586429068475456
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=3.913, max=5, sum=129.125 (33)",
-            "tab": "General information",
-            "score": 3.912878787878788
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0.09, max=0.925, sum=2.975 (33)",
-            "tab": "General information",
-            "score": 0.09015151515151516
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=263.4, mean=650.012, max=949.7, sum=21450.4 (33)",
-            "tab": "General information",
-            "score": 650.0121212121212
-          },
-          "RAFT - # output tokens": {
-            "description": "min=30, mean=30, max=30, sum=990 (33)",
-            "tab": "General information",
-            "score": 30.0
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "min=0.125, mean=0.125, max=0.125, sum=0.375 (3)",
-            "tab": "Bias",
-            "score": 0.12500000000000003
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json b/data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json
deleted file mode 100644
index 25f29c7e2..000000000
--- a/data/helm_classic/cohere/Cohere-Command-beta-52.4B/b8932181-b669-4b0e-8879-1dfbf9afea12.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-52.4B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cohere Command beta 52.4B",
-    "id": "cohere/Cohere-Command-beta-52.4B",
-    "developer": "cohere",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.874,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.5963856625666678
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.8502739196287583
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.8657917351465738
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5758163753811841
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.6738178488178488
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.6776315789473684
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.452,
-        "details": {
-          "description": "min=0.23, mean=0.452, max=0.79, sum=6.786 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.099, mean=0.183, max=0.338, sum=2.742 (15)",
-            "tab": "Calibration",
-            "score": 0.18282231471159943
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.15, mean=0.387, max=0.73, sum=5.807 (15)",
-            "tab": "Robustness",
-            "score": 0.38711111111111113
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.19, mean=0.407, max=0.73, sum=6.107 (15)",
-            "tab": "Fairness",
-            "score": 0.4071111111111111
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=372.75, mean=481.26, max=628.421, sum=7218.903 (15)",
-            "tab": "General information",
-            "score": 481.2602105263158
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.856,
-        "details": {
-          "description": "min=0.849, mean=0.856, max=0.86, sum=2.569 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.018, mean=0.023, max=0.026, sum=0.069 (3)",
-            "tab": "Calibration",
-            "score": 0.02302613493537822
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.806, mean=0.811, max=0.816, sum=2.432 (3)",
-            "tab": "Robustness",
-            "score": 0.8106666666666666
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.812, mean=0.822, max=0.827, sum=2.465 (3)",
-            "tab": "Fairness",
-            "score": 0.8216666666666667
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=669.307, mean=925.307, max=1269.307, sum=2775.921 (3)",
-            "tab": "General information",
-            "score": 925.3070000000001
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.752,
-        "details": {
-          "description": "min=0.744, mean=0.752, max=0.763, sum=2.255 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.051, mean=0.058, max=0.067, sum=0.173 (3)",
-            "tab": "Calibration",
-            "score": 0.05761424791814445
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.566, mean=0.57, max=0.578, sum=1.711 (3)",
-            "tab": "Robustness",
-            "score": 0.5702997988620334
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.647, mean=0.657, max=0.666, sum=1.97 (3)",
-            "tab": "Fairness",
-            "score": 0.6566736137653061
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=0.904, mean=1.508, max=1.941, sum=4.524 (3)",
-            "tab": "General information",
-            "score": 1.5079812206572771
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1570.772, mean=1600.684, max=1660.485, sum=4802.051 (3)",
-            "tab": "General information",
-            "score": 1600.6835680751174
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.679, mean=5.992, max=6.496, sum=17.977 (3)",
-            "tab": "General information",
-            "score": 5.992488262910798
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.361, mean=0.404, max=0.444, sum=1.213 (3)",
-            "tab": "Bias",
-            "score": 0.404320987654321
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.174, mean=0.178, max=0.181, sum=0.534 (3)",
-            "tab": "Bias",
-            "score": 0.1778748183802931
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.011, mean=0.014, max=0.017, sum=0.042 (3)",
-            "tab": "Toxicity",
-            "score": 0.014084507042253521
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.755, mean=0.76, max=0.763, sum=2.28 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.075, mean=0.084, max=0.091, sum=0.251 (3)",
-            "tab": "Calibration",
-            "score": 0.08377931898267306
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.044, mean=0.056, max=0.063, sum=0.168 (3)",
-            "tab": "Calibration",
-            "score": 0.05602757611120105
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.286, mean=0.289, max=0.294, sum=0.867 (3)",
-            "tab": "Robustness",
-            "score": 0.28891923018489013
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.669, mean=0.679, max=0.685, sum=2.036 (3)",
-            "tab": "Robustness",
-            "score": 0.6786112890887687
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.29, mean=0.296, max=0.301, sum=0.888 (3)",
-            "tab": "Fairness",
-            "score": 0.29608566298974776
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.7, mean=0.706, max=0.714, sum=2.117 (3)",
-            "tab": "Fairness",
-            "score": 0.7056823207366739
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=109.191, mean=111.191, max=115.191, sum=333.573 (3)",
-            "tab": "General information",
-            "score": 111.19099999999999
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.29, mean=4.325, max=4.367, sum=12.974 (3)",
-            "tab": "General information",
-            "score": 4.324666666666666
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.485, mean=4.602, max=4.705, sum=13.807 (3)",
-            "tab": "General information",
-            "score": 4.602333333333333
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.039, mean=0.039, max=0.039, sum=0.117 (3)",
-            "tab": "General information",
-            "score": 0.039
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1258.15, mean=1471.073, max=1597.431, sum=4413.22 (3)",
-            "tab": "General information",
-            "score": 1471.073333333333
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=7.153, mean=7.288, max=7.488, sum=21.864 (3)",
-            "tab": "General information",
-            "score": 7.288
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.487, mean=0.552, max=0.634, sum=1.655 (3)",
-            "tab": "Bias",
-            "score": 0.5517958743765196
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.063, mean=0.129, max=0.206, sum=0.387 (3)",
-            "tab": "Bias",
-            "score": 0.12914332399626519
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.479, mean=0.482, max=0.483, sum=1.446 (3)",
-            "tab": "Bias",
-            "score": 0.48194444444444445
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.577, mean=0.579, max=0.582, sum=1.737 (3)",
-            "tab": "Bias",
-            "score": 0.5791309646902151
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.025, mean=0.05, max=0.067, sum=0.151 (3)",
-            "tab": "Bias",
-            "score": 0.05047080979284368
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.432,
-        "details": {
-          "description": "min=0.429, mean=0.432, max=0.435, sum=1.296 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.043, mean=0.06, max=0.073, sum=0.181 (3)",
-            "tab": "Calibration",
-            "score": 0.06049762085119498
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.236, mean=0.238, max=0.24, sum=0.715 (3)",
-            "tab": "Robustness",
-            "score": 0.23825281130135667
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.309, mean=0.316, max=0.322, sum=0.947 (3)",
-            "tab": "Fairness",
-            "score": 0.31563184414828255
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.748, mean=0.848, max=0.933, sum=2.545 (3)",
-            "tab": "General information",
-            "score": 0.8483333333333333
-          },
-          "QuAC - truncated": {
-            "description": "min=0.022, mean=0.022, max=0.022, sum=0.066 (3)",
-            "tab": "General information",
-            "score": 0.022000000000000002
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1577.224, mean=1610.503, max=1643.74, sum=4831.508 (3)",
-            "tab": "General information",
-            "score": 1610.5026666666665
-          },
-          "QuAC - # output tokens": {
-            "description": "min=19.435, mean=19.627, max=19.984, sum=58.881 (3)",
-            "tab": "General information",
-            "score": 19.627
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.593, mean=0.596, max=0.603, sum=1.788 (3)",
-            "tab": "Bias",
-            "score": 0.5961199294532628
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.459, mean=0.47, max=0.484, sum=1.409 (3)",
-            "tab": "Bias",
-            "score": 0.4696816360952984
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.299, mean=0.316, max=0.333, sum=0.949 (3)",
-            "tab": "Bias",
-            "score": 0.316297459154602
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.219, mean=0.232, max=0.245, sum=0.695 (3)",
-            "tab": "Bias",
-            "score": 0.23168423828159934
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.811,
-        "details": {
-          "description": "min=0.811, mean=0.811, max=0.811, sum=0.811 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.325, mean=0.325, max=0.325, sum=0.325 (1)",
-            "tab": "Calibration",
-            "score": 0.3246923611213033
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.774, mean=0.774, max=0.774, sum=0.774 (1)",
-            "tab": "Robustness",
-            "score": 0.774
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.699, mean=0.699, max=0.699, sum=0.699 (1)",
-            "tab": "Fairness",
-            "score": 0.699
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=88.855, mean=88.855, max=88.855, sum=88.855 (1)",
-            "tab": "General information",
-            "score": 88.855
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.582,
-        "details": {
-          "description": "min=0.582, mean=0.582, max=0.582, sum=0.582 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.231, mean=0.231, max=0.231, sum=0.231 (1)",
-            "tab": "Calibration",
-            "score": 0.23111297495969485
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.492, mean=0.492, max=0.492, sum=0.492 (1)",
-            "tab": "Robustness",
-            "score": 0.492
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.508, mean=0.508, max=0.508, sum=0.508 (1)",
-            "tab": "Fairness",
-            "score": 0.508
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.358, mean=5.358, max=5.358, sum=5.358 (1)",
-            "tab": "General information",
-            "score": 5.358
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.269,
-        "details": {
-          "description": "min=0.265, mean=0.269, max=0.275, sum=0.807 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.272, mean=0.311, max=0.338, sum=0.933 (3)",
-            "tab": "Calibration",
-            "score": 0.31095945192078733
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.226, mean=0.229, max=0.231, sum=0.688 (3)",
-            "tab": "Robustness",
-            "score": 0.2293577981651376
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.219, mean=0.222, max=0.225, sum=0.665 (3)",
-            "tab": "Fairness",
-            "score": 0.2217125382262997
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=505.315, mean=514.648, max=532.315, sum=1543.945 (3)",
-            "tab": "General information",
-            "score": 514.6483180428135
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.762,
-        "details": {
-          "description": "min=0.761, mean=0.762, max=0.765, sum=2.287 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.429, mean=0.434, max=0.438, sum=1.303 (3)",
-            "tab": "Robustness",
-            "score": 0.43439140211640154
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.726, mean=0.734, max=0.743, sum=2.202 (3)",
-            "tab": "Robustness",
-            "score": 0.7339375978505934
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.444, mean=0.45, max=0.453, sum=1.35 (3)",
-            "tab": "Fairness",
-            "score": 0.4498752645502638
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.745, mean=0.748, max=0.752, sum=2.245 (3)",
-            "tab": "Fairness",
-            "score": 0.7483868294443408
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=497.281, mean=536.614, max=583.281, sum=1609.843 (3)",
-            "tab": "General information",
-            "score": 536.6143333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=480.163, mean=519.496, max=566.163, sum=1558.488 (3)",
-            "tab": "General information",
-            "score": 519.4961240310078
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.161,
-        "details": {
-          "description": "min=0.156, mean=0.161, max=0.167, sum=0.966 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1555.036, mean=1575.036, max=1602.036, sum=9450.219 (6)",
-            "tab": "General information",
-            "score": 1575.0364806866953
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=72.088, mean=74.406, max=77.451, sum=446.433 (6)",
-            "tab": "General information",
-            "score": 74.40557939914163
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.587, mean=0.612, max=0.629, sum=3.673 (6)",
-            "tab": "Bias",
-            "score": 0.6121656731068496
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.391, mean=0.396, max=0.407, sum=2.379 (6)",
-            "tab": "Bias",
-            "score": 0.39642600089657387
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.238, mean=0.286, max=0.343, sum=1.713 (6)",
-            "tab": "Bias",
-            "score": 0.28558037967512334
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.088, mean=0.09, max=0.093, sum=0.537 (6)",
-            "tab": "Bias",
-            "score": 0.08955985269326716
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.366, mean=0.415, max=0.441, sum=1.245 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.4149051333035736
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.316, mean=0.318, max=0.322, sum=0.955 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.31834420143428105
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.976, mean=0.979, max=0.982, sum=5.874 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9790462109521986
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=28.96, mean=32.165, max=35.676, sum=192.989 (6)",
-            "tab": "Summarization metrics",
-            "score": 32.164866076836944
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=8.594, mean=9.156, max=9.657, sum=54.938 (6)",
-            "tab": "Summarization metrics",
-            "score": 9.156293880030324
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.152,
-        "details": {
-          "description": "min=0.147, mean=0.152, max=0.156, sum=0.913 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.996, mean=4.997, max=5, sum=29.985 (6)",
-            "tab": "General information",
-            "score": 4.997425997425997
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1484.608, mean=1537.293, max=1572.616, sum=9223.757 (6)",
-            "tab": "General information",
-            "score": 1537.2927927927929
-          },
-          "XSUM - # output tokens": {
-            "description": "min=24.187, mean=24.351, max=24.541, sum=146.108 (6)",
-            "tab": "General information",
-            "score": 24.35135135135135
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4.0 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666669
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.433, mean=0.457, max=0.476, sum=2.745 (6)",
-            "tab": "Bias",
-            "score": 0.4574302134646962
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.481, mean=0.522, max=0.556, sum=3.13 (6)",
-            "tab": "Bias",
-            "score": 0.5217473884140551
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.18, mean=0.181, max=0.182, sum=1.086 (6)",
-            "tab": "Bias",
-            "score": 0.1810207108427353
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.285, mean=-0.271, max=-0.262, sum=-0.814 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.27140173856816235
-          },
-          "XSUM - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.455, mean=0.459, max=0.462, sum=1.376 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.4587225678869484
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.788, mean=0.793, max=0.797, sum=4.758 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.7930169105851288
-          },
-          "XSUM - Density": {
-            "description": "min=2.417, mean=2.548, max=2.678, sum=15.286 (6)",
-            "tab": "Summarization metrics",
-            "score": 2.54760656490819
-          },
-          "XSUM - Compression": {
-            "description": "min=16.704, mean=16.937, max=17.065, sum=101.621 (6)",
-            "tab": "Summarization metrics",
-            "score": 16.93675136805864
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.96,
-        "details": {
-          "description": "min=0.955, mean=0.96, max=0.965, sum=2.881 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.011, mean=0.015, max=0.02, sum=0.045 (3)",
-            "tab": "Calibration",
-            "score": 0.015015056118517703
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.929, mean=0.933, max=0.936, sum=2.799 (3)",
-            "tab": "Robustness",
-            "score": 0.9330000000000002
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.951, mean=0.957, max=0.96, sum=2.871 (3)",
-            "tab": "Fairness",
-            "score": 0.957
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.89, mean=4.217, max=4.981, sum=12.652 (3)",
-            "tab": "General information",
-            "score": 4.217333333333333
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1282.318, mean=1557.741, max=1776.111, sum=4673.222 (3)",
-            "tab": "General information",
-            "score": 1557.7406666666666
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.601,
-        "details": {
-          "description": "min=0.254, mean=0.601, max=0.86, sum=32.478 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.054, mean=0.161, max=0.416, sum=8.676 (54)",
-            "tab": "Calibration",
-            "score": 0.16066140880534402
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.205, mean=0.535, max=0.84, sum=28.866 (54)",
-            "tab": "Robustness",
-            "score": 0.5345588668880686
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.222, mean=0.544, max=0.85, sum=29.397 (54)",
-            "tab": "Fairness",
-            "score": 0.5443897908426464
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=362.293, mean=732.514, max=1288.441, sum=39555.782 (54)",
-            "tab": "General information",
-            "score": 732.5144825548033
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=54 (54)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.667,
-        "details": {
-          "description": "min=0.025, mean=0.667, max=0.975, sum=22.0 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.041, mean=0.262, max=0.96, sum=8.637 (33)",
-            "tab": "Calibration",
-            "score": 0.26172447899775947
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.599, max=0.975, sum=19.775 (33)",
-            "tab": "Robustness",
-            "score": 0.5992424242424242
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.025, mean=0.627, max=0.975, sum=20.7 (33)",
-            "tab": "Fairness",
-            "score": 0.6272727272727272
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.554, max=5, sum=150.275 (33)",
-            "tab": "General information",
-            "score": 4.553787878787879
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=270.325, mean=813.265, max=1762.475, sum=26837.75 (33)",
-            "tab": "General information",
-            "score": 813.2651515151515
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.025, mean=3.15, max=6.8, sum=103.95 (33)",
-            "tab": "General information",
-            "score": 3.15
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json b/data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json
deleted file mode 100644
index 8f01acff1..000000000
--- a/data/helm_classic/cohere/Cohere-Command-beta-6.1B/c8f6f90c-39f6-4685-9d2d-8964c3d2ba02.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-Command-beta-6.1B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cohere Command beta 6.1B",
-    "id": "cohere/Cohere-Command-beta-6.1B",
-    "developer": "cohere",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.675,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.5291111339523303
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.6159776448986682
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.66227113635345
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.43551719208606965
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.6688037271370605
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.5789473684210527
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.406,
-        "details": {
-          "description": "min=0.26, mean=0.406, max=0.63, sum=6.095 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.103, mean=0.155, max=0.243, sum=2.327 (15)",
-            "tab": "Calibration",
-            "score": 0.1551609000421963
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.2, mean=0.334, max=0.54, sum=5.009 (15)",
-            "tab": "Robustness",
-            "score": 0.33394152046783626
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.2, mean=0.366, max=0.55, sum=5.495 (15)",
-            "tab": "Fairness",
-            "score": 0.36630409356725147
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=372.75, mean=481.26, max=628.421, sum=7218.903 (15)",
-            "tab": "General information",
-            "score": 481.2602105263158
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.798,
-        "details": {
-          "description": "min=0.791, mean=0.798, max=0.809, sum=2.394 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.048, mean=0.059, max=0.069, sum=0.178 (3)",
-            "tab": "Calibration",
-            "score": 0.0594622129465324
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.715, mean=0.725, max=0.743, sum=2.176 (3)",
-            "tab": "Robustness",
-            "score": 0.7253333333333334
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.74, mean=0.748, max=0.76, sum=2.244 (3)",
-            "tab": "Fairness",
-            "score": 0.7479999999999999
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=669.307, mean=925.307, max=1269.307, sum=2775.921 (3)",
-            "tab": "General information",
-            "score": 925.3070000000001
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.709,
-        "details": {
-          "description": "min=0.707, mean=0.709, max=0.712, sum=2.128 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.075, mean=0.076, max=0.077, sum=0.228 (3)",
-            "tab": "Calibration",
-            "score": 0.07599807506781359
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.515, mean=0.529, max=0.539, sum=1.586 (3)",
-            "tab": "Robustness",
-            "score": 0.5285770759196127
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.592, mean=0.595, max=0.6, sum=1.785 (3)",
-            "tab": "Fairness",
-            "score": 0.5949605221040284
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=0.904, mean=1.508, max=1.941, sum=4.524 (3)",
-            "tab": "General information",
-            "score": 1.5079812206572771
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1570.772, mean=1600.684, max=1660.485, sum=4802.051 (3)",
-            "tab": "General information",
-            "score": 1600.6835680751174
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.301, mean=5.807, max=6.217, sum=17.42 (3)",
-            "tab": "General information",
-            "score": 5.8065727699530525
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.463, mean=0.488, max=0.5, sum=1.463 (3)",
-            "tab": "Bias",
-            "score": 0.48765432098765427
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.126, mean=0.144, max=0.169, sum=0.432 (3)",
-            "tab": "Bias",
-            "score": 0.14398558425056623
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.008, mean=0.01, max=0.014, sum=0.031 (3)",
-            "tab": "Toxicity",
-            "score": 0.010328638497652582
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.717,
-        "details": {
-          "description": "min=0.714, mean=0.717, max=0.724, sum=2.152 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.04, mean=0.042, max=0.046, sum=0.127 (3)",
-            "tab": "Calibration",
-            "score": 0.04227945276969597
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.045, mean=0.057, max=0.074, sum=0.172 (3)",
-            "tab": "Calibration",
-            "score": 0.057325907163997956
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.156, mean=0.163, max=0.171, sum=0.489 (3)",
-            "tab": "Robustness",
-            "score": 0.163031767310864
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.596, mean=0.605, max=0.616, sum=1.815 (3)",
-            "tab": "Robustness",
-            "score": 0.6050162193677248
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.165, mean=0.167, max=0.167, sum=0.5 (3)",
-            "tab": "Fairness",
-            "score": 0.16652011745655915
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.647, mean=0.654, max=0.66, sum=1.962 (3)",
-            "tab": "Fairness",
-            "score": 0.6540942012407344
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=109.191, mean=111.191, max=115.191, sum=333.573 (3)",
-            "tab": "General information",
-            "score": 111.19099999999999
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.428, mean=4.687, max=4.995, sum=14.06 (3)",
-            "tab": "General information",
-            "score": 4.6866666666666665
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.485, mean=4.602, max=4.705, sum=13.807 (3)",
-            "tab": "General information",
-            "score": 4.602333333333333
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.039, mean=0.039, max=0.039, sum=0.117 (3)",
-            "tab": "General information",
-            "score": 0.039
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1258.15, mean=1471.073, max=1597.431, sum=4413.22 (3)",
-            "tab": "General information",
-            "score": 1471.073333333333
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=7.147, mean=7.377, max=7.586, sum=22.131 (3)",
-            "tab": "General information",
-            "score": 7.377
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.421, mean=0.465, max=0.506, sum=1.394 (3)",
-            "tab": "Bias",
-            "score": 0.46474105132386057
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.1, mean=0.183, max=0.3, sum=0.55 (3)",
-            "tab": "Bias",
-            "score": 0.18333333333333335
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.473, mean=0.487, max=0.509, sum=1.46 (3)",
-            "tab": "Bias",
-            "score": 0.48677896291115386
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.348, mean=0.356, max=0.363, sum=1.068 (3)",
-            "tab": "Bias",
-            "score": 0.3560153609831029
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)",
-            "tab": "Toxicity",
-            "score": 0.0003333333333333333
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)",
-            "tab": "Toxicity",
-            "score": 0.0003333333333333333
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375,
-        "details": {
-          "description": "min=0.371, mean=0.375, max=0.379, sum=1.125 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.054, mean=0.062, max=0.067, sum=0.186 (3)",
-            "tab": "Calibration",
-            "score": 0.06185077042352865
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.159, mean=0.17, max=0.178, sum=0.511 (3)",
-            "tab": "Robustness",
-            "score": 0.17034790269142241
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.268, mean=0.273, max=0.279, sum=0.819 (3)",
-            "tab": "Fairness",
-            "score": 0.2730533859766594
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.748, mean=0.848, max=0.933, sum=2.545 (3)",
-            "tab": "General information",
-            "score": 0.8483333333333333
-          },
-          "QuAC - truncated": {
-            "description": "min=0.022, mean=0.022, max=0.022, sum=0.066 (3)",
-            "tab": "General information",
-            "score": 0.022000000000000002
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1577.224, mean=1610.503, max=1643.74, sum=4831.508 (3)",
-            "tab": "General information",
-            "score": 1610.5026666666665
-          },
-          "QuAC - # output tokens": {
-            "description": "min=16.185, mean=17.394, max=18.299, sum=52.182 (3)",
-            "tab": "General information",
-            "score": 17.394
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.469, mean=0.471, max=0.475, sum=1.414 (3)",
-            "tab": "Bias",
-            "score": 0.47144607843137254
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.312, mean=0.356, max=0.423, sum=1.069 (3)",
-            "tab": "Bias",
-            "score": 0.35619490458200137
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.236, mean=0.248, max=0.259, sum=0.743 (3)",
-            "tab": "Bias",
-            "score": 0.2476420794142787
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.006 (3)",
-            "tab": "Toxicity",
-            "score": 0.002
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.752,
-        "details": {
-          "description": "min=0.752, mean=0.752, max=0.752, sum=0.752 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.293 (1)",
-            "tab": "Calibration",
-            "score": 0.2926835489814197
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.696, mean=0.696, max=0.696, sum=0.696 (1)",
-            "tab": "Robustness",
-            "score": 0.696
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.608, mean=0.608, max=0.608, sum=0.608 (1)",
-            "tab": "Fairness",
-            "score": 0.608
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=88.855, mean=88.855, max=88.855, sum=88.855 (1)",
-            "tab": "General information",
-            "score": 88.855
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.55,
-        "details": {
-          "description": "min=0.55, mean=0.55, max=0.55, sum=0.55 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.25, mean=0.25, max=0.25, sum=0.25 (1)",
-            "tab": "Calibration",
-            "score": 0.2504061981122775
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.448, mean=0.448, max=0.448, sum=0.448 (1)",
-            "tab": "Robustness",
-            "score": 0.448
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.468, mean=0.468, max=0.468, sum=0.468 (1)",
-            "tab": "Fairness",
-            "score": 0.468
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.358, mean=5.358, max=5.358, sum=5.358 (1)",
-            "tab": "General information",
-            "score": 5.358
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.203,
-        "details": {
-          "description": "min=0.197, mean=0.203, max=0.213, sum=0.61 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.275, mean=0.3, max=0.332, sum=0.901 (3)",
-            "tab": "Calibration",
-            "score": 0.3001833323753285
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.168, mean=0.171, max=0.174, sum=0.512 (3)",
-            "tab": "Robustness",
-            "score": 0.17074413863404692
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.154, mean=0.163, max=0.167, sum=0.488 (3)",
-            "tab": "Fairness",
-            "score": 0.16258919469928643
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=505.315, mean=514.648, max=532.315, sum=1543.945 (3)",
-            "tab": "General information",
-            "score": 514.6483180428135
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.709,
-        "details": {
-          "description": "min=0.702, mean=0.709, max=0.717, sum=2.128 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.372, mean=0.387, max=0.401, sum=1.161 (3)",
-            "tab": "Robustness",
-            "score": 0.386937698412698
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.68, mean=0.685, max=0.689, sum=2.054 (3)",
-            "tab": "Robustness",
-            "score": 0.6845367765287401
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.402, mean=0.411, max=0.42, sum=1.232 (3)",
-            "tab": "Fairness",
-            "score": 0.4107572751322747
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.68, mean=0.69, max=0.696, sum=2.069 (3)",
-            "tab": "Fairness",
-            "score": 0.6896233668786421
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=497.281, mean=536.614, max=583.281, sum=1609.843 (3)",
-            "tab": "General information",
-            "score": 536.6143333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=480.163, mean=519.496, max=566.163, sum=1558.488 (3)",
-            "tab": "General information",
-            "score": 519.4961240310078
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.153,
-        "details": {
-          "description": "min=0.15, mean=0.153, max=0.158, sum=0.919 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1555.036, mean=1575.036, max=1602.036, sum=9450.219 (6)",
-            "tab": "General information",
-            "score": 1575.0364806866953
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=69.622, mean=73.723, max=77.732, sum=442.339 (6)",
-            "tab": "General information",
-            "score": 73.72317596566523
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.594, mean=0.603, max=0.609, sum=3.618 (6)",
-            "tab": "Bias",
-            "score": 0.6029930306246096
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.384, mean=0.408, max=0.421, sum=2.449 (6)",
-            "tab": "Bias",
-            "score": 0.40820094830714143
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.245, mean=0.259, max=0.269, sum=1.553 (6)",
-            "tab": "Bias",
-            "score": 0.2588148950314076
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.116, mean=0.121, max=0.127, sum=0.724 (6)",
-            "tab": "Bias",
-            "score": 0.1206019792299876
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.318, mean=0.331, max=0.342, sum=0.992 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.3306993242099164
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.289, mean=0.296, max=0.305, sum=0.888 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.29605955170271475
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.974, mean=0.975, max=0.975, sum=5.848 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9746996636764317
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=28.678, mean=31.707, max=36.132, sum=190.245 (6)",
-            "tab": "Summarization metrics",
-            "score": 31.707488870766706
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=9.108, mean=9.688, max=10.161, sum=58.13 (6)",
-            "tab": "Summarization metrics",
-            "score": 9.688415513712991
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.122,
-        "details": {
-          "description": "min=0.122, mean=0.122, max=0.122, sum=0.73 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.996, mean=4.997, max=5, sum=29.985 (6)",
-            "tab": "General information",
-            "score": 4.997425997425997
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1484.608, mean=1537.293, max=1572.616, sum=9223.757 (6)",
-            "tab": "General information",
-            "score": 1537.2927927927929
-          },
-          "XSUM - # output tokens": {
-            "description": "min=22.674, mean=23.421, max=24.095, sum=140.529 (6)",
-            "tab": "General information",
-            "score": 23.421492921492924
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.445, mean=0.454, max=0.467, sum=2.725 (6)",
-            "tab": "Bias",
-            "score": 0.45422077922077925
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.483, mean=0.505, max=0.524, sum=3.031 (6)",
-            "tab": "Bias",
-            "score": 0.5051915503043323
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.198, mean=0.215, max=0.235, sum=1.29 (6)",
-            "tab": "Bias",
-            "score": 0.2150586429483566
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.244, mean=-0.239, max=-0.235, sum=-0.716 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.23871033593647883
-          },
-          "XSUM - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.417, mean=0.418, max=0.42, sum=1.254 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.4181413420706151
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.823, mean=0.824, max=0.826, sum=4.943 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.8238944118657666
-          },
-          "XSUM - Density": {
-            "description": "min=2.687, mean=2.793, max=2.942, sum=16.758 (6)",
-            "tab": "Summarization metrics",
-            "score": 2.7930375453507623
-          },
-          "XSUM - Compression": {
-            "description": "min=17.475, mean=18.017, max=18.57, sum=108.1 (6)",
-            "tab": "Summarization metrics",
-            "score": 18.016669951894464
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.961,
-        "details": {
-          "description": "min=0.959, mean=0.961, max=0.962, sum=2.882 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.011, mean=0.014, max=0.019, sum=0.043 (3)",
-            "tab": "Calibration",
-            "score": 0.014204038428277976
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.917, mean=0.921, max=0.925, sum=2.762 (3)",
-            "tab": "Robustness",
-            "score": 0.9206666666666669
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.946, mean=0.95, max=0.954, sum=2.851 (3)",
-            "tab": "Fairness",
-            "score": 0.9503333333333334
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.89, mean=4.217, max=4.981, sum=12.652 (3)",
-            "tab": "General information",
-            "score": 4.217333333333333
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1282.318, mean=1557.741, max=1776.111, sum=4673.222 (3)",
-            "tab": "General information",
-            "score": 1557.7406666666666
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.54,
-        "details": {
-          "description": "min=0.009, mean=0.54, max=1, sum=29.17 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.113, mean=0.358, max=0.735, sum=19.322 (54)",
-            "tab": "Calibration",
-            "score": 0.3578234752080933
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.468, max=1, sum=25.26 (54)",
-            "tab": "Robustness",
-            "score": 0.46778473308233626
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.002, mean=0.496, max=1, sum=26.757 (54)",
-            "tab": "Fairness",
-            "score": 0.4955072296924251
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=362.293, mean=732.514, max=1288.441, sum=39555.782 (54)",
-            "tab": "General information",
-            "score": 732.5144825548033
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=54 (54)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.634,
-        "details": {
-          "description": "min=0.05, mean=0.634, max=0.975, sum=20.925 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.05, mean=0.274, max=0.84, sum=9.055 (33)",
-            "tab": "Calibration",
-            "score": 0.2744070774220778
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.552, max=0.975, sum=18.225 (33)",
-            "tab": "Robustness",
-            "score": 0.5522727272727274
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.05, mean=0.609, max=0.975, sum=20.1 (33)",
-            "tab": "Fairness",
-            "score": 0.609090909090909
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.554, max=5, sum=150.275 (33)",
-            "tab": "General information",
-            "score": 4.553787878787879
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=270.325, mean=813.265, max=1762.475, sum=26837.75 (33)",
-            "tab": "General information",
-            "score": 813.2651515151515
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.2, mean=3.148, max=6.3, sum=103.875 (33)",
-            "tab": "General information",
-            "score": 3.1477272727272725
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json b/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json
deleted file mode 100644
index 16c06b937..000000000
--- a/data/helm_classic/cohere/Cohere-large-v20220720-13.1B/579fb908-3c36-4ff8-a262-fd5388806b83.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-large-v20220720-13.1B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cohere large v20220720 13.1B",
-    "id": "cohere/Cohere-large-v20220720-13.1B",
-    "developer": "cohere",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.372,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.6524936901131783
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.3450884302942145
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.3621096552687209
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.40696820175438597
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5413536579003514
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.48450623450623453
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.5760442773600668
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.324,
-        "details": {
-          "description": "min=0.19, mean=0.324, max=0.4, sum=4.854 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.075, mean=0.112, max=0.151, sum=1.678 (15)",
-            "tab": "Calibration",
-            "score": 0.11188578153206447
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.15, mean=0.253, max=0.35, sum=3.799 (15)",
-            "tab": "Robustness",
-            "score": 0.25327485380116954
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.14, mean=0.281, max=0.38, sum=4.214 (15)",
-            "tab": "Fairness",
-            "score": 0.2809590643274854
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.292, mean=0.317, max=0.349, sum=4.752 (15)",
-            "tab": "Efficiency",
-            "score": 0.3167793253495066
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=372.75, mean=481.26, max=628.421, sum=7218.903 (15)",
-            "tab": "General information",
-            "score": 481.2602105263158
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.725,
-        "details": {
-          "description": "min=0.705, mean=0.725, max=0.738, sum=2.176 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.066, mean=0.088, max=0.106, sum=0.265 (3)",
-            "tab": "Calibration",
-            "score": 0.08825401206422555
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.514, mean=0.545, max=0.566, sum=1.635 (3)",
-            "tab": "Robustness",
-            "score": 0.545
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.653, mean=0.676, max=0.695, sum=2.027 (3)",
-            "tab": "Fairness",
-            "score": 0.6756666666666667
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.359, mean=0.421, max=0.505, sum=1.263 (3)",
-            "tab": "Efficiency",
-            "score": 0.4208381308593749
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=669.307, mean=925.307, max=1269.307, sum=2775.921 (3)",
-            "tab": "General information",
-            "score": 925.3070000000001
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.625,
-        "details": {
-          "description": "min=0.581, mean=0.625, max=0.647, sum=1.874 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.032, mean=0.037, max=0.044, sum=0.11 (3)",
-            "tab": "Calibration",
-            "score": 0.03650754887085305
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.318, mean=0.357, max=0.38, sum=1.072 (3)",
-            "tab": "Robustness",
-            "score": 0.3573511654752053
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.466, mean=0.512, max=0.538, sum=1.537 (3)",
-            "tab": "Fairness",
-            "score": 0.5123186802559418
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=0.693, mean=0.729, max=0.782, sum=2.186 (3)",
-            "tab": "Efficiency",
-            "score": 0.7286962533010564
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=0.958, mean=1.562, max=1.997, sum=4.687 (3)",
-            "tab": "General information",
-            "score": 1.5624413145539906
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1601.997, mean=1634.99, max=1693.155, sum=4904.969 (3)",
-            "tab": "General information",
-            "score": 1634.9896713615024
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.535, mean=6.91, max=9.504, sum=20.73 (3)",
-            "tab": "General information",
-            "score": 6.909859154929578
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.418, mean=0.473, max=0.5, sum=1.418 (3)",
-            "tab": "Bias",
-            "score": 0.4726495726495727
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.193, mean=0.202, max=0.211, sum=0.607 (3)",
-            "tab": "Bias",
-            "score": 0.20233455199447267
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.014, mean=0.017, max=0.02, sum=0.051 (3)",
-            "tab": "Toxicity",
-            "score": 0.016901408450704227
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.573,
-        "details": {
-          "description": "min=0.553, mean=0.573, max=0.584, sum=1.72 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.02, mean=0.025, max=0.032, sum=0.074 (3)",
-            "tab": "Calibration",
-            "score": 0.024639111727299556
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.117, mean=0.143, max=0.158, sum=0.43 (3)",
-            "tab": "Calibration",
-            "score": 0.14321248401208217
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.16, mean=0.172, max=0.18, sum=0.515 (3)",
-            "tab": "Robustness",
-            "score": 0.17161461010403287
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.287, mean=0.347, max=0.38, sum=1.041 (3)",
-            "tab": "Robustness",
-            "score": 0.3470084296370371
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.176, mean=0.178, max=0.181, sum=0.535 (3)",
-            "tab": "Fairness",
-            "score": 0.17833773739586523
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.489, mean=0.507, max=0.516, sum=1.52 (3)",
-            "tab": "Fairness",
-            "score": 0.5065982888177307
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.332, mean=0.337, max=0.343, sum=1.012 (3)",
-            "tab": "Efficiency",
-            "score": 0.33722079557291607
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=0.681, mean=0.774, max=0.827, sum=2.321 (3)",
-            "tab": "Efficiency",
-            "score": 0.7738100833333333
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=109.191, mean=111.191, max=115.191, sum=333.573 (3)",
-            "tab": "General information",
-            "score": 111.19099999999999
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=5.441, mean=5.625, max=5.917, sum=16.875 (3)",
-            "tab": "General information",
-            "score": 5.625
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.538, mean=4.633, max=4.715, sum=13.899 (3)",
-            "tab": "General information",
-            "score": 4.633
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.039, mean=0.039, max=0.039, sum=0.117 (3)",
-            "tab": "General information",
-            "score": 0.039
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1261.72, mean=1481.344, max=1608.455, sum=4444.032 (3)",
-            "tab": "General information",
-            "score": 1481.344
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=8.71, mean=10.443, max=11.438, sum=31.329 (3)",
-            "tab": "General information",
-            "score": 10.443
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.25, mean=0.333, max=0.5, sum=1 (3)",
-            "tab": "Bias",
-            "score": 0.3333333333333333
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.244, mean=0.34, max=0.429, sum=1.021 (3)",
-            "tab": "Bias",
-            "score": 0.34034751045060324
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.208, mean=0.233, max=0.269, sum=0.7 (3)",
-            "tab": "Bias",
-            "score": 0.23326210826210825
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.37, mean=0.39, max=0.4, sum=1.17 (3)",
-            "tab": "Bias",
-            "score": 0.38999999999999996
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.447, mean=0.457, max=0.467, sum=1.371 (3)",
-            "tab": "Bias",
-            "score": 0.45706182643221777
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.125, mean=0.174, max=0.251, sum=0.523 (3)",
-            "tab": "Bias",
-            "score": 0.17447005829358772
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)",
-            "tab": "Toxicity",
-            "score": 0.0003333333333333333
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.002, max=0.003, sum=0.005 (3)",
-            "tab": "Toxicity",
-            "score": 0.0016666666666666668
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.338,
-        "details": {
-          "description": "min=0.335, mean=0.338, max=0.343, sum=1.015 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.03, mean=0.033, max=0.036, sum=0.099 (3)",
-            "tab": "Calibration",
-            "score": 0.03288362014267938
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.197, mean=0.204, max=0.211, sum=0.613 (3)",
-            "tab": "Robustness",
-            "score": 0.20424911828028136
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.251, mean=0.256, max=0.259, sum=0.768 (3)",
-            "tab": "Fairness",
-            "score": 0.25613799535824233
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=1.189, mean=1.262, max=1.309, sum=3.785 (3)",
-            "tab": "Efficiency",
-            "score": 1.261730263346353
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.797, mean=0.881, max=0.969, sum=2.644 (3)",
-            "tab": "General information",
-            "score": 0.8813333333333334
-          },
-          "QuAC - truncated": {
-            "description": "min=0.02, mean=0.02, max=0.02, sum=0.06 (3)",
-            "tab": "General information",
-            "score": 0.02
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1600.292, mean=1639.784, max=1661.675, sum=4919.353 (3)",
-            "tab": "General information",
-            "score": 1639.784333333333
-          },
-          "QuAC - # output tokens": {
-            "description": "min=26.693, mean=30.036, max=32.515, sum=90.109 (3)",
-            "tab": "General information",
-            "score": 30.036333333333335
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.43, mean=0.441, max=0.46, sum=1.322 (3)",
-            "tab": "Bias",
-            "score": 0.4407422751666938
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.306, mean=0.338, max=0.358, sum=1.015 (3)",
-            "tab": "Bias",
-            "score": 0.3382593663469334
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.234, mean=0.238, max=0.243, sum=0.714 (3)",
-            "tab": "Bias",
-            "score": 0.23804653081585347
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.003, mean=0.003, max=0.004, sum=0.01 (3)",
-            "tab": "Toxicity",
-            "score": 0.0033333333333333335
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.736,
-        "details": {
-          "description": "min=0.736, mean=0.736, max=0.736, sum=0.736 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.288, mean=0.288, max=0.288, sum=0.288 (1)",
-            "tab": "Calibration",
-            "score": 0.28820318504565584
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.687, mean=0.687, max=0.687, sum=0.687 (1)",
-            "tab": "Robustness",
-            "score": 0.687
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.575, mean=0.575, max=0.575, sum=0.575 (1)",
-            "tab": "Fairness",
-            "score": 0.575
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.225, mean=0.225, max=0.225, sum=0.225 (1)",
-            "tab": "Efficiency",
-            "score": 0.22464337890624972
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=88.855, mean=88.855, max=88.855, sum=88.855 (1)",
-            "tab": "General information",
-            "score": 88.855
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.542,
-        "details": {
-          "description": "min=0.542, mean=0.542, max=0.542, sum=0.542 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.225, mean=0.225, max=0.225, sum=0.225 (1)",
-            "tab": "Calibration",
-            "score": 0.2254334966206393
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.43, mean=0.43, max=0.43, sum=0.43 (1)",
-            "tab": "Robustness",
-            "score": 0.43
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.446, mean=0.446, max=0.446, sum=0.446 (1)",
-            "tab": "Fairness",
-            "score": 0.446
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.201, mean=0.201, max=0.201, sum=0.201 (1)",
-            "tab": "Efficiency",
-            "score": 0.2014860078125007
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.358, mean=5.358, max=5.358, sum=5.358 (1)",
-            "tab": "General information",
-            "score": 5.358
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.181,
-        "details": {
-          "description": "min=0.161, mean=0.181, max=0.2, sum=0.544 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.097, mean=0.105, max=0.117, sum=0.316 (3)",
-            "tab": "Calibration",
-            "score": 0.10528939288118344
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.141, mean=0.154, max=0.173, sum=0.462 (3)",
-            "tab": "Robustness",
-            "score": 0.15392456676860344
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.142, mean=0.157, max=0.174, sum=0.471 (3)",
-            "tab": "Fairness",
-            "score": 0.15698267074413863
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.323, mean=0.325, max=0.328, sum=0.975 (3)",
-            "tab": "Efficiency",
-            "score": 0.3248777191442089
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=505.315, mean=514.648, max=532.315, sum=1543.945 (3)",
-            "tab": "General information",
-            "score": 514.6483180428135
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.33,
-        "details": {
-          "description": "min=0.292, mean=0.33, max=0.382, sum=0.991 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.109, mean=0.13, max=0.147, sum=0.39 (3)",
-            "tab": "Robustness",
-            "score": 0.1300338624338624
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.195, mean=0.257, max=0.323, sum=0.772 (3)",
-            "tab": "Robustness",
-            "score": 0.2574506868270638
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.136, mean=0.164, max=0.189, sum=0.493 (3)",
-            "tab": "Fairness",
-            "score": 0.16423492063492048
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.273, mean=0.312, max=0.361, sum=0.936 (3)",
-            "tab": "Fairness",
-            "score": 0.3120660241438415
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.322, mean=0.33, max=0.339, sum=0.989 (3)",
-            "tab": "Efficiency",
-            "score": 0.3298234970703125
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.319, mean=0.327, max=0.335, sum=0.98 (3)",
-            "tab": "Efficiency",
-            "score": 0.32664419815891477
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=497.281, mean=536.614, max=583.281, sum=1609.843 (3)",
-            "tab": "General information",
-            "score": 536.6143333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=1.008, mean=1.025, max=1.046, sum=3.074 (3)",
-            "tab": "General information",
-            "score": 1.0246666666666666
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=480.163, mean=519.496, max=566.163, sum=1558.488 (3)",
-            "tab": "General information",
-            "score": 519.4961240310078
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=1.023, mean=1.031, max=1.047, sum=3.093 (3)",
-            "tab": "General information",
-            "score": 1.0310077519379846
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.126,
-        "details": {
-          "description": "min=0.115, mean=0.126, max=0.134, sum=0.758 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=2.097, mean=2.269, max=2.366, sum=13.614 (6)",
-            "tab": "Efficiency",
-            "score": 2.2689930690607114
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1555.036, mean=1575.036, max=1602.036, sum=9450.219 (6)",
-            "tab": "General information",
-            "score": 1575.0364806866953
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=67.079, mean=74.505, max=78.916, sum=447.03 (6)",
-            "tab": "General information",
-            "score": 74.50500715307582
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.58, mean=0.626, max=0.659, sum=3.756 (6)",
-            "tab": "Bias",
-            "score": 0.6260369618341756
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.371, mean=0.401, max=0.431, sum=2.409 (6)",
-            "tab": "Bias",
-            "score": 0.40149048314255253
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.185, mean=0.238, max=0.295, sum=1.431 (6)",
-            "tab": "Bias",
-            "score": 0.23843844144516976
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.115, mean=0.134, max=0.153, sum=0.805 (6)",
-            "tab": "Bias",
-            "score": 0.1341289455316015
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)",
-            "tab": "Toxicity",
-            "score": 0.000715307582260372
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.447, mean=0.5, max=0.543, sum=1.499 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.4997740334832678
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=4.715, mean=4.763, max=4.822, sum=28.58 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.763415476947068
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.227, mean=0.246, max=0.263, sum=0.737 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.2457600895432969
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.903, mean=0.946, max=0.975, sum=5.678 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9463649022058865
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=30.364, mean=37.733, max=45.984, sum=226.401 (6)",
-            "tab": "Summarization metrics",
-            "score": 37.73347863579329
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=9.977, mean=11.27, max=13.424, sum=67.62 (6)",
-            "tab": "Summarization metrics",
-            "score": 11.269948645908789
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108,
-        "details": {
-          "description": "min=0.106, mean=0.108, max=0.11, sum=0.649 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=1.064, mean=1.075, max=1.089, sum=6.451 (6)",
-            "tab": "Efficiency",
-            "score": 1.0751711510617759
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.996, mean=4.998, max=5, sum=29.988 (6)",
-            "tab": "General information",
-            "score": 4.998069498069498
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1484.608, mean=1537.452, max=1572.616, sum=9224.71 (6)",
-            "tab": "General information",
-            "score": 1537.4517374517375
-          },
-          "XSUM - # output tokens": {
-            "description": "min=22.133, mean=22.992, max=23.423, sum=137.954 (6)",
-            "tab": "General information",
-            "score": 22.99227799227799
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.456, mean=0.466, max=0.484, sum=2.793 (6)",
-            "tab": "Bias",
-            "score": 0.4655148596176822
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.139, mean=0.157, max=0.172, sum=0.945 (6)",
-            "tab": "Bias",
-            "score": 0.15743560442588508
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.008 (6)",
-            "tab": "Toxicity",
-            "score": 0.001287001287001287
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.196, mean=-0.189, max=-0.185, sum=-0.567 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.18902428828304493
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=2.852, mean=2.889, max=2.928, sum=17.336 (6)",
-            "tab": "Summarization metrics",
-            "score": 2.889265592037019
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.394, mean=0.398, max=0.403, sum=1.195 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.3984961779205311
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.82, mean=0.823, max=0.825, sum=4.937 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.8227568594164721
-          },
-          "XSUM - Density": {
-            "description": "min=3.497, mean=3.599, max=3.746, sum=21.593 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.5988000456323377
-          },
-          "XSUM - Compression": {
-            "description": "min=20.099, mean=20.712, max=21.78, sum=124.27 (6)",
-            "tab": "Summarization metrics",
-            "score": 20.711693139962097
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.933,
-        "details": {
-          "description": "min=0.929, mean=0.933, max=0.94, sum=2.8 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.098, mean=0.132, max=0.183, sum=0.396 (3)",
-            "tab": "Calibration",
-            "score": 0.13199349625828075
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.895, mean=0.902, max=0.91, sum=2.706 (3)",
-            "tab": "Robustness",
-            "score": 0.902
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.912, mean=0.92, max=0.93, sum=2.759 (3)",
-            "tab": "Fairness",
-            "score": 0.9196666666666666
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.479, mean=0.536, max=0.62, sum=1.607 (3)",
-            "tab": "Efficiency",
-            "score": 0.5358171357421871
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=4.846, mean=4.93, max=4.98, sum=14.79 (3)",
-            "tab": "General information",
-            "score": 4.930000000000001
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1161.854, mean=1398.654, max=1747.025, sum=4195.961 (3)",
-            "tab": "General information",
-            "score": 1398.6536666666668
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.507,
-        "details": {
-          "description": "min=0, mean=0.507, max=1, sum=27.395 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.1, mean=0.384, max=0.705, sum=20.717 (54)",
-            "tab": "Calibration",
-            "score": 0.38365386942886265
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.333, max=0.95, sum=17.981 (54)",
-            "tab": "Robustness",
-            "score": 0.3329825600043121
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.443, max=1, sum=23.917 (54)",
-            "tab": "Fairness",
-            "score": 0.44290609222735455
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.29, mean=0.375, max=0.51, sum=20.235 (54)",
-            "tab": "Efficiency",
-            "score": 0.3747284900914756
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=362.293, mean=732.514, max=1288.441, sum=39555.782 (54)",
-            "tab": "General information",
-            "score": 732.5144825548033
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=54 (54)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.596,
-        "details": {
-          "description": "min=0, mean=0.596, max=0.975, sum=19.675 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.115, mean=0.267, max=1, sum=8.804 (33)",
-            "tab": "Calibration",
-            "score": 0.26679166027291745
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.49, max=0.975, sum=16.175 (33)",
-            "tab": "Robustness",
-            "score": 0.49015151515151517
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0, mean=0.564, max=0.975, sum=18.625 (33)",
-            "tab": "Fairness",
-            "score": 0.5643939393939394
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.284, mean=0.444, max=0.697, sum=14.664 (33)",
-            "tab": "Efficiency",
-            "score": 0.4443553984670929
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.557, max=5, sum=150.375 (33)",
-            "tab": "General information",
-            "score": 4.556818181818182
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=270.325, mean=814.446, max=1777.025, sum=26876.725 (33)",
-            "tab": "General information",
-            "score": 814.446212121212
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0, mean=3.02, max=6.5, sum=99.65 (33)",
-            "tab": "General information",
-            "score": 3.01969696969697
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json b/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json
deleted file mode 100644
index f0d42b850..000000000
--- a/data/helm_classic/cohere/Cohere-medium-v20220720-6.1B/68ff9f10-0357-4ea8-b758-de6c7f51d669.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-medium-v20220720-6.1B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cohere medium v20220720 6.1B",
-    "id": "cohere/Cohere-medium-v20220720-6.1B",
-    "developer": "cohere",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.23,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.5098117312502142
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.18793903538063716
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.26943181031056446
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.5410910087719298
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.4663309072932103
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.5508257174923842
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.4311194653299916
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.279,
-        "details": {
-          "description": "min=0.18, mean=0.279, max=0.36, sum=4.182 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.067, mean=0.114, max=0.164, sum=1.703 (15)",
-            "tab": "Calibration",
-            "score": 0.11350786269483934
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.09, mean=0.184, max=0.24, sum=2.755 (15)",
-            "tab": "Robustness",
-            "score": 0.18368421052631578
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.15, mean=0.237, max=0.29, sum=3.548 (15)",
-            "tab": "Fairness",
-            "score": 0.23653801169590644
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.265, mean=0.281, max=0.301, sum=4.21 (15)",
-            "tab": "Efficiency",
-            "score": 0.2806724427425987
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=372.75, mean=481.26, max=628.421, sum=7218.903 (15)",
-            "tab": "General information",
-            "score": 481.2602105263158
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.659,
-        "details": {
-          "description": "min=0.65, mean=0.659, max=0.667, sum=1.977 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.069, mean=0.082, max=0.093, sum=0.247 (3)",
-            "tab": "Calibration",
-            "score": 0.08218351589951171
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.556, mean=0.562, max=0.573, sum=1.686 (3)",
-            "tab": "Robustness",
-            "score": 0.5619999999999999
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.589, mean=0.597, max=0.61, sum=1.792 (3)",
-            "tab": "Fairness",
-            "score": 0.5973333333333333
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.308, mean=0.35, max=0.402, sum=1.049 (3)",
-            "tab": "Efficiency",
-            "score": 0.34952371158854173
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=669.307, mean=925.307, max=1269.307, sum=2775.921 (3)",
-            "tab": "General information",
-            "score": 925.3070000000001
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.559,
-        "details": {
-          "description": "min=0.54, mean=0.559, max=0.572, sum=1.677 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.043, mean=0.047, max=0.055, sum=0.141 (3)",
-            "tab": "Calibration",
-            "score": 0.046946382998353055
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.283, mean=0.3, max=0.315, sum=0.899 (3)",
-            "tab": "Robustness",
-            "score": 0.29964626689663526
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.416, mean=0.438, max=0.455, sum=1.313 (3)",
-            "tab": "Fairness",
-            "score": 0.4376922212938658
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=0.525, mean=0.533, max=0.548, sum=1.599 (3)",
-            "tab": "Efficiency",
-            "score": 0.5331198741930753
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=0.958, mean=1.562, max=1.997, sum=4.687 (3)",
-            "tab": "General information",
-            "score": 1.5624413145539906
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1601.997, mean=1634.99, max=1693.155, sum=4904.969 (3)",
-            "tab": "General information",
-            "score": 1634.9896713615024
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.392, mean=6.771, max=8.33, sum=20.313 (3)",
-            "tab": "General information",
-            "score": 6.770892018779342
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.394, mean=0.427, max=0.45, sum=1.282 (3)",
-            "tab": "Bias",
-            "score": 0.42718253968253966
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.373, mean=0.569, max=0.667, sum=1.706 (3)",
-            "tab": "Bias",
-            "score": 0.5686274509803922
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.152, mean=0.174, max=0.195, sum=0.521 (3)",
-            "tab": "Bias",
-            "score": 0.17371956530315583
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.014, mean=0.02, max=0.025, sum=0.059 (3)",
-            "tab": "Toxicity",
-            "score": 0.01971830985915493
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.504,
-        "details": {
-          "description": "min=0.482, mean=0.504, max=0.516, sum=1.512 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.018, mean=0.026, max=0.036, sum=0.077 (3)",
-            "tab": "Calibration",
-            "score": 0.025653079993217736
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.129, mean=0.142, max=0.154, sum=0.425 (3)",
-            "tab": "Calibration",
-            "score": 0.14175015381424005
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.097, mean=0.102, max=0.104, sum=0.305 (3)",
-            "tab": "Robustness",
-            "score": 0.10170384904294616
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.226, mean=0.266, max=0.292, sum=0.799 (3)",
-            "tab": "Robustness",
-            "score": 0.26631844818771483
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.124, mean=0.126, max=0.127, sum=0.377 (3)",
-            "tab": "Fairness",
-            "score": 0.12565301660951664
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.41, mean=0.432, max=0.444, sum=1.297 (3)",
-            "tab": "Fairness",
-            "score": 0.4322127161835283
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.254, mean=0.259, max=0.265, sum=0.778 (3)",
-            "tab": "Efficiency",
-            "score": 0.25938733203125103
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=0.476, mean=0.535, max=0.583, sum=1.606 (3)",
-            "tab": "Efficiency",
-            "score": 0.5353007499999998
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=109.191, mean=111.191, max=115.191, sum=333.573 (3)",
-            "tab": "General information",
-            "score": 111.19099999999999
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.823, mean=5.267, max=5.728, sum=15.801 (3)",
-            "tab": "General information",
-            "score": 5.267
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.538, mean=4.633, max=4.715, sum=13.899 (3)",
-            "tab": "General information",
-            "score": 4.633
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.039, mean=0.039, max=0.039, sum=0.117 (3)",
-            "tab": "General information",
-            "score": 0.039
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1261.72, mean=1481.344, max=1608.455, sum=4444.032 (3)",
-            "tab": "General information",
-            "score": 1481.344
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=7.288, mean=9.101, max=11.307, sum=27.304 (3)",
-            "tab": "General information",
-            "score": 9.101333333333333
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.419, mean=0.441, max=0.476, sum=1.323 (3)",
-            "tab": "Bias",
-            "score": 0.4410100926954859
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.214, mean=0.251, max=0.3, sum=0.753 (3)",
-            "tab": "Bias",
-            "score": 0.2511387163561077
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.292, mean=0.354, max=0.417, sum=0.708 (2)",
-            "tab": "Bias",
-            "score": 0.3541666666666667
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.289, mean=0.325, max=0.385, sum=0.974 (3)",
-            "tab": "Bias",
-            "score": 0.3247724272114516
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.202, mean=0.234, max=0.285, sum=0.703 (3)",
-            "tab": "Bias",
-            "score": 0.23429326676087917
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)",
-            "tab": "Toxicity",
-            "score": 0.0003333333333333333
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.003, sum=0.007 (3)",
-            "tab": "Toxicity",
-            "score": 0.0023333333333333335
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.279,
-        "details": {
-          "description": "min=0.273, mean=0.279, max=0.287, sum=0.838 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.042, mean=0.048, max=0.061, sum=0.145 (3)",
-            "tab": "Calibration",
-            "score": 0.04829561557428013
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.12, mean=0.144, max=0.157, sum=0.432 (3)",
-            "tab": "Robustness",
-            "score": 0.14398518012537756
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.186, mean=0.198, max=0.207, sum=0.593 (3)",
-            "tab": "Fairness",
-            "score": 0.19765650296002213
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=0.664, mean=0.735, max=0.771, sum=2.206 (3)",
-            "tab": "Efficiency",
-            "score": 0.7354030888671875
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.797, mean=0.881, max=0.969, sum=2.644 (3)",
-            "tab": "General information",
-            "score": 0.8813333333333334
-          },
-          "QuAC - truncated": {
-            "description": "min=0.02, mean=0.02, max=0.02, sum=0.06 (3)",
-            "tab": "General information",
-            "score": 0.02
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1600.292, mean=1639.784, max=1661.675, sum=4919.353 (3)",
-            "tab": "General information",
-            "score": 1639.784333333333
-          },
-          "QuAC - # output tokens": {
-            "description": "min=17.39, mean=23.531, max=27.056, sum=70.593 (3)",
-            "tab": "General information",
-            "score": 23.531000000000002
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2.0 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666669
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.383, mean=0.412, max=0.431, sum=1.237 (3)",
-            "tab": "Bias",
-            "score": 0.41249828370040936
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.303, mean=0.357, max=0.392, sum=1.072 (3)",
-            "tab": "Bias",
-            "score": 0.35746080227329485
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.233, mean=0.262, max=0.276, sum=0.786 (3)",
-            "tab": "Bias",
-            "score": 0.2618392019722732
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.002, max=0.002, sum=0.005 (3)",
-            "tab": "Toxicity",
-            "score": 0.0016666666666666668
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.706,
-        "details": {
-          "description": "min=0.706, mean=0.706, max=0.706, sum=0.706 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.271, mean=0.271, max=0.271, sum=0.271 (1)",
-            "tab": "Calibration",
-            "score": 0.2707363482287178
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.651, mean=0.651, max=0.651, sum=0.651 (1)",
-            "tab": "Robustness",
-            "score": 0.651
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.525, mean=0.525, max=0.525, sum=0.525 (1)",
-            "tab": "Fairness",
-            "score": 0.525
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.204, mean=0.204, max=0.204, sum=0.204 (1)",
-            "tab": "Efficiency",
-            "score": 0.20370158203125027
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=88.855, mean=88.855, max=88.855, sum=88.855 (1)",
-            "tab": "General information",
-            "score": 88.855
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.496,
-        "details": {
-          "description": "min=0.496, mean=0.496, max=0.496, sum=0.496 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.275, mean=0.275, max=0.275, sum=0.275 (1)",
-            "tab": "Calibration",
-            "score": 0.27530956848832144
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.382, mean=0.382, max=0.382, sum=0.382 (1)",
-            "tab": "Robustness",
-            "score": 0.382
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.42 (1)",
-            "tab": "Fairness",
-            "score": 0.42
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.187, mean=0.187, max=0.187, sum=0.187 (1)",
-            "tab": "Efficiency",
-            "score": 0.1870674140625
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.358, mean=5.358, max=5.358, sum=5.358 (1)",
-            "tab": "General information",
-            "score": 5.358
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.19,
-        "details": {
-          "description": "min=0.176, mean=0.19, max=0.203, sum=0.57 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.082, mean=0.094, max=0.109, sum=0.282 (3)",
-            "tab": "Calibration",
-            "score": 0.09386032214108035
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.127, mean=0.149, max=0.168, sum=0.448 (3)",
-            "tab": "Robustness",
-            "score": 0.1493374108053007
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.154, mean=0.174, max=0.19, sum=0.521 (3)",
-            "tab": "Fairness",
-            "score": 0.17380224260958207
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.287, mean=0.287, max=0.288, sum=0.862 (3)",
-            "tab": "Efficiency",
-            "score": 0.28723167974722846
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=505.315, mean=514.648, max=532.315, sum=1543.945 (3)",
-            "tab": "General information",
-            "score": 514.6483180428135
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.374,
-        "details": {
-          "description": "min=0.337, mean=0.374, max=0.416, sum=1.122 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.101, mean=0.109, max=0.12, sum=0.326 (3)",
-            "tab": "Robustness",
-            "score": 0.10871957671957677
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.294, mean=0.315, max=0.354, sum=0.945 (3)",
-            "tab": "Robustness",
-            "score": 0.31504083631376195
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.126, mean=0.132, max=0.136, sum=0.396 (3)",
-            "tab": "Fairness",
-            "score": 0.13183915343915345
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.321, mean=0.357, max=0.398, sum=1.072 (3)",
-            "tab": "Fairness",
-            "score": 0.35726921379791293
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.286, mean=0.289, max=0.293, sum=0.867 (3)",
-            "tab": "Efficiency",
-            "score": 0.28909981347656255
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.285, mean=0.288, max=0.29, sum=0.864 (3)",
-            "tab": "Efficiency",
-            "score": 0.28804701126453486
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=497.281, mean=536.614, max=583.281, sum=1609.843 (3)",
-            "tab": "General information",
-            "score": 536.6143333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=1, mean=1.005, max=1.013, sum=3.014 (3)",
-            "tab": "General information",
-            "score": 1.0046666666666666
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=480.163, mean=519.496, max=566.163, sum=1558.488 (3)",
-            "tab": "General information",
-            "score": 519.4961240310078
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=1, mean=1.016, max=1.023, sum=3.047 (3)",
-            "tab": "General information",
-            "score": 1.0155038759689923
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.077,
-        "details": {
-          "description": "min=0.03, mean=0.077, max=0.111, sum=0.459 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=1.073, mean=1.2, max=1.325, sum=7.2 (6)",
-            "tab": "Efficiency",
-            "score": 1.199950748558208
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1555.036, mean=1575.036, max=1602.036, sum=9450.219 (6)",
-            "tab": "General information",
-            "score": 1575.0364806866953
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=52.893, mean=63.193, max=73.206, sum=379.159 (6)",
-            "tab": "General information",
-            "score": 63.1931330472103
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.644, mean=0.659, max=0.667, sum=3.956 (6)",
-            "tab": "Bias",
-            "score": 0.6592592592592593
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.402, mean=0.44, max=0.476, sum=2.641 (6)",
-            "tab": "Bias",
-            "score": 0.44008624507065996
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.285, mean=0.304, max=0.333, sum=1.825 (6)",
-            "tab": "Bias",
-            "score": 0.30422478269658376
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.127, mean=0.173, max=0.229, sum=1.037 (6)",
-            "tab": "Bias",
-            "score": 0.17278322431241475
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)",
-            "tab": "Toxicity",
-            "score": 0.000715307582260372
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=-0.174, mean=0.229, max=0.443, sum=0.686 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.22880441457511005
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=4.552, mean=4.664, max=4.795, sum=27.982 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.663724611238682
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.008, mean=0.115, max=0.197, sum=0.346 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.11522739683384077
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.482, mean=0.799, max=0.965, sum=4.793 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.7988868167525552
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=9.34, mean=22.176, max=32.926, sum=133.058 (6)",
-            "tab": "Summarization metrics",
-            "score": 22.17629615230217
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=11.915, mean=13.154, max=15.457, sum=78.926 (6)",
-            "tab": "Summarization metrics",
-            "score": 13.15437099106955
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.087,
-        "details": {
-          "description": "min=0.086, mean=0.087, max=0.09, sum=0.524 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=0.717, mean=0.724, max=0.732, sum=4.343 (6)",
-            "tab": "Efficiency",
-            "score": 0.7239030526061776
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.996, mean=4.998, max=5, sum=29.988 (6)",
-            "tab": "General information",
-            "score": 4.998069498069498
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1484.608, mean=1537.452, max=1572.616, sum=9224.71 (6)",
-            "tab": "General information",
-            "score": 1537.4517374517375
-          },
-          "XSUM - # output tokens": {
-            "description": "min=23.498, mean=24.055, max=24.463, sum=144.328 (6)",
-            "tab": "General information",
-            "score": 24.054697554697555
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.447, mean=0.461, max=0.481, sum=2.765 (6)",
-            "tab": "Bias",
-            "score": 0.46086088123125163
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.449, mean=0.498, max=0.579, sum=2.99 (6)",
-            "tab": "Bias",
-            "score": 0.4982964658021866
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.167, mean=0.186, max=0.198, sum=1.115 (6)",
-            "tab": "Bias",
-            "score": 0.18582940251572325
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.17, mean=-0.159, max=-0.142, sum=-0.477 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.1589340320425144
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=3.197, mean=3.223, max=3.258, sum=19.336 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.2227135293221596
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.364, mean=0.367, max=0.371, sum=1.102 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.36729036225155814
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.84, mean=0.847, max=0.855, sum=5.083 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.8472154184001573
-          },
-          "XSUM - Density": {
-            "description": "min=4.485, mean=4.754, max=4.928, sum=28.525 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.7541975208526
-          },
-          "XSUM - Compression": {
-            "description": "min=19.527, mean=19.748, max=20.169, sum=118.491 (6)",
-            "tab": "Summarization metrics",
-            "score": 19.748450478665102
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.935,
-        "details": {
-          "description": "min=0.917, mean=0.935, max=0.947, sum=2.805 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.335, mean=0.36, max=0.394, sum=1.08 (3)",
-            "tab": "Calibration",
-            "score": 0.360155737743892
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.878, mean=0.889, max=0.897, sum=2.666 (3)",
-            "tab": "Robustness",
-            "score": 0.8886666666666666
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.896, mean=0.918, max=0.936, sum=2.753 (3)",
-            "tab": "Fairness",
-            "score": 0.9176666666666667
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.404, mean=0.452, max=0.489, sum=1.355 (3)",
-            "tab": "Efficiency",
-            "score": 0.45160390852864607
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.903, mean=4.229, max=4.983, sum=12.688 (3)",
-            "tab": "General information",
-            "score": 4.229333333333333
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1283.038, mean=1562.808, max=1784.2, sum=4688.425 (3)",
-            "tab": "General information",
-            "score": 1562.8083333333334
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1.003, max=1.01, sum=3.01 (3)",
-            "tab": "General information",
-            "score": 1.0033333333333332
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.504,
-        "details": {
-          "description": "min=0, mean=0.504, max=1, sum=27.205 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.176, mean=0.459, max=0.641, sum=24.77 (54)",
-            "tab": "Calibration",
-            "score": 0.45870054566126006
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.136, max=0.736, sum=7.362 (54)",
-            "tab": "Robustness",
-            "score": 0.13632694985889793
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.489, max=1, sum=26.387 (54)",
-            "tab": "Fairness",
-            "score": 0.48864261081744575
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.262, mean=0.321, max=0.405, sum=17.316 (54)",
-            "tab": "Efficiency",
-            "score": 0.32067323239104795
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=362.293, mean=732.514, max=1288.441, sum=39555.782 (54)",
-            "tab": "General information",
-            "score": 732.5144825548033
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=54 (54)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52,
-        "details": {
-          "description": "min=0.125, mean=0.52, max=0.975, sum=17.15 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.151, mean=0.304, max=0.849, sum=10.027 (33)",
-            "tab": "Calibration",
-            "score": 0.3038351531350353
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.385, max=0.975, sum=12.7 (33)",
-            "tab": "Robustness",
-            "score": 0.3848484848484848
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.125, mean=0.5, max=0.975, sum=16.5 (33)",
-            "tab": "Fairness",
-            "score": 0.5
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.244, mean=0.358, max=0.532, sum=11.817 (33)",
-            "tab": "Efficiency",
-            "score": 0.3580963386304451
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.557, max=5, sum=150.375 (33)",
-            "tab": "General information",
-            "score": 4.556818181818182
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=270.325, mean=814.446, max=1777.025, sum=26876.725 (33)",
-            "tab": "General information",
-            "score": 814.446212121212
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.225, mean=2.965, max=6.15, sum=97.85 (33)",
-            "tab": "General information",
-            "score": 2.965151515151515
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json b/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json
deleted file mode 100644
index 43f986e70..000000000
--- a/data/helm_classic/cohere/Cohere-medium-v20221108-6.1B/b1ecc2b8-6461-4d70-b639-df3dc2594a5b.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-medium-v20221108-6.1B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cohere medium v20221108 6.1B",
-    "id": "cohere/Cohere-medium-v20221108-6.1B",
-    "developer": "cohere",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.312,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.6010395609917657
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.26965587249235745
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.339964744191663
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5558769690348637
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.6328714495381162
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.506578947368421
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.254,
-        "details": {
-          "description": "min=0.18, mean=0.254, max=0.32, sum=3.806 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.055, mean=0.113, max=0.167, sum=1.691 (15)",
-            "tab": "Calibration",
-            "score": 0.11272299343238619
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.15, mean=0.207, max=0.25, sum=3.1 (15)",
-            "tab": "Robustness",
-            "score": 0.20667836257309943
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.14, mean=0.22, max=0.3, sum=3.299 (15)",
-            "tab": "Fairness",
-            "score": 0.21994152046783624
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=372.75, mean=481.26, max=628.421, sum=7218.903 (15)",
-            "tab": "General information",
-            "score": 481.2602105263158
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7,
-        "details": {
-          "description": "min=0.693, mean=0.7, max=0.704, sum=2.1 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.088, mean=0.095, max=0.105, sum=0.284 (3)",
-            "tab": "Calibration",
-            "score": 0.09459272512018041
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.508, mean=0.54, max=0.568, sum=1.62 (3)",
-            "tab": "Robustness",
-            "score": 0.54
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.626, mean=0.642, max=0.652, sum=1.925 (3)",
-            "tab": "Fairness",
-            "score": 0.6416666666666667
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=669.307, mean=925.307, max=1269.307, sum=2775.921 (3)",
-            "tab": "General information",
-            "score": 925.3070000000001
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.61,
-        "details": {
-          "description": "min=0.57, mean=0.61, max=0.642, sum=1.831 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.027, mean=0.028, max=0.03, sum=0.085 (3)",
-            "tab": "Calibration",
-            "score": 0.02834267942109429
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.265, mean=0.296, max=0.321, sum=0.888 (3)",
-            "tab": "Robustness",
-            "score": 0.2960125312478054
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.441, mean=0.497, max=0.537, sum=1.491 (3)",
-            "tab": "Fairness",
-            "score": 0.49703931741598933
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=0.958, mean=1.562, max=1.997, sum=4.687 (3)",
-            "tab": "General information",
-            "score": 1.5624413145539906
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1601.997, mean=1634.99, max=1693.155, sum=4904.969 (3)",
-            "tab": "General information",
-            "score": 1634.9896713615024
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.544, mean=7.144, max=9.065, sum=21.431 (3)",
-            "tab": "General information",
-            "score": 7.143661971830986
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.417, mean=0.441, max=0.469, sum=1.323 (3)",
-            "tab": "Bias",
-            "score": 0.44097222222222215
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.15, mean=0.181, max=0.213, sum=0.543 (3)",
-            "tab": "Bias",
-            "score": 0.18104985015382555
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.008, mean=0.011, max=0.014, sum=0.034 (3)",
-            "tab": "Toxicity",
-            "score": 0.011267605633802818
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.517,
-        "details": {
-          "description": "min=0.506, mean=0.517, max=0.536, sum=1.551 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.006, mean=0.015, max=0.02, sum=0.044 (3)",
-            "tab": "Calibration",
-            "score": 0.01475928497137971
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.181, mean=0.233, max=0.27, sum=0.698 (3)",
-            "tab": "Calibration",
-            "score": 0.2327617365925914
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.099, mean=0.105, max=0.11, sum=0.314 (3)",
-            "tab": "Robustness",
-            "score": 0.10457862657700777
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.164, mean=0.222, max=0.282, sum=0.665 (3)",
-            "tab": "Robustness",
-            "score": 0.22177043436006846
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.142, mean=0.149, max=0.157, sum=0.447 (3)",
-            "tab": "Fairness",
-            "score": 0.14913779301489424
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.431, mean=0.45, max=0.473, sum=1.349 (3)",
-            "tab": "Fairness",
-            "score": 0.44971949324423194
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=109.191, mean=111.191, max=115.191, sum=333.573 (3)",
-            "tab": "General information",
-            "score": 111.19099999999999
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=6.631, mean=6.745, max=6.831, sum=20.236 (3)",
-            "tab": "General information",
-            "score": 6.745333333333334
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.538, mean=4.633, max=4.715, sum=13.899 (3)",
-            "tab": "General information",
-            "score": 4.633
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.039, mean=0.039, max=0.039, sum=0.117 (3)",
-            "tab": "General information",
-            "score": 0.039
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1261.72, mean=1481.344, max=1608.455, sum=4444.032 (3)",
-            "tab": "General information",
-            "score": 1481.344
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=7.485, mean=8.419, max=9.746, sum=25.256 (3)",
-            "tab": "General information",
-            "score": 8.418666666666667
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.357, mean=0.45, max=0.5, sum=1.349 (3)",
-            "tab": "Bias",
-            "score": 0.44969278033794163
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.382, mean=0.451, max=0.504, sum=1.353 (3)",
-            "tab": "Bias",
-            "score": 0.4511619362542481
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.173, mean=0.314, max=0.386, sum=0.942 (3)",
-            "tab": "Bias",
-            "score": 0.3140619884317363
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.233, mean=0.308, max=0.35, sum=0.923 (3)",
-            "tab": "Bias",
-            "score": 0.30777777777777776
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.421, mean=0.452, max=0.476, sum=1.356 (3)",
-            "tab": "Bias",
-            "score": 0.4519283176992704
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.056, mean=0.061, max=0.069, sum=0.184 (3)",
-            "tab": "Bias",
-            "score": 0.06120328473269649
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0.002, max=0.003, sum=0.005 (3)",
-            "tab": "Toxicity",
-            "score": 0.0016666666666666668
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.314,
-        "details": {
-          "description": "min=0.297, mean=0.314, max=0.328, sum=0.942 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.029, mean=0.041, max=0.062, sum=0.124 (3)",
-            "tab": "Calibration",
-            "score": 0.04129669890931466
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.127, mean=0.152, max=0.171, sum=0.456 (3)",
-            "tab": "Robustness",
-            "score": 0.15189850694469184
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.208, mean=0.229, max=0.244, sum=0.688 (3)",
-            "tab": "Fairness",
-            "score": 0.22939607207059778
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.797, mean=0.881, max=0.969, sum=2.644 (3)",
-            "tab": "General information",
-            "score": 0.8813333333333334
-          },
-          "QuAC - truncated": {
-            "description": "min=0.02, mean=0.02, max=0.02, sum=0.06 (3)",
-            "tab": "General information",
-            "score": 0.02
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1600.292, mean=1639.784, max=1661.675, sum=4919.353 (3)",
-            "tab": "General information",
-            "score": 1639.784333333333
-          },
-          "QuAC - # output tokens": {
-            "description": "min=18.756, mean=22.84, max=26.573, sum=68.519 (3)",
-            "tab": "General information",
-            "score": 22.83966666666667
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.619, mean=0.651, max=0.667, sum=1.952 (3)",
-            "tab": "Bias",
-            "score": 0.6507936507936508
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.436, mean=0.441, max=0.444, sum=1.322 (3)",
-            "tab": "Bias",
-            "score": 0.4407764298624513
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.345, mean=0.353, max=0.359, sum=1.06 (3)",
-            "tab": "Bias",
-            "score": 0.35330965547213355
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.248, mean=0.251, max=0.255, sum=0.753 (3)",
-            "tab": "Bias",
-            "score": 0.2510004319407244
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.006 (3)",
-            "tab": "Toxicity",
-            "score": 0.002
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.726,
-        "details": {
-          "description": "min=0.726, mean=0.726, max=0.726, sum=0.726 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.281, mean=0.281, max=0.281, sum=0.281 (1)",
-            "tab": "Calibration",
-            "score": 0.2814688190554964
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.687, mean=0.687, max=0.687, sum=0.687 (1)",
-            "tab": "Robustness",
-            "score": 0.687
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.567, mean=0.567, max=0.567, sum=0.567 (1)",
-            "tab": "Fairness",
-            "score": 0.567
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=88.855, mean=88.855, max=88.855, sum=88.855 (1)",
-            "tab": "General information",
-            "score": 88.855
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.538,
-        "details": {
-          "description": "min=0.538, mean=0.538, max=0.538, sum=0.538 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.23, mean=0.23, max=0.23, sum=0.23 (1)",
-            "tab": "Calibration",
-            "score": 0.2303402231123461
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.414, mean=0.414, max=0.414, sum=0.414 (1)",
-            "tab": "Robustness",
-            "score": 0.414
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.44, mean=0.44, max=0.44, sum=0.44 (1)",
-            "tab": "Fairness",
-            "score": 0.44
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.358, mean=5.358, max=5.358, sum=5.358 (1)",
-            "tab": "General information",
-            "score": 5.358
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.215,
-        "details": {
-          "description": "min=0.19, mean=0.215, max=0.237, sum=0.645 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.057, mean=0.08, max=0.106, sum=0.24 (3)",
-            "tab": "Calibration",
-            "score": 0.07993899696218487
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.156, mean=0.17, max=0.19, sum=0.511 (3)",
-            "tab": "Robustness",
-            "score": 0.17023445463812437
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.156, mean=0.182, max=0.205, sum=0.546 (3)",
-            "tab": "Fairness",
-            "score": 0.18195718654434248
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=505.315, mean=514.648, max=532.315, sum=1543.945 (3)",
-            "tab": "General information",
-            "score": 514.6483180428135
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.373,
-        "details": {
-          "description": "min=0.329, mean=0.373, max=0.4, sum=1.118 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.11, mean=0.13, max=0.144, sum=0.389 (3)",
-            "tab": "Robustness",
-            "score": 0.12963544973544971
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.265, mean=0.314, max=0.339, sum=0.942 (3)",
-            "tab": "Robustness",
-            "score": 0.3140445596258007
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.123, mean=0.145, max=0.162, sum=0.436 (3)",
-            "tab": "Fairness",
-            "score": 0.1454550264550264
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.311, mean=0.353, max=0.384, sum=1.058 (3)",
-            "tab": "Fairness",
-            "score": 0.35251421077315565
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=497.281, mean=536.614, max=583.281, sum=1609.843 (3)",
-            "tab": "General information",
-            "score": 536.6143333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=1, mean=1.005, max=1.008, sum=3.015 (3)",
-            "tab": "General information",
-            "score": 1.005
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=480.163, mean=519.496, max=566.163, sum=1558.488 (3)",
-            "tab": "General information",
-            "score": 519.4961240310078
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.121,
-        "details": {
-          "description": "min=0.116, mean=0.121, max=0.13, sum=0.728 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1555.036, mean=1575.036, max=1602.036, sum=9450.219 (6)",
-            "tab": "General information",
-            "score": 1575.0364806866953
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=60.474, mean=68.601, max=77.918, sum=411.605 (6)",
-            "tab": "General information",
-            "score": 68.60085836909872
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.604, mean=0.612, max=0.618, sum=3.671 (6)",
-            "tab": "Bias",
-            "score": 0.6118203882651768
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.401, mean=0.408, max=0.419, sum=2.449 (6)",
-            "tab": "Bias",
-            "score": 0.408087030039703
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.257, mean=0.287, max=0.318, sum=1.72 (6)",
-            "tab": "Bias",
-            "score": 0.2867291116025263
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.117, mean=0.141, max=0.159, sum=0.844 (6)",
-            "tab": "Bias",
-            "score": 0.14067727789435583
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)",
-            "tab": "Toxicity",
-            "score": 0.000715307582260372
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.231, mean=0.359, max=0.443, sum=1.077 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.35895859214347764
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.195, mean=0.218, max=0.246, sum=0.654 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.21796490870344257
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.801, mean=0.899, max=0.957, sum=5.391 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.8985701854042452
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=16.696, mean=24.344, max=33.085, sum=146.063 (6)",
-            "tab": "Summarization metrics",
-            "score": 24.343863209587038
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=9.239, mean=11.42, max=13.421, sum=68.523 (6)",
-            "tab": "Summarization metrics",
-            "score": 11.420494637224708
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.099,
-        "details": {
-          "description": "min=0.095, mean=0.099, max=0.106, sum=0.596 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.996, mean=4.998, max=5, sum=29.988 (6)",
-            "tab": "General information",
-            "score": 4.998069498069498
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1484.608, mean=1537.452, max=1572.616, sum=9224.71 (6)",
-            "tab": "General information",
-            "score": 1537.4517374517375
-          },
-          "XSUM - # output tokens": {
-            "description": "min=23.5, mean=23.626, max=23.749, sum=141.757 (6)",
-            "tab": "General information",
-            "score": 23.626126126126128
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.424, mean=0.436, max=0.453, sum=2.616 (6)",
-            "tab": "Bias",
-            "score": 0.43605987410335234
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.373, mean=0.393, max=0.404, sum=2.359 (6)",
-            "tab": "Bias",
-            "score": 0.393188854489164
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.181, mean=0.194, max=0.206, sum=1.165 (6)",
-            "tab": "Bias",
-            "score": 0.194128141174599
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.192, mean=-0.171, max=-0.149, sum=-0.513 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.17113255308913036
-          },
-          "XSUM - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.382, mean=0.384, max=0.388, sum=1.152 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.38412741233326225
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.842, mean=0.842, max=0.842, sum=5.051 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.8418943137133965
-          },
-          "XSUM - Density": {
-            "description": "min=3.715, mean=3.815, max=3.914, sum=22.889 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.8148335440941747
-          },
-          "XSUM - Compression": {
-            "description": "min=19.45, mean=19.703, max=19.907, sum=118.221 (6)",
-            "tab": "Summarization metrics",
-            "score": 19.7034371773279
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.935,
-        "details": {
-          "description": "min=0.917, mean=0.935, max=0.947, sum=2.804 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.335, mean=0.36, max=0.394, sum=1.079 (3)",
-            "tab": "Calibration",
-            "score": 0.3598306140598746
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.878, mean=0.888, max=0.896, sum=2.665 (3)",
-            "tab": "Robustness",
-            "score": 0.8883333333333333
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.896, mean=0.917, max=0.936, sum=2.752 (3)",
-            "tab": "Fairness",
-            "score": 0.9173333333333334
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.903, mean=4.229, max=4.983, sum=12.688 (3)",
-            "tab": "General information",
-            "score": 4.229333333333333
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1283.038, mean=1562.808, max=1784.2, sum=4688.425 (3)",
-            "tab": "General information",
-            "score": 1562.8083333333334
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1.003, max=1.01, sum=3.01 (3)",
-            "tab": "General information",
-            "score": 1.0033333333333332
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0, mean=0.5, max=1, sum=27.019 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.265, mean=0.487, max=0.736, sum=26.317 (54)",
-            "tab": "Calibration",
-            "score": 0.4873543575629644
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.353, max=0.931, sum=19.089 (54)",
-            "tab": "Robustness",
-            "score": 0.35349935695509527
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.493, max=1, sum=26.609 (54)",
-            "tab": "Fairness",
-            "score": 0.49275536816045606
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=362.293, mean=732.514, max=1288.441, sum=39555.782 (54)",
-            "tab": "General information",
-            "score": 732.5144825548033
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=54 (54)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.591,
-        "details": {
-          "description": "min=0.1, mean=0.591, max=0.975, sum=19.5 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.11, mean=0.253, max=0.545, sum=8.337 (33)",
-            "tab": "Calibration",
-            "score": 0.25263340417043
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.025, mean=0.502, max=0.975, sum=16.55 (33)",
-            "tab": "Robustness",
-            "score": 0.5015151515151515
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.1, mean=0.571, max=0.975, sum=18.85 (33)",
-            "tab": "Fairness",
-            "score": 0.5712121212121212
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.557, max=5, sum=150.375 (33)",
-            "tab": "General information",
-            "score": 4.556818181818182
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=270.325, mean=814.446, max=1777.025, sum=26876.725 (33)",
-            "tab": "General information",
-            "score": 814.446212121212
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.575, mean=3.038, max=6.375, sum=100.25 (33)",
-            "tab": "General information",
-            "score": 3.0378787878787885
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json b/data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json
deleted file mode 100644
index adaaa9403..000000000
--- a/data/helm_classic/cohere/Cohere-small-v20220720-410M/8e4f9ef2-8423-491d-b5e9-06128eb8fd32.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-small-v20220720-410M/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cohere small v20220720 410M",
-    "id": "cohere/Cohere-small-v20220720-410M",
-    "developer": "cohere",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.109,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.6085000742339626
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.1469566826886926
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.15386697669576083
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.5343333333333333
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.45155563090416306
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.412334270667604
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.29156223893065997
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.264,
-        "details": {
-          "description": "min=0.18, mean=0.264, max=0.42, sum=3.963 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.049, mean=0.136, max=0.202, sum=2.04 (15)",
-            "tab": "Calibration",
-            "score": 0.13602108170852936
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.13, mean=0.226, max=0.42, sum=3.397 (15)",
-            "tab": "Robustness",
-            "score": 0.22644444444444442
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.1, mean=0.222, max=0.4, sum=3.334 (15)",
-            "tab": "Fairness",
-            "score": 0.22225730994152046
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.265, mean=0.284, max=0.312, sum=4.267 (15)",
-            "tab": "Efficiency",
-            "score": 0.284456830180921
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=372.75, mean=481.26, max=628.421, sum=7218.903 (15)",
-            "tab": "General information",
-            "score": 481.2602105263158
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.457,
-        "details": {
-          "description": "min=0.447, mean=0.457, max=0.464, sum=1.372 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.072, mean=0.095, max=0.124, sum=0.285 (3)",
-            "tab": "Calibration",
-            "score": 0.09496766959019069
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.352, mean=0.361, max=0.378, sum=1.083 (3)",
-            "tab": "Robustness",
-            "score": 0.361
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.346, mean=0.374, max=0.396, sum=1.121 (3)",
-            "tab": "Fairness",
-            "score": 0.37366666666666665
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.319, mean=0.367, max=0.436, sum=1.101 (3)",
-            "tab": "Efficiency",
-            "score": 0.36694511328125
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=669.307, mean=925.307, max=1269.307, sum=2775.921 (3)",
-            "tab": "General information",
-            "score": 925.3070000000001
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1.001, max=1.004, sum=3.004 (3)",
-            "tab": "General information",
-            "score": 1.0013333333333334
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.294,
-        "details": {
-          "description": "min=0.281, mean=0.294, max=0.309, sum=0.881 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.029, mean=0.031, max=0.033, sum=0.093 (3)",
-            "tab": "Calibration",
-            "score": 0.031094283389380417
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.076, mean=0.078, max=0.081, sum=0.235 (3)",
-            "tab": "Robustness",
-            "score": 0.07821074014295328
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.167, mean=0.179, max=0.197, sum=0.538 (3)",
-            "tab": "Fairness",
-            "score": 0.17918507973514153
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=0.544, mean=0.56, max=0.583, sum=1.681 (3)",
-            "tab": "Efficiency",
-            "score": 0.5603894916373239
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=0.958, mean=1.562, max=1.997, sum=4.687 (3)",
-            "tab": "General information",
-            "score": 1.5624413145539906
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1601.997, mean=1634.99, max=1693.155, sum=4904.969 (3)",
-            "tab": "General information",
-            "score": 1634.9896713615024
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=8.149, mean=11.007, max=15.597, sum=33.02 (3)",
-            "tab": "General information",
-            "score": 11.006572769953053
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.411, mean=0.418, max=0.429, sum=1.255 (3)",
-            "tab": "Bias",
-            "score": 0.4184126984126984
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.333, mean=0.556, max=0.667, sum=1.667 (3)",
-            "tab": "Bias",
-            "score": 0.5555555555555556
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.186, mean=0.202, max=0.217, sum=0.606 (3)",
-            "tab": "Bias",
-            "score": 0.20205501924662395
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.025, mean=0.027, max=0.031, sum=0.082 (3)",
-            "tab": "Toxicity",
-            "score": 0.027230046948356807
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.309,
-        "details": {
-          "description": "min=0.291, mean=0.309, max=0.334, sum=0.928 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.02, mean=0.023, max=0.027, sum=0.07 (3)",
-            "tab": "Calibration",
-            "score": 0.023328620693919305
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.18, mean=0.198, max=0.221, sum=0.594 (3)",
-            "tab": "Calibration",
-            "score": 0.198062019189297
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.024, mean=0.025, max=0.027, sum=0.075 (3)",
-            "tab": "Robustness",
-            "score": 0.025009279663584086
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.066, mean=0.074, max=0.08, sum=0.222 (3)",
-            "tab": "Robustness",
-            "score": 0.07408175909872887
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.052, mean=0.055, max=0.062, sum=0.166 (3)",
-            "tab": "Fairness",
-            "score": 0.055406816944260924
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.198, mean=0.219, max=0.246, sum=0.657 (3)",
-            "tab": "Fairness",
-            "score": 0.21887630944724534
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.246, mean=0.251, max=0.259, sum=0.753 (3)",
-            "tab": "Efficiency",
-            "score": 0.2509381953124994
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=0.552, mean=0.605, max=0.643, sum=1.815 (3)",
-            "tab": "Efficiency",
-            "score": 0.6049964999999996
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=109.191, mean=111.191, max=115.191, sum=333.573 (3)",
-            "tab": "General information",
-            "score": 111.19099999999999
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.325, mean=5.149, max=6.46, sum=15.446 (3)",
-            "tab": "General information",
-            "score": 5.148666666666667
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.538, mean=4.633, max=4.715, sum=13.899 (3)",
-            "tab": "General information",
-            "score": 4.633
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.039, mean=0.039, max=0.039, sum=0.117 (3)",
-            "tab": "General information",
-            "score": 0.039
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1261.72, mean=1481.344, max=1608.455, sum=4444.032 (3)",
-            "tab": "General information",
-            "score": 1481.344
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=20.452, mean=22.835, max=25.41, sum=68.505 (3)",
-            "tab": "General information",
-            "score": 22.834999999999997
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.238, mean=0.415, max=0.539, sum=1.244 (3)",
-            "tab": "Bias",
-            "score": 0.41471861471861476
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.167, mean=0.234, max=0.286, sum=0.702 (3)",
-            "tab": "Bias",
-            "score": 0.2341269841269841
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.48, mean=0.485, max=0.494, sum=1.455 (3)",
-            "tab": "Bias",
-            "score": 0.48499285130718955
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.382, mean=0.435, max=0.467, sum=1.306 (3)",
-            "tab": "Bias",
-            "score": 0.43543086336382425
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.234, mean=0.265, max=0.3, sum=0.796 (3)",
-            "tab": "Bias",
-            "score": 0.2653339127915399
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)",
-            "tab": "Toxicity",
-            "score": 0.0006666666666666666
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.002, mean=0.003, max=0.003, sum=0.008 (3)",
-            "tab": "Toxicity",
-            "score": 0.0026666666666666666
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.219,
-        "details": {
-          "description": "min=0.208, mean=0.219, max=0.238, sum=0.656 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.03, mean=0.036, max=0.042, sum=0.108 (3)",
-            "tab": "Calibration",
-            "score": 0.035862172954873824
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.094, mean=0.098, max=0.101, sum=0.293 (3)",
-            "tab": "Robustness",
-            "score": 0.09766108203425072
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.135, mean=0.144, max=0.162, sum=0.433 (3)",
-            "tab": "Fairness",
-            "score": 0.14446776305873513
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=0.611, mean=0.619, max=0.625, sum=1.856 (3)",
-            "tab": "Efficiency",
-            "score": 0.6185995332031252
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.797, mean=0.881, max=0.969, sum=2.644 (3)",
-            "tab": "General information",
-            "score": 0.8813333333333334
-          },
-          "QuAC - truncated": {
-            "description": "min=0.02, mean=0.02, max=0.02, sum=0.06 (3)",
-            "tab": "General information",
-            "score": 0.02
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1600.292, mean=1639.784, max=1661.675, sum=4919.353 (3)",
-            "tab": "General information",
-            "score": 1639.784333333333
-          },
-          "QuAC - # output tokens": {
-            "description": "min=18.807, mean=20.639, max=21.99, sum=61.916 (3)",
-            "tab": "General information",
-            "score": 20.638666666666666
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.447, mean=0.458, max=0.468, sum=1.375 (3)",
-            "tab": "Bias",
-            "score": 0.45823351891324243
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.329, mean=0.341, max=0.364, sum=1.022 (3)",
-            "tab": "Bias",
-            "score": 0.34075560523096593
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.277, mean=0.285, max=0.299, sum=0.854 (3)",
-            "tab": "Bias",
-            "score": 0.2847879707506289
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.003, max=0.004, sum=0.008 (3)",
-            "tab": "Toxicity",
-            "score": 0.0026666666666666666
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.483,
-        "details": {
-          "description": "min=0.483, mean=0.483, max=0.483, sum=0.483 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.083, mean=0.083, max=0.083, sum=0.083 (1)",
-            "tab": "Calibration",
-            "score": 0.08312318484699062
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.405, mean=0.405, max=0.405, sum=0.405 (1)",
-            "tab": "Robustness",
-            "score": 0.405
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.308, mean=0.308, max=0.308, sum=0.308 (1)",
-            "tab": "Fairness",
-            "score": 0.308
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.223, mean=0.223, max=0.223, sum=0.223 (1)",
-            "tab": "Efficiency",
-            "score": 0.22341269531249972
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=88.855, mean=88.855, max=88.855, sum=88.855 (1)",
-            "tab": "General information",
-            "score": 88.855
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.348,
-        "details": {
-          "description": "min=0.348, mean=0.348, max=0.348, sum=0.348 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.379, mean=0.379, max=0.379, sum=0.379 (1)",
-            "tab": "Calibration",
-            "score": 0.37852917669250147
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.238, mean=0.238, max=0.238, sum=0.238 (1)",
-            "tab": "Robustness",
-            "score": 0.238
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.28, mean=0.28, max=0.28, sum=0.28 (1)",
-            "tab": "Fairness",
-            "score": 0.28
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.214, mean=0.214, max=0.214, sum=0.214 (1)",
-            "tab": "Efficiency",
-            "score": 0.2136278906249995
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.358, mean=5.358, max=5.358, sum=5.358 (1)",
-            "tab": "General information",
-            "score": 5.358
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.217,
-        "details": {
-          "description": "min=0.202, mean=0.217, max=0.226, sum=0.65 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.059, mean=0.076, max=0.098, sum=0.229 (3)",
-            "tab": "Calibration",
-            "score": 0.07625390965133329
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.2, mean=0.204, max=0.211, sum=0.612 (3)",
-            "tab": "Robustness",
-            "score": 0.2038735983690112
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.194, mean=0.203, max=0.214, sum=0.609 (3)",
-            "tab": "Fairness",
-            "score": 0.20285423037716613
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.287, mean=0.289, max=0.295, sum=0.868 (3)",
-            "tab": "Efficiency",
-            "score": 0.2894203160837155
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=505.315, mean=514.648, max=532.315, sum=1543.945 (3)",
-            "tab": "General information",
-            "score": 514.6483180428135
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.304,
-        "details": {
-          "description": "min=0.258, mean=0.304, max=0.338, sum=0.911 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.22, mean=0.252, max=0.287, sum=0.757 (3)",
-            "tab": "Robustness",
-            "score": 0.2521940956196658
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.228, mean=0.28, max=0.324, sum=0.84 (3)",
-            "tab": "Fairness",
-            "score": 0.2798487582673837
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.282, mean=0.291, max=0.303, sum=0.872 (3)",
-            "tab": "Efficiency",
-            "score": 0.29054985767926356
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=480.163, mean=519.496, max=566.163, sum=1558.488 (3)",
-            "tab": "General information",
-            "score": 519.4961240310078
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=1, mean=1.031, max=1.093, sum=3.093 (3)",
-            "tab": "General information",
-            "score": 1.0310077519379846
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.063,
-        "details": {
-          "description": "min=0.031, mean=0.063, max=0.087, sum=0.377 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=0.781, mean=0.954, max=1.052, sum=5.724 (6)",
-            "tab": "Efficiency",
-            "score": 0.9539734693535404
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1555.036, mean=1575.036, max=1602.036, sum=9450.219 (6)",
-            "tab": "General information",
-            "score": 1575.0364806866953
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=49.71, mean=78.352, max=93.899, sum=470.112 (6)",
-            "tab": "General information",
-            "score": 78.3519313304721
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.625, mean=0.648, max=0.667, sum=3.885 (6)",
-            "tab": "Bias",
-            "score": 0.6475615887380594
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.405, mean=0.42, max=0.449, sum=2.522 (6)",
-            "tab": "Bias",
-            "score": 0.4203329386778049
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.099, mean=0.145, max=0.201, sum=0.868 (6)",
-            "tab": "Bias",
-            "score": 0.14468337947687135
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.163, mean=0.182, max=0.21, sum=1.09 (6)",
-            "tab": "Bias",
-            "score": 0.18171396544569016
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.009 (6)",
-            "tab": "Toxicity",
-            "score": 0.001430615164520744
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=-0.077, mean=0.054, max=0.168, sum=0.161 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.053643734154981075
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=0.051, mean=2.638, max=4.057, sum=15.831 (6)",
-            "tab": "Summarization metrics",
-            "score": 2.6384596103973283
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=-0.069, mean=0.026, max=0.075, sum=0.077 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.025643326292308758
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.532, mean=0.744, max=0.913, sum=4.465 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.7441391663831297
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=11.632, mean=25.238, max=33.415, sum=151.427 (6)",
-            "tab": "Summarization metrics",
-            "score": 25.237906513316556
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=9.053, mean=13.243, max=20.787, sum=79.46 (6)",
-            "tab": "Summarization metrics",
-            "score": 13.243377373187593
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.033,
-        "details": {
-          "description": "min=0.031, mean=0.033, max=0.037, sum=0.199 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=0.637, mean=0.642, max=0.649, sum=3.85 (6)",
-            "tab": "Efficiency",
-            "score": 0.6416181225868728
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.996, mean=4.998, max=5, sum=29.988 (6)",
-            "tab": "General information",
-            "score": 4.998069498069498
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1484.608, mean=1537.452, max=1572.616, sum=9224.71 (6)",
-            "tab": "General information",
-            "score": 1537.4517374517375
-          },
-          "XSUM - # output tokens": {
-            "description": "min=25.859, mean=27.394, max=28.226, sum=164.363 (6)",
-            "tab": "General information",
-            "score": 27.393822393822393
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.399, mean=0.43, max=0.493, sum=2.58 (6)",
-            "tab": "Bias",
-            "score": 0.43004930254930257
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.542, mean=0.556, max=0.583, sum=3.333 (6)",
-            "tab": "Bias",
-            "score": 0.5555555555555556
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.224, mean=0.246, max=0.283, sum=1.474 (6)",
-            "tab": "Bias",
-            "score": 0.2457025240044108
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)",
-            "tab": "Toxicity",
-            "score": 0.0006435006435006435
-          },
-          "XSUM - SummaC": {
-            "description": "min=0.0, mean=0.028, max=0.073, sum=0.085 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.02834827232857105
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=2.873, mean=3.094, max=3.373, sum=18.563 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.0938511325795113
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.173, mean=0.195, max=0.221, sum=0.585 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.1951040609680371
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.853, mean=0.863, max=0.87, sum=5.178 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.8630576414302875
-          },
-          "XSUM - Density": {
-            "description": "min=9.489, mean=10.557, max=12.063, sum=63.341 (6)",
-            "tab": "Summarization metrics",
-            "score": 10.556911526268395
-          },
-          "XSUM - Compression": {
-            "description": "min=16.738, mean=17.551, max=18.157, sum=105.306 (6)",
-            "tab": "Summarization metrics",
-            "score": 17.55096225657148
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.578,
-        "details": {
-          "description": "min=0.53, mean=0.578, max=0.618, sum=1.735 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.085, mean=0.134, max=0.174, sum=0.401 (3)",
-            "tab": "Calibration",
-            "score": 0.13354341899719424
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.447, mean=0.473, max=0.498, sum=1.418 (3)",
-            "tab": "Robustness",
-            "score": 0.4726666666666666
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.49, mean=0.518, max=0.54, sum=1.554 (3)",
-            "tab": "Fairness",
-            "score": 0.518
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.414, mean=0.458, max=0.52, sum=1.373 (3)",
-            "tab": "Efficiency",
-            "score": 0.45773176757812467
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=4.846, mean=4.93, max=4.98, sum=14.79 (3)",
-            "tab": "General information",
-            "score": 4.930000000000001
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1161.854, mean=1398.654, max=1747.025, sum=4195.961 (3)",
-            "tab": "General information",
-            "score": 1398.6536666666668
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.501,
-        "details": {
-          "description": "min=0, mean=0.501, max=1, sum=27.062 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.201, mean=0.486, max=0.8, sum=26.269 (54)",
-            "tab": "Calibration",
-            "score": 0.4864679961449666
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.434, max=1, sum=23.451 (54)",
-            "tab": "Robustness",
-            "score": 0.4342847473494527
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.495, max=1, sum=26.744 (54)",
-            "tab": "Fairness",
-            "score": 0.49526155082406725
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.264, mean=0.329, max=0.439, sum=17.76 (54)",
-            "tab": "Efficiency",
-            "score": 0.32889709084919744
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=362.293, mean=732.514, max=1288.441, sum=39555.782 (54)",
-            "tab": "General information",
-            "score": 732.5144825548033
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=54 (54)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.492,
-        "details": {
-          "description": "min=0, mean=0.492, max=0.975, sum=16.225 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.084, mean=0.234, max=0.631, sum=7.714 (33)",
-            "tab": "Calibration",
-            "score": 0.23374335739699753
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.403, max=0.975, sum=13.3 (33)",
-            "tab": "Robustness",
-            "score": 0.40303030303030307
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0, mean=0.452, max=0.975, sum=14.9 (33)",
-            "tab": "Fairness",
-            "score": 0.4515151515151515
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.256, mean=0.36, max=0.547, sum=11.878 (33)",
-            "tab": "Efficiency",
-            "score": 0.3599495087594697
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.557, max=5, sum=150.375 (33)",
-            "tab": "General information",
-            "score": 4.556818181818182
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=270.325, mean=814.446, max=1777.025, sum=26876.725 (33)",
-            "tab": "General information",
-            "score": 814.446212121212
-          },
-          "RAFT - # output tokens": {
-            "description": "min=1, mean=3.239, max=5.575, sum=106.9 (33)",
-            "tab": "General information",
-            "score": 3.2393939393939393
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json b/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json
deleted file mode 100644
index 80b637746..000000000
--- a/data/helm_classic/cohere/Cohere-xlarge-v20220609-52.4B/8d2665d6-55fb-4d0c-8d6d-48cd43f27ff2.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20220609-52.4B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cohere xlarge v20220609 52.4B",
-    "id": "cohere/Cohere-xlarge-v20220609-52.4B",
-    "developer": "cohere",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.56,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.5427202179052317
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.5061059259613209
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.5496737226436893
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.1992872807017544
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5983741692925366
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.5744286577619911
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.546345029239766
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.353,
-        "details": {
-          "description": "min=0.228, mean=0.353, max=0.56, sum=5.296 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.089, mean=0.149, max=0.246, sum=2.242 (15)",
-            "tab": "Calibration",
-            "score": 0.14945785718149934
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.158, mean=0.29, max=0.51, sum=4.349 (15)",
-            "tab": "Robustness",
-            "score": 0.28992982456140354
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.158, mean=0.315, max=0.53, sum=4.729 (15)",
-            "tab": "Fairness",
-            "score": 0.31526315789473686
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.47, mean=0.489, max=0.506, sum=7.328 (15)",
-            "tab": "Efficiency",
-            "score": 0.4885340888157895
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=372.75, mean=481.26, max=628.421, sum=7218.903 (15)",
-            "tab": "General information",
-            "score": 481.2602105263158
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.718,
-        "details": {
-          "description": "min=0.702, mean=0.718, max=0.74, sum=2.153 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.037, mean=0.04, max=0.043, sum=0.119 (3)",
-            "tab": "Calibration",
-            "score": 0.039674216829776156
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.601, mean=0.614, max=0.622, sum=1.842 (3)",
-            "tab": "Robustness",
-            "score": 0.614
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.657, mean=0.667, max=0.681, sum=2 (3)",
-            "tab": "Fairness",
-            "score": 0.6666666666666666
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.519, mean=0.598, max=0.705, sum=1.795 (3)",
-            "tab": "Efficiency",
-            "score": 0.5984045305989586
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=669.307, mean=925.307, max=1269.307, sum=2775.921 (3)",
-            "tab": "General information",
-            "score": 925.3070000000001
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1.001, max=1.004, sum=3.004 (3)",
-            "tab": "General information",
-            "score": 1.0013333333333334
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.65,
-        "details": {
-          "description": "min=0.593, mean=0.65, max=0.688, sum=1.95 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.048, mean=0.062, max=0.079, sum=0.185 (3)",
-            "tab": "Calibration",
-            "score": 0.061654179655226814
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.331, mean=0.383, max=0.42, sum=1.148 (3)",
-            "tab": "Robustness",
-            "score": 0.38251983624053415
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.481, mean=0.548, max=0.591, sum=1.644 (3)",
-            "tab": "Fairness",
-            "score": 0.5478470147843514
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=1.025, mean=1.062, max=1.132, sum=3.185 (3)",
-            "tab": "Efficiency",
-            "score": 1.061820745305164
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=0.958, mean=1.562, max=1.997, sum=4.687 (3)",
-            "tab": "General information",
-            "score": 1.5624413145539906
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1601.997, mean=1634.99, max=1693.155, sum=4904.969 (3)",
-            "tab": "General information",
-            "score": 1634.9896713615024
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.794, mean=7.077, max=9.031, sum=21.231 (3)",
-            "tab": "General information",
-            "score": 7.07699530516432
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.396, mean=0.454, max=0.5, sum=1.362 (3)",
-            "tab": "Bias",
-            "score": 0.4541666666666666
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.333, mean=0.556, max=0.667, sum=1.667 (3)",
-            "tab": "Bias",
-            "score": 0.5555555555555557
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.204, mean=0.208, max=0.215, sum=0.624 (3)",
-            "tab": "Bias",
-            "score": 0.20801619481196945
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.011, mean=0.021, max=0.028, sum=0.062 (3)",
-            "tab": "Toxicity",
-            "score": 0.020657276995305163
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.595,
-        "details": {
-          "description": "min=0.576, mean=0.595, max=0.607, sum=1.785 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.061, mean=0.068, max=0.073, sum=0.203 (3)",
-            "tab": "Calibration",
-            "score": 0.06770990173751885
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.075, mean=0.085, max=0.099, sum=0.254 (3)",
-            "tab": "Calibration",
-            "score": 0.08482055822987211
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.233, mean=0.238, max=0.241, sum=0.713 (3)",
-            "tab": "Robustness",
-            "score": 0.23753663022529162
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.411, mean=0.471, max=0.518, sum=1.414 (3)",
-            "tab": "Robustness",
-            "score": 0.4713418135089589
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.248, mean=0.255, max=0.259, sum=0.764 (3)",
-            "tab": "Fairness",
-            "score": 0.25466316487855734
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.521, mean=0.535, max=0.546, sum=1.604 (3)",
-            "tab": "Fairness",
-            "score": 0.5348225692810691
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.546, mean=0.565, max=0.586, sum=1.694 (3)",
-            "tab": "Efficiency",
-            "score": 0.5647122317708332
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=0.95, mean=1.085, max=1.249, sum=3.256 (3)",
-            "tab": "Efficiency",
-            "score": 1.0851867500000003
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=109.191, mean=111.191, max=115.191, sum=333.573 (3)",
-            "tab": "General information",
-            "score": 111.19099999999999
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=5.31, mean=5.844, max=6.407, sum=17.531 (3)",
-            "tab": "General information",
-            "score": 5.843666666666667
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.538, mean=4.633, max=4.715, sum=13.899 (3)",
-            "tab": "General information",
-            "score": 4.633
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.039, mean=0.039, max=0.039, sum=0.117 (3)",
-            "tab": "General information",
-            "score": 0.039
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1261.72, mean=1481.344, max=1608.455, sum=4444.032 (3)",
-            "tab": "General information",
-            "score": 1481.344
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=7.154, mean=8.834, max=11.932, sum=26.502 (3)",
-            "tab": "General information",
-            "score": 8.834
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.382, mean=0.43, max=0.498, sum=1.291 (3)",
-            "tab": "Bias",
-            "score": 0.4304995528213292
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.024, mean=0.094, max=0.18, sum=0.281 (3)",
-            "tab": "Bias",
-            "score": 0.09357753357753357
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.332, mean=0.388, max=0.488, sum=1.163 (3)",
-            "tab": "Bias",
-            "score": 0.38769841269841265
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.373, mean=0.409, max=0.446, sum=1.226 (3)",
-            "tab": "Bias",
-            "score": 0.40861462430089884
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.026, mean=0.051, max=0.066, sum=0.153 (3)",
-            "tab": "Bias",
-            "score": 0.051062717190300304
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)",
-            "tab": "Toxicity",
-            "score": 0.0006666666666666666
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.361,
-        "details": {
-          "description": "min=0.355, mean=0.361, max=0.365, sum=1.082 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.066, mean=0.067, max=0.07, sum=0.201 (3)",
-            "tab": "Calibration",
-            "score": 0.06703451532890617
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.214, mean=0.215, max=0.216, sum=0.646 (3)",
-            "tab": "Robustness",
-            "score": 0.2154779030326859
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.274, mean=0.281, max=0.287, sum=0.844 (3)",
-            "tab": "Fairness",
-            "score": 0.2814055112322921
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=2.057, mean=2.089, max=2.151, sum=6.267 (3)",
-            "tab": "Efficiency",
-            "score": 2.0889632337239585
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.797, mean=0.881, max=0.969, sum=2.644 (3)",
-            "tab": "General information",
-            "score": 0.8813333333333334
-          },
-          "QuAC - truncated": {
-            "description": "min=0.02, mean=0.02, max=0.02, sum=0.06 (3)",
-            "tab": "General information",
-            "score": 0.02
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1600.292, mean=1639.784, max=1661.675, sum=4919.353 (3)",
-            "tab": "General information",
-            "score": 1639.784333333333
-          },
-          "QuAC - # output tokens": {
-            "description": "min=31.783, mean=32.717, max=34.585, sum=98.152 (3)",
-            "tab": "General information",
-            "score": 32.717333333333336
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.556, mean=0.582, max=0.6, sum=1.745 (3)",
-            "tab": "Bias",
-            "score": 0.5815402704291595
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.43, mean=0.438, max=0.449, sum=1.315 (3)",
-            "tab": "Bias",
-            "score": 0.4381760996205441
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.333, mean=0.344, max=0.355, sum=1.033 (3)",
-            "tab": "Bias",
-            "score": 0.3443830841027822
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.223, mean=0.23, max=0.237, sum=0.691 (3)",
-            "tab": "Bias",
-            "score": 0.23033600244512342
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.002, max=0.003, sum=0.006 (3)",
-            "tab": "Toxicity",
-            "score": 0.002
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.811,
-        "details": {
-          "description": "min=0.811, mean=0.811, max=0.811, sum=0.811 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.341, mean=0.341, max=0.341, sum=0.341 (1)",
-            "tab": "Calibration",
-            "score": 0.34142560211110756
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.759, mean=0.759, max=0.759, sum=0.759 (1)",
-            "tab": "Robustness",
-            "score": 0.759
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.66, mean=0.66, max=0.66, sum=0.66 (1)",
-            "tab": "Fairness",
-            "score": 0.66
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.359, mean=0.359, max=0.359, sum=0.359 (1)",
-            "tab": "Efficiency",
-            "score": 0.35889839843750027
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=88.855, mean=88.855, max=88.855, sum=88.855 (1)",
-            "tab": "General information",
-            "score": 88.855
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.55,
-        "details": {
-          "description": "min=0.55, mean=0.55, max=0.55, sum=0.55 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.235, mean=0.235, max=0.235, sum=0.235 (1)",
-            "tab": "Calibration",
-            "score": 0.23470136403728084
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.448, mean=0.448, max=0.448, sum=0.448 (1)",
-            "tab": "Robustness",
-            "score": 0.448
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.47, mean=0.47, max=0.47, sum=0.47 (1)",
-            "tab": "Fairness",
-            "score": 0.47
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.314, mean=0.314, max=0.314, sum=0.314 (1)",
-            "tab": "Efficiency",
-            "score": 0.3138882968749995
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.358, mean=5.358, max=5.358, sum=5.358 (1)",
-            "tab": "General information",
-            "score": 5.358
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.198,
-        "details": {
-          "description": "min=0.177, mean=0.198, max=0.225, sum=0.593 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.075, mean=0.099, max=0.119, sum=0.298 (3)",
-            "tab": "Calibration",
-            "score": 0.0994665665272844
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.122, mean=0.151, max=0.182, sum=0.454 (3)",
-            "tab": "Robustness",
-            "score": 0.15137614678899083
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.138, mean=0.156, max=0.182, sum=0.469 (3)",
-            "tab": "Fairness",
-            "score": 0.1564729867482161
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.49, mean=0.501, max=0.506, sum=1.502 (3)",
-            "tab": "Efficiency",
-            "score": 0.50081436353211
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=505.315, mean=514.648, max=532.315, sum=1543.945 (3)",
-            "tab": "General information",
-            "score": 514.6483180428135
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.459,
-        "details": {
-          "description": "min=0.429, mean=0.459, max=0.479, sum=1.378 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.191, mean=0.207, max=0.223, sum=0.622 (3)",
-            "tab": "Robustness",
-            "score": 0.20732857142857117
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.371, mean=0.397, max=0.414, sum=1.19 (3)",
-            "tab": "Robustness",
-            "score": 0.39663320695609633
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.211, mean=0.233, max=0.251, sum=0.698 (3)",
-            "tab": "Fairness",
-            "score": 0.23262777777777743
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.394, mean=0.431, max=0.457, sum=1.292 (3)",
-            "tab": "Fairness",
-            "score": 0.4307144032412258
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.492, mean=0.499, max=0.504, sum=1.496 (3)",
-            "tab": "Efficiency",
-            "score": 0.4985355449218751
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.487, mean=0.501, max=0.511, sum=1.504 (3)",
-            "tab": "Efficiency",
-            "score": 0.501260492369186
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=497.281, mean=536.614, max=583.281, sum=1609.843 (3)",
-            "tab": "General information",
-            "score": 536.6143333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=480.163, mean=519.496, max=566.163, sum=1558.488 (3)",
-            "tab": "General information",
-            "score": 519.4961240310078
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.144,
-        "details": {
-          "description": "min=0.14, mean=0.144, max=0.146, sum=0.861 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=4.313, mean=4.337, max=4.381, sum=26.024 (6)",
-            "tab": "Efficiency",
-            "score": 4.3373758759723735
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1555.036, mean=1575.036, max=1602.036, sum=9450.219 (6)",
-            "tab": "General information",
-            "score": 1575.0364806866953
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=88.871, mean=89.431, max=90.324, sum=536.588 (6)",
-            "tab": "General information",
-            "score": 89.43133047210301
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.616, mean=0.626, max=0.635, sum=3.753 (6)",
-            "tab": "Bias",
-            "score": 0.6255738197534654
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.377, mean=0.387, max=0.397, sum=2.32 (6)",
-            "tab": "Bias",
-            "score": 0.38662344919565644
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.244, mean=0.301, max=0.358, sum=1.808 (6)",
-            "tab": "Bias",
-            "score": 0.30129162776221596
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.104, mean=0.117, max=0.128, sum=0.7 (6)",
-            "tab": "Bias",
-            "score": 0.116591581511673
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.002, max=0.004, sum=0.013 (6)",
-            "tab": "Toxicity",
-            "score": 0.002145922746781116
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.393, mean=0.469, max=0.516, sum=1.407 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.46891720389173397
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=4.621, mean=4.683, max=4.752, sum=28.101 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.683468662049275
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.257, mean=0.264, max=0.275, sum=0.792 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.2639259716833397
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.897, mean=0.945, max=0.971, sum=5.671 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.945166441130516
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=43.963, mean=49.713, max=55.846, sum=298.279 (6)",
-            "tab": "Summarization metrics",
-            "score": 49.713109703758754
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=8.816, mean=9.072, max=9.547, sum=54.43 (6)",
-            "tab": "Summarization metrics",
-            "score": 9.071669466217989
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "min=0.978, mean=0.993, max=1, sum=5.956 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9925925925925926
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "min=4.422, mean=4.539, max=4.667, sum=27.237 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.5394335511982575
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "min=3.556, mean=3.69, max=3.81, sum=22.142 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.6903205726735138
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.129,
-        "details": {
-          "description": "min=0.125, mean=0.129, max=0.134, sum=0.775 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=1.735, mean=1.741, max=1.747, sum=10.443 (6)",
-            "tab": "Efficiency",
-            "score": 1.7405486446267702
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.996, mean=4.998, max=5, sum=29.988 (6)",
-            "tab": "General information",
-            "score": 4.998069498069498
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1484.608, mean=1537.452, max=1572.616, sum=9224.71 (6)",
-            "tab": "General information",
-            "score": 1537.4517374517375
-          },
-          "XSUM - # output tokens": {
-            "description": "min=24.515, mean=24.802, max=25.066, sum=148.815 (6)",
-            "tab": "General information",
-            "score": 24.802445302445303
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.456, mean=0.463, max=0.468, sum=2.78 (6)",
-            "tab": "Bias",
-            "score": 0.4633319142897687
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.532, mean=0.622, max=0.667, sum=3.73 (6)",
-            "tab": "Bias",
-            "score": 0.6216216216216217
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.184, mean=0.205, max=0.224, sum=1.231 (6)",
-            "tab": "Bias",
-            "score": 0.2051781150126976
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)",
-            "tab": "Toxicity",
-            "score": 0.0006435006435006435
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.265, mean=-0.253, max=-0.236, sum=-0.758 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.252571659198599
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=2.761, mean=2.981, max=3.213, sum=17.888 (6)",
-            "tab": "Summarization metrics",
-            "score": 2.981288283366219
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.431, mean=0.434, max=0.438, sum=1.301 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.4335328367301425
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.794, mean=0.8, max=0.803, sum=4.797 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.7995514803953769
-          },
-          "XSUM - Density": {
-            "description": "min=2.71, mean=2.945, max=3.142, sum=17.67 (6)",
-            "tab": "Summarization metrics",
-            "score": 2.945005615644467
-          },
-          "XSUM - Compression": {
-            "description": "min=18.323, mean=18.422, max=18.574, sum=110.533 (6)",
-            "tab": "Summarization metrics",
-            "score": 18.422086618359014
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "min=0.638, mean=0.661, max=0.697, sum=3.968 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.6612578878025103
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "min=4.212, mean=4.239, max=4.275, sum=25.431 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.238517902133463
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "min=4.773, mean=4.825, max=4.877, sum=28.952 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.825335737235052
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.956,
-        "details": {
-          "description": "min=0.941, mean=0.956, max=0.965, sum=2.867 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.05, mean=0.069, max=0.081, sum=0.206 (3)",
-            "tab": "Calibration",
-            "score": 0.06875792133691605
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.907, mean=0.923, max=0.933, sum=2.768 (3)",
-            "tab": "Robustness",
-            "score": 0.9226666666666667
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.93, mean=0.949, max=0.96, sum=2.846 (3)",
-            "tab": "Fairness",
-            "score": 0.9486666666666667
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.709, mean=0.796, max=0.865, sum=2.389 (3)",
-            "tab": "Efficiency",
-            "score": 0.7963252441406254
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.903, mean=4.229, max=4.983, sum=12.688 (3)",
-            "tab": "General information",
-            "score": 4.229333333333333
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1283.038, mean=1562.808, max=1784.2, sum=4688.425 (3)",
-            "tab": "General information",
-            "score": 1562.8083333333334
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.532,
-        "details": {
-          "description": "min=0.001, mean=0.532, max=1, sum=28.726 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.051, mean=0.327, max=0.708, sum=17.639 (54)",
-            "tab": "Calibration",
-            "score": 0.32664532725883244
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.32, max=0.817, sum=17.265 (54)",
-            "tab": "Robustness",
-            "score": 0.31971446667223646
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.001, mean=0.479, max=1, sum=25.855 (54)",
-            "tab": "Fairness",
-            "score": 0.4787922217178853
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.464, mean=0.546, max=0.711, sum=29.484 (54)",
-            "tab": "Efficiency",
-            "score": 0.5459943267746123
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=362.293, mean=732.514, max=1288.441, sum=39555.782 (54)",
-            "tab": "General information",
-            "score": 732.5144825548033
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=54 (54)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.633,
-        "details": {
-          "description": "min=0.1, mean=0.633, max=0.95, sum=20.875 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.093, mean=0.274, max=0.825, sum=9.044 (33)",
-            "tab": "Calibration",
-            "score": 0.274053604040966
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.563, max=0.925, sum=18.575 (33)",
-            "tab": "Robustness",
-            "score": 0.5628787878787879
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.05, mean=0.598, max=0.95, sum=19.75 (33)",
-            "tab": "Fairness",
-            "score": 0.5984848484848486
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.458, mean=0.667, max=0.987, sum=22.019 (33)",
-            "tab": "Efficiency",
-            "score": 0.6672338778409089
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.557, max=5, sum=150.375 (33)",
-            "tab": "General information",
-            "score": 4.556818181818182
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=270.325, mean=814.446, max=1777.025, sum=26876.725 (33)",
-            "tab": "General information",
-            "score": 814.446212121212
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.275, mean=3.051, max=5.95, sum=100.675 (33)",
-            "tab": "General information",
-            "score": 3.0507575757575767
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json b/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json
deleted file mode 100644
index cc49de0c7..000000000
--- a/data/helm_classic/cohere/Cohere-xlarge-v20221108-52.4B/6bbe052f-46f7-4541-80a3-dbb86433db7a.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/cohere_Cohere-xlarge-v20221108-52.4B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cohere xlarge v20221108 52.4B",
-    "id": "cohere/Cohere-xlarge-v20221108-52.4B",
-    "developer": "cohere",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.664,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.5846823928461301
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.5964421748070247
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.6082341462764155
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.601504827172334
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.5642015392015391
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.7039473684210527
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.382,
-        "details": {
-          "description": "min=0.21, mean=0.382, max=0.67, sum=5.731 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.104, mean=0.143, max=0.197, sum=2.146 (15)",
-            "tab": "Calibration",
-            "score": 0.14305203655556303
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.12, mean=0.299, max=0.6, sum=4.49 (15)",
-            "tab": "Robustness",
-            "score": 0.29933333333333334
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.13, mean=0.317, max=0.57, sum=4.748 (15)",
-            "tab": "Fairness",
-            "score": 0.31652631578947366
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=372.75, mean=481.26, max=628.421, sum=7218.903 (15)",
-            "tab": "General information",
-            "score": 481.2602105263158
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.762,
-        "details": {
-          "description": "min=0.761, mean=0.762, max=0.763, sum=2.285 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.037, mean=0.051, max=0.062, sum=0.154 (3)",
-            "tab": "Calibration",
-            "score": 0.05127903463780418
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.712, mean=0.718, max=0.722, sum=2.153 (3)",
-            "tab": "Robustness",
-            "score": 0.7176666666666667
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.702, mean=0.708, max=0.72, sum=2.124 (3)",
-            "tab": "Fairness",
-            "score": 0.7079999999999999
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=669.307, mean=925.307, max=1269.307, sum=2775.921 (3)",
-            "tab": "General information",
-            "score": 925.3070000000001
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.672,
-        "details": {
-          "description": "min=0.607, mean=0.672, max=0.708, sum=2.017 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.042, mean=0.059, max=0.072, sum=0.178 (3)",
-            "tab": "Calibration",
-            "score": 0.059183266964369506
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.313, mean=0.39, max=0.434, sum=1.171 (3)",
-            "tab": "Robustness",
-            "score": 0.3901906178600691
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.486, mean=0.553, max=0.589, sum=1.659 (3)",
-            "tab": "Fairness",
-            "score": 0.5530542667501213
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=0.958, mean=1.562, max=1.997, sum=4.687 (3)",
-            "tab": "General information",
-            "score": 1.5624413145539906
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1601.997, mean=1634.99, max=1693.155, sum=4904.969 (3)",
-            "tab": "General information",
-            "score": 1634.9896713615024
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.792, mean=6.729, max=8.434, sum=20.186 (3)",
-            "tab": "General information",
-            "score": 6.728638497652582
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.417, mean=0.472, max=0.5, sum=1.417 (3)",
-            "tab": "Bias",
-            "score": 0.47222222222222227
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.184, mean=0.192, max=0.197, sum=0.575 (3)",
-            "tab": "Bias",
-            "score": 0.19158509798903886
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.008, mean=0.013, max=0.02, sum=0.039 (3)",
-            "tab": "Toxicity",
-            "score": 0.013145539906103287
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.628,
-        "details": {
-          "description": "min=0.619, mean=0.628, max=0.634, sum=1.885 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.044, mean=0.054, max=0.064, sum=0.163 (3)",
-            "tab": "Calibration",
-            "score": 0.05430103491623906
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.064, mean=0.073, max=0.08, sum=0.219 (3)",
-            "tab": "Calibration",
-            "score": 0.07296237131206641
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.276, mean=0.283, max=0.288, sum=0.85 (3)",
-            "tab": "Robustness",
-            "score": 0.28349840532468856
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.49, mean=0.533, max=0.555, sum=1.598 (3)",
-            "tab": "Robustness",
-            "score": 0.532530651706331
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.295, mean=0.299, max=0.303, sum=0.898 (3)",
-            "tab": "Fairness",
-            "score": 0.299210546403295
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.548, mean=0.566, max=0.58, sum=1.699 (3)",
-            "tab": "Fairness",
-            "score": 0.5664508489119625
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=109.191, mean=111.191, max=115.191, sum=333.573 (3)",
-            "tab": "General information",
-            "score": 111.19099999999999
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.528, mean=4.808, max=5.211, sum=14.424 (3)",
-            "tab": "General information",
-            "score": 4.808
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.538, mean=4.633, max=4.715, sum=13.899 (3)",
-            "tab": "General information",
-            "score": 4.633
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.039, mean=0.039, max=0.039, sum=0.117 (3)",
-            "tab": "General information",
-            "score": 0.039
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1261.72, mean=1481.344, max=1608.455, sum=4444.032 (3)",
-            "tab": "General information",
-            "score": 1481.344
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=5.836, mean=6.093, max=6.582, sum=18.278 (3)",
-            "tab": "General information",
-            "score": 6.092666666666666
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.333, mean=0.444, max=0.5, sum=1.333 (3)",
-            "tab": "Bias",
-            "score": 0.4444444444444444
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.407, mean=0.48, max=0.556, sum=1.441 (3)",
-            "tab": "Bias",
-            "score": 0.4804079441760602
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.147, mean=0.247, max=0.385, sum=0.741 (3)",
-            "tab": "Bias",
-            "score": 0.24693627450980396
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.186, mean=0.232, max=0.278, sum=0.697 (3)",
-            "tab": "Bias",
-            "score": 0.2324074074074074
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.467, mean=0.474, max=0.483, sum=1.423 (3)",
-            "tab": "Bias",
-            "score": 0.4744480248239647
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.092, mean=0.113, max=0.135, sum=0.339 (3)",
-            "tab": "Bias",
-            "score": 0.11298873219533077
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.374,
-        "details": {
-          "description": "min=0.367, mean=0.374, max=0.378, sum=1.122 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.053, mean=0.063, max=0.072, sum=0.189 (3)",
-            "tab": "Calibration",
-            "score": 0.06295082132498765
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.221, mean=0.229, max=0.234, sum=0.686 (3)",
-            "tab": "Robustness",
-            "score": 0.22865454547247813
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.269, mean=0.275, max=0.278, sum=0.824 (3)",
-            "tab": "Fairness",
-            "score": 0.27469570002834404
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.797, mean=0.881, max=0.969, sum=2.644 (3)",
-            "tab": "General information",
-            "score": 0.8813333333333334
-          },
-          "QuAC - truncated": {
-            "description": "min=0.02, mean=0.02, max=0.02, sum=0.06 (3)",
-            "tab": "General information",
-            "score": 0.02
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1600.292, mean=1639.784, max=1661.675, sum=4919.353 (3)",
-            "tab": "General information",
-            "score": 1639.784333333333
-          },
-          "QuAC - # output tokens": {
-            "description": "min=24.612, mean=27.944, max=31.344, sum=83.832 (3)",
-            "tab": "General information",
-            "score": 27.944
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.543, mean=0.571, max=0.589, sum=1.713 (3)",
-            "tab": "Bias",
-            "score": 0.570980870980871
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.371, mean=0.395, max=0.426, sum=1.185 (3)",
-            "tab": "Bias",
-            "score": 0.3948930748680999
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.253, mean=0.304, max=0.331, sum=0.912 (3)",
-            "tab": "Bias",
-            "score": 0.3038684617631986
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.211, mean=0.233, max=0.263, sum=0.699 (3)",
-            "tab": "Bias",
-            "score": 0.2330910766304025
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.002, max=0.003, sum=0.007 (3)",
-            "tab": "Toxicity",
-            "score": 0.0023333333333333335
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=0.81 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)",
-            "tab": "Calibration",
-            "score": 0.3332417863062664
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.764, mean=0.764, max=0.764, sum=0.764 (1)",
-            "tab": "Robustness",
-            "score": 0.764
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.687, mean=0.687, max=0.687, sum=0.687 (1)",
-            "tab": "Fairness",
-            "score": 0.687
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=88.855, mean=88.855, max=88.855, sum=88.855 (1)",
-            "tab": "General information",
-            "score": 88.855
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.588,
-        "details": {
-          "description": "min=0.588, mean=0.588, max=0.588, sum=0.588 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.207, mean=0.207, max=0.207, sum=0.207 (1)",
-            "tab": "Calibration",
-            "score": 0.20665896753536225
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.482, mean=0.482, max=0.482, sum=0.482 (1)",
-            "tab": "Robustness",
-            "score": 0.482
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Fairness",
-            "score": 0.5
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.358, mean=5.358, max=5.358, sum=5.358 (1)",
-            "tab": "General information",
-            "score": 5.358
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.169,
-        "details": {
-          "description": "min=0.164, mean=0.169, max=0.179, sum=0.508 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.18, mean=0.211, max=0.233, sum=0.633 (3)",
-            "tab": "Calibration",
-            "score": 0.21105124875435366
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.106, mean=0.116, max=0.13, sum=0.349 (3)",
-            "tab": "Robustness",
-            "score": 0.1162079510703364
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.112, mean=0.12, max=0.124, sum=0.359 (3)",
-            "tab": "Fairness",
-            "score": 0.1197757390417941
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=505.315, mean=514.648, max=532.315, sum=1543.945 (3)",
-            "tab": "General information",
-            "score": 514.6483180428135
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.55,
-        "details": {
-          "description": "min=0.526, mean=0.55, max=0.573, sum=1.65 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.201, mean=0.242, max=0.292, sum=0.725 (3)",
-            "tab": "Robustness",
-            "score": 0.24177817460317433
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.449, mean=0.482, max=0.527, sum=1.446 (3)",
-            "tab": "Robustness",
-            "score": 0.48206153384583117
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.239, mean=0.267, max=0.302, sum=0.802 (3)",
-            "tab": "Fairness",
-            "score": 0.2673071428571425
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.51, mean=0.522, max=0.544, sum=1.565 (3)",
-            "tab": "Fairness",
-            "score": 0.5216640091882355
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=497.281, mean=536.614, max=583.281, sum=1609.843 (3)",
-            "tab": "General information",
-            "score": 536.6143333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=1, mean=1.002, max=1.005, sum=3.005 (3)",
-            "tab": "General information",
-            "score": 1.0016666666666667
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=480.163, mean=519.496, max=566.163, sum=1558.488 (3)",
-            "tab": "General information",
-            "score": 519.4961240310078
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.153,
-        "details": {
-          "description": "min=0.153, mean=0.153, max=0.154, sum=0.92 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1555.036, mean=1575.036, max=1602.036, sum=9450.219 (6)",
-            "tab": "General information",
-            "score": 1575.0364806866953
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=89.47, mean=91.338, max=92.403, sum=548.03 (6)",
-            "tab": "General information",
-            "score": 91.33834048640915
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.579, mean=0.607, max=0.649, sum=3.642 (6)",
-            "tab": "Bias",
-            "score": 0.606957921303154
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.362, mean=0.383, max=0.409, sum=2.3 (6)",
-            "tab": "Bias",
-            "score": 0.3833873353199473
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.223, mean=0.266, max=0.328, sum=1.597 (6)",
-            "tab": "Bias",
-            "score": 0.26620678930063096
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.128, mean=0.133, max=0.14, sum=0.796 (6)",
-            "tab": "Bias",
-            "score": 0.1326032519141558
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.009 (6)",
-            "tab": "Toxicity",
-            "score": 0.001430615164520744
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.469, mean=0.514, max=0.552, sum=1.542 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.5141110990456594
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.281, mean=0.286, max=0.295, sum=0.858 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.2858638938260981
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.946, mean=0.971, max=0.984, sum=5.823 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9705641483765838
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=41.158, mean=44.772, max=50.734, sum=268.631 (6)",
-            "tab": "Summarization metrics",
-            "score": 44.771778103334206
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=7.733, mean=8.026, max=8.278, sum=48.156 (6)",
-            "tab": "Summarization metrics",
-            "score": 8.02592370223569
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.153,
-        "details": {
-          "description": "min=0.148, mean=0.153, max=0.158, sum=0.919 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.996, mean=4.998, max=5, sum=29.988 (6)",
-            "tab": "General information",
-            "score": 4.998069498069498
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1484.608, mean=1537.452, max=1572.616, sum=9224.71 (6)",
-            "tab": "General information",
-            "score": 1537.4517374517375
-          },
-          "XSUM - # output tokens": {
-            "description": "min=25.925, mean=26.153, max=26.423, sum=156.919 (6)",
-            "tab": "General information",
-            "score": 26.153153153153156
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.447, mean=0.454, max=0.463, sum=2.724 (6)",
-            "tab": "Bias",
-            "score": 0.45401696819707577
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.515, mean=0.537, max=0.565, sum=3.223 (6)",
-            "tab": "Bias",
-            "score": 0.5371029656743943
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.204, mean=0.218, max=0.236, sum=1.306 (6)",
-            "tab": "Bias",
-            "score": 0.2176913745770286
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)",
-            "tab": "Toxicity",
-            "score": 0.0006435006435006435
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.28, mean=-0.258, max=-0.245, sum=-0.774 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.25799066096812756
-          },
-          "XSUM - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.447, mean=0.451, max=0.454, sum=1.354 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.45133514557325344
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.79, mean=0.798, max=0.803, sum=4.787 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.7978456468638059
-          },
-          "XSUM - Density": {
-            "description": "min=2.823, mean=3.009, max=3.208, sum=18.053 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.008801536227543
-          },
-          "XSUM - Compression": {
-            "description": "min=17.074, mean=17.188, max=17.359, sum=103.128 (6)",
-            "tab": "Summarization metrics",
-            "score": 17.187984260626735
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.956,
-        "details": {
-          "description": "min=0.941, mean=0.956, max=0.965, sum=2.868 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.05, mean=0.069, max=0.082, sum=0.207 (3)",
-            "tab": "Calibration",
-            "score": 0.06908904600115551
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.907, mean=0.923, max=0.933, sum=2.769 (3)",
-            "tab": "Robustness",
-            "score": 0.923
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.931, mean=0.949, max=0.96, sum=2.847 (3)",
-            "tab": "Fairness",
-            "score": 0.949
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.903, mean=4.229, max=4.983, sum=12.688 (3)",
-            "tab": "General information",
-            "score": 4.229333333333333
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1283.038, mean=1562.808, max=1784.2, sum=4688.425 (3)",
-            "tab": "General information",
-            "score": 1562.8083333333334
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.524,
-        "details": {
-          "description": "min=0.035, mean=0.524, max=0.968, sum=28.319 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.056, mean=0.313, max=0.651, sum=16.899 (54)",
-            "tab": "Calibration",
-            "score": 0.3129455444585645
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.012, mean=0.408, max=0.908, sum=22.047 (54)",
-            "tab": "Robustness",
-            "score": 0.408272754767954
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.03, mean=0.415, max=0.875, sum=22.43 (54)",
-            "tab": "Fairness",
-            "score": 0.41537457925495214
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=362.293, mean=732.514, max=1288.441, sum=39555.782 (54)",
-            "tab": "General information",
-            "score": 732.5144825548033
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=54 (54)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.624,
-        "details": {
-          "description": "min=0, mean=0.624, max=0.975, sum=20.6 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.078, mean=0.25, max=1, sum=8.255 (33)",
-            "tab": "Calibration",
-            "score": 0.2501605016965272
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.489, max=0.925, sum=16.125 (33)",
-            "tab": "Robustness",
-            "score": 0.48863636363636365
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0, mean=0.604, max=0.975, sum=19.925 (33)",
-            "tab": "Fairness",
-            "score": 0.6037878787878787
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.557, max=5, sum=150.375 (33)",
-            "tab": "General information",
-            "score": 4.556818181818182
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=270.325, mean=814.446, max=1777.025, sum=26876.725 (33)",
-            "tab": "General information",
-            "score": 814.446212121212
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0, mean=2.99, max=7.05, sum=98.675 (33)",
-            "tab": "General information",
-            "score": 2.9901515151515157
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json b/data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json
deleted file mode 100644
index bc304945b..000000000
--- a/data/helm_classic/eleutherai/Pythia-12B/9b91f415-6edf-4a2f-a3ff-a9dac8343ebd.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/eleutherai_Pythia-12B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Pythia 12B",
-    "id": "eleutherai/Pythia-12B",
-    "developer": "eleutherai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.257,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.37428307123034227
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.27195804195804196
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.22631701631701634
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.4331466568182155
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.38444055944055944
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.274,
-        "details": {
-          "description": "min=0.2, mean=0.274, max=0.3, sum=1.368 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.092, mean=0.111, max=0.166, sum=0.557 (5)",
-            "tab": "Calibration",
-            "score": 0.11132961223278444
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.17, mean=0.22, max=0.28, sum=1.102 (5)",
-            "tab": "Robustness",
-            "score": 0.22035087719298244
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.16, mean=0.212, max=0.29, sum=1.061 (5)",
-            "tab": "Fairness",
-            "score": 0.2121052631578947
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=358.76, mean=467.936, max=612.798, sum=2339.678 (5)",
-            "tab": "General information",
-            "score": 467.935649122807
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.662,
-        "details": {
-          "description": "min=0.662, mean=0.662, max=0.662, sum=0.662 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.14, mean=0.14, max=0.14, sum=0.14 (1)",
-            "tab": "Calibration",
-            "score": 0.13986557582802048
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.51, mean=0.51, max=0.51, sum=0.51 (1)",
-            "tab": "Robustness",
-            "score": 0.51
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.547, mean=0.547, max=0.547, sum=0.547 (1)",
-            "tab": "Fairness",
-            "score": 0.547
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1251.897, mean=1251.897, max=1251.897, sum=1251.897 (1)",
-            "tab": "General information",
-            "score": 1251.897
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.596,
-        "details": {
-          "description": "min=0.596, mean=0.596, max=0.596, sum=0.596 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.239, mean=0.239, max=0.239, sum=0.239 (1)",
-            "tab": "Calibration",
-            "score": 0.2394289121866973
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.42 (1)",
-            "tab": "Robustness",
-            "score": 0.42022169799567144
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.449, mean=0.449, max=0.449, sum=0.449 (1)",
-            "tab": "Fairness",
-            "score": 0.44869513696457247
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)",
-            "tab": "General information",
-            "score": 1.9690140845070423
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1691.082, mean=1691.082, max=1691.082, sum=1691.082 (1)",
-            "tab": "General information",
-            "score": 1691.081690140845
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=100 (1)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.215, mean=0.215, max=0.215, sum=0.215 (1)",
-            "tab": "Bias",
-            "score": 0.2152777777777778
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.023, mean=0.023, max=0.023, sum=0.023 (1)",
-            "tab": "Toxicity",
-            "score": 0.022535211267605635
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.581,
-        "details": {
-          "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.094, mean=0.094, max=0.094, sum=0.094 (1)",
-            "tab": "Calibration",
-            "score": 0.09399996958029097
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)",
-            "tab": "Calibration",
-            "score": 0.3899944090149843
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.108, mean=0.108, max=0.108, sum=0.108 (1)",
-            "tab": "Robustness",
-            "score": 0.10849928114746796
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.47, mean=0.47, max=0.47, sum=0.47 (1)",
-            "tab": "Robustness",
-            "score": 0.46990137932247006
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.131, mean=0.131, max=0.131, sum=0.131 (1)",
-            "tab": "Fairness",
-            "score": 0.13109020655004933
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.523, mean=0.523, max=0.523, sum=0.523 (1)",
-            "tab": "Fairness",
-            "score": 0.5229768252994325
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=117.299, mean=117.299, max=117.299, sum=117.299 (1)",
-            "tab": "General information",
-            "score": 117.299
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=300 (1)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.704, mean=4.704, max=4.704, sum=4.704 (1)",
-            "tab": "General information",
-            "score": 4.704
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.037, mean=0.037, max=0.037, sum=0.037 (1)",
-            "tab": "General information",
-            "score": 0.037
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1495.552, mean=1495.552, max=1495.552, sum=1495.552 (1)",
-            "tab": "General information",
-            "score": 1495.552
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=300 (1)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.407, mean=0.407, max=0.407, sum=0.407 (1)",
-            "tab": "Bias",
-            "score": 0.40682414698162733
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.122, mean=0.122, max=0.122, sum=0.122 (1)",
-            "tab": "Bias",
-            "score": 0.1216216216216216
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.405, mean=0.405, max=0.405, sum=0.405 (1)",
-            "tab": "Bias",
-            "score": 0.4047619047619048
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)",
-            "tab": "Bias",
-            "score": 0.4666666666666667
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.276, mean=0.276, max=0.276, sum=0.276 (1)",
-            "tab": "Bias",
-            "score": 0.27551020408163257
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)",
-            "tab": "Toxicity",
-            "score": 0.002
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.313,
-        "details": {
-          "description": "min=0.313, mean=0.313, max=0.313, sum=0.313 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.138, mean=0.138, max=0.138, sum=0.138 (1)",
-            "tab": "Calibration",
-            "score": 0.1383150544527575
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.171, mean=0.171, max=0.171, sum=0.171 (1)",
-            "tab": "Robustness",
-            "score": 0.17120890749036072
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.227, mean=0.227, max=0.227, sum=0.227 (1)",
-            "tab": "Fairness",
-            "score": 0.22738715021444486
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.883, mean=0.883, max=0.883, sum=0.883 (1)",
-            "tab": "General information",
-            "score": 0.883
-          },
-          "QuAC - truncated": {
-            "description": "min=0.021, mean=0.021, max=0.021, sum=0.021 (1)",
-            "tab": "General information",
-            "score": 0.021
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1655.708, mean=1655.708, max=1655.708, sum=1655.708 (1)",
-            "tab": "General information",
-            "score": 1655.708
-          },
-          "QuAC - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=100 (1)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.641, mean=0.641, max=0.641, sum=0.641 (1)",
-            "tab": "Bias",
-            "score": 0.6406926406926409
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.415, mean=0.415, max=0.415, sum=0.415 (1)",
-            "tab": "Bias",
-            "score": 0.4150793650793651
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.314, mean=0.314, max=0.314, sum=0.314 (1)",
-            "tab": "Bias",
-            "score": 0.3137254901960784
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.26, mean=0.26, max=0.26, sum=0.26 (1)",
-            "tab": "Bias",
-            "score": 0.25965665236051505
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)",
-            "tab": "Toxicity",
-            "score": 0.002
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.177,
-        "details": {
-          "description": "min=0.177, mean=0.177, max=0.177, sum=0.177 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.094, mean=0.094, max=0.094, sum=0.094 (1)",
-            "tab": "Calibration",
-            "score": 0.09363268995646454
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.138, mean=0.138, max=0.138, sum=0.138 (1)",
-            "tab": "Robustness",
-            "score": 0.13761467889908258
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.154, mean=0.154, max=0.154, sum=0.154 (1)",
-            "tab": "Fairness",
-            "score": 0.154434250764526
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=505.352, mean=505.352, max=505.352, sum=505.352 (1)",
-            "tab": "General information",
-            "score": 505.35168195718654
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.931,
-        "details": {
-          "description": "min=0.931, mean=0.931, max=0.931, sum=0.931 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.342, mean=0.342, max=0.342, sum=0.342 (1)",
-            "tab": "Calibration",
-            "score": 0.34150363639115
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.854, mean=0.854, max=0.854, sum=0.854 (1)",
-            "tab": "Robustness",
-            "score": 0.854
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.916, mean=0.916, max=0.916, sum=0.916 (1)",
-            "tab": "Fairness",
-            "score": 0.916
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.911, mean=2.911, max=2.911, sum=2.911 (1)",
-            "tab": "General information",
-            "score": 2.911
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1619.568, mean=1619.568, max=1619.568, sum=1619.568 (1)",
-            "tab": "General information",
-            "score": 1619.568
-          },
-          "IMDB - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.531,
-        "details": {
-          "description": "min=0.03, mean=0.531, max=0.988, sum=9.561 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.138, mean=0.297, max=0.479, sum=5.337 (18)",
-            "tab": "Calibration",
-            "score": 0.2965193799633309
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.02, mean=0.418, max=0.973, sum=7.526 (18)",
-            "tab": "Robustness",
-            "score": 0.41812542395705293
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.01, mean=0.448, max=0.985, sum=8.071 (18)",
-            "tab": "Fairness",
-            "score": 0.44837567354282437
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=360.976, mean=771.654, max=1282.4, sum=13889.772 (18)",
-            "tab": "General information",
-            "score": 771.6539847352628
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.514,
-        "details": {
-          "description": "min=0.175, mean=0.514, max=0.975, sum=5.65 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.175, mean=0.514, max=0.975, sum=5.649 (11)",
-            "tab": "Calibration",
-            "score": 0.5135614568346981
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.45, max=0.975, sum=4.95 (11)",
-            "tab": "Robustness",
-            "score": 0.45
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.15, mean=0.489, max=0.975, sum=5.375 (11)",
-            "tab": "Fairness",
-            "score": 0.48863636363636365
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.7, mean=4.605, max=5, sum=50.65 (11)",
-            "tab": "General information",
-            "score": 4.6045454545454545
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=280.35, mean=869.691, max=1756.575, sum=9566.6 (11)",
-            "tab": "General information",
-            "score": 869.6909090909089
-          },
-          "RAFT - # output tokens": {
-            "description": "min=30, mean=30, max=30, sum=330 (11)",
-            "tab": "General information",
-            "score": 30.0
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json b/data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json
deleted file mode 100644
index 511816a71..000000000
--- a/data/helm_classic/eleutherai/Pythia-6.9B/742a59e8-c813-42ef-938a-4897e25dcdad.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/eleutherai_Pythia-6.9B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Pythia 6.9B",
-    "id": "eleutherai/Pythia-6.9B",
-    "developer": "eleutherai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.196,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.4304810360777058
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.1820979020979021
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.17121212121212123
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5099743679983342
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.512004662004662
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.236,
-        "details": {
-          "description": "min=0.16, mean=0.236, max=0.281, sum=1.181 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.064, mean=0.136, max=0.2, sum=0.682 (5)",
-            "tab": "Calibration",
-            "score": 0.1364262799156796
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.12, mean=0.201, max=0.263, sum=1.003 (5)",
-            "tab": "Robustness",
-            "score": 0.20063157894736844
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.14, mean=0.207, max=0.254, sum=1.034 (5)",
-            "tab": "Fairness",
-            "score": 0.20687719298245613
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=358.76, mean=467.936, max=612.798, sum=2339.678 (5)",
-            "tab": "General information",
-            "score": 467.935649122807
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.631,
-        "details": {
-          "description": "min=0.631, mean=0.631, max=0.631, sum=0.631 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.106, mean=0.106, max=0.106, sum=0.106 (1)",
-            "tab": "Calibration",
-            "score": 0.10596147166386737
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.527, mean=0.527, max=0.527, sum=0.527 (1)",
-            "tab": "Robustness",
-            "score": 0.527
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.552, mean=0.552, max=0.552, sum=0.552 (1)",
-            "tab": "Fairness",
-            "score": 0.552
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1251.897, mean=1251.897, max=1251.897, sum=1251.897 (1)",
-            "tab": "General information",
-            "score": 1251.897
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.528,
-        "details": {
-          "description": "min=0.528, mean=0.528, max=0.528, sum=0.528 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.217, mean=0.217, max=0.217, sum=0.217 (1)",
-            "tab": "Calibration",
-            "score": 0.21689349381563713
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.313, mean=0.313, max=0.313, sum=0.313 (1)",
-            "tab": "Robustness",
-            "score": 0.31250255336597976
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.389, mean=0.389, max=0.389, sum=0.389 (1)",
-            "tab": "Fairness",
-            "score": 0.38935766339772926
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)",
-            "tab": "General information",
-            "score": 1.9690140845070423
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1691.082, mean=1691.082, max=1691.082, sum=1691.082 (1)",
-            "tab": "General information",
-            "score": 1691.081690140845
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=100 (1)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.444, mean=0.444, max=0.444, sum=0.444 (1)",
-            "tab": "Bias",
-            "score": 0.4444444444444444
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.204, mean=0.204, max=0.204, sum=0.204 (1)",
-            "tab": "Bias",
-            "score": 0.20434782608695648
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.014, mean=0.014, max=0.014, sum=0.014 (1)",
-            "tab": "Toxicity",
-            "score": 0.014084507042253521
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.539,
-        "details": {
-          "description": "min=0.539, mean=0.539, max=0.539, sum=0.539 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.07, mean=0.07, max=0.07, sum=0.07 (1)",
-            "tab": "Calibration",
-            "score": 0.06999999827276561
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.369, mean=0.369, max=0.369, sum=0.369 (1)",
-            "tab": "Calibration",
-            "score": 0.3689977017786239
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.094, mean=0.094, max=0.094, sum=0.094 (1)",
-            "tab": "Robustness",
-            "score": 0.09385332819874069
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.391, mean=0.391, max=0.391, sum=0.391 (1)",
-            "tab": "Robustness",
-            "score": 0.39128308105054077
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.103, mean=0.103, max=0.103, sum=0.103 (1)",
-            "tab": "Fairness",
-            "score": 0.10301926896303132
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.464, mean=0.464, max=0.464, sum=0.464 (1)",
-            "tab": "Fairness",
-            "score": 0.4640855445555752
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=117.299, mean=117.299, max=117.299, sum=117.299 (1)",
-            "tab": "General information",
-            "score": 117.299
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=300 (1)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.704, mean=4.704, max=4.704, sum=4.704 (1)",
-            "tab": "General information",
-            "score": 4.704
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.037, mean=0.037, max=0.037, sum=0.037 (1)",
-            "tab": "General information",
-            "score": 0.037
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1495.552, mean=1495.552, max=1495.552, sum=1495.552 (1)",
-            "tab": "General information",
-            "score": 1495.552
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=299.883, mean=299.883, max=299.883, sum=299.883 (1)",
-            "tab": "General information",
-            "score": 299.883
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.312, mean=0.312, max=0.312, sum=0.312 (1)",
-            "tab": "Bias",
-            "score": 0.31182795698924726
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.188, mean=0.188, max=0.188, sum=0.188 (1)",
-            "tab": "Bias",
-            "score": 0.1875
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.387, mean=0.387, max=0.387, sum=0.387 (1)",
-            "tab": "Bias",
-            "score": 0.38690476190476186
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.422, mean=0.422, max=0.422, sum=0.422 (1)",
-            "tab": "Bias",
-            "score": 0.42222222222222217
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.159, mean=0.159, max=0.159, sum=0.159 (1)",
-            "tab": "Bias",
-            "score": 0.1590909090909091
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.296,
-        "details": {
-          "description": "min=0.296, mean=0.296, max=0.296, sum=0.296 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.1, mean=0.1, max=0.1, sum=0.1 (1)",
-            "tab": "Calibration",
-            "score": 0.09977223409937552
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.171, mean=0.171, max=0.171, sum=0.171 (1)",
-            "tab": "Robustness",
-            "score": 0.17097990289529255
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.198, mean=0.198, max=0.198, sum=0.198 (1)",
-            "tab": "Fairness",
-            "score": 0.19836760191150613
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.883, mean=0.883, max=0.883, sum=0.883 (1)",
-            "tab": "General information",
-            "score": 0.883
-          },
-          "QuAC - truncated": {
-            "description": "min=0.021, mean=0.021, max=0.021, sum=0.021 (1)",
-            "tab": "General information",
-            "score": 0.021
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1655.708, mean=1655.708, max=1655.708, sum=1655.708 (1)",
-            "tab": "General information",
-            "score": 1655.708
-          },
-          "QuAC - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=100 (1)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.635, mean=0.635, max=0.635, sum=0.635 (1)",
-            "tab": "Bias",
-            "score": 0.6349206349206349
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.416, mean=0.416, max=0.416, sum=0.416 (1)",
-            "tab": "Bias",
-            "score": 0.41639199007620065
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.369, mean=0.369, max=0.369, sum=0.369 (1)",
-            "tab": "Bias",
-            "score": 0.3687074829931972
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.25, mean=0.25, max=0.25, sum=0.25 (1)",
-            "tab": "Bias",
-            "score": 0.25
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.003, mean=0.003, max=0.003, sum=0.003 (1)",
-            "tab": "Toxicity",
-            "score": 0.003
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.213,
-        "details": {
-          "description": "min=0.213, mean=0.213, max=0.213, sum=0.213 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.076, mean=0.076, max=0.076, sum=0.076 (1)",
-            "tab": "Calibration",
-            "score": 0.07613907039385276
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.139, mean=0.139, max=0.139, sum=0.139 (1)",
-            "tab": "Robustness",
-            "score": 0.13914373088685014
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.18, mean=0.18, max=0.18, sum=0.18 (1)",
-            "tab": "Fairness",
-            "score": 0.18042813455657492
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=505.352, mean=505.352, max=505.352, sum=505.352 (1)",
-            "tab": "General information",
-            "score": 505.35168195718654
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.928,
-        "details": {
-          "description": "min=0.928, mean=0.928, max=0.928, sum=0.928 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.302, mean=0.302, max=0.302, sum=0.302 (1)",
-            "tab": "Calibration",
-            "score": 0.3016994708797646
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.871, mean=0.871, max=0.871, sum=0.871 (1)",
-            "tab": "Robustness",
-            "score": 0.871
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.911, mean=0.911, max=0.911, sum=0.911 (1)",
-            "tab": "Fairness",
-            "score": 0.911
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.911, mean=2.911, max=2.911, sum=2.911 (1)",
-            "tab": "General information",
-            "score": 2.911
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1619.568, mean=1619.568, max=1619.568, sum=1619.568 (1)",
-            "tab": "General information",
-            "score": 1619.568
-          },
-          "IMDB - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.511,
-        "details": {
-          "description": "min=0.02, mean=0.511, max=0.988, sum=9.207 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.074, mean=0.259, max=0.508, sum=4.655 (18)",
-            "tab": "Calibration",
-            "score": 0.25858613851508827
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.013, mean=0.363, max=0.915, sum=6.531 (18)",
-            "tab": "Robustness",
-            "score": 0.3628308048007681
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.001, mean=0.333, max=0.927, sum=5.995 (18)",
-            "tab": "Fairness",
-            "score": 0.33307716875468274
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=360.976, mean=771.654, max=1282.4, sum=13889.772 (18)",
-            "tab": "General information",
-            "score": 771.6539847352628
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.502,
-        "details": {
-          "description": "min=0.175, mean=0.502, max=0.975, sum=5.525 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.175, mean=0.502, max=0.975, sum=5.519 (11)",
-            "tab": "Calibration",
-            "score": 0.5016937882323235
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.075, mean=0.377, max=0.975, sum=4.15 (11)",
-            "tab": "Robustness",
-            "score": 0.3772727272727272
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.175, mean=0.45, max=0.975, sum=4.95 (11)",
-            "tab": "Fairness",
-            "score": 0.45
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.7, mean=4.605, max=5, sum=50.65 (11)",
-            "tab": "General information",
-            "score": 4.6045454545454545
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=280.35, mean=869.691, max=1756.575, sum=9566.6 (11)",
-            "tab": "General information",
-            "score": 869.6909090909089
-          },
-          "RAFT - # output tokens": {
-            "description": "min=30, mean=30, max=30, sum=330 (11)",
-            "tab": "General information",
-            "score": 30.0
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json b/data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json
deleted file mode 100644
index 8d33e45b6..000000000
--- a/data/helm_classic/google/Palmyra-X-43B/5dec6a7d-2710-49c2-889d-c7b8ee203ce4.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/google_Palmyra-X-43B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Palmyra X 43B",
-    "id": "google/Palmyra-X-43B",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.732,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": null
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.8206682206682206
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.7968401968401968
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5458006056443556
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.462995337995338
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.609,
-        "details": {
-          "description": "min=0.35, mean=0.609, max=0.88, sum=9.136 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.29, mean=0.566, max=0.86, sum=8.494 (15)",
-            "tab": "Robustness",
-            "score": 0.5662339181286549
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.34, mean=0.588, max=0.86, sum=8.822 (15)",
-            "tab": "Fairness",
-            "score": 0.5881637426900584
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)",
-            "tab": "General information",
-            "score": 472.2740350877193
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.896,
-        "details": {
-          "description": "min=0.894, mean=0.896, max=0.898, sum=2.689 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.875, mean=0.878, max=0.88, sum=2.634 (3)",
-            "tab": "Robustness",
-            "score": 0.878
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.872, mean=0.875, max=0.878, sum=2.625 (3)",
-            "tab": "Fairness",
-            "score": 0.875
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)",
-            "tab": "General information",
-            "score": 908.4063333333334
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1.005, mean=1.007, max=1.01, sum=3.021 (3)",
-            "tab": "General information",
-            "score": 1.007
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.742,
-        "details": {
-          "description": "min=0.732, mean=0.742, max=0.748, sum=2.226 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.667, mean=0.672, max=0.68, sum=2.016 (3)",
-            "tab": "Robustness",
-            "score": 0.6719021727640991
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.64, mean=0.651, max=0.659, sum=1.952 (3)",
-            "tab": "Fairness",
-            "score": 0.6506183133514157
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3504.577, mean=3803.911, max=3972.577, sum=11411.732 (3)",
-            "tab": "General information",
-            "score": 3803.910798122066
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.375, mean=6.272, max=7.29, sum=18.817 (3)",
-            "tab": "General information",
-            "score": 6.272300469483568
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.396, mean=0.398, max=0.403, sum=1.194 (3)",
-            "tab": "Bias",
-            "score": 0.39814814814814814
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.149, mean=0.159, max=0.181, sum=0.478 (3)",
-            "tab": "Bias",
-            "score": 0.15935305534542177
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.008, mean=0.011, max=0.014, sum=0.034 (3)",
-            "tab": "Toxicity",
-            "score": 0.011267605633802818
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.355, mean=0.363, max=0.368, sum=1.089 (3)",
-            "tab": "Robustness",
-            "score": 0.3629707081568259
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.356, mean=0.362, max=0.367, sum=1.087 (3)",
-            "tab": "Fairness",
-            "score": 0.3624320629787478
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)",
-            "tab": "General information",
-            "score": 112.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=3.166, mean=3.19, max=3.231, sum=9.571 (3)",
-            "tab": "General information",
-            "score": 3.1903333333333332
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.245, mean=0.314, max=0.378, sum=0.941 (3)",
-            "tab": "Bias",
-            "score": 0.31352905160694455
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.25, mean=0.266, max=0.278, sum=0.797 (3)",
-            "tab": "Bias",
-            "score": 0.26566951566951563
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.473,
-        "details": {
-          "description": "min=0.459, mean=0.473, max=0.488, sum=1.419 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.379, mean=0.383, max=0.392, sum=1.15 (3)",
-            "tab": "Robustness",
-            "score": 0.38348793103386436
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.394, mean=0.399, max=0.408, sum=1.196 (3)",
-            "tab": "Fairness",
-            "score": 0.39873411995988545
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "QuAC - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=4676.788, mean=5199.788, max=5842.788, sum=15599.364 (3)",
-            "tab": "General information",
-            "score": 5199.788
-          },
-          "QuAC - # output tokens": {
-            "description": "min=25.906, mean=26.581, max=27.052, sum=79.742 (3)",
-            "tab": "General information",
-            "score": 26.580666666666662
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.63, mean=0.642, max=0.667, sum=1.926 (3)",
-            "tab": "Bias",
-            "score": 0.6419753086419754
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.364, mean=0.395, max=0.447, sum=1.186 (3)",
-            "tab": "Bias",
-            "score": 0.39526937310090554
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.286, mean=0.293, max=0.298, sum=0.878 (3)",
-            "tab": "Bias",
-            "score": 0.29267512260888473
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.221, mean=0.235, max=0.248, sum=0.705 (3)",
-            "tab": "Bias",
-            "score": 0.23492413534960777
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.616,
-        "details": {
-          "description": "min=0.601, mean=0.616, max=0.63, sum=1.847 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.554, mean=0.568, max=0.584, sum=1.705 (3)",
-            "tab": "Robustness",
-            "score": 0.5682976554536188
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.529, mean=0.542, max=0.56, sum=1.625 (3)",
-            "tab": "Fairness",
-            "score": 0.5417940876656473
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)",
-            "tab": "General information",
-            "score": 511.12079510703364
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=0.908, mean=0.949, max=0.982, sum=2.847 (3)",
-            "tab": "General information",
-            "score": 0.9490316004077473
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.049,
-        "details": {
-          "description": "min=0, mean=0.049, max=0.147, sum=0.147 (3)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=1398 (3)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1531.586, mean=1549.919, max=1567.586, sum=4649.758 (3)",
-            "tab": "General information",
-            "score": 1549.9191702432045
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=0, mean=17.63, max=52.891, sum=52.891 (3)",
-            "tab": "General information",
-            "score": 17.630185979971387
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.622, mean=0.622, max=0.622, sum=0.622 (1)",
-            "tab": "Bias",
-            "score": 0.6219394640447272
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.421, mean=0.421, max=0.421, sum=0.421 (1)",
-            "tab": "Bias",
-            "score": 0.42094867293009713
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.276, mean=0.276, max=0.276, sum=0.276 (1)",
-            "tab": "Bias",
-            "score": 0.27642276422764234
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.114, mean=0.114, max=0.114, sum=0.114 (1)",
-            "tab": "Bias",
-            "score": 0.11422708618331054
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0, mean=0.291, max=0.872, sum=0.872 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.29078580039209107
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=0, mean=2.35, max=7.049, sum=7.049 (3)",
-            "tab": "Summarization metrics",
-            "score": 2.34978873721003
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=0, mean=3.117, max=9.351, sum=9.351 (3)",
-            "tab": "Summarization metrics",
-            "score": 3.116859693035
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.149,
-        "details": {
-          "description": "min=0.144, mean=0.149, max=0.157, sum=0.447 (3)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=1554 (3)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1456.402, mean=1510.735, max=1539.402, sum=4532.205 (3)",
-            "tab": "General information",
-            "score": 1510.734877734878
-          },
-          "XSUM - # output tokens": {
-            "description": "min=25.077, mean=25.248, max=25.463, sum=75.745 (3)",
-            "tab": "General information",
-            "score": 25.248391248391247
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2.0 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666669
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.434, mean=0.438, max=0.444, sum=1.313 (3)",
-            "tab": "Bias",
-            "score": 0.43769157088122607
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.383, mean=0.439, max=0.494, sum=1.318 (3)",
-            "tab": "Bias",
-            "score": 0.4393992219104699
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.202, mean=0.205, max=0.208, sum=0.616 (3)",
-            "tab": "Bias",
-            "score": 0.2054618848004968
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.006 (3)",
-            "tab": "Toxicity",
-            "score": 0.0019305019305019308
-          },
-          "XSUM - SummaC": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.77, mean=0.775, max=0.778, sum=2.324 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.7746217499327193
-          },
-          "XSUM - Density": {
-            "description": "min=2.38, mean=2.466, max=2.546, sum=7.399 (3)",
-            "tab": "Summarization metrics",
-            "score": 2.4662768763204443
-          },
-          "XSUM - Compression": {
-            "description": "min=14.242, mean=14.252, max=14.266, sum=42.756 (3)",
-            "tab": "Summarization metrics",
-            "score": 14.25194669426599
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.935,
-        "details": {
-          "description": "min=0.928, mean=0.935, max=0.939, sum=2.806 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.896, mean=0.904, max=0.909, sum=2.713 (3)",
-            "tab": "Robustness",
-            "score": 0.9043333333333333
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.909, mean=0.918, max=0.923, sum=2.754 (3)",
-            "tab": "Fairness",
-            "score": 0.918
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1282.797, mean=1897.464, max=2572.797, sum=5692.391 (3)",
-            "tab": "General information",
-            "score": 1897.4636666666665
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1.928, mean=1.939, max=1.95, sum=5.816 (3)",
-            "tab": "General information",
-            "score": 1.9386666666666665
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.008,
-        "details": {
-          "description": "min=0, mean=0.008, max=0.344, sum=0.406 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.006, max=0.319, sum=0.347 (54)",
-            "tab": "Robustness",
-            "score": 0.006429753618269135
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.006, max=0.288, sum=0.338 (54)",
-            "tab": "Fairness",
-            "score": 0.006254555939232581
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)",
-            "tab": "General information",
-            "score": 722.6354931173206
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=0, mean=0.011, max=0.504, sum=0.604 (54)",
-            "tab": "General information",
-            "score": 0.011187107057192404
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.701,
-        "details": {
-          "description": "min=0, mean=0.701, max=0.975, sum=23.125 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.677, max=0.975, sum=22.35 (33)",
-            "tab": "Robustness",
-            "score": 0.6772727272727272
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0, mean=0.672, max=0.975, sum=22.175 (33)",
-            "tab": "Fairness",
-            "score": 0.6719696969696969
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=5, mean=5, max=5, sum=165 (33)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=257.35, mean=1279.572, max=6599.65, sum=42225.875 (33)",
-            "tab": "General information",
-            "score": 1279.5719696969697
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0, mean=3.07, max=6.825, sum=101.3 (33)",
-            "tab": "General information",
-            "score": 3.06969696969697
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json b/data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json
deleted file mode 100644
index 2a710defd..000000000
--- a/data/helm_classic/google/T5-11B/509360bc-86f5-49dc-899c-2899d8b6bc6c.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/google_T5-11B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "T5 11B",
-    "id": "google/T5-11B",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.131,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.43469010175763184
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.16445221445221445
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.14974358974358976
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.4340277777777778
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.4887674914954327
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.5758109174775842
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.1118421052631579
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.29,
-        "details": {
-          "description": "min=0.211, mean=0.29, max=0.4, sum=4.354 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.1, mean=0.151, max=0.242, sum=2.271 (15)",
-            "tab": "Calibration",
-            "score": 0.1514046561108303
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.19, mean=0.258, max=0.38, sum=3.866 (15)",
-            "tab": "Robustness",
-            "score": 0.25776608187134503
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.167, mean=0.235, max=0.33, sum=3.525 (15)",
-            "tab": "Fairness",
-            "score": 0.23500584795321638
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.173, mean=0.218, max=0.232, sum=3.277 (15)",
-            "tab": "Efficiency",
-            "score": 0.21847905223539232
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=2.482, mean=4.326, max=5, sum=64.896 (15)",
-            "tab": "General information",
-            "score": 4.326397660818714
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=382.49, mean=420.562, max=467.75, sum=6308.426 (15)",
-            "tab": "General information",
-            "score": 420.5617309941521
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.761,
-        "details": {
-          "description": "min=0.732, mean=0.761, max=0.803, sum=2.283 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.348, mean=0.433, max=0.512, sum=1.298 (3)",
-            "tab": "Calibration",
-            "score": 0.43269382093398495
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.624, mean=0.65, max=0.688, sum=1.951 (3)",
-            "tab": "Robustness",
-            "score": 0.6503333333333333
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.697, mean=0.723, max=0.766, sum=2.168 (3)",
-            "tab": "Fairness",
-            "score": 0.7226666666666667
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.27, mean=0.271, max=0.272, sum=0.814 (3)",
-            "tab": "Efficiency",
-            "score": 0.27128291567197677
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=0.969, mean=1.588, max=2.006, sum=4.765 (3)",
-            "tab": "General information",
-            "score": 1.5883333333333332
-          },
-          "BoolQ - truncated": {
-            "description": "min=0.004, mean=0.004, max=0.004, sum=0.012 (3)",
-            "tab": "General information",
-            "score": 0.004
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=386.367, mean=401.944, max=422.649, sum=1205.833 (3)",
-            "tab": "General information",
-            "score": 401.94433333333336
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "min=0.125, mean=0.375, max=0.5, sum=1.125 (3)",
-            "tab": "Bias",
-            "score": 0.375
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.086,
-        "details": {
-          "description": "min=0.086, mean=0.086, max=0.086, sum=0.257 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.0, mean=0.0, max=0.0, sum=0.0 (3)",
-            "tab": "Calibration",
-            "score": 8.06672937578031e-11
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.045, mean=0.045, max=0.045, sum=0.136 (3)",
-            "tab": "Robustness",
-            "score": 0.04518225074755041
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.05, mean=0.05, max=0.05, sum=0.149 (3)",
-            "tab": "Fairness",
-            "score": 0.0497772820026842
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=1.054, mean=1.054, max=1.054, sum=3.163 (3)",
-            "tab": "Efficiency",
-            "score": 1.0544504576125933
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0.825, mean=0.825, max=0.825, sum=2.476 (3)",
-            "tab": "General information",
-            "score": 0.8253521126760562
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=492.141, mean=492.141, max=492.141, sum=1476.423 (3)",
-            "tab": "General information",
-            "score": 492.14084507042253
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=300 (3)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.408, mean=0.408, max=0.408, sum=1.225 (3)",
-            "tab": "Bias",
-            "score": 0.4081829027907459
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.367, mean=0.367, max=0.367, sum=1.1 (3)",
-            "tab": "Bias",
-            "score": 0.36666666666666664
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.156, mean=0.156, max=0.156, sum=0.469 (3)",
-            "tab": "Bias",
-            "score": 0.15620542082738947
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.011, mean=0.011, max=0.011, sum=0.034 (3)",
-            "tab": "Toxicity",
-            "score": 0.011267605633802818
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.477,
-        "details": {
-          "description": "min=0.278, mean=0.477, max=0.588, sum=1.432 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.07, mean=0.076, max=0.082, sum=0.228 (3)",
-            "tab": "Calibration",
-            "score": 0.07599999619350188
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.051, mean=0.239, max=0.356, sum=0.717 (3)",
-            "tab": "Calibration",
-            "score": 0.23900003883193166
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.146, mean=0.153, max=0.159, sum=0.458 (3)",
-            "tab": "Robustness",
-            "score": 0.15251804391476487
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.047, mean=0.071, max=0.107, sum=0.213 (3)",
-            "tab": "Robustness",
-            "score": 0.0710016541484974
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.152, mean=0.159, max=0.164, sum=0.476 (3)",
-            "tab": "Fairness",
-            "score": 0.15857963279707157
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.227, mean=0.424, max=0.532, sum=1.271 (3)",
-            "tab": "Fairness",
-            "score": 0.42376820534695847
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=2.617, mean=2.856, max=3.211, sum=8.569 (3)",
-            "tab": "Efficiency",
-            "score": 2.856322434252687
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=6.926, mean=12.846, max=24.675, sum=38.539 (3)",
-            "tab": "Efficiency",
-            "score": 12.84636455836454
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=109.556, mean=113.556, max=118.556, sum=340.668 (3)",
-            "tab": "General information",
-            "score": 113.556
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=900 (3)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=0.096, mean=0.924, max=1.792, sum=2.771 (3)",
-            "tab": "General information",
-            "score": 0.9236666666666666
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.094, mean=0.349, max=0.839, sum=1.048 (3)",
-            "tab": "General information",
-            "score": 0.34933333333333333
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=233.452, mean=301.907, max=339.767, sum=905.721 (3)",
-            "tab": "General information",
-            "score": 301.907
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=900 (3)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.529, mean=0.533, max=0.535, sum=1.6 (3)",
-            "tab": "Bias",
-            "score": 0.5332530194915516
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.071, mean=0.103, max=0.125, sum=0.308 (3)",
-            "tab": "Bias",
-            "score": 0.10251322751322754
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.389, mean=0.417, max=0.472, sum=1.25 (3)",
-            "tab": "Bias",
-            "score": 0.4166666666666666
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.483, mean=0.516, max=0.552, sum=1.549 (3)",
-            "tab": "Bias",
-            "score": 0.5163891020108681
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.218, mean=0.243, max=0.26, sum=0.728 (3)",
-            "tab": "Bias",
-            "score": 0.24276995305164317
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.116,
-        "details": {
-          "description": "min=0.116, mean=0.116, max=0.116, sum=0.348 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.0, mean=0.0, max=0.0, sum=0.0 (3)",
-            "tab": "Calibration",
-            "score": 1.908717030577995e-9
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.064, mean=0.064, max=0.064, sum=0.191 (3)",
-            "tab": "Robustness",
-            "score": 0.06378325242260692
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.074, mean=0.074, max=0.074, sum=0.221 (3)",
-            "tab": "Fairness",
-            "score": 0.07376443691909672
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=1.032, mean=1.032, max=1.032, sum=3.097 (3)",
-            "tab": "Efficiency",
-            "score": 1.0323945961168868
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "QuAC - truncated": {
-            "description": "min=0.999, mean=0.999, max=0.999, sum=2.997 (3)",
-            "tab": "General information",
-            "score": 0.999
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=510.923, mean=510.923, max=510.923, sum=1532.769 (3)",
-            "tab": "General information",
-            "score": 510.923
-          },
-          "QuAC - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=300 (3)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.65, mean=0.65, max=0.65, sum=1.949 (3)",
-            "tab": "Bias",
-            "score": 0.6495726495726497
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.44, mean=0.44, max=0.44, sum=1.32 (3)",
-            "tab": "Bias",
-            "score": 0.4400900674211062
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.397, mean=0.397, max=0.397, sum=1.192 (3)",
-            "tab": "Bias",
-            "score": 0.39717891610987377
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.257, mean=0.257, max=0.257, sum=0.771 (3)",
-            "tab": "Bias",
-            "score": 0.25702629193109705
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.006 (3)",
-            "tab": "Toxicity",
-            "score": 0.002
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.133,
-        "details": {
-          "description": "min=0.104, mean=0.133, max=0.15, sum=0.532 (4)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.109, mean=0.143, max=0.195, sum=0.574 (4)",
-            "tab": "Calibration",
-            "score": 0.1434693835940009
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.09, mean=0.122, max=0.148, sum=0.489 (4)",
-            "tab": "Robustness",
-            "score": 0.12232415902140673
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.058, mean=0.101, max=0.136, sum=0.405 (4)",
-            "tab": "Fairness",
-            "score": 0.10129969418960244
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.174, mean=0.21, max=0.249, sum=0.838 (4)",
-            "tab": "Efficiency",
-            "score": 0.2095953345265857
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=2616 (4)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=0, mean=3.547, max=4.869, sum=14.19 (4)",
-            "tab": "General information",
-            "score": 3.547400611620795
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (4)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=85.896, mean=371.92, max=471.52, sum=1487.679 (4)",
-            "tab": "General information",
-            "score": 371.9197247706422
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=4 (4)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=2.5, max=3, sum=10 (4)",
-            "tab": "General information",
-            "score": 2.5
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.043,
-        "details": {
-          "description": "min=0.043, mean=0.043, max=0.043, sum=0.257 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=1.653, mean=1.654, max=1.655, sum=9.926 (6)",
-            "tab": "Efficiency",
-            "score": 1.6543884711070522
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=0.062, mean=0.064, max=0.067, sum=0.382 (6)",
-            "tab": "General information",
-            "score": 0.06366237482117311
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0.929, mean=0.932, max=0.933, sum=5.592 (6)",
-            "tab": "General information",
-            "score": 0.9320457796852647
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=500.412, mean=500.553, max=500.835, sum=3003.318 (6)",
-            "tab": "General information",
-            "score": 500.5529327610873
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=128, mean=128, max=128, sum=768 (6)",
-            "tab": "General information",
-            "score": 128.0
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.632, mean=0.632, max=0.632, sum=3.789 (6)",
-            "tab": "Bias",
-            "score": 0.631578947368421
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.452, mean=0.452, max=0.452, sum=2.709 (6)",
-            "tab": "Bias",
-            "score": 0.4515726043503821
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.264, mean=0.264, max=0.264, sum=1.581 (6)",
-            "tab": "Bias",
-            "score": 0.26356589147286824
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.119, mean=0.119, max=0.12, sum=0.713 (6)",
-            "tab": "Bias",
-            "score": 0.11890102842483792
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=-0.125, mean=-0.122, max=-0.117, sum=-0.365 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.12151602946968616
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=-0.173, mean=-0.17, max=-0.165, sum=-0.509 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.16977369097758946
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.55, mean=0.555, max=0.56, sum=3.329 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.5547542182286073
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=2.69, mean=2.698, max=2.706, sum=16.19 (6)",
-            "tab": "Summarization metrics",
-            "score": 2.698337926712314
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=19.085, mean=19.248, max=19.44, sum=115.49 (6)",
-            "tab": "Summarization metrics",
-            "score": 19.248383205041776
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.015,
-        "details": {
-          "description": "min=0.008, mean=0.015, max=0.018, sum=0.087 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=1.096, mean=1.159, max=1.283, sum=6.953 (6)",
-            "tab": "Efficiency",
-            "score": 1.15883249730996
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=0.239, mean=0.3, max=0.373, sum=1.799 (6)",
-            "tab": "General information",
-            "score": 0.29987129987129985
-          },
-          "XSUM - truncated": {
-            "description": "min=0.602, mean=0.671, max=0.73, sum=4.023 (6)",
-            "tab": "General information",
-            "score": 0.6705276705276706
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=432.851, mean=436.826, max=442.064, sum=2620.958 (6)",
-            "tab": "General information",
-            "score": 436.8262548262548
-          },
-          "XSUM - # output tokens": {
-            "description": "min=64, mean=64, max=64, sum=384 (6)",
-            "tab": "General information",
-            "score": 64.0
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2.667 (4)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=3 (6)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.333, mean=0.358, max=0.394, sum=2.15 (6)",
-            "tab": "Bias",
-            "score": 0.3582634859230604
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.214, mean=0.222, max=0.231, sum=1.332 (6)",
-            "tab": "Bias",
-            "score": 0.2219358310118288
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.267, mean=-0.258, max=-0.244, sum=-0.775 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.2584302846171323
-          },
-          "XSUM - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=-0.379, mean=-0.315, max=-0.276, sum=-0.944 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.3147063674770794
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.324, mean=0.355, max=0.372, sum=2.133 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.3554524422801694
-          },
-          "XSUM - Density": {
-            "description": "min=0.763, mean=0.831, max=0.866, sum=4.987 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.831154946558878
-          },
-          "XSUM - Compression": {
-            "description": "min=16.29, mean=16.544, max=16.714, sum=99.261 (6)",
-            "tab": "Summarization metrics",
-            "score": 16.543527805806836
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.379,
-        "details": {
-          "description": "min=0.248, mean=0.379, max=0.568, sum=1.137 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.108, mean=0.236, max=0.374, sum=0.707 (3)",
-            "tab": "Calibration",
-            "score": 0.23573461605966659
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.17, mean=0.304, max=0.51, sum=0.911 (3)",
-            "tab": "Robustness",
-            "score": 0.3036666666666667
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.162, mean=0.303, max=0.502, sum=0.91 (3)",
-            "tab": "Fairness",
-            "score": 0.30333333333333334
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.276, mean=0.278, max=0.28, sum=0.834 (3)",
-            "tab": "Efficiency",
-            "score": 0.27797461745258367
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=0.33, mean=0.466, max=0.701, sum=1.397 (3)",
-            "tab": "General information",
-            "score": 0.4656666666666666
-          },
-          "IMDB - truncated": {
-            "description": "min=0.172, mean=0.173, max=0.173, sum=0.518 (3)",
-            "tab": "General information",
-            "score": 0.17266666666666666
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=391.442, mean=408.425, max=434.668, sum=1225.274 (3)",
-            "tab": "General information",
-            "score": 408.4246666666666
-          },
-          "IMDB - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.509,
-        "details": {
-          "description": "min=0, mean=0.509, max=0.998, sum=27.462 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.108, mean=0.38, max=0.553, sum=20.519 (54)",
-            "tab": "Calibration",
-            "score": 0.3799801119037254
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.392, max=0.991, sum=21.175 (54)",
-            "tab": "Robustness",
-            "score": 0.39212772273586344
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.329, max=0.991, sum=17.759 (54)",
-            "tab": "Fairness",
-            "score": 0.32887358622117774
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.269, mean=0.27, max=0.273, sum=14.596 (54)",
-            "tab": "Efficiency",
-            "score": 0.27030228534077655
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=1.019, mean=2.636, max=4.881, sum=142.352 (54)",
-            "tab": "General information",
-            "score": 2.6361556323380086
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0.002, max=0.022, sum=0.094 (54)",
-            "tab": "General information",
-            "score": 0.0017482982997674094
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=331.768, mean=416.791, max=477.628, sum=22506.741 (54)",
-            "tab": "General information",
-            "score": 416.79149386044713
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37,
-        "details": {
-          "description": "min=0, mean=0.37, max=0.925, sum=12.2 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.0, mean=0.367, max=0.925, sum=12.1 (33)",
-            "tab": "Calibration",
-            "score": 0.36667176546312147
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.331, max=0.875, sum=10.925 (33)",
-            "tab": "Robustness",
-            "score": 0.33106060606060606
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0, mean=0.351, max=0.85, sum=11.575 (33)",
-            "tab": "Fairness",
-            "score": 0.3507575757575757
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.411, mean=0.448, max=0.835, sum=14.799 (33)",
-            "tab": "Efficiency",
-            "score": 0.4484652494441787
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=2.433, max=5, sum=80.3 (33)",
-            "tab": "General information",
-            "score": 2.433333333333333
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0.394, max=1, sum=13 (33)",
-            "tab": "General information",
-            "score": 0.3939393939393939
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=263.4, mean=420.742, max=511, sum=13884.475 (33)",
-            "tab": "General information",
-            "score": 420.7416666666667
-          },
-          "RAFT - # output tokens": {
-            "description": "min=30, mean=30, max=30, sum=990 (33)",
-            "tab": "General information",
-            "score": 30.0
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json b/data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json
deleted file mode 100644
index bb571aece..000000000
--- a/data/helm_classic/google/UL2-20B/8f54f091-46d0-4a9a-9b22-a97a7e3972c0.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/google_UL2-20B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "UL2 20B",
-    "id": "google/UL2-20B",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.167,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.464477335800185
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.2572027972027972
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.1858974358974359
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.5056944444444444
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5601766236691538
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.2902378485711819
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.11842105263157894
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.291,
-        "details": {
-          "description": "min=0.2, mean=0.291, max=0.39, sum=4.368 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.084, mean=0.134, max=0.202, sum=2.004 (15)",
-            "tab": "Calibration",
-            "score": 0.13362255376880447
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.2, mean=0.272, max=0.37, sum=4.079 (15)",
-            "tab": "Robustness",
-            "score": 0.2719415204678362
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.19, mean=0.273, max=0.36, sum=4.102 (15)",
-            "tab": "Fairness",
-            "score": 0.2734502923976609
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.178, mean=0.182, max=0.184, sum=2.725 (15)",
-            "tab": "Efficiency",
-            "score": 0.18164482078684702
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=2.465, mean=4.316, max=5, sum=64.743 (15)",
-            "tab": "General information",
-            "score": 4.316222222222222
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=385.228, mean=423.395, max=467.79, sum=6350.919 (15)",
-            "tab": "General information",
-            "score": 423.39457309941525
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.746,
-        "details": {
-          "description": "min=0.717, mean=0.746, max=0.762, sum=2.237 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.416, mean=0.46, max=0.512, sum=1.379 (3)",
-            "tab": "Calibration",
-            "score": 0.45980755585445926
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.638, mean=0.646, max=0.651, sum=1.938 (3)",
-            "tab": "Robustness",
-            "score": 0.646
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.672, mean=0.698, max=0.714, sum=2.095 (3)",
-            "tab": "Fairness",
-            "score": 0.6983333333333334
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.292, mean=0.313, max=0.341, sum=0.938 (3)",
-            "tab": "Efficiency",
-            "score": 0.3127442524572212
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=0.953, mean=1.57, max=1.978, sum=4.709 (3)",
-            "tab": "General information",
-            "score": 1.5696666666666668
-          },
-          "BoolQ - truncated": {
-            "description": "min=0.004, mean=0.004, max=0.004, sum=0.012 (3)",
-            "tab": "General information",
-            "score": 0.004
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=386.826, mean=402.285, max=424.449, sum=1206.854 (3)",
-            "tab": "General information",
-            "score": 402.2846666666667
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "min=0.167, mean=0.23, max=0.357, sum=0.69 (3)",
-            "tab": "Bias",
-            "score": 0.23015873015873015
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.083,
-        "details": {
-          "description": "min=0.083, mean=0.083, max=0.083, sum=0.248 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.0, mean=0.0, max=0.0, sum=0.0 (3)",
-            "tab": "Calibration",
-            "score": 4.840114578300129e-6
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.059, mean=0.059, max=0.059, sum=0.178 (3)",
-            "tab": "Robustness",
-            "score": 0.05920683866208649
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.053, mean=0.053, max=0.053, sum=0.159 (3)",
-            "tab": "Fairness",
-            "score": 0.05305645886768214
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=1.182, mean=1.182, max=1.182, sum=3.546 (3)",
-            "tab": "Efficiency",
-            "score": 1.1820060481894892
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0.834, mean=0.834, max=0.834, sum=2.501 (3)",
-            "tab": "General information",
-            "score": 0.8338028169014086
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=492.876, mean=492.876, max=492.876, sum=1478.628 (3)",
-            "tab": "General information",
-            "score": 492.87605633802815
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=300 (3)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.337, mean=0.337, max=0.337, sum=1.01 (3)",
-            "tab": "Bias",
-            "score": 0.3368016513369257
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.342, mean=0.342, max=0.342, sum=1.026 (3)",
-            "tab": "Bias",
-            "score": 0.3419913419913419
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.154, mean=0.154, max=0.154, sum=0.462 (3)",
-            "tab": "Bias",
-            "score": 0.15399534522885955
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.017, mean=0.017, max=0.017, sum=0.051 (3)",
-            "tab": "Toxicity",
-            "score": 0.016901408450704224
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.349,
-        "details": {
-          "description": "min=0.195, mean=0.349, max=0.432, sum=1.048 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.088, mean=0.092, max=0.095, sum=0.276 (3)",
-            "tab": "Calibration",
-            "score": 0.09200000000000001
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.028, mean=0.179, max=0.258, sum=0.537 (3)",
-            "tab": "Calibration",
-            "score": 0.17899999902043598
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.139, mean=0.141, max=0.143, sum=0.423 (3)",
-            "tab": "Robustness",
-            "score": 0.1409495030072503
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.154, mean=0.291, max=0.365, sum=0.872 (3)",
-            "tab": "Robustness",
-            "score": 0.2906387285430619
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.159, mean=0.162, max=0.167, sum=0.486 (3)",
-            "tab": "Fairness",
-            "score": 0.16184307849771043
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.153, mean=0.303, max=0.389, sum=0.908 (3)",
-            "tab": "Fairness",
-            "score": 0.30281096844711025
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=1.912, mean=1.994, max=2.142, sum=5.981 (3)",
-            "tab": "Efficiency",
-            "score": 1.993551874854462
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=2.941, mean=3.093, max=3.306, sum=9.279 (3)",
-            "tab": "Efficiency",
-            "score": 3.0931644739895567
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=113.556, mean=117.556, max=122.556, sum=352.668 (3)",
-            "tab": "General information",
-            "score": 117.556
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=900 (3)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=0.083, mean=0.918, max=1.789, sum=2.755 (3)",
-            "tab": "General information",
-            "score": 0.9183333333333333
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.097, mean=0.355, max=0.852, sum=1.064 (3)",
-            "tab": "General information",
-            "score": 0.3546666666666667
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=231.47, mean=303.619, max=343.479, sum=910.857 (3)",
-            "tab": "General information",
-            "score": 303.61899999999997
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=900 (3)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.333, mean=0.387, max=0.44, sum=1.162 (3)",
-            "tab": "Bias",
-            "score": 0.3874074074074074
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.444, mean=0.519, max=0.562, sum=1.558 (3)",
-            "tab": "Bias",
-            "score": 0.5194689485314483
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.079, mean=0.183, max=0.239, sum=0.549 (3)",
-            "tab": "Bias",
-            "score": 0.1829490113242974
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.41, mean=0.449, max=0.5, sum=1.346 (3)",
-            "tab": "Bias",
-            "score": 0.44858553791887124
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.451, mean=0.538, max=0.595, sum=1.615 (3)",
-            "tab": "Bias",
-            "score": 0.5381999649472214
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.069, mean=0.111, max=0.136, sum=0.332 (3)",
-            "tab": "Bias",
-            "score": 0.11064384639781977
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.144,
-        "details": {
-          "description": "min=0.144, mean=0.144, max=0.144, sum=0.433 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.0, mean=0.0, max=0.0, sum=0.0 (3)",
-            "tab": "Calibration",
-            "score": 0.00013015946539738277
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.111, mean=0.111, max=0.111, sum=0.333 (3)",
-            "tab": "Robustness",
-            "score": 0.11096938073772407
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.107, mean=0.107, max=0.107, sum=0.32 (3)",
-            "tab": "Fairness",
-            "score": 0.10672699918485114
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=1.226, mean=1.226, max=1.226, sum=3.679 (3)",
-            "tab": "Efficiency",
-            "score": 1.2264695519389521
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "QuAC - truncated": {
-            "description": "min=0.999, mean=0.999, max=0.999, sum=2.997 (3)",
-            "tab": "General information",
-            "score": 0.999
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=510.938, mean=510.938, max=510.938, sum=1532.814 (3)",
-            "tab": "General information",
-            "score": 510.93799999999993
-          },
-          "QuAC - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=300 (3)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.614, mean=0.614, max=0.614, sum=1.843 (3)",
-            "tab": "Bias",
-            "score": 0.6143486267149368
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.402, mean=0.402, max=0.402, sum=1.207 (3)",
-            "tab": "Bias",
-            "score": 0.40228575253954807
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.317, mean=0.317, max=0.317, sum=0.951 (3)",
-            "tab": "Bias",
-            "score": 0.3169129720853858
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.253, mean=0.253, max=0.253, sum=0.758 (3)",
-            "tab": "Bias",
-            "score": 0.2525635309852876
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.006, mean=0.006, max=0.006, sum=0.018 (3)",
-            "tab": "Toxicity",
-            "score": 0.006000000000000001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.193,
-        "details": {
-          "description": "min=0.162, mean=0.193, max=0.232, sum=0.772 (4)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.096, mean=0.125, max=0.139, sum=0.498 (4)",
-            "tab": "Calibration",
-            "score": 0.12460869505528777
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.162, mean=0.178, max=0.209, sum=0.711 (4)",
-            "tab": "Robustness",
-            "score": 0.17775229357798167
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.15, mean=0.162, max=0.176, sum=0.647 (4)",
-            "tab": "Fairness",
-            "score": 0.16169724770642202
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.122, mean=0.168, max=0.183, sum=0.671 (4)",
-            "tab": "Efficiency",
-            "score": 0.16779271445154526
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=2616 (4)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=0, mean=3.513, max=4.838, sum=14.05 (4)",
-            "tab": "General information",
-            "score": 3.5126146788990824
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (4)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=89.896, mean=372.668, max=473.333, sum=1490.671 (4)",
-            "tab": "General information",
-            "score": 372.66781345565744
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=4 (4)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=2.5, max=3, sum=10 (4)",
-            "tab": "General information",
-            "score": 2.5
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.03,
-        "details": {
-          "description": "min=0.03, mean=0.03, max=0.03, sum=0.182 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=1.108, mean=1.108, max=1.109, sum=6.651 (6)",
-            "tab": "Efficiency",
-            "score": 1.1084291968542619
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=0.06, mean=0.061, max=0.062, sum=0.365 (6)",
-            "tab": "General information",
-            "score": 0.060801144492131615
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0.933, mean=0.935, max=0.936, sum=5.609 (6)",
-            "tab": "General information",
-            "score": 0.9349070100143061
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=500.788, mean=500.829, max=500.912, sum=3004.974 (6)",
-            "tab": "General information",
-            "score": 500.8290414878398
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=128, mean=128, max=128, sum=768 (6)",
-            "tab": "General information",
-            "score": 128.0
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.402, mean=0.402, max=0.402, sum=2.411 (6)",
-            "tab": "Bias",
-            "score": 0.4018787714810442
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.361, mean=0.361, max=0.361, sum=2.163 (6)",
-            "tab": "Bias",
-            "score": 0.3605442176870748
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.188, mean=0.188, max=0.188, sum=1.129 (6)",
-            "tab": "Bias",
-            "score": 0.1882129277566539
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0.009, mean=0.009, max=0.009, sum=0.052 (6)",
-            "tab": "Toxicity",
-            "score": 0.008583690987124463
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=-0.27, mean=-0.27, max=-0.27, sum=-0.81 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.2698551726198464
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=-0.122, mean=-0.121, max=-0.12, sum=-0.362 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.12078049146748136
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.72, mean=0.72, max=0.72, sum=4.319 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.7197585278365729
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=5.044, mean=5.044, max=5.044, sum=30.265 (6)",
-            "tab": "Summarization metrics",
-            "score": 5.044183333839311
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=7.173, mean=7.186, max=7.2, sum=43.118 (6)",
-            "tab": "Summarization metrics",
-            "score": 7.186281356409094
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.058,
-        "details": {
-          "description": "min=0.049, mean=0.058, max=0.066, sum=0.345 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=0.771, mean=0.774, max=0.781, sum=4.646 (6)",
-            "tab": "Efficiency",
-            "score": 0.7743015579914415
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=0.234, mean=0.293, max=0.361, sum=1.761 (6)",
-            "tab": "General information",
-            "score": 0.29343629343629346
-          },
-          "XSUM - truncated": {
-            "description": "min=0.614, mean=0.677, max=0.736, sum=4.062 (6)",
-            "tab": "General information",
-            "score": 0.676962676962677
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=433.917, mean=437.97, max=442.292, sum=2627.819 (6)",
-            "tab": "General information",
-            "score": 437.96975546975546
-          },
-          "XSUM - # output tokens": {
-            "description": "min=64, mean=64, max=64, sum=384 (6)",
-            "tab": "General information",
-            "score": 64.0
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.45, mean=0.455, max=0.463, sum=2.729 (6)",
-            "tab": "Bias",
-            "score": 0.45478395061728394
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.489, mean=0.524, max=0.556, sum=3.145 (6)",
-            "tab": "Bias",
-            "score": 0.5241150528821762
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.236, mean=0.251, max=0.262, sum=1.508 (6)",
-            "tab": "Bias",
-            "score": 0.251389993488347
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)",
-            "tab": "Toxicity",
-            "score": 0.0006435006435006435
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.28, mean=-0.275, max=-0.272, sum=-0.826 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.2753430534988641
-          },
-          "XSUM - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.028, mean=0.072, max=0.121, sum=0.215 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.07156637071699196
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.617, mean=0.643, max=0.671, sum=3.856 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.6426528869383965
-          },
-          "XSUM - Density": {
-            "description": "min=3.058, mean=3.208, max=3.428, sum=19.25 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.2083925287601787
-          },
-          "XSUM - Compression": {
-            "description": "min=7.31, mean=7.853, max=8.427, sum=47.12 (6)",
-            "tab": "Summarization metrics",
-            "score": 7.853257861418139
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.337,
-        "details": {
-          "description": "min=0.13, mean=0.337, max=0.556, sum=1.01 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.037, mean=0.225, max=0.41, sum=0.675 (3)",
-            "tab": "Calibration",
-            "score": 0.22500123786419848
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.091, mean=0.276, max=0.485, sum=0.827 (3)",
-            "tab": "Robustness",
-            "score": 0.27566666666666667
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.092, mean=0.271, max=0.484, sum=0.814 (3)",
-            "tab": "Fairness",
-            "score": 0.2713333333333333
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.214, mean=0.215, max=0.217, sum=0.645 (3)",
-            "tab": "Efficiency",
-            "score": 0.21490736543138858
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=0.309, mean=0.449, max=0.689, sum=1.347 (3)",
-            "tab": "General information",
-            "score": 0.449
-          },
-          "IMDB - truncated": {
-            "description": "min=0.175, mean=0.176, max=0.176, sum=0.527 (3)",
-            "tab": "General information",
-            "score": 0.17566666666666664
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=388.254, mean=407.098, max=435.686, sum=1221.293 (3)",
-            "tab": "General information",
-            "score": 407.0976666666666
-          },
-          "IMDB - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.521,
-        "details": {
-          "description": "min=0, mean=0.521, max=1, sum=28.146 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.123, mean=0.404, max=0.585, sum=21.802 (54)",
-            "tab": "Calibration",
-            "score": 0.40373338964571226
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.45, max=0.983, sum=24.293 (54)",
-            "tab": "Robustness",
-            "score": 0.4498711194026963
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.423, max=0.975, sum=22.816 (54)",
-            "tab": "Fairness",
-            "score": 0.4225225679997762
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.21, mean=0.264, max=0.45, sum=14.236 (54)",
-            "tab": "Efficiency",
-            "score": 0.2636334561494892
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=1.01, mean=2.608, max=4.878, sum=140.857 (54)",
-            "tab": "General information",
-            "score": 2.608459470057463
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0.003, max=0.032, sum=0.138 (54)",
-            "tab": "General information",
-            "score": 0.0025500084787325617
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=335.768, mean=416.896, max=479.235, sum=22512.361 (54)",
-            "tab": "General information",
-            "score": 416.89557696196465
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.404,
-        "details": {
-          "description": "min=0, mean=0.404, max=0.95, sum=13.325 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.0, mean=0.401, max=0.95, sum=13.228 (33)",
-            "tab": "Calibration",
-            "score": 0.40084433515818857
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.349, max=0.95, sum=11.525 (33)",
-            "tab": "Robustness",
-            "score": 0.3492424242424242
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0, mean=0.375, max=0.95, sum=12.375 (33)",
-            "tab": "Fairness",
-            "score": 0.375
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.316, mean=0.434, max=0.454, sum=14.32 (33)",
-            "tab": "Efficiency",
-            "score": 0.43394225670679076
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=2.433, max=5, sum=80.3 (33)",
-            "tab": "General information",
-            "score": 2.433333333333333
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0.394, max=1, sum=13 (33)",
-            "tab": "General information",
-            "score": 0.3939393939393939
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=267.4, mean=423.537, max=511, sum=13976.725 (33)",
-            "tab": "General information",
-            "score": 423.53712121212124
-          },
-          "RAFT - # output tokens": {
-            "description": "min=30, mean=30, max=30, sum=990 (33)",
-            "tab": "General information",
-            "score": 30.0
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "min=0.079, mean=0.079, max=0.079, sum=0.237 (3)",
-            "tab": "Bias",
-            "score": 0.07894736842105265
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json b/data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json
deleted file mode 100644
index e1d9662a3..000000000
--- a/data/helm_classic/lmsys/Vicuna-v1.3-13B/8f152c7d-5fba-476e-82c1-4f34a6e7d7e0.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-13B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Vicuna v1.3 13B",
-    "id": "lmsys/Vicuna-v1.3-13B",
-    "developer": "lmsys",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.706,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.27488436632747454
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.7320745920745921
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.7154545454545455
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5333173629091996
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.5758158508158508
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.462,
-        "details": {
-          "description": "min=0.298, mean=0.462, max=0.72, sum=2.308 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.156, mean=0.194, max=0.246, sum=0.972 (5)",
-            "tab": "Calibration",
-            "score": 0.19445587267296924
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.237, mean=0.413, max=0.69, sum=2.067 (5)",
-            "tab": "Robustness",
-            "score": 0.4133684210526316
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.228, mean=0.424, max=0.7, sum=2.118 (5)",
-            "tab": "Fairness",
-            "score": 0.4236140350877193
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)",
-            "tab": "General information",
-            "score": 522.5470877192982
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.808,
-        "details": {
-          "description": "min=0.808, mean=0.808, max=0.808, sum=0.808 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.159, mean=0.159, max=0.159, sum=0.159 (1)",
-            "tab": "Calibration",
-            "score": 0.15912327464389103
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.757, mean=0.757, max=0.757, sum=0.757 (1)",
-            "tab": "Robustness",
-            "score": 0.757
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.748, mean=0.748, max=0.748, sum=0.748 (1)",
-            "tab": "Fairness",
-            "score": 0.748
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)",
-            "tab": "General information",
-            "score": 1439.447
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=4.996, mean=4.996, max=4.996, sum=4.996 (1)",
-            "tab": "General information",
-            "score": 4.996
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.691,
-        "details": {
-          "description": "min=0.691, mean=0.691, max=0.691, sum=0.691 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.257, mean=0.257, max=0.257, sum=0.257 (1)",
-            "tab": "Calibration",
-            "score": 0.25677737638719905
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.525, mean=0.525, max=0.525, sum=0.525 (1)",
-            "tab": "Robustness",
-            "score": 0.5253621693457193
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.607, mean=0.607, max=0.607, sum=0.607 (1)",
-            "tab": "Fairness",
-            "score": 0.6066076692752655
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.437, mean=1.437, max=1.437, sum=1.437 (1)",
-            "tab": "General information",
-            "score": 1.4366197183098592
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1541.115, mean=1541.115, max=1541.115, sum=1541.115 (1)",
-            "tab": "General information",
-            "score": 1541.1154929577465
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=67.575, mean=67.575, max=67.575, sum=67.575 (1)",
-            "tab": "General information",
-            "score": 67.57464788732395
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.417, mean=0.417, max=0.417, sum=0.417 (1)",
-            "tab": "Bias",
-            "score": 0.41666666666666663
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.181, mean=0.181, max=0.181, sum=0.181 (1)",
-            "tab": "Bias",
-            "score": 0.1806282722513089
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.008, mean=0.008, max=0.008, sum=0.008 (1)",
-            "tab": "Toxicity",
-            "score": 0.008450704225352112
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.686,
-        "details": {
-          "description": "min=0.686, mean=0.686, max=0.686, sum=0.686 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.202, mean=0.202, max=0.202, sum=0.202 (1)",
-            "tab": "Calibration",
-            "score": 0.20199999735253094
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.43, mean=0.43, max=0.43, sum=0.43 (1)",
-            "tab": "Calibration",
-            "score": 0.4297157164166979
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.273, mean=0.273, max=0.273, sum=0.273 (1)",
-            "tab": "Robustness",
-            "score": 0.2732835109469542
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.621, mean=0.621, max=0.621, sum=0.621 (1)",
-            "tab": "Robustness",
-            "score": 0.6205537766211775
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.266, mean=0.266, max=0.266, sum=0.266 (1)",
-            "tab": "Fairness",
-            "score": 0.26608326669652704
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.63, mean=0.63, max=0.63, sum=0.63 (1)",
-            "tab": "Fairness",
-            "score": 0.6295785534387982
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)",
-            "tab": "General information",
-            "score": 137.383
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=299.508, mean=299.508, max=299.508, sum=299.508 (1)",
-            "tab": "General information",
-            "score": 299.508
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)",
-            "tab": "General information",
-            "score": 3.722
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.049, mean=0.049, max=0.049, sum=0.049 (1)",
-            "tab": "General information",
-            "score": 0.049
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1407.178, mean=1407.178, max=1407.178, sum=1407.178 (1)",
-            "tab": "General information",
-            "score": 1407.178
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=266.895, mean=266.895, max=266.895, sum=266.895 (1)",
-            "tab": "General information",
-            "score": 266.895
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.364, mean=0.364, max=0.364, sum=0.364 (1)",
-            "tab": "Bias",
-            "score": 0.363914373088685
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.132, mean=0.132, max=0.132, sum=0.132 (1)",
-            "tab": "Bias",
-            "score": 0.13157894736842105
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.484, mean=0.484, max=0.484, sum=0.484 (1)",
-            "tab": "Bias",
-            "score": 0.4838709677419355
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.293 (1)",
-            "tab": "Bias",
-            "score": 0.29310344827586204
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.403,
-        "details": {
-          "description": "min=0.403, mean=0.403, max=0.403, sum=0.403 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.103, mean=0.103, max=0.103, sum=0.103 (1)",
-            "tab": "Calibration",
-            "score": 0.10339686685910766
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.247, mean=0.247, max=0.247, sum=0.247 (1)",
-            "tab": "Robustness",
-            "score": 0.24738453163162216
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.324, mean=0.324, max=0.324, sum=0.324 (1)",
-            "tab": "Fairness",
-            "score": 0.32414193488324744
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)",
-            "tab": "General information",
-            "score": 0.507
-          },
-          "QuAC - truncated": {
-            "description": "min=0.06, mean=0.06, max=0.06, sum=0.06 (1)",
-            "tab": "General information",
-            "score": 0.06
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1498.657, mean=1498.657, max=1498.657, sum=1498.657 (1)",
-            "tab": "General information",
-            "score": 1498.657
-          },
-          "QuAC - # output tokens": {
-            "description": "min=77.743, mean=77.743, max=77.743, sum=77.743 (1)",
-            "tab": "General information",
-            "score": 77.743
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.63, mean=0.63, max=0.63, sum=0.63 (1)",
-            "tab": "Bias",
-            "score": 0.6296296296296295
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.408, mean=0.408, max=0.408, sum=0.408 (1)",
-            "tab": "Bias",
-            "score": 0.4083074125172457
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.289, mean=0.289, max=0.289, sum=0.289 (1)",
-            "tab": "Bias",
-            "score": 0.28888888888888886
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.242, mean=0.242, max=0.242, sum=0.242 (1)",
-            "tab": "Bias",
-            "score": 0.2418952618453865
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.385,
-        "details": {
-          "description": "min=0.385, mean=0.385, max=0.385, sum=0.385 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.316, mean=0.316, max=0.316, sum=0.316 (1)",
-            "tab": "Calibration",
-            "score": 0.31581376966800645
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.341, mean=0.341, max=0.341, sum=0.341 (1)",
-            "tab": "Robustness",
-            "score": 0.3409785932721712
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.315, mean=0.315, max=0.315, sum=0.315 (1)",
-            "tab": "Fairness",
-            "score": 0.3149847094801223
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)",
-            "tab": "General information",
-            "score": 524.6024464831804
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.762,
-        "details": {
-          "description": "min=0.762, mean=0.762, max=0.762, sum=0.762 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.183, mean=0.183, max=0.183, sum=0.183 (1)",
-            "tab": "Calibration",
-            "score": 0.18259660460611343
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.674, mean=0.674, max=0.674, sum=0.674 (1)",
-            "tab": "Robustness",
-            "score": 0.674
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.707, mean=0.707, max=0.707, sum=0.707 (1)",
-            "tab": "Fairness",
-            "score": 0.707
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.781, mean=2.781, max=2.781, sum=2.781 (1)",
-            "tab": "General information",
-            "score": 2.781
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1751.213, mean=1751.213, max=1751.213, sum=1751.213 (1)",
-            "tab": "General information",
-            "score": 1751.213
-          },
-          "IMDB - # output tokens": {
-            "description": "min=3.32, mean=3.32, max=3.32, sum=3.32 (1)",
-            "tab": "General information",
-            "score": 3.32
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.645,
-        "details": {
-          "description": "min=0.247, mean=0.645, max=0.946, sum=11.602 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.086, mean=0.253, max=0.415, sum=4.559 (18)",
-            "tab": "Calibration",
-            "score": 0.25325054290553783
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.177, mean=0.593, max=0.932, sum=10.679 (18)",
-            "tab": "Robustness",
-            "score": 0.5932501359027997
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.139, mean=0.569, max=0.946, sum=10.248 (18)",
-            "tab": "Fairness",
-            "score": 0.5693148383516141
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)",
-            "tab": "General information",
-            "score": 855.2410378605821
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=2, mean=2.59, max=4.159, sum=46.618 (18)",
-            "tab": "General information",
-            "score": 2.589879611958418
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.657,
-        "details": {
-          "description": "min=0.175, mean=0.657, max=0.9, sum=7.225 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.212, mean=0.376, max=0.701, sum=4.137 (11)",
-            "tab": "Calibration",
-            "score": 0.37612291287489436
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.025, mean=0.591, max=0.875, sum=6.5 (11)",
-            "tab": "Robustness",
-            "score": 0.5909090909090909
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.125, mean=0.62, max=0.875, sum=6.825 (11)",
-            "tab": "Fairness",
-            "score": 0.6204545454545454
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.45, mean=4.552, max=5, sum=50.075 (11)",
-            "tab": "General information",
-            "score": 4.552272727272727
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=303.675, mean=954.111, max=1882.1, sum=10495.225 (11)",
-            "tab": "General information",
-            "score": 954.1113636363635
-          },
-          "RAFT - # output tokens": {
-            "description": "min=5.3, mean=15.4, max=30, sum=169.4 (11)",
-            "tab": "General information",
-            "score": 15.399999999999999
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json b/data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json
deleted file mode 100644
index b03d7afe6..000000000
--- a/data/helm_classic/lmsys/Vicuna-v1.3-7B/7c0d2405-f12e-4a3b-924f-1b2a86fd4eae.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/lmsys_Vicuna-v1.3-7B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Vicuna v1.3 7B",
-    "id": "lmsys/Vicuna-v1.3-7B",
-    "developer": "lmsys",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.625,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.20388529139685477
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.662027972027972
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.6221212121212122
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5093893164757827
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.8238927738927739
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.434,
-        "details": {
-          "description": "min=0.228, mean=0.434, max=0.7, sum=2.168 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.121, mean=0.176, max=0.315, sum=0.88 (5)",
-            "tab": "Calibration",
-            "score": 0.17593793416924502
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.175, mean=0.371, max=0.65, sum=1.855 (5)",
-            "tab": "Robustness",
-            "score": 0.3710877192982456
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.184, mean=0.385, max=0.68, sum=1.924 (5)",
-            "tab": "Fairness",
-            "score": 0.38484210526315793
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)",
-            "tab": "General information",
-            "score": 522.5470877192982
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=0.76 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.322, mean=0.322, max=0.322, sum=0.322 (1)",
-            "tab": "Calibration",
-            "score": 0.322404542566261
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.672, mean=0.672, max=0.672, sum=0.672 (1)",
-            "tab": "Robustness",
-            "score": 0.672
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.67, mean=0.67, max=0.67, sum=0.67 (1)",
-            "tab": "Fairness",
-            "score": 0.67
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)",
-            "tab": "General information",
-            "score": 1439.447
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=4.412, mean=4.412, max=4.412, sum=4.412 (1)",
-            "tab": "General information",
-            "score": 4.412
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.643,
-        "details": {
-          "description": "min=0.643, mean=0.643, max=0.643, sum=0.643 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.084, mean=0.084, max=0.084, sum=0.084 (1)",
-            "tab": "Calibration",
-            "score": 0.08355639800803456
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Robustness",
-            "score": 0.499695916561912
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.553, mean=0.553, max=0.553, sum=0.553 (1)",
-            "tab": "Fairness",
-            "score": 0.5528194590567359
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.437, mean=1.437, max=1.437, sum=1.437 (1)",
-            "tab": "General information",
-            "score": 1.4366197183098592
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1541.115, mean=1541.115, max=1541.115, sum=1541.115 (1)",
-            "tab": "General information",
-            "score": 1541.1154929577465
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=19.287, mean=19.287, max=19.287, sum=19.287 (1)",
-            "tab": "General information",
-            "score": 19.28732394366197
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.379, mean=0.379, max=0.379, sum=0.379 (1)",
-            "tab": "Bias",
-            "score": 0.3794642857142857
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.373, mean=0.373, max=0.373, sum=0.373 (1)",
-            "tab": "Bias",
-            "score": 0.37254901960784315
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.186, mean=0.186, max=0.186, sum=0.186 (1)",
-            "tab": "Bias",
-            "score": 0.18604651162790695
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.008, mean=0.008, max=0.008, sum=0.008 (1)",
-            "tab": "Toxicity",
-            "score": 0.008450704225352112
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.634,
-        "details": {
-          "description": "min=0.634, mean=0.634, max=0.634, sum=0.634 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.162, mean=0.162, max=0.162, sum=0.162 (1)",
-            "tab": "Calibration",
-            "score": 0.16180078530132275
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.413, mean=0.413, max=0.413, sum=0.413 (1)",
-            "tab": "Calibration",
-            "score": 0.41328409267406696
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.214, mean=0.214, max=0.214, sum=0.214 (1)",
-            "tab": "Robustness",
-            "score": 0.213860378689308
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.539, mean=0.539, max=0.539, sum=0.539 (1)",
-            "tab": "Robustness",
-            "score": 0.5393637207184442
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.224, mean=0.224, max=0.224, sum=0.224 (1)",
-            "tab": "Fairness",
-            "score": 0.22422961995096835
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.575, mean=0.575, max=0.575, sum=0.575 (1)",
-            "tab": "Fairness",
-            "score": 0.5749345098495453
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)",
-            "tab": "General information",
-            "score": 137.383
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=296.95, mean=296.95, max=296.95, sum=296.95 (1)",
-            "tab": "General information",
-            "score": 296.95
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)",
-            "tab": "General information",
-            "score": 3.722
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.049, mean=0.049, max=0.049, sum=0.049 (1)",
-            "tab": "General information",
-            "score": 0.049
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1407.178, mean=1407.178, max=1407.178, sum=1407.178 (1)",
-            "tab": "General information",
-            "score": 1407.178
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=286.175, mean=286.175, max=286.175, sum=286.175 (1)",
-            "tab": "General information",
-            "score": 286.175
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)",
-            "tab": "Bias",
-            "score": 0.3333333333333333
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.55, mean=0.55, max=0.55, sum=0.55 (1)",
-            "tab": "Bias",
-            "score": 0.5497835497835497
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.324, mean=0.324, max=0.324, sum=0.324 (1)",
-            "tab": "Bias",
-            "score": 0.32352941176470584
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.521, mean=0.521, max=0.521, sum=0.521 (1)",
-            "tab": "Bias",
-            "score": 0.5205992509363295
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.458, mean=0.458, max=0.458, sum=0.458 (1)",
-            "tab": "Bias",
-            "score": 0.45833333333333326
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.392,
-        "details": {
-          "description": "min=0.392, mean=0.392, max=0.392, sum=0.392 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.109, mean=0.109, max=0.109, sum=0.109 (1)",
-            "tab": "Calibration",
-            "score": 0.10940664349880716
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.25, mean=0.25, max=0.25, sum=0.25 (1)",
-            "tab": "Robustness",
-            "score": 0.24986961512093836
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.304 (1)",
-            "tab": "Fairness",
-            "score": 0.3036739587215963
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)",
-            "tab": "General information",
-            "score": 0.507
-          },
-          "QuAC - truncated": {
-            "description": "min=0.06, mean=0.06, max=0.06, sum=0.06 (1)",
-            "tab": "General information",
-            "score": 0.06
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1498.657, mean=1498.657, max=1498.657, sum=1498.657 (1)",
-            "tab": "General information",
-            "score": 1498.657
-          },
-          "QuAC - # output tokens": {
-            "description": "min=77.25, mean=77.25, max=77.25, sum=77.25 (1)",
-            "tab": "General information",
-            "score": 77.25
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.633, mean=0.633, max=0.633, sum=0.633 (1)",
-            "tab": "Bias",
-            "score": 0.6333333333333334
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.416, mean=0.416, max=0.416, sum=0.416 (1)",
-            "tab": "Bias",
-            "score": 0.41569852337396196
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.277, mean=0.277, max=0.277, sum=0.277 (1)",
-            "tab": "Bias",
-            "score": 0.27653213751868466
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.255, mean=0.255, max=0.255, sum=0.255 (1)",
-            "tab": "Bias",
-            "score": 0.2550295857988165
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.292,
-        "details": {
-          "description": "min=0.292, mean=0.292, max=0.292, sum=0.292 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.227, mean=0.227, max=0.227, sum=0.227 (1)",
-            "tab": "Calibration",
-            "score": 0.22667464300561196
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.258, mean=0.258, max=0.258, sum=0.258 (1)",
-            "tab": "Robustness",
-            "score": 0.25840978593272174
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.235, mean=0.235, max=0.235, sum=0.235 (1)",
-            "tab": "Fairness",
-            "score": 0.23547400611620795
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)",
-            "tab": "General information",
-            "score": 524.6024464831804
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.916,
-        "details": {
-          "description": "min=0.916, mean=0.916, max=0.916, sum=0.916 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.348, mean=0.348, max=0.348, sum=0.348 (1)",
-            "tab": "Calibration",
-            "score": 0.34781631358579634
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.882, mean=0.882, max=0.882, sum=0.882 (1)",
-            "tab": "Robustness",
-            "score": 0.882
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.906, mean=0.906, max=0.906, sum=0.906 (1)",
-            "tab": "Fairness",
-            "score": 0.906
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.781, mean=2.781, max=2.781, sum=2.781 (1)",
-            "tab": "General information",
-            "score": 2.781
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1751.213, mean=1751.213, max=1751.213, sum=1751.213 (1)",
-            "tab": "General information",
-            "score": 1751.213
-          },
-          "IMDB - # output tokens": {
-            "description": "min=3.258, mean=3.258, max=3.258, sum=3.258 (1)",
-            "tab": "General information",
-            "score": 3.258
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.62,
-        "details": {
-          "description": "min=0.154, mean=0.62, max=0.98, sum=11.166 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.13, mean=0.346, max=0.589, sum=6.236 (18)",
-            "tab": "Calibration",
-            "score": 0.3464227204141308
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.125, mean=0.543, max=0.918, sum=9.77 (18)",
-            "tab": "Robustness",
-            "score": 0.5427815962078022
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.116, mean=0.564, max=0.974, sum=10.144 (18)",
-            "tab": "Fairness",
-            "score": 0.5635727085389178
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)",
-            "tab": "General information",
-            "score": 855.2410378605821
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=4.854, mean=4.98, max=5, sum=89.64 (18)",
-            "tab": "General information",
-            "score": 4.980000522687608
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.693,
-        "details": {
-          "description": "min=0.275, mean=0.693, max=0.975, sum=7.625 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.126, mean=0.601, max=0.963, sum=6.61 (11)",
-            "tab": "Calibration",
-            "score": 0.6009008385490167
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.6, max=0.85, sum=6.6 (11)",
-            "tab": "Robustness",
-            "score": 0.6000000000000001
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.2, mean=0.643, max=0.975, sum=7.075 (11)",
-            "tab": "Fairness",
-            "score": 0.6431818181818182
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.45, mean=4.552, max=5, sum=50.075 (11)",
-            "tab": "General information",
-            "score": 4.552272727272727
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=303.675, mean=954.111, max=1882.1, sum=10495.225 (11)",
-            "tab": "General information",
-            "score": 954.1113636363635
-          },
-          "RAFT - # output tokens": {
-            "description": "min=5.8, mean=24.4, max=30, sum=268.4 (11)",
-            "tab": "General information",
-            "score": 24.4
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json b/data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json
deleted file mode 100644
index 959b52195..000000000
--- a/data/helm_classic/meta/LLaMA-13B/d65d8f48-8b8e-4ec6-af68-f61af5408adf.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/meta_LLaMA-13B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMA 13B",
-    "id": "meta/LLaMA-13B",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.595,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": null
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.6374592074592075
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.6022144522144522
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5777177774710669
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.6102564102564103
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.422,
-        "details": {
-          "description": "min=0.2, mean=0.422, max=0.76, sum=2.111 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.127, mean=0.15, max=0.18, sum=0.748 (5)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.14, mean=0.37, max=0.68, sum=1.848 (5)",
-            "tab": "Robustness",
-            "score": 0.3696140350877193
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.18, mean=0.385, max=0.71, sum=1.927 (5)",
-            "tab": "Fairness",
-            "score": 0.3853684210526316
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)",
-            "tab": "General information",
-            "score": 522.5470877192982
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.714,
-        "details": {
-          "description": "min=0.714, mean=0.714, max=0.714, sum=0.714 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.078, mean=0.078, max=0.078, sum=0.078 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.67, mean=0.67, max=0.67, sum=0.67 (1)",
-            "tab": "Robustness",
-            "score": 0.67
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.666, mean=0.666, max=0.666, sum=0.666 (1)",
-            "tab": "Fairness",
-            "score": 0.666
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)",
-            "tab": "General information",
-            "score": 1439.447
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.711,
-        "details": {
-          "description": "min=0.711, mean=0.711, max=0.711, sum=0.711 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.293 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.544, mean=0.544, max=0.544, sum=0.544 (1)",
-            "tab": "Robustness",
-            "score": 0.543905538434645
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.628, mean=0.628, max=0.628, sum=0.628 (1)",
-            "tab": "Fairness",
-            "score": 0.6277072207288055
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.437, mean=1.437, max=1.437, sum=1.437 (1)",
-            "tab": "General information",
-            "score": 1.4366197183098592
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1541.115, mean=1541.115, max=1541.115, sum=1541.115 (1)",
-            "tab": "General information",
-            "score": 1541.1154929577465
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=100 (1)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.417, mean=0.417, max=0.417, sum=0.417 (1)",
-            "tab": "Bias",
-            "score": 0.4166666666666667
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.224, mean=0.224, max=0.224, sum=0.224 (1)",
-            "tab": "Bias",
-            "score": 0.22357723577235772
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.008, mean=0.008, max=0.008, sum=0.008 (1)",
-            "tab": "Toxicity",
-            "score": 0.008450704225352112
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.614,
-        "details": {
-          "description": "min=0.614, mean=0.614, max=0.614, sum=0.614 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.227, mean=0.227, max=0.227, sum=0.227 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.414, mean=0.414, max=0.414, sum=0.414 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.272, mean=0.272, max=0.272, sum=0.272 (1)",
-            "tab": "Robustness",
-            "score": 0.27211691617574163
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.556, mean=0.556, max=0.556, sum=0.556 (1)",
-            "tab": "Robustness",
-            "score": 0.5559403134593146
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.288, mean=0.288, max=0.288, sum=0.288 (1)",
-            "tab": "Fairness",
-            "score": 0.28794490645078735
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.561, mean=0.561, max=0.561, sum=0.561 (1)",
-            "tab": "Fairness",
-            "score": 0.5608161827325524
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)",
-            "tab": "General information",
-            "score": 137.383
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=300 (1)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)",
-            "tab": "General information",
-            "score": 3.722
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.049, mean=0.049, max=0.049, sum=0.049 (1)",
-            "tab": "General information",
-            "score": 0.049
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1407.178, mean=1407.178, max=1407.178, sum=1407.178 (1)",
-            "tab": "General information",
-            "score": 1407.178
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=300 (1)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.438, mean=0.438, max=0.438, sum=0.438 (1)",
-            "tab": "Bias",
-            "score": 0.43775100401606426
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.167, mean=0.167, max=0.167, sum=0.167 (1)",
-            "tab": "Bias",
-            "score": 0.16666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)",
-            "tab": "Bias",
-            "score": 0.3333333333333333
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.083, mean=0.083, max=0.083, sum=0.083 (1)",
-            "tab": "Bias",
-            "score": 0.08333333333333334
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.347,
-        "details": {
-          "description": "min=0.347, mean=0.347, max=0.347, sum=0.347 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.172, mean=0.172, max=0.172, sum=0.172 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.194, mean=0.194, max=0.194, sum=0.194 (1)",
-            "tab": "Robustness",
-            "score": 0.19407861446110536
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.267, mean=0.267, max=0.267, sum=0.267 (1)",
-            "tab": "Fairness",
-            "score": 0.26734169068478314
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)",
-            "tab": "General information",
-            "score": 0.507
-          },
-          "QuAC - truncated": {
-            "description": "min=0.06, mean=0.06, max=0.06, sum=0.06 (1)",
-            "tab": "General information",
-            "score": 0.06
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1498.657, mean=1498.657, max=1498.657, sum=1498.657 (1)",
-            "tab": "General information",
-            "score": 1498.657
-          },
-          "QuAC - # output tokens": {
-            "description": "min=99.882, mean=99.882, max=99.882, sum=99.882 (1)",
-            "tab": "General information",
-            "score": 99.882
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.605, mean=0.605, max=0.605, sum=0.605 (1)",
-            "tab": "Bias",
-            "score": 0.6047619047619048
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.444, mean=0.444, max=0.444, sum=0.444 (1)",
-            "tab": "Bias",
-            "score": 0.44425076013311304
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.276, mean=0.276, max=0.276, sum=0.276 (1)",
-            "tab": "Bias",
-            "score": 0.2761904761904762
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.224, mean=0.224, max=0.224, sum=0.224 (1)",
-            "tab": "Bias",
-            "score": 0.22388059701492535
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.003, mean=0.003, max=0.003, sum=0.003 (1)",
-            "tab": "Toxicity",
-            "score": 0.003
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.324,
-        "details": {
-          "description": "min=0.324, mean=0.324, max=0.324, sum=0.324 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.193, mean=0.193, max=0.193, sum=0.193 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.274, mean=0.274, max=0.274, sum=0.274 (1)",
-            "tab": "Robustness",
-            "score": 0.27370030581039756
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.234, mean=0.234, max=0.234, sum=0.234 (1)",
-            "tab": "Fairness",
-            "score": 0.23394495412844038
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)",
-            "tab": "General information",
-            "score": 524.6024464831804
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.928,
-        "details": {
-          "description": "min=0.928, mean=0.928, max=0.928, sum=0.928 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.302, mean=0.302, max=0.302, sum=0.302 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.875, mean=0.875, max=0.875, sum=0.875 (1)",
-            "tab": "Robustness",
-            "score": 0.875
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.903, mean=0.903, max=0.903, sum=0.903 (1)",
-            "tab": "Fairness",
-            "score": 0.903
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.781, mean=2.781, max=2.781, sum=2.781 (1)",
-            "tab": "General information",
-            "score": 2.781
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1751.213, mean=1751.213, max=1751.213, sum=1751.213 (1)",
-            "tab": "General information",
-            "score": 1751.213
-          },
-          "IMDB - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6,
-        "details": {
-          "description": "min=0.118, mean=0.6, max=0.947, sum=10.797 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.098, mean=0.295, max=0.455, sum=5.305 (18)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.079, mean=0.529, max=0.947, sum=9.523 (18)",
-            "tab": "Robustness",
-            "score": 0.529079897678074
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.054, mean=0.533, max=0.947, sum=9.585 (18)",
-            "tab": "Fairness",
-            "score": 0.5325232651113918
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)",
-            "tab": "General information",
-            "score": 855.2410378605821
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.643,
-        "details": {
-          "description": "min=0.125, mean=0.643, max=0.925, sum=7.075 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.132, mean=0.644, max=0.925, sum=7.081 (11)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.559, max=0.9, sum=6.15 (11)",
-            "tab": "Robustness",
-            "score": 0.5590909090909091
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.075, mean=0.605, max=0.9, sum=6.65 (11)",
-            "tab": "Fairness",
-            "score": 0.6045454545454545
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.45, mean=4.552, max=5, sum=50.075 (11)",
-            "tab": "General information",
-            "score": 4.552272727272727
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=303.675, mean=954.111, max=1882.1, sum=10495.225 (11)",
-            "tab": "General information",
-            "score": 954.1113636363635
-          },
-          "RAFT - # output tokens": {
-            "description": "min=22.975, mean=29.361, max=30, sum=322.975 (11)",
-            "tab": "General information",
-            "score": 29.361363636363638
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json b/data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json
deleted file mode 100644
index 7f604e015..000000000
--- a/data/helm_classic/meta/LLaMA-30B/dff69882-cb8b-4323-b587-60f295085459.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/meta_LLaMA-30B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMA 30B",
-    "id": "meta/LLaMA-30B",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.781,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": null
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.8149650349650349
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.8224708624708624
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5955016826844834
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.6467365967365968
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.531,
-        "details": {
-          "description": "min=0.33, mean=0.531, max=0.83, sum=2.657 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.051, mean=0.093, max=0.139, sum=0.464 (5)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.22, mean=0.461, max=0.82, sum=2.305 (5)",
-            "tab": "Robustness",
-            "score": 0.4609122807017544
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.28, mean=0.496, max=0.81, sum=2.481 (5)",
-            "tab": "Fairness",
-            "score": 0.49617543859649127
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)",
-            "tab": "General information",
-            "score": 522.5470877192982
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.861,
-        "details": {
-          "description": "min=0.861, mean=0.861, max=0.861, sum=0.861 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.164, mean=0.164, max=0.164, sum=0.164 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.791, mean=0.791, max=0.791, sum=0.791 (1)",
-            "tab": "Robustness",
-            "score": 0.791
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.813, mean=0.813, max=0.813, sum=0.813 (1)",
-            "tab": "Fairness",
-            "score": 0.813
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)",
-            "tab": "General information",
-            "score": 1439.447
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.752,
-        "details": {
-          "description": "min=0.752, mean=0.752, max=0.752, sum=0.752 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.296, mean=0.296, max=0.296, sum=0.296 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.611, mean=0.611, max=0.611, sum=0.611 (1)",
-            "tab": "Robustness",
-            "score": 0.6105202153922532
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.657, mean=0.657, max=0.657, sum=0.657 (1)",
-            "tab": "Fairness",
-            "score": 0.6567447414077484
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.437, mean=1.437, max=1.437, sum=1.437 (1)",
-            "tab": "General information",
-            "score": 1.4366197183098592
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1541.115, mean=1541.115, max=1541.115, sum=1541.115 (1)",
-            "tab": "General information",
-            "score": 1541.1154929577465
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=100 (1)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.4, mean=0.4, max=0.4, sum=0.4 (1)",
-            "tab": "Bias",
-            "score": 0.4
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.214, mean=0.214, max=0.214, sum=0.214 (1)",
-            "tab": "Bias",
-            "score": 0.2142857142857143
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.011, mean=0.011, max=0.011, sum=0.011 (1)",
-            "tab": "Toxicity",
-            "score": 0.011267605633802818
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.666,
-        "details": {
-          "description": "min=0.666, mean=0.666, max=0.666, sum=0.666 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.264, mean=0.264, max=0.264, sum=0.264 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.451, mean=0.451, max=0.451, sum=0.451 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.36, mean=0.36, max=0.36, sum=0.36 (1)",
-            "tab": "Robustness",
-            "score": 0.36029476515740994
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.612, mean=0.612, max=0.612, sum=0.612 (1)",
-            "tab": "Robustness",
-            "score": 0.6123442768470954
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.356, mean=0.356, max=0.356, sum=0.356 (1)",
-            "tab": "Fairness",
-            "score": 0.35638449124084753
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.621, mean=0.621, max=0.621, sum=0.621 (1)",
-            "tab": "Fairness",
-            "score": 0.6212987885688864
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)",
-            "tab": "General information",
-            "score": 137.383
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=300 (1)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)",
-            "tab": "General information",
-            "score": 3.722
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.049, mean=0.049, max=0.049, sum=0.049 (1)",
-            "tab": "General information",
-            "score": 0.049
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1407.178, mean=1407.178, max=1407.178, sum=1407.178 (1)",
-            "tab": "General information",
-            "score": 1407.178
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=300 (1)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.328, mean=0.328, max=0.328, sum=0.328 (1)",
-            "tab": "Bias",
-            "score": 0.32753623188405795
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)",
-            "tab": "Bias",
-            "score": 0.33333333333333337
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.1, mean=0.1, max=0.1, sum=0.1 (1)",
-            "tab": "Bias",
-            "score": 0.09999999999999998
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)",
-            "tab": "Bias",
-            "score": 0.3900709219858156
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.257, mean=0.257, max=0.257, sum=0.257 (1)",
-            "tab": "Bias",
-            "score": 0.2567567567567568
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39,
-        "details": {
-          "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.146, mean=0.146, max=0.146, sum=0.146 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.273, mean=0.273, max=0.273, sum=0.273 (1)",
-            "tab": "Robustness",
-            "score": 0.27320176375521127
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.325, mean=0.325, max=0.325, sum=0.325 (1)",
-            "tab": "Fairness",
-            "score": 0.3253423128866467
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)",
-            "tab": "General information",
-            "score": 0.507
-          },
-          "QuAC - truncated": {
-            "description": "min=0.06, mean=0.06, max=0.06, sum=0.06 (1)",
-            "tab": "General information",
-            "score": 0.06
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1498.657, mean=1498.657, max=1498.657, sum=1498.657 (1)",
-            "tab": "General information",
-            "score": 1498.657
-          },
-          "QuAC - # output tokens": {
-            "description": "min=99.987, mean=99.987, max=99.987, sum=99.987 (1)",
-            "tab": "General information",
-            "score": 99.987
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.571, mean=0.571, max=0.571, sum=0.571 (1)",
-            "tab": "Bias",
-            "score": 0.5714285714285715
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.436 (1)",
-            "tab": "Bias",
-            "score": 0.43576827288346653
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.229, mean=0.229, max=0.229, sum=0.229 (1)",
-            "tab": "Bias",
-            "score": 0.22891566265060237
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.222, mean=0.222, max=0.222, sum=0.222 (1)",
-            "tab": "Bias",
-            "score": 0.22215709261430247
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.004, mean=0.004, max=0.004, sum=0.004 (1)",
-            "tab": "Toxicity",
-            "score": 0.004
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344,
-        "details": {
-          "description": "min=0.344, mean=0.344, max=0.344, sum=0.344 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.15, mean=0.15, max=0.15, sum=0.15 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.281, mean=0.281, max=0.281, sum=0.281 (1)",
-            "tab": "Robustness",
-            "score": 0.28134556574923547
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.266, mean=0.266, max=0.266, sum=0.266 (1)",
-            "tab": "Fairness",
-            "score": 0.26605504587155965
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)",
-            "tab": "General information",
-            "score": 524.6024464831804
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.927,
-        "details": {
-          "description": "min=0.927, mean=0.927, max=0.927, sum=0.927 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.255, mean=0.255, max=0.255, sum=0.255 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.893, mean=0.893, max=0.893, sum=0.893 (1)",
-            "tab": "Robustness",
-            "score": 0.893
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.913, mean=0.913, max=0.913, sum=0.913 (1)",
-            "tab": "Fairness",
-            "score": 0.913
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.781, mean=2.781, max=2.781, sum=2.781 (1)",
-            "tab": "General information",
-            "score": 2.781
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1751.213, mean=1751.213, max=1751.213, sum=1751.213 (1)",
-            "tab": "General information",
-            "score": 1751.213
-          },
-          "IMDB - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.549,
-        "details": {
-          "description": "min=0.027, mean=0.549, max=0.998, sum=9.887 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.161, mean=0.4, max=0.513, sum=7.208 (18)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.016, mean=0.503, max=0.97, sum=9.055 (18)",
-            "tab": "Robustness",
-            "score": 0.503044804739656
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.006, mean=0.508, max=0.998, sum=9.137 (18)",
-            "tab": "Fairness",
-            "score": 0.5075946750657245
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)",
-            "tab": "General information",
-            "score": 855.2410378605821
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.752,
-        "details": {
-          "description": "min=0.15, mean=0.752, max=1, sum=8.275 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.156, mean=0.753, max=1.0, sum=8.279 (11)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.05, mean=0.67, max=0.95, sum=7.375 (11)",
-            "tab": "Robustness",
-            "score": 0.6704545454545454
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.1, mean=0.718, max=0.975, sum=7.9 (11)",
-            "tab": "Fairness",
-            "score": 0.7181818181818181
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.45, mean=4.552, max=5, sum=50.075 (11)",
-            "tab": "General information",
-            "score": 4.552272727272727
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=303.675, mean=954.111, max=1882.1, sum=10495.225 (11)",
-            "tab": "General information",
-            "score": 954.1113636363635
-          },
-          "RAFT - # output tokens": {
-            "description": "min=30, mean=30, max=30, sum=330 (11)",
-            "tab": "General information",
-            "score": 30.0
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json b/data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json
deleted file mode 100644
index ad8c1c451..000000000
--- a/data/helm_classic/meta/LLaMA-65B/90220411-5e4d-4b74-a74c-ca2ad030d50e.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/meta_LLaMA-65B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMA 65B",
-    "id": "meta/LLaMA-65B",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.908,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": null
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.8851981351981352
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.9235431235431235
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.4059399223461723
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.5910839160839161
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.584,
-        "details": {
-          "description": "min=0.34, mean=0.584, max=0.89, sum=2.919 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.27, mean=0.504, max=0.81, sum=2.518 (5)",
-            "tab": "Robustness",
-            "score": 0.5036842105263158
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.34, mean=0.551, max=0.84, sum=2.757 (5)",
-            "tab": "Fairness",
-            "score": 0.5514385964912281
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)",
-            "tab": "General information",
-            "score": 522.5470877192982
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=0.871 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.84, mean=0.84, max=0.84, sum=0.84 (1)",
-            "tab": "Robustness",
-            "score": 0.84
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.847, mean=0.847, max=0.847, sum=0.847 (1)",
-            "tab": "Fairness",
-            "score": 0.847
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)",
-            "tab": "General information",
-            "score": 1439.447
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.755,
-        "details": {
-          "description": "min=0.755, mean=0.755, max=0.755, sum=0.755 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.567, mean=0.567, max=0.567, sum=0.567 (1)",
-            "tab": "Robustness",
-            "score": 0.5674436891870642
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.661, mean=0.661, max=0.661, sum=0.661 (1)",
-            "tab": "Fairness",
-            "score": 0.6614214785759094
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.437, mean=1.437, max=1.437, sum=1.437 (1)",
-            "tab": "General information",
-            "score": 1.4366197183098592
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1541.115, mean=1541.115, max=1541.115, sum=1541.115 (1)",
-            "tab": "General information",
-            "score": 1541.1154929577465
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.198, mean=0.198, max=0.198, sum=0.198 (1)",
-            "tab": "Bias",
-            "score": 0.1981132075471698
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.008, mean=0.008, max=0.008, sum=0.008 (1)",
-            "tab": "Toxicity",
-            "score": 0.008450704225352112
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.672,
-        "details": {
-          "description": "min=0.672, mean=0.672, max=0.672, sum=0.672 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.388, mean=0.388, max=0.388, sum=0.388 (1)",
-            "tab": "Robustness",
-            "score": 0.3875883665002626
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.624, mean=0.624, max=0.624, sum=0.624 (1)",
-            "tab": "Robustness",
-            "score": 0.623794662165915
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.375, mean=0.375, max=0.375, sum=0.375 (1)",
-            "tab": "Fairness",
-            "score": 0.3753249636782112
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.633, mean=0.633, max=0.633, sum=0.633 (1)",
-            "tab": "Fairness",
-            "score": 0.6326996444457361
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)",
-            "tab": "General information",
-            "score": 137.383
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)",
-            "tab": "General information",
-            "score": 3.722
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.049, mean=0.049, max=0.049, sum=0.049 (1)",
-            "tab": "General information",
-            "score": 0.049
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1407.178, mean=1407.178, max=1407.178, sum=1407.178 (1)",
-            "tab": "General information",
-            "score": 1407.178
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.987, mean=0.987, max=0.987, sum=0.987 (1)",
-            "tab": "General information",
-            "score": 0.987
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.352, mean=0.352, max=0.352, sum=0.352 (1)",
-            "tab": "Bias",
-            "score": 0.35238095238095235
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.3 (1)",
-            "tab": "Bias",
-            "score": 0.30000000000000004
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.436 (1)",
-            "tab": "Bias",
-            "score": 0.4358974358974359
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.393, mean=0.393, max=0.393, sum=0.393 (1)",
-            "tab": "Bias",
-            "score": 0.3928571428571429
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.401,
-        "details": {
-          "description": "min=0.401, mean=0.401, max=0.401, sum=0.401 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.275, mean=0.275, max=0.275, sum=0.275 (1)",
-            "tab": "Robustness",
-            "score": 0.2748605351114493
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)",
-            "tab": "Fairness",
-            "score": 0.33296543407590734
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)",
-            "tab": "General information",
-            "score": 0.507
-          },
-          "QuAC - truncated": {
-            "description": "min=0.06, mean=0.06, max=0.06, sum=0.06 (1)",
-            "tab": "General information",
-            "score": 0.06
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1498.657, mean=1498.657, max=1498.657, sum=1498.657 (1)",
-            "tab": "General information",
-            "score": 1498.657
-          },
-          "QuAC - # output tokens": {
-            "description": "min=0.997, mean=0.997, max=0.997, sum=0.997 (1)",
-            "tab": "General information",
-            "score": 0.997
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.621, mean=0.621, max=0.621, sum=0.621 (1)",
-            "tab": "Bias",
-            "score": 0.6210526315789473
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.394, mean=0.394, max=0.394, sum=0.394 (1)",
-            "tab": "Bias",
-            "score": 0.3944670750705233
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.38, mean=0.38, max=0.38, sum=0.38 (1)",
-            "tab": "Bias",
-            "score": 0.3804713804713804
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.243, mean=0.243, max=0.243, sum=0.243 (1)",
-            "tab": "Bias",
-            "score": 0.24335260115606938
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.003, mean=0.003, max=0.003, sum=0.003 (1)",
-            "tab": "Toxicity",
-            "score": 0.003
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.508,
-        "details": {
-          "description": "min=0.508, mean=0.508, max=0.508, sum=0.508 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.448, mean=0.448, max=0.448, sum=0.448 (1)",
-            "tab": "Robustness",
-            "score": 0.44801223241590216
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.42 (1)",
-            "tab": "Fairness",
-            "score": 0.42048929663608564
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)",
-            "tab": "General information",
-            "score": 524.6024464831804
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.962,
-        "details": {
-          "description": "min=0.962, mean=0.962, max=0.962, sum=0.962 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.935, mean=0.935, max=0.935, sum=0.935 (1)",
-            "tab": "Robustness",
-            "score": 0.935
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.953, mean=0.953, max=0.953, sum=0.953 (1)",
-            "tab": "Fairness",
-            "score": 0.953
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.781, mean=2.781, max=2.781, sum=2.781 (1)",
-            "tab": "General information",
-            "score": 2.781
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1751.213, mean=1751.213, max=1751.213, sum=1751.213 (1)",
-            "tab": "General information",
-            "score": 1751.213
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.655,
-        "details": {
-          "description": "min=0.395, mean=0.655, max=0.863, sum=11.783 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.247, mean=0.566, max=0.853, sum=10.188 (18)",
-            "tab": "Robustness",
-            "score": 0.565986035612513
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.32, mean=0.574, max=0.8, sum=10.336 (18)",
-            "tab": "Fairness",
-            "score": 0.57420608635975
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)",
-            "tab": "General information",
-            "score": 855.2410378605821
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.702,
-        "details": {
-          "description": "min=0.125, mean=0.702, max=0.975, sum=7.725 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.655, max=0.975, sum=7.2 (11)",
-            "tab": "Robustness",
-            "score": 0.6545454545454545
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.075, mean=0.668, max=0.975, sum=7.35 (11)",
-            "tab": "Fairness",
-            "score": 0.6681818181818182
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.45, mean=4.552, max=5, sum=50.075 (11)",
-            "tab": "General information",
-            "score": 4.552272727272727
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=303.675, mean=954.111, max=1882.1, sum=10495.225 (11)",
-            "tab": "General information",
-            "score": 954.1113636363635
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.8, mean=0.982, max=1, sum=10.8 (11)",
-            "tab": "General information",
-            "score": 0.9818181818181819
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json b/data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json
deleted file mode 100644
index 152b9e683..000000000
--- a/data/helm_classic/meta/LLaMA-7B/8c2465b2-deca-476c-bb41-836685ceab35.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/meta_LLaMA-7B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMA 7B",
-    "id": "meta/LLaMA-7B",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.533,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": null
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.567972027972028
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.5526107226107226
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5501935339738984
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.7582167832167832
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.321,
-        "details": {
-          "description": "min=0.23, mean=0.321, max=0.45, sum=1.603 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.063, mean=0.111, max=0.138, sum=0.557 (5)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.18, mean=0.268, max=0.36, sum=1.338 (5)",
-            "tab": "Robustness",
-            "score": 0.2676140350877193
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.19, mean=0.284, max=0.42, sum=1.421 (5)",
-            "tab": "Fairness",
-            "score": 0.28410526315789475
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)",
-            "tab": "General information",
-            "score": 522.5470877192982
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.756,
-        "details": {
-          "description": "min=0.756, mean=0.756, max=0.756, sum=0.756 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.292, mean=0.292, max=0.292, sum=0.292 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.688, mean=0.688, max=0.688, sum=0.688 (1)",
-            "tab": "Robustness",
-            "score": 0.688
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.71, mean=0.71, max=0.71, sum=0.71 (1)",
-            "tab": "Fairness",
-            "score": 0.71
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)",
-            "tab": "General information",
-            "score": 1439.447
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.669,
-        "details": {
-          "description": "min=0.669, mean=0.669, max=0.669, sum=0.669 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.265, mean=0.265, max=0.265, sum=0.265 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.485, mean=0.485, max=0.485, sum=0.485 (1)",
-            "tab": "Robustness",
-            "score": 0.48451305318378857
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.552, mean=0.552, max=0.552, sum=0.552 (1)",
-            "tab": "Fairness",
-            "score": 0.5523890751544673
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.437, mean=1.437, max=1.437, sum=1.437 (1)",
-            "tab": "General information",
-            "score": 1.4366197183098592
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1541.115, mean=1541.115, max=1541.115, sum=1541.115 (1)",
-            "tab": "General information",
-            "score": 1541.1154929577465
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=100 (1)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.444, mean=0.444, max=0.444, sum=0.444 (1)",
-            "tab": "Bias",
-            "score": 0.4444444444444444
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.178, mean=0.178, max=0.178, sum=0.178 (1)",
-            "tab": "Bias",
-            "score": 0.17785234899328858
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.006, mean=0.006, max=0.006, sum=0.006 (1)",
-            "tab": "Toxicity",
-            "score": 0.005633802816901409
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.589,
-        "details": {
-          "description": "min=0.589, mean=0.589, max=0.589, sum=0.589 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.176, mean=0.176, max=0.176, sum=0.176 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.402, mean=0.402, max=0.402, sum=0.402 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.222, mean=0.222, max=0.222, sum=0.222 (1)",
-            "tab": "Robustness",
-            "score": 0.22150747696392029
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.519, mean=0.519, max=0.519, sum=0.519 (1)",
-            "tab": "Robustness",
-            "score": 0.5190244505397503
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.241, mean=0.241, max=0.241, sum=0.241 (1)",
-            "tab": "Fairness",
-            "score": 0.24052468144533276
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.537, mean=0.537, max=0.537, sum=0.537 (1)",
-            "tab": "Fairness",
-            "score": 0.5368535244140038
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)",
-            "tab": "General information",
-            "score": 137.383
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=300 (1)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)",
-            "tab": "General information",
-            "score": 3.722
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.049, mean=0.049, max=0.049, sum=0.049 (1)",
-            "tab": "General information",
-            "score": 0.049
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1407.178, mean=1407.178, max=1407.178, sum=1407.178 (1)",
-            "tab": "General information",
-            "score": 1407.178
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=300 (1)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.374, mean=0.374, max=0.374, sum=0.374 (1)",
-            "tab": "Bias",
-            "score": 0.3739837398373984
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.111, mean=0.111, max=0.111, sum=0.111 (1)",
-            "tab": "Bias",
-            "score": 0.11111111111111116
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.3 (1)",
-            "tab": "Bias",
-            "score": 0.3
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.506, mean=0.506, max=0.506, sum=0.506 (1)",
-            "tab": "Bias",
-            "score": 0.5061728395061729
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.36, mean=0.36, max=0.36, sum=0.36 (1)",
-            "tab": "Bias",
-            "score": 0.3604651162790698
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.338,
-        "details": {
-          "description": "min=0.338, mean=0.338, max=0.338, sum=0.338 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.114, mean=0.114, max=0.114, sum=0.114 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.223, mean=0.223, max=0.223, sum=0.223 (1)",
-            "tab": "Robustness",
-            "score": 0.22309180806281237
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.257, mean=0.257, max=0.257, sum=0.257 (1)",
-            "tab": "Fairness",
-            "score": 0.2568299506065861
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)",
-            "tab": "General information",
-            "score": 0.507
-          },
-          "QuAC - truncated": {
-            "description": "min=0.06, mean=0.06, max=0.06, sum=0.06 (1)",
-            "tab": "General information",
-            "score": 0.06
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1498.657, mean=1498.657, max=1498.657, sum=1498.657 (1)",
-            "tab": "General information",
-            "score": 1498.657
-          },
-          "QuAC - # output tokens": {
-            "description": "min=99.794, mean=99.794, max=99.794, sum=99.794 (1)",
-            "tab": "General information",
-            "score": 99.794
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.571, mean=0.571, max=0.571, sum=0.571 (1)",
-            "tab": "Bias",
-            "score": 0.5714285714285715
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.428, mean=0.428, max=0.428, sum=0.428 (1)",
-            "tab": "Bias",
-            "score": 0.42791413680110835
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.284, mean=0.284, max=0.284, sum=0.284 (1)",
-            "tab": "Bias",
-            "score": 0.28395061728395066
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.259, mean=0.259, max=0.259, sum=0.259 (1)",
-            "tab": "Bias",
-            "score": 0.2594070695553022
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.003, mean=0.003, max=0.003, sum=0.003 (1)",
-            "tab": "Toxicity",
-            "score": 0.003
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.28,
-        "details": {
-          "description": "min=0.28, mean=0.28, max=0.28, sum=0.28 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.057, mean=0.057, max=0.057, sum=0.057 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.229, mean=0.229, max=0.229, sum=0.229 (1)",
-            "tab": "Robustness",
-            "score": 0.22935779816513763
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.219, mean=0.219, max=0.219, sum=0.219 (1)",
-            "tab": "Fairness",
-            "score": 0.21865443425076453
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)",
-            "tab": "General information",
-            "score": 524.6024464831804
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.947,
-        "details": {
-          "description": "min=0.947, mean=0.947, max=0.947, sum=0.947 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.336 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.897, mean=0.897, max=0.897, sum=0.897 (1)",
-            "tab": "Robustness",
-            "score": 0.897
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.936, mean=0.936, max=0.936, sum=0.936 (1)",
-            "tab": "Fairness",
-            "score": 0.936
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.781, mean=2.781, max=2.781, sum=2.781 (1)",
-            "tab": "General information",
-            "score": 2.781
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1751.213, mean=1751.213, max=1751.213, sum=1751.213 (1)",
-            "tab": "General information",
-            "score": 1751.213
-          },
-          "IMDB - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.563,
-        "details": {
-          "description": "min=0.015, mean=0.563, max=0.99, sum=10.13 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.13, mean=0.334, max=0.562, sum=6.012 (18)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.012, mean=0.492, max=0.958, sum=8.864 (18)",
-            "tab": "Robustness",
-            "score": 0.4924249260198337
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.008, mean=0.505, max=0.98, sum=9.086 (18)",
-            "tab": "Fairness",
-            "score": 0.5047868294149912
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)",
-            "tab": "General information",
-            "score": 855.2410378605821
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.573,
-        "details": {
-          "description": "min=0.125, mean=0.573, max=0.975, sum=6.3 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.122, mean=0.572, max=0.975, sum=6.295 (11)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.486, max=0.95, sum=5.35 (11)",
-            "tab": "Robustness",
-            "score": 0.4863636363636364
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.1, mean=0.545, max=0.975, sum=6 (11)",
-            "tab": "Fairness",
-            "score": 0.5454545454545454
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.45, mean=4.552, max=5, sum=50.075 (11)",
-            "tab": "General information",
-            "score": 4.552272727272727
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=303.675, mean=954.111, max=1882.1, sum=10495.225 (11)",
-            "tab": "General information",
-            "score": 954.1113636363635
-          },
-          "RAFT - # output tokens": {
-            "description": "min=29.575, mean=29.961, max=30, sum=329.575 (11)",
-            "tab": "General information",
-            "score": 29.961363636363636
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json b/data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json
deleted file mode 100644
index f2cd54e60..000000000
--- a/data/helm_classic/meta/Llama-2-13B/4b0f6a03-1054-4047-82d1-53992f0378ee.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/meta_Llama-2-13B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 2 13B",
-    "id": "meta/Llama-2-13B",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.823,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": null
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.8231701631701632
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.8078088578088578
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.46948265409803874
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.4142191142191142
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.507,
-        "details": {
-          "description": "min=0.28, mean=0.507, max=0.84, sum=2.533 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.22, mean=0.444, max=0.76, sum=2.222 (5)",
-            "tab": "Robustness",
-            "score": 0.44438596491228066
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.26, mean=0.466, max=0.79, sum=2.331 (5)",
-            "tab": "Fairness",
-            "score": 0.46614035087719297
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)",
-            "tab": "General information",
-            "score": 522.5470877192982
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.811,
-        "details": {
-          "description": "min=0.811, mean=0.811, max=0.811, sum=0.811 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.116, mean=0.116, max=0.116, sum=0.116 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.753, mean=0.753, max=0.753, sum=0.753 (1)",
-            "tab": "Robustness",
-            "score": 0.753
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.732, mean=0.732, max=0.732, sum=0.732 (1)",
-            "tab": "Fairness",
-            "score": 0.732
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)",
-            "tab": "General information",
-            "score": 1439.447
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.744,
-        "details": {
-          "description": "min=0.744, mean=0.744, max=0.744, sum=0.744 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.682, mean=0.682, max=0.682, sum=0.682 (1)",
-            "tab": "Robustness",
-            "score": 0.681791424099214
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.657, mean=0.657, max=0.657, sum=0.657 (1)",
-            "tab": "Fairness",
-            "score": 0.6567284210865421
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.414, mean=4.414, max=4.414, sum=4.414 (1)",
-            "tab": "General information",
-            "score": 4.414084507042253
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3673.268, mean=3673.268, max=3673.268, sum=3673.268 (1)",
-            "tab": "General information",
-            "score": 3673.2676056338028
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.417, mean=0.417, max=0.417, sum=0.417 (1)",
-            "tab": "Bias",
-            "score": 0.4166666666666667
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.218, mean=0.218, max=0.218, sum=0.218 (1)",
-            "tab": "Bias",
-            "score": 0.21830985915492954
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.014, mean=0.014, max=0.014, sum=0.014 (1)",
-            "tab": "Toxicity",
-            "score": 0.014084507042253521
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.637,
-        "details": {
-          "description": "min=0.637, mean=0.637, max=0.637, sum=0.637 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.324, mean=0.324, max=0.324, sum=0.324 (1)",
-            "tab": "Robustness",
-            "score": 0.3243542710528751
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.563, mean=0.563, max=0.563, sum=0.563 (1)",
-            "tab": "Robustness",
-            "score": 0.5631882717621935
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.309, mean=0.309, max=0.309, sum=0.309 (1)",
-            "tab": "Fairness",
-            "score": 0.30927547433853436
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.58, mean=0.58, max=0.58, sum=0.58 (1)",
-            "tab": "Fairness",
-            "score": 0.5801102053016279
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)",
-            "tab": "General information",
-            "score": 137.383
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.831, mean=4.831, max=4.831, sum=4.831 (1)",
-            "tab": "General information",
-            "score": 4.831
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
-            "tab": "General information",
-            "score": 0.026
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2289.409, mean=2289.409, max=2289.409, sum=2289.409 (1)",
-            "tab": "General information",
-            "score": 2289.409
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.984, mean=0.984, max=0.984, sum=0.984 (1)",
-            "tab": "General information",
-            "score": 0.984
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.521, mean=0.521, max=0.521, sum=0.521 (1)",
-            "tab": "Bias",
-            "score": 0.5205992509363295
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.15, mean=0.15, max=0.15, sum=0.15 (1)",
-            "tab": "Bias",
-            "score": 0.15000000000000002
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)",
-            "tab": "Bias",
-            "score": 0.4666666666666667
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.357, mean=0.357, max=0.357, sum=0.357 (1)",
-            "tab": "Bias",
-            "score": 0.3571428571428571
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.424,
-        "details": {
-          "description": "min=0.424, mean=0.424, max=0.424, sum=0.424 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.294, mean=0.294, max=0.294, sum=0.294 (1)",
-            "tab": "Robustness",
-            "score": 0.2939019916232739
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.351, mean=0.351, max=0.351, sum=0.351 (1)",
-            "tab": "Fairness",
-            "score": 0.35074944218906556
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=3.204, mean=3.204, max=3.204, sum=3.204 (1)",
-            "tab": "General information",
-            "score": 3.204
-          },
-          "QuAC - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=3617.038, mean=3617.038, max=3617.038, sum=3617.038 (1)",
-            "tab": "General information",
-            "score": 3617.038
-          },
-          "QuAC - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.549, mean=0.549, max=0.549, sum=0.549 (1)",
-            "tab": "Bias",
-            "score": 0.5485347985347986
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.392, mean=0.392, max=0.392, sum=0.392 (1)",
-            "tab": "Bias",
-            "score": 0.39214643381310055
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.325, mean=0.325, max=0.325, sum=0.325 (1)",
-            "tab": "Bias",
-            "score": 0.3248945147679325
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.242, mean=0.242, max=0.242, sum=0.242 (1)",
-            "tab": "Bias",
-            "score": 0.24197860962566847
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.004, mean=0.004, max=0.004, sum=0.004 (1)",
-            "tab": "Toxicity",
-            "score": 0.004
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.33,
-        "details": {
-          "description": "min=0.33, mean=0.33, max=0.33, sum=0.33 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.287, mean=0.287, max=0.287, sum=0.287 (1)",
-            "tab": "Robustness",
-            "score": 0.2874617737003058
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.274, mean=0.274, max=0.274, sum=0.274 (1)",
-            "tab": "Fairness",
-            "score": 0.27370030581039756
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)",
-            "tab": "General information",
-            "score": 524.6024464831804
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.962,
-        "details": {
-          "description": "min=0.962, mean=0.962, max=0.962, sum=0.962 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.954, mean=0.954, max=0.954, sum=0.954 (1)",
-            "tab": "Robustness",
-            "score": 0.954
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.957, mean=0.957, max=0.957, sum=0.957 (1)",
-            "tab": "Fairness",
-            "score": 0.957
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=2897.409, mean=2897.409, max=2897.409, sum=2897.409 (1)",
-            "tab": "General information",
-            "score": 2897.409
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.588,
-        "details": {
-          "description": "min=0.087, mean=0.588, max=0.968, sum=10.579 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.098, mean=0.323, max=0.788, sum=4.519 (14)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.022, mean=0.47, max=0.958, sum=8.468 (18)",
-            "tab": "Robustness",
-            "score": 0.47042658911281887
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.006, mean=0.489, max=0.968, sum=8.81 (18)",
-            "tab": "Fairness",
-            "score": 0.4894481246425394
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)",
-            "tab": "General information",
-            "score": 855.2410378605821
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=2.692, max=5, sum=48.448 (18)",
-            "tab": "General information",
-            "score": 2.6915388744093813
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.707,
-        "details": {
-          "description": "min=0.1, mean=0.707, max=0.975, sum=7.775 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.05, mean=0.652, max=0.95, sum=7.175 (11)",
-            "tab": "Robustness",
-            "score": 0.6522727272727272
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.075, mean=0.673, max=0.975, sum=7.4 (11)",
-            "tab": "Fairness",
-            "score": 0.6727272727272727
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=2.575, mean=4.78, max=5, sum=52.575 (11)",
-            "tab": "General information",
-            "score": 4.779545454545455
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=303.675, mean=1153.852, max=3623.9, sum=12692.375 (11)",
-            "tab": "General information",
-            "score": 1153.8522727272727
-          },
-          "RAFT - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json b/data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json
deleted file mode 100644
index de031e670..000000000
--- a/data/helm_classic/meta/Llama-2-70B/78bc128a-6e53-4086-9498-2b3428e1d884.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/meta_Llama-2-70B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 2 70B",
-    "id": "meta/Llama-2-70B",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.944,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": null
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.9649184149184149
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.9587645687645687
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5375895851224799
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.643006993006993
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.582,
-        "details": {
-          "description": "min=0.29, mean=0.582, max=0.92, sum=2.909 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.22, mean=0.545, max=0.9, sum=2.726 (5)",
-            "tab": "Robustness",
-            "score": 0.5451929824561403
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.26, mean=0.557, max=0.91, sum=2.786 (5)",
-            "tab": "Fairness",
-            "score": 0.5571929824561404
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)",
-            "tab": "General information",
-            "score": 522.5470877192982
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.886,
-        "details": {
-          "description": "min=0.886, mean=0.886, max=0.886, sum=0.886 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.863, mean=0.863, max=0.863, sum=0.863 (1)",
-            "tab": "Robustness",
-            "score": 0.863
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.859, mean=0.859, max=0.859, sum=0.859 (1)",
-            "tab": "Fairness",
-            "score": 0.859
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)",
-            "tab": "General information",
-            "score": 1439.447
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=0.77 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.722, mean=0.722, max=0.722, sum=0.722 (1)",
-            "tab": "Robustness",
-            "score": 0.7215317388650366
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.709, mean=0.709, max=0.709, sum=0.709 (1)",
-            "tab": "Fairness",
-            "score": 0.709497495841271
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.414, mean=4.414, max=4.414, sum=4.414 (1)",
-            "tab": "General information",
-            "score": 4.414084507042253
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3673.268, mean=3673.268, max=3673.268, sum=3673.268 (1)",
-            "tab": "General information",
-            "score": 3673.2676056338028
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.187, mean=0.187, max=0.187, sum=0.187 (1)",
-            "tab": "Bias",
-            "score": 0.18695652173913044
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.008, mean=0.008, max=0.008, sum=0.008 (1)",
-            "tab": "Toxicity",
-            "score": 0.008450704225352112
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.674,
-        "details": {
-          "description": "min=0.674, mean=0.674, max=0.674, sum=0.674 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.42 (1)",
-            "tab": "Robustness",
-            "score": 0.42009390434309946
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.639, mean=0.639, max=0.639, sum=0.639 (1)",
-            "tab": "Robustness",
-            "score": 0.6385366212170214
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.4, mean=0.4, max=0.4, sum=0.4 (1)",
-            "tab": "Fairness",
-            "score": 0.3997609830959401
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.637, mean=0.637, max=0.637, sum=0.637 (1)",
-            "tab": "Fairness",
-            "score": 0.6365724774019619
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)",
-            "tab": "General information",
-            "score": 137.383
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.831, mean=4.831, max=4.831, sum=4.831 (1)",
-            "tab": "General information",
-            "score": 4.831
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
-            "tab": "General information",
-            "score": 0.026
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2289.409, mean=2289.409, max=2289.409, sum=2289.409 (1)",
-            "tab": "General information",
-            "score": 2289.409
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.998, mean=0.998, max=0.998, sum=0.998 (1)",
-            "tab": "General information",
-            "score": 0.998
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.167, mean=0.167, max=0.167, sum=0.167 (1)",
-            "tab": "Bias",
-            "score": 0.16666666666666666
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.524, mean=0.524, max=0.524, sum=0.524 (1)",
-            "tab": "Bias",
-            "score": 0.5238095238095237
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.312, mean=0.312, max=0.312, sum=0.312 (1)",
-            "tab": "Bias",
-            "score": 0.3125
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.566, mean=0.566, max=0.566, sum=0.566 (1)",
-            "tab": "Bias",
-            "score": 0.5655430711610487
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.184, mean=0.184, max=0.184, sum=0.184 (1)",
-            "tab": "Bias",
-            "score": 0.1842105263157895
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)",
-            "tab": "Toxicity",
-            "score": 0.002
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.484,
-        "details": {
-          "description": "min=0.484, mean=0.484, max=0.484, sum=0.484 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.362, mean=0.362, max=0.362, sum=0.362 (1)",
-            "tab": "Robustness",
-            "score": 0.36189050917141447
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.414, mean=0.414, max=0.414, sum=0.414 (1)",
-            "tab": "Fairness",
-            "score": 0.4139340894194124
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=3.204, mean=3.204, max=3.204, sum=3.204 (1)",
-            "tab": "General information",
-            "score": 3.204
-          },
-          "QuAC - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=3617.038, mean=3617.038, max=3617.038, sum=3617.038 (1)",
-            "tab": "General information",
-            "score": 3617.038
-          },
-          "QuAC - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.611, mean=0.611, max=0.611, sum=0.611 (1)",
-            "tab": "Bias",
-            "score": 0.6111111111111112
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.403, mean=0.403, max=0.403, sum=0.403 (1)",
-            "tab": "Bias",
-            "score": 0.4025455927051672
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.272, mean=0.272, max=0.272, sum=0.272 (1)",
-            "tab": "Bias",
-            "score": 0.27183271832718325
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.239, mean=0.239, max=0.239, sum=0.239 (1)",
-            "tab": "Bias",
-            "score": 0.23913043478260873
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.554,
-        "details": {
-          "description": "min=0.554, mean=0.554, max=0.554, sum=0.554 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.468, mean=0.468, max=0.468, sum=0.468 (1)",
-            "tab": "Robustness",
-            "score": 0.46788990825688076
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.434, mean=0.434, max=0.434, sum=0.434 (1)",
-            "tab": "Fairness",
-            "score": 0.43425076452599387
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)",
-            "tab": "General information",
-            "score": 524.6024464831804
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.961,
-        "details": {
-          "description": "min=0.961, mean=0.961, max=0.961, sum=0.961 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.949, mean=0.949, max=0.949, sum=0.949 (1)",
-            "tab": "Robustness",
-            "score": 0.949
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.954, mean=0.954, max=0.954, sum=0.954 (1)",
-            "tab": "Fairness",
-            "score": 0.954
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=2897.409, mean=2897.409, max=2897.409, sum=2897.409 (1)",
-            "tab": "General information",
-            "score": 2897.409
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.652,
-        "details": {
-          "description": "min=0.337, mean=0.652, max=0.919, sum=11.733 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.272, mean=0.59, max=0.884, sum=10.619 (18)",
-            "tab": "Robustness",
-            "score": 0.5899239945803259
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.125, mean=0.551, max=0.892, sum=9.924 (18)",
-            "tab": "Fairness",
-            "score": 0.551334119704094
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)",
-            "tab": "General information",
-            "score": 855.2410378605821
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.727,
-        "details": {
-          "description": "min=0.125, mean=0.727, max=0.975, sum=8 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.075, mean=0.673, max=0.975, sum=7.4 (11)",
-            "tab": "Robustness",
-            "score": 0.6727272727272727
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.1, mean=0.7, max=0.975, sum=7.7 (11)",
-            "tab": "Fairness",
-            "score": 0.7
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=2.575, mean=4.78, max=5, sum=52.575 (11)",
-            "tab": "General information",
-            "score": 4.779545454545455
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=303.675, mean=1153.852, max=3623.9, sum=12692.375 (11)",
-            "tab": "General information",
-            "score": 1153.8522727272727
-          },
-          "RAFT - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json b/data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json
deleted file mode 100644
index eac315fea..000000000
--- a/data/helm_classic/meta/Llama-2-7B/2be7887e-6c91-437c-bbfc-8b68de3330da.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/meta_Llama-2-7B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 2 7B",
-    "id": "meta/Llama-2-7B",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.607,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": null
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.6437529137529138
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.6102097902097903
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.4576728062932413
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.8121794871794872
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.431,
-        "details": {
-          "description": "min=0.28, mean=0.431, max=0.64, sum=2.153 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.22, mean=0.373, max=0.57, sum=1.866 (5)",
-            "tab": "Robustness",
-            "score": 0.37312280701754386
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.26, mean=0.392, max=0.59, sum=1.961 (5)",
-            "tab": "Fairness",
-            "score": 0.392140350877193
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)",
-            "tab": "General information",
-            "score": 522.5470877192982
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.762,
-        "details": {
-          "description": "min=0.762, mean=0.762, max=0.762, sum=0.762 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.215, mean=0.215, max=0.215, sum=0.215 (1)",
-            "tab": "Calibration",
-            "score": null
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.676, mean=0.676, max=0.676, sum=0.676 (1)",
-            "tab": "Robustness",
-            "score": 0.676
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.706, mean=0.706, max=0.706, sum=0.706 (1)",
-            "tab": "Fairness",
-            "score": 0.706
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)",
-            "tab": "General information",
-            "score": 1439.447
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1.296, mean=1.296, max=1.296, sum=1.296 (1)",
-            "tab": "General information",
-            "score": 1.296
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.691,
-        "details": {
-          "description": "min=0.691, mean=0.691, max=0.691, sum=0.691 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.573, mean=0.573, max=0.573, sum=0.573 (1)",
-            "tab": "Robustness",
-            "score": 0.5726018964106345
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.596, mean=0.596, max=0.596, sum=0.596 (1)",
-            "tab": "Fairness",
-            "score": 0.5960691234215144
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.414, mean=4.414, max=4.414, sum=4.414 (1)",
-            "tab": "General information",
-            "score": 4.414084507042253
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3673.268, mean=3673.268, max=3673.268, sum=3673.268 (1)",
-            "tab": "General information",
-            "score": 3673.2676056338028
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)",
-            "tab": "Bias",
-            "score": 0.3333333333333333
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.203, mean=0.203, max=0.203, sum=0.203 (1)",
-            "tab": "Bias",
-            "score": 0.20348837209302328
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.011, mean=0.011, max=0.011, sum=0.011 (1)",
-            "tab": "Toxicity",
-            "score": 0.011267605633802818
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.611,
-        "details": {
-          "description": "min=0.611, mean=0.611, max=0.611, sum=0.611 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.261, mean=0.261, max=0.261, sum=0.261 (1)",
-            "tab": "Robustness",
-            "score": 0.2606038875824225
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.501, mean=0.501, max=0.501, sum=0.501 (1)",
-            "tab": "Robustness",
-            "score": 0.5010811862440044
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.264, mean=0.264, max=0.264, sum=0.264 (1)",
-            "tab": "Fairness",
-            "score": 0.26403309290317406
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.55, mean=0.55, max=0.55, sum=0.55 (1)",
-            "tab": "Fairness",
-            "score": 0.5499198184166533
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)",
-            "tab": "General information",
-            "score": 137.383
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0.998, mean=0.998, max=0.998, sum=0.998 (1)",
-            "tab": "General information",
-            "score": 0.998
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.831, mean=4.831, max=4.831, sum=4.831 (1)",
-            "tab": "General information",
-            "score": 4.831
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
-            "tab": "General information",
-            "score": 0.026
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2289.409, mean=2289.409, max=2289.409, sum=2289.409 (1)",
-            "tab": "General information",
-            "score": 2289.409
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.955, mean=0.955, max=0.955, sum=0.955 (1)",
-            "tab": "General information",
-            "score": 0.955
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.381, mean=0.381, max=0.381, sum=0.381 (1)",
-            "tab": "Bias",
-            "score": 0.38095238095238093
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.182, mean=0.182, max=0.182, sum=0.182 (1)",
-            "tab": "Bias",
-            "score": 0.18181818181818182
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.577, mean=0.577, max=0.577, sum=0.577 (1)",
-            "tab": "Bias",
-            "score": 0.5770114942528735
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.486, mean=0.486, max=0.486, sum=0.486 (1)",
-            "tab": "Bias",
-            "score": 0.48630136986301375
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.406,
-        "details": {
-          "description": "min=0.406, mean=0.406, max=0.406, sum=0.406 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.271, mean=0.271, max=0.271, sum=0.271 (1)",
-            "tab": "Robustness",
-            "score": 0.27069315379336467
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.321 (1)",
-            "tab": "Fairness",
-            "score": 0.32122644280851614
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=3.204, mean=3.204, max=3.204, sum=3.204 (1)",
-            "tab": "General information",
-            "score": 3.204
-          },
-          "QuAC - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=3617.038, mean=3617.038, max=3617.038, sum=3617.038 (1)",
-            "tab": "General information",
-            "score": 3617.038
-          },
-          "QuAC - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.583, mean=0.583, max=0.583, sum=0.583 (1)",
-            "tab": "Bias",
-            "score": 0.5833333333333334
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.426, mean=0.426, max=0.426, sum=0.426 (1)",
-            "tab": "Bias",
-            "score": 0.4264652792029702
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.283, mean=0.283, max=0.283, sum=0.283 (1)",
-            "tab": "Bias",
-            "score": 0.2831541218637993
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.231, mean=0.231, max=0.231, sum=0.231 (1)",
-            "tab": "Bias",
-            "score": 0.23093681917211328
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.272,
-        "details": {
-          "description": "min=0.272, mean=0.272, max=0.272, sum=0.272 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.234, mean=0.234, max=0.234, sum=0.234 (1)",
-            "tab": "Robustness",
-            "score": 0.23394495412844038
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.223, mean=0.223, max=0.223, sum=0.223 (1)",
-            "tab": "Fairness",
-            "score": 0.22324159021406728
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)",
-            "tab": "General information",
-            "score": 524.6024464831804
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.907,
-        "details": {
-          "description": "min=0.907, mean=0.907, max=0.907, sum=0.907 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.808, mean=0.808, max=0.808, sum=0.808 (1)",
-            "tab": "Robustness",
-            "score": 0.808
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.871, mean=0.871, max=0.871, sum=0.871 (1)",
-            "tab": "Fairness",
-            "score": 0.871
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=2897.409, mean=2897.409, max=2897.409, sum=2897.409 (1)",
-            "tab": "General information",
-            "score": 2897.409
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.562,
-        "details": {
-          "description": "min=0.025, mean=0.562, max=1, sum=10.108 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.025, mean=0.516, max=0.989, sum=9.28 (18)",
-            "tab": "Robustness",
-            "score": 0.5155612610622284
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.01, mean=0.503, max=0.998, sum=9.057 (18)",
-            "tab": "Fairness",
-            "score": 0.5031757189564859
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)",
-            "tab": "General information",
-            "score": 855.2410378605821
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.643,
-        "details": {
-          "description": "min=0.125, mean=0.643, max=0.95, sum=7.075 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.05, mean=0.573, max=0.875, sum=6.3 (11)",
-            "tab": "Robustness",
-            "score": 0.5727272727272728
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.1, mean=0.609, max=0.95, sum=6.7 (11)",
-            "tab": "Fairness",
-            "score": 0.6090909090909092
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=2.575, mean=4.78, max=5, sum=52.575 (11)",
-            "tab": "General information",
-            "score": 4.779545454545455
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=303.675, mean=1153.852, max=3623.9, sum=12692.375 (11)",
-            "tab": "General information",
-            "score": 1153.8522727272727
-          },
-          "RAFT - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json b/data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json
deleted file mode 100644
index 63a0c348d..000000000
--- a/data/helm_classic/meta/OPT-175B/f135ce21-655f-4ebf-9cc6-d83ada0f177b.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/meta_OPT-175B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OPT 175B",
-    "id": "meta/OPT-175B",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.609,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.33807716905928437
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.5191448151403657
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.6221815633384042
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.24121162280701755
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.58013310485115
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.43513523513523517
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.5927318295739348
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318,
-        "details": {
-          "description": "min=0.21, mean=0.318, max=0.48, sum=4.775 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.115, mean=0.147, max=0.194, sum=2.207 (15)",
-            "tab": "Calibration",
-            "score": 0.14714449343481936
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.13, mean=0.27, max=0.45, sum=4.048 (15)",
-            "tab": "Robustness",
-            "score": 0.2698479532163743
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.167, mean=0.287, max=0.43, sum=4.298 (15)",
-            "tab": "Fairness",
-            "score": 0.28651461988304094
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.11, mean=0.12, max=0.138, sum=1.793 (15)",
-            "tab": "Efficiency",
-            "score": 0.1195572826114746
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)",
-            "tab": "General information",
-            "score": 472.2740350877193
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.793,
-        "details": {
-          "description": "min=0.777, mean=0.793, max=0.813, sum=2.379 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.177, mean=0.194, max=0.218, sum=0.581 (3)",
-            "tab": "Calibration",
-            "score": 0.19360710050007168
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.584, mean=0.623, max=0.662, sum=1.869 (3)",
-            "tab": "Robustness",
-            "score": 0.623
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.712, mean=0.731, max=0.746, sum=2.193 (3)",
-            "tab": "Fairness",
-            "score": 0.731
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.71, mean=0.869, max=0.954, sum=2.608 (3)",
-            "tab": "Efficiency",
-            "score": 0.869335141547284
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)",
-            "tab": "General information",
-            "score": 908.4063333333334
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.671,
-        "details": {
-          "description": "min=0.657, mean=0.671, max=0.692, sum=2.013 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.25, mean=0.254, max=0.261, sum=0.763 (3)",
-            "tab": "Calibration",
-            "score": 0.25442494535286947
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.365, mean=0.409, max=0.447, sum=1.227 (3)",
-            "tab": "Robustness",
-            "score": 0.4090933797146052
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.545, mean=0.573, max=0.6, sum=1.718 (3)",
-            "tab": "Fairness",
-            "score": 0.5725951072978767
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=2.375, mean=2.783, max=3.573, sum=8.348 (3)",
-            "tab": "Efficiency",
-            "score": 2.7825779012238017
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)",
-            "tab": "General information",
-            "score": 1.6469483568075116
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)",
-            "tab": "General information",
-            "score": 1652.3774647887324
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=27.152, mean=40.781, max=56.166, sum=122.344 (3)",
-            "tab": "General information",
-            "score": 40.781220657277
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.472, mean=0.491, max=0.5, sum=1.472 (3)",
-            "tab": "Bias",
-            "score": 0.49074074074074076
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.213, mean=0.232, max=0.257, sum=0.695 (3)",
-            "tab": "Bias",
-            "score": 0.23182834585691858
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.017, mean=0.019, max=0.023, sum=0.056 (3)",
-            "tab": "Toxicity",
-            "score": 0.018779342723004692
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.615,
-        "details": {
-          "description": "min=0.607, mean=0.615, max=0.619, sum=1.845 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.169, mean=0.173, max=0.178, sum=0.52 (3)",
-            "tab": "Calibration",
-            "score": 0.17321815784980257
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.365, mean=0.372, max=0.38, sum=1.117 (3)",
-            "tab": "Calibration",
-            "score": 0.3723122842871363
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.202, mean=0.208, max=0.213, sum=0.623 (3)",
-            "tab": "Robustness",
-            "score": 0.2076699169323979
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.382, mean=0.408, max=0.445, sum=1.224 (3)",
-            "tab": "Robustness",
-            "score": 0.40794279599736244
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.244, mean=0.246, max=0.248, sum=0.738 (3)",
-            "tab": "Fairness",
-            "score": 0.2461285688311032
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.557, mean=0.561, max=0.566, sum=1.684 (3)",
-            "tab": "Fairness",
-            "score": 0.5613201936765554
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=4.226, mean=4.548, max=4.977, sum=13.645 (3)",
-            "tab": "Efficiency",
-            "score": 4.5482187833781085
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=6.761, mean=7.78, max=8.516, sum=23.341 (3)",
-            "tab": "Efficiency",
-            "score": 7.78018927021878
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)",
-            "tab": "General information",
-            "score": 112.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=272.695, mean=278.02, max=287.118, sum=834.059 (3)",
-            "tab": "General information",
-            "score": 278.01966666666664
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)",
-            "tab": "General information",
-            "score": 4.691333333333334
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)",
-            "tab": "General information",
-            "score": 0.036
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)",
-            "tab": "General information",
-            "score": 1419.5736666666664
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=168.53, mean=194.671, max=213.115, sum=584.014 (3)",
-            "tab": "General information",
-            "score": 194.67133333333334
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.279, mean=0.327, max=0.375, sum=0.654 (2)",
-            "tab": "Bias",
-            "score": 0.32684426229508196
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.48, mean=0.521, max=0.562, sum=1.563 (3)",
-            "tab": "Bias",
-            "score": 0.5211641167340236
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.029, mean=0.081, max=0.119, sum=0.243 (3)",
-            "tab": "Bias",
-            "score": 0.0811320308714203
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.433, mean=0.439, max=0.45, sum=1.317 (3)",
-            "tab": "Bias",
-            "score": 0.4388888888888889
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.423, mean=0.461, max=0.48, sum=1.384 (3)",
-            "tab": "Bias",
-            "score": 0.4612918002748511
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.318, mean=0.325, max=0.332, sum=0.974 (3)",
-            "tab": "Bias",
-            "score": 0.324702218997521
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)",
-            "tab": "Toxicity",
-            "score": 0.0006666666666666666
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.002, max=0.003, sum=0.005 (3)",
-            "tab": "Toxicity",
-            "score": 0.0016666666666666668
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.36,
-        "details": {
-          "description": "min=0.347, mean=0.36, max=0.369, sum=1.08 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.128, mean=0.148, max=0.173, sum=0.443 (3)",
-            "tab": "Calibration",
-            "score": 0.14774672207107284
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.194, mean=0.2, max=0.209, sum=0.6 (3)",
-            "tab": "Robustness",
-            "score": 0.2000302607507829
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.255, mean=0.266, max=0.274, sum=0.798 (3)",
-            "tab": "Fairness",
-            "score": 0.26591098840755784
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=3.951, mean=4.049, max=4.154, sum=12.147 (3)",
-            "tab": "Efficiency",
-            "score": 4.049007016242971
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)",
-            "tab": "General information",
-            "score": 0.9443333333333334
-          },
-          "QuAC - truncated": {
-            "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)",
-            "tab": "General information",
-            "score": 0.016
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)",
-            "tab": "General information",
-            "score": 1644.8306666666667
-          },
-          "QuAC - # output tokens": {
-            "description": "min=75.972, mean=77.836, max=79.528, sum=233.507 (3)",
-            "tab": "General information",
-            "score": 77.83566666666667
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.561, mean=0.591, max=0.614, sum=1.773 (3)",
-            "tab": "Bias",
-            "score": 0.5910808767951625
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.376, mean=0.386, max=0.399, sum=1.159 (3)",
-            "tab": "Bias",
-            "score": 0.38627685600159944
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.167, mean=0.243, max=0.304, sum=0.73 (3)",
-            "tab": "Bias",
-            "score": 0.2433558772540988
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.195, mean=0.207, max=0.218, sum=0.621 (3)",
-            "tab": "Bias",
-            "score": 0.2069846056271054
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.003, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791,
-        "details": {
-          "description": "min=0.791, mean=0.791, max=0.791, sum=0.791 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.325, mean=0.325, max=0.325, sum=0.325 (1)",
-            "tab": "Calibration",
-            "score": 0.324637159664446
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.744, mean=0.744, max=0.744, sum=0.744 (1)",
-            "tab": "Robustness",
-            "score": 0.744
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.66, mean=0.66, max=0.66, sum=0.66 (1)",
-            "tab": "Fairness",
-            "score": 0.66
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.71, mean=0.71, max=0.71, sum=0.71 (1)",
-            "tab": "Efficiency",
-            "score": 0.7096132577732451
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)",
-            "tab": "General information",
-            "score": 87.888
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.586,
-        "details": {
-          "description": "min=0.586, mean=0.586, max=0.586, sum=0.586 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.209, mean=0.209, max=0.209, sum=0.209 (1)",
-            "tab": "Calibration",
-            "score": 0.20889829455743214
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.488, mean=0.488, max=0.488, sum=0.488 (1)",
-            "tab": "Robustness",
-            "score": 0.488
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Fairness",
-            "score": 0.5
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.038, mean=0.038, max=0.038, sum=0.038 (1)",
-            "tab": "Efficiency",
-            "score": 0.03760148134353242
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)",
-            "tab": "General information",
-            "score": 5.27
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25,
-        "details": {
-          "description": "min=0.228, mean=0.25, max=0.269, sum=1.002 (4)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.042, mean=0.054, max=0.061, sum=0.216 (4)",
-            "tab": "Calibration",
-            "score": 0.05404322346973557
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.167, mean=0.205, max=0.249, sum=0.818 (4)",
-            "tab": "Robustness",
-            "score": 0.20451070336391436
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.165, mean=0.203, max=0.249, sum=0.812 (4)",
-            "tab": "Fairness",
-            "score": 0.2029816513761468
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.079, mean=0.141, max=0.246, sum=0.563 (4)",
-            "tab": "Efficiency",
-            "score": 0.1406602569641055
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=2616 (4)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=0, mean=3.75, max=5, sum=15 (4)",
-            "tab": "General information",
-            "score": 3.75
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (4)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=85.121, mean=404.621, max=529.121, sum=1618.483 (4)",
-            "tab": "General information",
-            "score": 404.62079510703364
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=4 (4)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=2.5, max=3, sum=10 (4)",
-            "tab": "General information",
-            "score": 2.5
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.448,
-        "details": {
-          "description": "min=0.425, mean=0.448, max=0.467, sum=1.344 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.198, mean=0.235, max=0.263, sum=0.705 (3)",
-            "tab": "Robustness",
-            "score": 0.23496613756613724
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.386, mean=0.408, max=0.422, sum=1.225 (3)",
-            "tab": "Robustness",
-            "score": 0.4083455179340017
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.229, mean=0.26, max=0.288, sum=0.779 (3)",
-            "tab": "Fairness",
-            "score": 0.25959669312169276
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.4, mean=0.419, max=0.428, sum=1.256 (3)",
-            "tab": "Fairness",
-            "score": 0.41868435186381264
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.229, mean=0.241, max=0.262, sum=0.724 (3)",
-            "tab": "Efficiency",
-            "score": 0.24148347487755295
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.19, mean=0.226, max=0.254, sum=0.678 (3)",
-            "tab": "Efficiency",
-            "score": 0.2261325473631569
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)",
-            "tab": "General information",
-            "score": 532.5653333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)",
-            "tab": "General information",
-            "score": 515.8217054263565
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.146,
-        "details": {
-          "description": "min=0.132, mean=0.146, max=0.156, sum=0.875 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=4.705, mean=4.729, max=4.742, sum=28.373 (6)",
-            "tab": "Efficiency",
-            "score": 4.728843353285813
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)",
-            "tab": "General information",
-            "score": 1549.9191702432045
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=72.006, mean=73.533, max=75.564, sum=441.197 (6)",
-            "tab": "General information",
-            "score": 73.53290414878398
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.584, mean=0.591, max=0.602, sum=3.548 (6)",
-            "tab": "Bias",
-            "score": 0.5912557147615382
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.389, mean=0.407, max=0.423, sum=2.439 (6)",
-            "tab": "Bias",
-            "score": 0.406575836707982
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.258, mean=0.294, max=0.328, sum=1.765 (6)",
-            "tab": "Bias",
-            "score": 0.29422007838910086
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.109, mean=0.123, max=0.15, sum=0.74 (6)",
-            "tab": "Bias",
-            "score": 0.1233558384477443
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)",
-            "tab": "Toxicity",
-            "score": 0.000715307582260372
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.094, mean=0.202, max=0.259, sum=0.605 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.20179927196685032
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=4.642, mean=4.67, max=4.721, sum=28.022 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.67041236939807
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.234, mean=0.276, max=0.301, sum=0.827 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.2755570292220846
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.855, mean=0.933, max=0.973, sum=5.599 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9331599358896452
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=28.251, mean=31.307, max=33.584, sum=187.839 (6)",
-            "tab": "Summarization metrics",
-            "score": 31.306505459997258
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=9.442, mean=9.8, max=10.068, sum=58.802 (6)",
-            "tab": "Summarization metrics",
-            "score": 9.800322939057557
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "min=1, mean=1, max=1, sum=6 (6)",
-            "tab": "Summarization metrics",
-            "score": 1.0
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "min=4.333, mean=4.378, max=4.467, sum=26.267 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.377777777777777
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "min=2.833, mean=3.233, max=3.867, sum=19.4 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.233333333333333
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.155,
-        "details": {
-          "description": "min=0.153, mean=0.155, max=0.158, sum=0.929 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=2.509, mean=2.523, max=2.545, sum=15.138 (6)",
-            "tab": "Efficiency",
-            "score": 2.522969657178858
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)",
-            "tab": "General information",
-            "score": 4.998712998712999
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1456.402, mean=1510.418, max=1538.921, sum=9062.51 (6)",
-            "tab": "General information",
-            "score": 1510.4182754182755
-          },
-          "XSUM - # output tokens": {
-            "description": "min=26.037, mean=26.229, max=26.481, sum=157.375 (6)",
-            "tab": "General information",
-            "score": 26.22908622908623
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.444, mean=0.449, max=0.459, sum=2.697 (6)",
-            "tab": "Bias",
-            "score": 0.44948914431673054
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.429, mean=0.453, max=0.481, sum=2.719 (6)",
-            "tab": "Bias",
-            "score": 0.45310942412391686
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.188, mean=0.218, max=0.235, sum=1.309 (6)",
-            "tab": "Bias",
-            "score": 0.21820243248814677
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.008 (6)",
-            "tab": "Toxicity",
-            "score": 0.001287001287001287
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.271, mean=-0.253, max=-0.224, sum=-0.76 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.25337265715073337
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=3.343, mean=3.523, max=3.7, sum=21.139 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.5231601957035803
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.458, mean=0.46, max=0.461, sum=1.38 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.45990517032509515
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.792, mean=0.793, max=0.795, sum=4.76 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.7933759020774565
-          },
-          "XSUM - Density": {
-            "description": "min=2.672, mean=2.732, max=2.852, sum=16.393 (6)",
-            "tab": "Summarization metrics",
-            "score": 2.732196710488823
-          },
-          "XSUM - Compression": {
-            "description": "min=16.442, mean=16.792, max=17.056, sum=100.753 (6)",
-            "tab": "Summarization metrics",
-            "score": 16.79220871639349
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "min=0.583, mean=0.798, max=0.944, sum=4.789 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.7981481481481479
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "min=4.167, mean=4.3, max=4.4, sum=25.8 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.300000000000001
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "min=4.867, mean=4.891, max=4.917, sum=29.344 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.890740740740742
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.947,
-        "details": {
-          "description": "min=0.932, mean=0.947, max=0.96, sum=2.842 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.164, mean=0.19, max=0.216, sum=0.569 (3)",
-            "tab": "Calibration",
-            "score": 0.18962950165784687
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.904, mean=0.919, max=0.937, sum=2.756 (3)",
-            "tab": "Robustness",
-            "score": 0.9186666666666667
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.929, mean=0.944, max=0.958, sum=2.831 (3)",
-            "tab": "Fairness",
-            "score": 0.9436666666666667
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=1.488, mean=1.575, max=1.732, sum=4.724 (3)",
-            "tab": "Efficiency",
-            "score": 1.5747312279142403
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=4.846, mean=4.933, max=4.986, sum=14.798 (3)",
-            "tab": "General information",
-            "score": 4.932666666666667
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1152.694, mean=1389.454, max=1744.631, sum=4168.363 (3)",
-            "tab": "General information",
-            "score": 1389.4543333333331
-          },
-          "IMDB - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.505,
-        "details": {
-          "description": "min=0, mean=0.505, max=1, sum=27.251 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.226, mean=0.462, max=0.633, sum=24.957 (54)",
-            "tab": "Calibration",
-            "score": 0.46216217374926066
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.184, max=0.769, sum=9.952 (54)",
-            "tab": "Robustness",
-            "score": 0.18428995439708568
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.491, max=1, sum=26.489 (54)",
-            "tab": "Fairness",
-            "score": 0.4905409716584098
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.299, mean=0.498, max=0.974, sum=26.871 (54)",
-            "tab": "Efficiency",
-            "score": 0.4976179389529128
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)",
-            "tab": "General information",
-            "score": 722.6354931173206
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.606,
-        "details": {
-          "description": "min=0.075, mean=0.606, max=0.975, sum=20 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.1, mean=0.352, max=0.74, sum=11.606 (33)",
-            "tab": "Calibration",
-            "score": 0.35168585204039804
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.48, max=0.975, sum=15.85 (33)",
-            "tab": "Robustness",
-            "score": 0.4803030303030303
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.075, mean=0.58, max=0.975, sum=19.125 (33)",
-            "tab": "Fairness",
-            "score": 0.5795454545454547
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.403, mean=0.962, max=1.712, sum=31.76 (33)",
-            "tab": "Efficiency",
-            "score": 0.9624239013413396
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.556, max=5, sum=150.35 (33)",
-            "tab": "General information",
-            "score": 4.556060606060607
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)",
-            "tab": "General information",
-            "score": 812.937878787879
-          },
-          "RAFT - # output tokens": {
-            "description": "min=5, mean=9.057, max=18.95, sum=298.875 (33)",
-            "tab": "General information",
-            "score": 9.056818181818182
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json b/data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json
deleted file mode 100644
index 2f3d2ad96..000000000
--- a/data/helm_classic/meta/OPT-66B/48912a61-af54-4208-b36d-2f3a283e5c5d.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/meta_OPT-66B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OPT 66B",
-    "id": "meta/OPT-66B",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.448,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.2888771827640159
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.43828848200372117
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.4763117490592463
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.466875
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.6312224376358433
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.3347556764223431
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.5785714285714286
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276,
-        "details": {
-          "description": "min=0.2, mean=0.276, max=0.37, sum=4.141 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.101, mean=0.135, max=0.172, sum=2.031 (15)",
-            "tab": "Calibration",
-            "score": 0.13542563946906333
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.13, mean=0.216, max=0.32, sum=3.242 (15)",
-            "tab": "Robustness",
-            "score": 0.21610526315789472
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.18, mean=0.229, max=0.33, sum=3.44 (15)",
-            "tab": "Fairness",
-            "score": 0.22935672514619884
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.041, mean=0.055, max=0.081, sum=0.818 (15)",
-            "tab": "Efficiency",
-            "score": 0.05452067670741475
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)",
-            "tab": "General information",
-            "score": 472.2740350877193
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.753, mean=0.76, max=0.764, sum=2.281 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.193, mean=0.2, max=0.206, sum=0.601 (3)",
-            "tab": "Calibration",
-            "score": 0.20047176103986394
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.666, mean=0.683, max=0.701, sum=2.049 (3)",
-            "tab": "Robustness",
-            "score": 0.6829999999999999
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.696, mean=0.71, max=0.721, sum=2.131 (3)",
-            "tab": "Fairness",
-            "score": 0.7103333333333333
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.272, mean=0.834, max=1.907, sum=2.501 (3)",
-            "tab": "Efficiency",
-            "score": 0.8336340090708299
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)",
-            "tab": "General information",
-            "score": 908.4063333333334
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.638,
-        "details": {
-          "description": "min=0.618, mean=0.638, max=0.655, sum=1.913 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.224, mean=0.245, max=0.264, sum=0.734 (3)",
-            "tab": "Calibration",
-            "score": 0.2445466042880168
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.364, mean=0.397, max=0.421, sum=1.19 (3)",
-            "tab": "Robustness",
-            "score": 0.39653941552028354
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.497, mean=0.526, max=0.543, sum=1.579 (3)",
-            "tab": "Fairness",
-            "score": 0.5262433008374211
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=0.939, mean=1.98, max=3.714, sum=5.939 (3)",
-            "tab": "Efficiency",
-            "score": 1.979606440811339
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)",
-            "tab": "General information",
-            "score": 1.6469483568075116
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)",
-            "tab": "General information",
-            "score": 1652.3774647887324
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=39.707, mean=50.904, max=65.363, sum=152.713 (3)",
-            "tab": "General information",
-            "score": 50.90422535211267
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.406, mean=0.416, max=0.425, sum=1.248 (3)",
-            "tab": "Bias",
-            "score": 0.41597222222222224
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.333, mean=0.556, max=0.667, sum=1.667 (3)",
-            "tab": "Bias",
-            "score": 0.5555555555555556
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.164, mean=0.191, max=0.207, sum=0.574 (3)",
-            "tab": "Bias",
-            "score": 0.1911771437726737
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.02, mean=0.022, max=0.025, sum=0.065 (3)",
-            "tab": "Toxicity",
-            "score": 0.0215962441314554
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.596,
-        "details": {
-          "description": "min=0.582, mean=0.596, max=0.615, sum=1.788 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.134, mean=0.141, max=0.149, sum=0.423 (3)",
-            "tab": "Calibration",
-            "score": 0.14107540425227785
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.379, mean=0.384, max=0.387, sum=1.153 (3)",
-            "tab": "Calibration",
-            "score": 0.38437204570087863
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.2, mean=0.206, max=0.216, sum=0.619 (3)",
-            "tab": "Robustness",
-            "score": 0.20625206311676839
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.419, mean=0.458, max=0.503, sum=1.373 (3)",
-            "tab": "Robustness",
-            "score": 0.45767430702477907
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.215, mean=0.218, max=0.221, sum=0.654 (3)",
-            "tab": "Fairness",
-            "score": 0.2180459446078801
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.52, mean=0.536, max=0.558, sum=1.607 (3)",
-            "tab": "Fairness",
-            "score": 0.5357020972773482
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.471, mean=0.611, max=0.739, sum=1.834 (3)",
-            "tab": "Efficiency",
-            "score": 0.611190575244526
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=2.887, mean=3.632, max=4.314, sum=10.896 (3)",
-            "tab": "Efficiency",
-            "score": 3.631964569965005
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)",
-            "tab": "General information",
-            "score": 112.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=128.956, mean=153.231, max=173.545, sum=459.692 (3)",
-            "tab": "General information",
-            "score": 153.23066666666668
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)",
-            "tab": "General information",
-            "score": 4.691333333333334
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)",
-            "tab": "General information",
-            "score": 0.036
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)",
-            "tab": "General information",
-            "score": 1419.5736666666664
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=168.231, mean=211.805, max=244.906, sum=635.415 (3)",
-            "tab": "General information",
-            "score": 211.80499999999998
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0, mean=0.278, max=0.5, sum=0.833 (3)",
-            "tab": "Bias",
-            "score": 0.27777777777777773
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.467, mean=0.481, max=0.491, sum=1.444 (3)",
-            "tab": "Bias",
-            "score": 0.481339792158324
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.106, mean=0.156, max=0.233, sum=0.469 (3)",
-            "tab": "Bias",
-            "score": 0.156341189674523
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.287, mean=0.338, max=0.395, sum=1.015 (3)",
-            "tab": "Bias",
-            "score": 0.33841269841269833
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.4, mean=0.427, max=0.48, sum=1.281 (3)",
-            "tab": "Bias",
-            "score": 0.42701178032188486
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.116, mean=0.119, max=0.124, sum=0.357 (3)",
-            "tab": "Bias",
-            "score": 0.11888541157186479
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.002, max=0.002, sum=0.005 (3)",
-            "tab": "Toxicity",
-            "score": 0.0016666666666666668
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.357,
-        "details": {
-          "description": "min=0.35, mean=0.357, max=0.366, sum=1.07 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.135, mean=0.154, max=0.176, sum=0.461 (3)",
-            "tab": "Calibration",
-            "score": 0.15357329550060583
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.177, mean=0.199, max=0.217, sum=0.597 (3)",
-            "tab": "Robustness",
-            "score": 0.19914898808715295
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.267, mean=0.268, max=0.27, sum=0.805 (3)",
-            "tab": "Fairness",
-            "score": 0.26839685415319225
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=2.636, mean=2.658, max=2.683, sum=7.974 (3)",
-            "tab": "Efficiency",
-            "score": 2.6581093871351746
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)",
-            "tab": "General information",
-            "score": 0.9443333333333334
-          },
-          "QuAC - truncated": {
-            "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)",
-            "tab": "General information",
-            "score": 0.016
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)",
-            "tab": "General information",
-            "score": 1644.8306666666667
-          },
-          "QuAC - # output tokens": {
-            "description": "min=89.614, mean=91.909, max=95.996, sum=275.728 (3)",
-            "tab": "General information",
-            "score": 91.90933333333334
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.556, mean=0.592, max=0.619, sum=1.775 (3)",
-            "tab": "Bias",
-            "score": 0.5915343915343915
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.398, mean=0.413, max=0.424, sum=1.239 (3)",
-            "tab": "Bias",
-            "score": 0.41297615039041286
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.228, mean=0.272, max=0.324, sum=0.816 (3)",
-            "tab": "Bias",
-            "score": 0.27205505897640186
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.239, mean=0.245, max=0.252, sum=0.734 (3)",
-            "tab": "Bias",
-            "score": 0.2445248639131045
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.002, sum=0.004 (3)",
-            "tab": "Toxicity",
-            "score": 0.0013333333333333333
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.745,
-        "details": {
-          "description": "min=0.745, mean=0.745, max=0.745, sum=0.745 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.293 (1)",
-            "tab": "Calibration",
-            "score": 0.29326475041918015
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.699, mean=0.699, max=0.699, sum=0.699 (1)",
-            "tab": "Robustness",
-            "score": 0.699
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.597, mean=0.597, max=0.597, sum=0.597 (1)",
-            "tab": "Fairness",
-            "score": 0.597
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.971, mean=0.971, max=0.971, sum=0.971 (1)",
-            "tab": "Efficiency",
-            "score": 0.9708148735597889
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)",
-            "tab": "General information",
-            "score": 87.888
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0.2, mean=0.2, max=0.2, sum=0.2 (1)",
-            "tab": "General information",
-            "score": 0.2
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.534,
-        "details": {
-          "description": "min=0.534, mean=0.534, max=0.534, sum=0.534 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.237, mean=0.237, max=0.237, sum=0.237 (1)",
-            "tab": "Calibration",
-            "score": 0.2373615873422732
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.45, mean=0.45, max=0.45, sum=0.45 (1)",
-            "tab": "Robustness",
-            "score": 0.45
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.454, mean=0.454, max=0.454, sum=0.454 (1)",
-            "tab": "Fairness",
-            "score": 0.454
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.188, mean=0.188, max=0.188, sum=0.188 (1)",
-            "tab": "Efficiency",
-            "score": 0.18798254558309685
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)",
-            "tab": "General information",
-            "score": 5.27
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.201,
-        "details": {
-          "description": "min=0.185, mean=0.201, max=0.22, sum=0.804 (4)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.047, mean=0.073, max=0.084, sum=0.293 (4)",
-            "tab": "Calibration",
-            "score": 0.07328356622626138
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.135, mean=0.174, max=0.206, sum=0.694 (4)",
-            "tab": "Robustness",
-            "score": 0.1735474006116208
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.145, mean=0.173, max=0.206, sum=0.693 (4)",
-            "tab": "Fairness",
-            "score": 0.17316513761467892
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.033, mean=0.041, max=0.046, sum=0.163 (4)",
-            "tab": "Efficiency",
-            "score": 0.04074840224276806
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=2616 (4)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=0, mean=3.75, max=5, sum=15 (4)",
-            "tab": "General information",
-            "score": 3.75
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (4)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=85.121, mean=404.621, max=529.121, sum=1618.483 (4)",
-            "tab": "General information",
-            "score": 404.62079510703364
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=4 (4)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=2.5, max=3, sum=10 (4)",
-            "tab": "General information",
-            "score": 2.5
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.482,
-        "details": {
-          "description": "min=0.467, mean=0.482, max=0.511, sum=1.446 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.175, mean=0.179, max=0.187, sum=0.537 (3)",
-            "tab": "Robustness",
-            "score": 0.1788788359788358
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.421, mean=0.437, max=0.46, sum=1.31 (3)",
-            "tab": "Robustness",
-            "score": 0.436684763137285
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.21, mean=0.214, max=0.221, sum=0.642 (3)",
-            "tab": "Fairness",
-            "score": 0.2139329365079363
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.45, mean=0.471, max=0.501, sum=1.412 (3)",
-            "tab": "Fairness",
-            "score": 0.4706976603850948
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.066, mean=0.076, max=0.089, sum=0.227 (3)",
-            "tab": "Efficiency",
-            "score": 0.07567241383876121
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.061, mean=0.102, max=0.183, sum=0.305 (3)",
-            "tab": "Efficiency",
-            "score": 0.10182954292591756
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)",
-            "tab": "General information",
-            "score": 532.5653333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)",
-            "tab": "General information",
-            "score": 515.8217054263565
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.136,
-        "details": {
-          "description": "min=0.119, mean=0.136, max=0.149, sum=0.816 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=1.828, mean=1.972, max=2.045, sum=11.831 (6)",
-            "tab": "Efficiency",
-            "score": 1.971851329588582
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)",
-            "tab": "General information",
-            "score": 1549.9191702432045
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=72.955, mean=77.928, max=83.685, sum=467.567 (6)",
-            "tab": "General information",
-            "score": 77.9277539341917
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.589, mean=0.609, max=0.627, sum=3.657 (6)",
-            "tab": "Bias",
-            "score": 0.6094903870639165
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.396, mean=0.404, max=0.412, sum=2.424 (6)",
-            "tab": "Bias",
-            "score": 0.40393077624581836
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.287, mean=0.337, max=0.37, sum=2.024 (6)",
-            "tab": "Bias",
-            "score": 0.33739205476866063
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.121, mean=0.128, max=0.139, sum=0.766 (6)",
-            "tab": "Bias",
-            "score": 0.12773227690338504
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)",
-            "tab": "Toxicity",
-            "score": 0.000715307582260372
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.064, mean=0.197, max=0.291, sum=0.592 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.19745183659958473
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=4.708, mean=4.735, max=4.771, sum=28.41 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.735075808555843
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.206, mean=0.256, max=0.287, sum=0.769 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.2564336767010044
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.829, mean=0.92, max=0.97, sum=5.522 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9202647711974157
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=34.301, mean=41.595, max=46.027, sum=249.573 (6)",
-            "tab": "Summarization metrics",
-            "score": 41.59545904426739
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=8.796, mean=9.759, max=10.302, sum=58.557 (6)",
-            "tab": "Summarization metrics",
-            "score": 9.759458553538733
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.126,
-        "details": {
-          "description": "min=0.123, mean=0.126, max=0.131, sum=0.757 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=0.833, mean=0.885, max=0.939, sum=5.309 (6)",
-            "tab": "Efficiency",
-            "score": 0.8849094198151292
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)",
-            "tab": "General information",
-            "score": 4.998712998712999
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1456.402, mean=1510.418, max=1538.921, sum=9062.51 (6)",
-            "tab": "General information",
-            "score": 1510.4182754182755
-          },
-          "XSUM - # output tokens": {
-            "description": "min=23.931, mean=24.362, max=24.873, sum=146.17 (6)",
-            "tab": "General information",
-            "score": 24.361647361647357
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.453, mean=0.469, max=0.478, sum=2.812 (6)",
-            "tab": "Bias",
-            "score": 0.46873713991769544
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.356, mean=0.462, max=0.532, sum=2.769 (6)",
-            "tab": "Bias",
-            "score": 0.46156957217464706
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.168, mean=0.186, max=0.201, sum=1.118 (6)",
-            "tab": "Bias",
-            "score": 0.18640980232047377
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0.002, mean=0.003, max=0.004, sum=0.015 (6)",
-            "tab": "Toxicity",
-            "score": 0.002574002574002574
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.208, mean=-0.189, max=-0.166, sum=-0.566 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.18875486064192462
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=3.146, mean=3.324, max=3.669, sum=19.946 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.3243234460347995
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.416, mean=0.417, max=0.419, sum=1.251 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.4169695047035986
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.815, mean=0.817, max=0.819, sum=4.904 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.8172878337570123
-          },
-          "XSUM - Density": {
-            "description": "min=3.708, mean=3.899, max=4.102, sum=23.393 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.898863398596404
-          },
-          "XSUM - Compression": {
-            "description": "min=18.005, mean=18.414, max=18.872, sum=110.483 (6)",
-            "tab": "Summarization metrics",
-            "score": 18.413782867028814
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.917,
-        "details": {
-          "description": "min=0.906, mean=0.917, max=0.926, sum=2.752 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.289, mean=0.302, max=0.327, sum=0.905 (3)",
-            "tab": "Calibration",
-            "score": 0.30155451934186406
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.872, mean=0.886, max=0.901, sum=2.659 (3)",
-            "tab": "Robustness",
-            "score": 0.8863333333333333
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.898, mean=0.908, max=0.919, sum=2.725 (3)",
-            "tab": "Fairness",
-            "score": 0.9083333333333333
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.515, mean=0.54, max=0.569, sum=1.62 (3)",
-            "tab": "Efficiency",
-            "score": 0.5398914054599924
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=4.846, mean=4.933, max=4.986, sum=14.798 (3)",
-            "tab": "General information",
-            "score": 4.932666666666667
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1152.694, mean=1389.454, max=1744.631, sum=4168.363 (3)",
-            "tab": "General information",
-            "score": 1389.4543333333331
-          },
-          "IMDB - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.506,
-        "details": {
-          "description": "min=0, mean=0.506, max=1, sum=27.302 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.299, mean=0.474, max=0.666, sum=25.591 (54)",
-            "tab": "Calibration",
-            "score": 0.47391416538592424
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.305, max=0.939, sum=16.459 (54)",
-            "tab": "Robustness",
-            "score": 0.30478947142198615
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.5, max=1, sum=27.006 (54)",
-            "tab": "Fairness",
-            "score": 0.5001070006147802
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.173, mean=0.212, max=0.325, sum=11.459 (54)",
-            "tab": "Efficiency",
-            "score": 0.21220531272072915
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)",
-            "tab": "General information",
-            "score": 722.6354931173206
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.557,
-        "details": {
-          "description": "min=0.175, mean=0.557, max=0.975, sum=18.375 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.126, mean=0.468, max=0.975, sum=15.455 (33)",
-            "tab": "Calibration",
-            "score": 0.468339884912531
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.025, mean=0.405, max=0.85, sum=13.35 (33)",
-            "tab": "Robustness",
-            "score": 0.4045454545454546
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.175, mean=0.536, max=0.975, sum=17.7 (33)",
-            "tab": "Fairness",
-            "score": 0.5363636363636364
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.069, mean=1.871, max=6.606, sum=61.732 (33)",
-            "tab": "Efficiency",
-            "score": 1.8706600076246471
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.556, max=5, sum=150.35 (33)",
-            "tab": "General information",
-            "score": 4.556060606060607
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)",
-            "tab": "General information",
-            "score": 812.937878787879
-          },
-          "RAFT - # output tokens": {
-            "description": "min=5, mean=18.712, max=30, sum=617.5 (33)",
-            "tab": "General information",
-            "score": 18.712121212121207
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json b/data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json
deleted file mode 100644
index ddcfa82ef..000000000
--- a/data/helm_classic/microsoft/TNLG-v2-530B/cc85315f-4472-4b22-9f0a-e4609676ce13.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/microsoft_TNLG-v2-530B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TNLG v2 530B",
-    "id": "microsoft/TNLG-v2-530B",
-    "developer": "microsoft",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.787,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.6152996196936993
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.6503510949562118
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.7516679834811092
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5308990441173578
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.3298371381704715
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.756578947368421
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.469,
-        "details": {
-          "description": "min=0.24, mean=0.469, max=0.78, sum=7.035 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.073, mean=0.127, max=0.202, sum=1.908 (15)",
-            "tab": "Calibration",
-            "score": 0.12722994020701678
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.15, mean=0.403, max=0.75, sum=6.051 (15)",
-            "tab": "Robustness",
-            "score": 0.40336842105263154
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.17, mean=0.418, max=0.75, sum=6.266 (15)",
-            "tab": "Fairness",
-            "score": 0.41770760233918125
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)",
-            "tab": "General information",
-            "score": 472.2740350877193
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.809,
-        "details": {
-          "description": "min=0.798, mean=0.809, max=0.829, sum=2.428 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.017, mean=0.048, max=0.088, sum=0.144 (3)",
-            "tab": "Calibration",
-            "score": 0.04811928896988451
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.724, mean=0.733, max=0.747, sum=2.198 (3)",
-            "tab": "Robustness",
-            "score": 0.7326666666666667
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.756, mean=0.767, max=0.777, sum=2.3 (3)",
-            "tab": "Fairness",
-            "score": 0.7666666666666667
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)",
-            "tab": "General information",
-            "score": 908.4063333333334
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.722,
-        "details": {
-          "description": "min=0.692, mean=0.722, max=0.743, sum=2.166 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.026, mean=0.05, max=0.075, sum=0.15 (3)",
-            "tab": "Calibration",
-            "score": 0.05012197972633472
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.22, mean=0.319, max=0.405, sum=0.957 (3)",
-            "tab": "Robustness",
-            "score": 0.31894751591392195
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.601, mean=0.632, max=0.664, sum=1.895 (3)",
-            "tab": "Fairness",
-            "score": 0.6318169391667601
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.051, mean=1.646, max=2.085, sum=4.938 (3)",
-            "tab": "General information",
-            "score": 1.6460093896713615
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1600.366, mean=1651.848, max=1705.003, sum=4955.544 (3)",
-            "tab": "General information",
-            "score": 1651.8478873239437
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.113, mean=5.982, max=7.265, sum=17.946 (3)",
-            "tab": "General information",
-            "score": 5.982159624413145
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.375, mean=0.395, max=0.436, sum=1.186 (3)",
-            "tab": "Bias",
-            "score": 0.3952991452991453
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.204, mean=0.221, max=0.239, sum=0.663 (3)",
-            "tab": "Bias",
-            "score": 0.22112892189926373
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.011, mean=0.012, max=0.014, sum=0.037 (3)",
-            "tab": "Toxicity",
-            "score": 0.012206572769953052
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.642,
-        "details": {
-          "description": "min=0.617, mean=0.642, max=0.656, sum=1.926 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.038, mean=0.04, max=0.041, sum=0.119 (3)",
-            "tab": "Calibration",
-            "score": 0.039723290660202144
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.071, mean=0.075, max=0.078, sum=0.225 (3)",
-            "tab": "Calibration",
-            "score": 0.07490014228309726
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.291, mean=0.307, max=0.322, sum=0.922 (3)",
-            "tab": "Robustness",
-            "score": 0.3074701383832172
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.483, mean=0.525, max=0.549, sum=1.576 (3)",
-            "tab": "Robustness",
-            "score": 0.5253631735860874
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.306, mean=0.318, max=0.324, sum=0.953 (3)",
-            "tab": "Fairness",
-            "score": 0.3175020164111731
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.575, mean=0.598, max=0.61, sum=1.794 (3)",
-            "tab": "Fairness",
-            "score": 0.5979278798197498
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)",
-            "tab": "General information",
-            "score": 112.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=3.8, mean=4.569, max=5.632, sum=13.707 (3)",
-            "tab": "General information",
-            "score": 4.569
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.647, mean=4.691, max=4.723, sum=14.072 (3)",
-            "tab": "General information",
-            "score": 4.690666666666666
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)",
-            "tab": "General information",
-            "score": 0.036
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1231.056, mean=1419.328, max=1523.222, sum=4257.983 (3)",
-            "tab": "General information",
-            "score": 1419.3276666666668
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=5.953, mean=6.015, max=6.134, sum=18.045 (3)",
-            "tab": "General information",
-            "score": 6.015000000000001
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.25, mean=0.342, max=0.443, sum=1.026 (3)",
-            "tab": "Bias",
-            "score": 0.342063492063492
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.53, mean=0.559, max=0.573, sum=1.676 (3)",
-            "tab": "Bias",
-            "score": 0.5587121212121212
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.206, mean=0.289, max=0.419, sum=0.867 (3)",
-            "tab": "Bias",
-            "score": 0.2891147156537034
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.191, mean=0.277, max=0.345, sum=0.83 (3)",
-            "tab": "Bias",
-            "score": 0.27656250000000004
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.457, mean=0.469, max=0.484, sum=1.408 (3)",
-            "tab": "Bias",
-            "score": 0.4693006584979578
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.254, mean=0.259, max=0.261, sum=0.776 (3)",
-            "tab": "Bias",
-            "score": 0.2587447378492154
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)",
-            "tab": "Toxicity",
-            "score": 0.0006666666666666666
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39,
-        "details": {
-          "description": "min=0.388, mean=0.39, max=0.393, sum=1.171 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.059, mean=0.08, max=0.106, sum=0.241 (3)",
-            "tab": "Calibration",
-            "score": 0.08020003145494241
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.183, mean=0.194, max=0.203, sum=0.583 (3)",
-            "tab": "Robustness",
-            "score": 0.19421481147358363
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.304, mean=0.313, max=0.32, sum=0.94 (3)",
-            "tab": "Fairness",
-            "score": 0.3132392185201357
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.845, mean=0.944, max=1.084, sum=2.831 (3)",
-            "tab": "General information",
-            "score": 0.9436666666666667
-          },
-          "QuAC - truncated": {
-            "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)",
-            "tab": "General information",
-            "score": 0.016
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1624.371, mean=1644.436, max=1670.589, sum=4933.308 (3)",
-            "tab": "General information",
-            "score": 1644.436
-          },
-          "QuAC - # output tokens": {
-            "description": "min=25.915, mean=29.956, max=32.756, sum=89.867 (3)",
-            "tab": "General information",
-            "score": 29.95566666666667
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.56, mean=0.579, max=0.599, sum=1.738 (3)",
-            "tab": "Bias",
-            "score": 0.5794166151309009
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.428, mean=0.435, max=0.448, sum=1.305 (3)",
-            "tab": "Bias",
-            "score": 0.43504680341335694
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.282, mean=0.333, max=0.369, sum=0.999 (3)",
-            "tab": "Bias",
-            "score": 0.33315102716024375
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.24, mean=0.25, max=0.259, sum=0.75 (3)",
-            "tab": "Bias",
-            "score": 0.2499075403684782
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.002, mean=0.003, max=0.003, sum=0.008 (3)",
-            "tab": "Toxicity",
-            "score": 0.0026666666666666666
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.799,
-        "details": {
-          "description": "min=0.799, mean=0.799, max=0.799, sum=0.799 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.322, mean=0.322, max=0.322, sum=0.322 (1)",
-            "tab": "Calibration",
-            "score": 0.32242755675811835
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.757, mean=0.757, max=0.757, sum=0.757 (1)",
-            "tab": "Robustness",
-            "score": 0.757
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.678, mean=0.678, max=0.678, sum=0.678 (1)",
-            "tab": "Fairness",
-            "score": 0.678
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)",
-            "tab": "General information",
-            "score": 87.888
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.562,
-        "details": {
-          "description": "min=0.562, mean=0.562, max=0.562, sum=0.562 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.243, mean=0.243, max=0.243, sum=0.243 (1)",
-            "tab": "Calibration",
-            "score": 0.2425759072363007
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.476, mean=0.476, max=0.476, sum=0.476 (1)",
-            "tab": "Robustness",
-            "score": 0.476
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.504, mean=0.504, max=0.504, sum=0.504 (1)",
-            "tab": "Fairness",
-            "score": 0.504
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)",
-            "tab": "General information",
-            "score": 5.27
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.251,
-        "details": {
-          "description": "min=0.22, mean=0.251, max=0.275, sum=0.752 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.174, mean=0.226, max=0.252, sum=0.678 (3)",
-            "tab": "Calibration",
-            "score": 0.22594889867402287
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.187, mean=0.202, max=0.217, sum=0.607 (3)",
-            "tab": "Robustness",
-            "score": 0.20234454638124363
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.177, mean=0.197, max=0.213, sum=0.59 (3)",
-            "tab": "Fairness",
-            "score": 0.19673802242609584
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)",
-            "tab": "General information",
-            "score": 511.12079510703364
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.643,
-        "details": {
-          "description": "min=0.621, mean=0.643, max=0.662, sum=1.93 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.264, mean=0.287, max=0.315, sum=0.86 (3)",
-            "tab": "Robustness",
-            "score": 0.28667883597883553
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.54, mean=0.565, max=0.586, sum=1.696 (3)",
-            "tab": "Robustness",
-            "score": 0.5653481865448796
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.332, mean=0.341, max=0.354, sum=1.024 (3)",
-            "tab": "Fairness",
-            "score": 0.3414910052910049
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.592, mean=0.612, max=0.629, sum=1.836 (3)",
-            "tab": "Fairness",
-            "score": 0.6120938886543282
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)",
-            "tab": "General information",
-            "score": 532.5653333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=1.004, mean=1.011, max=1.02, sum=3.034 (3)",
-            "tab": "General information",
-            "score": 1.0113333333333334
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)",
-            "tab": "General information",
-            "score": 515.8217054263565
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=1, mean=1.016, max=1.023, sum=3.047 (3)",
-            "tab": "General information",
-            "score": 1.0155038759689923
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.161,
-        "details": {
-          "description": "min=0.151, mean=0.161, max=0.166, sum=0.966 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)",
-            "tab": "General information",
-            "score": 1549.9191702432045
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=64.44, mean=66.904, max=70.5, sum=401.425 (6)",
-            "tab": "General information",
-            "score": 66.9041487839771
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.601, mean=0.629, max=0.647, sum=3.773 (6)",
-            "tab": "Bias",
-            "score": 0.6288257738993034
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.377, mean=0.398, max=0.411, sum=2.388 (6)",
-            "tab": "Bias",
-            "score": 0.3980717194410541
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.135, mean=0.227, max=0.309, sum=1.359 (6)",
-            "tab": "Bias",
-            "score": 0.22651255675216078
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.114, mean=0.12, max=0.124, sum=0.721 (6)",
-            "tab": "Bias",
-            "score": 0.12013592572007394
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0.002, mean=0.003, max=0.004, sum=0.017 (6)",
-            "tab": "Toxicity",
-            "score": 0.002861230329041488
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.553, mean=0.573, max=0.595, sum=1.718 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.5727510890981916
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.296, mean=0.316, max=0.326, sum=0.947 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.3157002201673737
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.975, mean=0.977, max=0.981, sum=5.862 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9770276969879915
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=25.944, mean=26.968, max=27.893, sum=161.808 (6)",
-            "tab": "Summarization metrics",
-            "score": 26.967920888770376
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=9.708, mean=10.317, max=10.928, sum=61.905 (6)",
-            "tab": "Summarization metrics",
-            "score": 10.317434111699901
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.169,
-        "details": {
-          "description": "min=0.162, mean=0.169, max=0.172, sum=1.013 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)",
-            "tab": "General information",
-            "score": 4.998712998712999
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1456.402, mean=1510.418, max=1538.921, sum=9062.51 (6)",
-            "tab": "General information",
-            "score": 1510.4182754182755
-          },
-          "XSUM - # output tokens": {
-            "description": "min=27.172, mean=27.501, max=27.815, sum=165.008 (6)",
-            "tab": "General information",
-            "score": 27.501287001287
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.443, mean=0.449, max=0.459, sum=2.696 (6)",
-            "tab": "Bias",
-            "score": 0.4493607590885817
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.362, mean=0.486, max=0.567, sum=2.914 (6)",
-            "tab": "Bias",
-            "score": 0.4857302118171683
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.195, mean=0.204, max=0.217, sum=1.223 (6)",
-            "tab": "Bias",
-            "score": 0.2037662889603199
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.003, max=0.004, sum=0.015 (6)",
-            "tab": "Toxicity",
-            "score": 0.002574002574002574
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.297, mean=-0.281, max=-0.266, sum=-0.842 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.2807751739040458
-          },
-          "XSUM - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.472, mean=0.473, max=0.476, sum=1.42 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.4734549353569219
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.772, mean=0.774, max=0.777, sum=4.641 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.7735373951395458
-          },
-          "XSUM - Density": {
-            "description": "min=2.174, mean=2.322, max=2.471, sum=13.929 (6)",
-            "tab": "Summarization metrics",
-            "score": 2.321577703631062
-          },
-          "XSUM - Compression": {
-            "description": "min=15.596, mean=15.776, max=15.931, sum=94.655 (6)",
-            "tab": "Summarization metrics",
-            "score": 15.775903485860036
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.941,
-        "details": {
-          "description": "min=0.939, mean=0.941, max=0.942, sum=2.822 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.065, mean=0.087, max=0.106, sum=0.262 (3)",
-            "tab": "Calibration",
-            "score": 0.08729270886734875
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.92, mean=0.921, max=0.922, sum=2.763 (3)",
-            "tab": "Robustness",
-            "score": 0.9210000000000002
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.933, mean=0.936, max=0.94, sum=2.807 (3)",
-            "tab": "Fairness",
-            "score": 0.9356666666666666
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=4.845, mean=4.932, max=4.985, sum=14.796 (3)",
-            "tab": "General information",
-            "score": 4.9319999999999995
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1152.524, mean=1389.183, max=1743.988, sum=4167.55 (3)",
-            "tab": "General information",
-            "score": 1389.1833333333332
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.601,
-        "details": {
-          "description": "min=0.171, mean=0.601, max=0.983, sum=32.472 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.058, mean=0.213, max=0.447, sum=11.516 (54)",
-            "tab": "Calibration",
-            "score": 0.2132557883443423
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.069, mean=0.409, max=0.689, sum=22.106 (54)",
-            "tab": "Robustness",
-            "score": 0.4093704023963013
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.047, mean=0.48, max=0.97, sum=25.944 (54)",
-            "tab": "Fairness",
-            "score": 0.48044223702694133
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)",
-            "tab": "General information",
-            "score": 722.6354931173206
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=54 (54)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.679,
-        "details": {
-          "description": "min=0.025, mean=0.679, max=0.975, sum=22.4 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.089, mean=0.244, max=0.908, sum=8.049 (33)",
-            "tab": "Calibration",
-            "score": 0.24392205141094134
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.545, max=0.85, sum=17.975 (33)",
-            "tab": "Robustness",
-            "score": 0.5446969696969698
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.025, mean=0.644, max=0.975, sum=21.25 (33)",
-            "tab": "Fairness",
-            "score": 0.6439393939393939
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.556, max=5, sum=150.35 (33)",
-            "tab": "General information",
-            "score": 4.556060606060607
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)",
-            "tab": "General information",
-            "score": 812.937878787879
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.15, mean=3.023, max=6.625, sum=99.75 (33)",
-            "tab": "General information",
-            "score": 3.022727272727273
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json b/data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json
deleted file mode 100644
index b3f527a04..000000000
--- a/data/helm_classic/microsoft/TNLG-v2-6.7B/ab773619-db5e-449b-8d6b-da743cb038bb.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/microsoft_TNLG-v2-6.7B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TNLG v2 6.7B",
-    "id": "microsoft/TNLG-v2-6.7B",
-    "developer": "microsoft",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.309,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.60170195635043
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.2395553093550869
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.2912077355347656
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.43656162406269206
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.4445961445961446
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.611842105263158
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.242,
-        "details": {
-          "description": "min=0.2, mean=0.242, max=0.35, sum=3.627 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.103, mean=0.132, max=0.175, sum=1.983 (15)",
-            "tab": "Calibration",
-            "score": 0.13220035950695058
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.09, mean=0.169, max=0.24, sum=2.542 (15)",
-            "tab": "Robustness",
-            "score": 0.1694970760233918
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.17, mean=0.212, max=0.31, sum=3.186 (15)",
-            "tab": "Fairness",
-            "score": 0.2124327485380117
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)",
-            "tab": "General information",
-            "score": 472.2740350877193
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.698,
-        "details": {
-          "description": "min=0.685, mean=0.698, max=0.709, sum=2.095 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.063, mean=0.065, max=0.067, sum=0.195 (3)",
-            "tab": "Calibration",
-            "score": 0.06514212406382298
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.623, mean=0.638, max=0.653, sum=1.914 (3)",
-            "tab": "Robustness",
-            "score": 0.638
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.649, mean=0.665, max=0.674, sum=1.996 (3)",
-            "tab": "Fairness",
-            "score": 0.6653333333333333
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)",
-            "tab": "General information",
-            "score": 908.4063333333334
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.631,
-        "details": {
-          "description": "min=0.612, mean=0.631, max=0.644, sum=1.893 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.045, mean=0.046, max=0.047, sum=0.138 (3)",
-            "tab": "Calibration",
-            "score": 0.0461090042242735
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.314, mean=0.352, max=0.375, sum=1.056 (3)",
-            "tab": "Robustness",
-            "score": 0.35196743378602896
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.492, mean=0.517, max=0.532, sum=1.552 (3)",
-            "tab": "Fairness",
-            "score": 0.5173113464127798
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.051, mean=1.646, max=2.085, sum=4.938 (3)",
-            "tab": "General information",
-            "score": 1.6460093896713615
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1600.366, mean=1651.848, max=1705.003, sum=4955.544 (3)",
-            "tab": "General information",
-            "score": 1651.8478873239437
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.189, mean=6.499, max=7.989, sum=19.496 (3)",
-            "tab": "General information",
-            "score": 6.498591549295774
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.46, mean=0.476, max=0.5, sum=1.429 (3)",
-            "tab": "Bias",
-            "score": 0.47625
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.203, mean=0.212, max=0.221, sum=0.637 (3)",
-            "tab": "Bias",
-            "score": 0.21227319042207152
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.008, mean=0.011, max=0.014, sum=0.034 (3)",
-            "tab": "Toxicity",
-            "score": 0.011267605633802816
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.561,
-        "details": {
-          "description": "min=0.532, mean=0.561, max=0.585, sum=1.683 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.028, mean=0.031, max=0.033, sum=0.093 (3)",
-            "tab": "Calibration",
-            "score": 0.031006448164221535
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.071, mean=0.089, max=0.108, sum=0.266 (3)",
-            "tab": "Calibration",
-            "score": 0.08866228023213817
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.144, mean=0.149, max=0.159, sum=0.448 (3)",
-            "tab": "Robustness",
-            "score": 0.149387882661448
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.215, mean=0.299, max=0.355, sum=0.896 (3)",
-            "tab": "Robustness",
-            "score": 0.2985499982493553
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.152, mean=0.162, max=0.17, sum=0.485 (3)",
-            "tab": "Fairness",
-            "score": 0.16163226517271406
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.463, mean=0.501, max=0.532, sum=1.502 (3)",
-            "tab": "Fairness",
-            "score": 0.5005776676014201
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)",
-            "tab": "General information",
-            "score": 112.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=5.189, mean=5.6, max=5.896, sum=16.8 (3)",
-            "tab": "General information",
-            "score": 5.6000000000000005
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.647, mean=4.691, max=4.723, sum=14.072 (3)",
-            "tab": "General information",
-            "score": 4.690666666666666
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)",
-            "tab": "General information",
-            "score": 0.036
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1231.056, mean=1419.328, max=1523.222, sum=4257.983 (3)",
-            "tab": "General information",
-            "score": 1419.3276666666668
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=7.244, mean=8.369, max=10.389, sum=25.107 (3)",
-            "tab": "General information",
-            "score": 8.369
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.494, mean=0.498, max=0.5, sum=1.494 (3)",
-            "tab": "Bias",
-            "score": 0.4981481481481482
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.32, mean=0.479, max=0.588, sum=1.437 (3)",
-            "tab": "Bias",
-            "score": 0.47890062007709067
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.179, mean=0.274, max=0.437, sum=0.821 (3)",
-            "tab": "Bias",
-            "score": 0.2737208807573663
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.167, mean=0.333, max=0.417, sum=1.0 (3)",
-            "tab": "Bias",
-            "score": 0.3333333333333333
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.399, mean=0.446, max=0.489, sum=1.338 (3)",
-            "tab": "Bias",
-            "score": 0.4460824634464231
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.115, mean=0.228, max=0.345, sum=0.684 (3)",
-            "tab": "Bias",
-            "score": 0.22804989848201077
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)",
-            "tab": "Toxicity",
-            "score": 0.0003333333333333333
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.003, sum=0.007 (3)",
-            "tab": "Toxicity",
-            "score": 0.0023333333333333335
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.345,
-        "details": {
-          "description": "min=0.334, mean=0.345, max=0.365, sum=1.034 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.046, mean=0.056, max=0.064, sum=0.169 (3)",
-            "tab": "Calibration",
-            "score": 0.056431419773363155
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.143, mean=0.159, max=0.17, sum=0.477 (3)",
-            "tab": "Robustness",
-            "score": 0.1590786964332521
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.26, mean=0.267, max=0.281, sum=0.801 (3)",
-            "tab": "Fairness",
-            "score": 0.26693937921563893
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.845, mean=0.944, max=1.084, sum=2.831 (3)",
-            "tab": "General information",
-            "score": 0.9436666666666667
-          },
-          "QuAC - truncated": {
-            "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)",
-            "tab": "General information",
-            "score": 0.016
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1624.371, mean=1644.436, max=1670.589, sum=4933.308 (3)",
-            "tab": "General information",
-            "score": 1644.436
-          },
-          "QuAC - # output tokens": {
-            "description": "min=17.622, mean=19.574, max=21.058, sum=58.723 (3)",
-            "tab": "General information",
-            "score": 19.574333333333332
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.598, mean=0.618, max=0.639, sum=1.855 (3)",
-            "tab": "Bias",
-            "score": 0.6181852538995397
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.451, mean=0.472, max=0.486, sum=1.416 (3)",
-            "tab": "Bias",
-            "score": 0.47198334521620583
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.32, mean=0.351, max=0.412, sum=1.054 (3)",
-            "tab": "Bias",
-            "score": 0.35120217651448443
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.213, mean=0.232, max=0.259, sum=0.695 (3)",
-            "tab": "Bias",
-            "score": 0.23164076323994623
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.002, sum=0.004 (3)",
-            "tab": "Toxicity",
-            "score": 0.0013333333333333333
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.704,
-        "details": {
-          "description": "min=0.704, mean=0.704, max=0.704, sum=0.704 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.268, mean=0.268, max=0.268, sum=0.268 (1)",
-            "tab": "Calibration",
-            "score": 0.2676753668258396
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)",
-            "tab": "Robustness",
-            "score": 0.656
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.53, mean=0.53, max=0.53, sum=0.53 (1)",
-            "tab": "Fairness",
-            "score": 0.53
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)",
-            "tab": "General information",
-            "score": 87.888
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.478,
-        "details": {
-          "description": "min=0.478, mean=0.478, max=0.478, sum=0.478 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.282, mean=0.282, max=0.282, sum=0.282 (1)",
-            "tab": "Calibration",
-            "score": 0.28175565698884514
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.408, mean=0.408, max=0.408, sum=0.408 (1)",
-            "tab": "Robustness",
-            "score": 0.408
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.412, mean=0.412, max=0.412, sum=0.412 (1)",
-            "tab": "Fairness",
-            "score": 0.412
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)",
-            "tab": "General information",
-            "score": 5.27
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.167,
-        "details": {
-          "description": "min=0.156, mean=0.167, max=0.173, sum=0.5 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.101, mean=0.117, max=0.128, sum=0.35 (3)",
-            "tab": "Calibration",
-            "score": 0.11656099093897697
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.128, mean=0.136, max=0.148, sum=0.408 (3)",
-            "tab": "Robustness",
-            "score": 0.13608562691131498
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.133, mean=0.144, max=0.162, sum=0.431 (3)",
-            "tab": "Fairness",
-            "score": 0.1437308868501529
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)",
-            "tab": "General information",
-            "score": 511.12079510703364
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.332,
-        "details": {
-          "description": "min=0.273, mean=0.332, max=0.382, sum=0.997 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.074, mean=0.105, max=0.125, sum=0.315 (3)",
-            "tab": "Robustness",
-            "score": 0.1048433862433863
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.227, mean=0.278, max=0.312, sum=0.835 (3)",
-            "tab": "Robustness",
-            "score": 0.2783978738136928
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.109, mean=0.14, max=0.166, sum=0.419 (3)",
-            "tab": "Fairness",
-            "score": 0.13970383597883587
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.256, mean=0.317, max=0.363, sum=0.95 (3)",
-            "tab": "Fairness",
-            "score": 0.31652617829212154
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)",
-            "tab": "General information",
-            "score": 532.5653333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=1.028, mean=1.067, max=1.136, sum=3.2 (3)",
-            "tab": "General information",
-            "score": 1.0666666666666667
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)",
-            "tab": "General information",
-            "score": 515.8217054263565
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=1.047, mean=1.047, max=1.047, sum=3.14 (3)",
-            "tab": "General information",
-            "score": 1.0465116279069768
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.146,
-        "details": {
-          "description": "min=0.139, mean=0.146, max=0.157, sum=0.877 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)",
-            "tab": "General information",
-            "score": 1549.9191702432045
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=70.732, mean=83.556, max=100.29, sum=501.335 (6)",
-            "tab": "General information",
-            "score": 83.55579399141631
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.605, mean=0.616, max=0.623, sum=3.698 (6)",
-            "tab": "Bias",
-            "score": 0.6163696620441931
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.387, mean=0.404, max=0.42, sum=2.422 (6)",
-            "tab": "Bias",
-            "score": 0.4036032258152607
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.306, mean=0.326, max=0.352, sum=1.955 (6)",
-            "tab": "Bias",
-            "score": 0.32584352768289004
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.125, mean=0.146, max=0.173, sum=0.878 (6)",
-            "tab": "Bias",
-            "score": 0.1463963556163381
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.487, mean=0.493, max=0.501, sum=1.48 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.4933195613927493
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.278, mean=0.282, max=0.284, sum=0.845 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.2815425075266347
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.973, mean=0.976, max=0.981, sum=5.857 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9761546866038108
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=38.053, mean=48.951, max=68.464, sum=293.707 (6)",
-            "tab": "Summarization metrics",
-            "score": 48.951173188846475
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=7.327, mean=9.598, max=11.919, sum=57.585 (6)",
-            "tab": "Summarization metrics",
-            "score": 9.59754128304669
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.11,
-        "details": {
-          "description": "min=0.107, mean=0.11, max=0.113, sum=0.661 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)",
-            "tab": "General information",
-            "score": 4.998712998712999
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1456.402, mean=1510.418, max=1538.921, sum=9062.51 (6)",
-            "tab": "General information",
-            "score": 1510.4182754182755
-          },
-          "XSUM - # output tokens": {
-            "description": "min=23.276, mean=23.579, max=24.127, sum=141.471 (6)",
-            "tab": "General information",
-            "score": 23.578507078507084
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.451, mean=0.462, max=0.473, sum=2.775 (6)",
-            "tab": "Bias",
-            "score": 0.46245791245791246
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.373, mean=0.489, max=0.579, sum=2.933 (6)",
-            "tab": "Bias",
-            "score": 0.4888826343934703
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.136, mean=0.182, max=0.23, sum=1.089 (6)",
-            "tab": "Bias",
-            "score": 0.18150391082886233
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.002, max=0.004, sum=0.012 (6)",
-            "tab": "Toxicity",
-            "score": 0.0019305019305019308
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.217, mean=-0.203, max=-0.192, sum=-0.61 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.20340532606019324
-          },
-          "XSUM - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.38, mean=0.385, max=0.394, sum=1.156 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.3853545238949662
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.786, mean=0.793, max=0.801, sum=4.757 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.792833262373014
-          },
-          "XSUM - Density": {
-            "description": "min=3.215, mean=3.286, max=3.34, sum=19.716 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.2859287054515427
-          },
-          "XSUM - Compression": {
-            "description": "min=17.984, mean=18.428, max=18.968, sum=110.571 (6)",
-            "tab": "Summarization metrics",
-            "score": 18.428451341381788
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.927,
-        "details": {
-          "description": "min=0.923, mean=0.927, max=0.934, sum=2.782 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.093, mean=0.118, max=0.136, sum=0.355 (3)",
-            "tab": "Calibration",
-            "score": 0.11832833491942714
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.883, mean=0.896, max=0.909, sum=2.687 (3)",
-            "tab": "Robustness",
-            "score": 0.8956666666666667
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.904, mean=0.912, max=0.922, sum=2.737 (3)",
-            "tab": "Fairness",
-            "score": 0.9123333333333333
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=4.845, mean=4.932, max=4.985, sum=14.796 (3)",
-            "tab": "General information",
-            "score": 4.9319999999999995
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1152.524, mean=1389.183, max=1743.988, sum=4167.55 (3)",
-            "tab": "General information",
-            "score": 1389.1833333333332
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.532,
-        "details": {
-          "description": "min=0.053, mean=0.532, max=0.955, sum=28.701 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.052, mean=0.248, max=0.54, sum=13.38 (54)",
-            "tab": "Calibration",
-            "score": 0.24778001352805415
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.022, mean=0.336, max=0.831, sum=18.169 (54)",
-            "tab": "Robustness",
-            "score": 0.336456419012055
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.042, mean=0.473, max=0.947, sum=25.533 (54)",
-            "tab": "Fairness",
-            "score": 0.4728366689674401
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)",
-            "tab": "General information",
-            "score": 722.6354931173206
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=54 (54)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.525,
-        "details": {
-          "description": "min=0.025, mean=0.525, max=0.975, sum=17.325 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.103, mean=0.314, max=0.912, sum=10.346 (33)",
-            "tab": "Calibration",
-            "score": 0.31351556505949635
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.445, max=0.95, sum=14.675 (33)",
-            "tab": "Robustness",
-            "score": 0.4446969696969697
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.025, mean=0.502, max=0.975, sum=16.55 (33)",
-            "tab": "Fairness",
-            "score": 0.5015151515151516
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.556, max=5, sum=150.35 (33)",
-            "tab": "General information",
-            "score": 4.556060606060607
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)",
-            "tab": "General information",
-            "score": 812.937878787879
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.15, mean=2.76, max=6.175, sum=91.075 (33)",
-            "tab": "General information",
-            "score": 2.7598484848484848
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json b/data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json
deleted file mode 100644
index 1fd56a99f..000000000
--- a/data/helm_classic/mistralai/Mistral-v0.1-7B/5f5bde4c-aa06-41f2-abaf-67901f62a8a1.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/mistralai_Mistral-v0.1-7B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral v0.1 7B",
-    "id": "mistralai/Mistral-v0.1-7B",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.884,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": null
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.8963869463869464
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.8611188811188811
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5247457047269077
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.4297202797202797
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.572,
-        "details": {
-          "description": "min=0.28, mean=0.572, max=0.84, sum=2.861 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.24, mean=0.533, max=0.82, sum=2.666 (5)",
-            "tab": "Robustness",
-            "score": 0.5332280701754385
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.27, mean=0.542, max=0.83, sum=2.709 (5)",
-            "tab": "Fairness",
-            "score": 0.541719298245614
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.874,
-        "details": {
-          "description": "min=0.874, mean=0.874, max=0.874, sum=0.874 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.837, mean=0.837, max=0.837, sum=0.837 (1)",
-            "tab": "Robustness",
-            "score": 0.837
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.842, mean=0.842, max=0.842, sum=0.842 (1)",
-            "tab": "Fairness",
-            "score": 0.842
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1418.259, mean=1418.259, max=1418.259, sum=1418.259 (1)",
-            "tab": "General information",
-            "score": 1418.259
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.716,
-        "details": {
-          "description": "min=0.716, mean=0.716, max=0.716, sum=0.716 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.649, mean=0.649, max=0.649, sum=0.649 (1)",
-            "tab": "Robustness",
-            "score": 0.6485445694648198
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.644, mean=0.644, max=0.644, sum=0.644 (1)",
-            "tab": "Fairness",
-            "score": 0.6436697691254157
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.575, mean=4.575, max=4.575, sum=4.575 (1)",
-            "tab": "General information",
-            "score": 4.574647887323944
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3627.715, mean=3627.715, max=3627.715, sum=3627.715 (1)",
-            "tab": "General information",
-            "score": 3627.7154929577464
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.173, mean=0.173, max=0.173, sum=0.173 (1)",
-            "tab": "Bias",
-            "score": 0.1730769230769231
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.008, mean=0.008, max=0.008, sum=0.008 (1)",
-            "tab": "Toxicity",
-            "score": 0.008450704225352112
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.687,
-        "details": {
-          "description": "min=0.687, mean=0.687, max=0.687, sum=0.687 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.305, mean=0.305, max=0.305, sum=0.305 (1)",
-            "tab": "Robustness",
-            "score": 0.3052498746141498
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.631, mean=0.631, max=0.631, sum=0.631 (1)",
-            "tab": "Robustness",
-            "score": 0.6314234953832969
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.3 (1)",
-            "tab": "Fairness",
-            "score": 0.30018094571517623
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.625, mean=0.625, max=0.625, sum=0.625 (1)",
-            "tab": "Fairness",
-            "score": 0.6249254915559919
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.832, mean=4.832, max=4.832, sum=4.832 (1)",
-            "tab": "General information",
-            "score": 4.832
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
-            "tab": "General information",
-            "score": 0.026
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2268.728, mean=2268.728, max=2268.728, sum=2268.728 (1)",
-            "tab": "General information",
-            "score": 2268.728
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.987, mean=0.987, max=0.987, sum=0.987 (1)",
-            "tab": "General information",
-            "score": 0.987
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.25, mean=0.25, max=0.25, sum=0.25 (1)",
-            "tab": "Bias",
-            "score": 0.25
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.287, mean=0.287, max=0.287, sum=0.287 (1)",
-            "tab": "Bias",
-            "score": 0.28746177370030584
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.065, mean=0.065, max=0.065, sum=0.065 (1)",
-            "tab": "Bias",
-            "score": 0.06521739130434784
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.439, mean=0.439, max=0.439, sum=0.439 (1)",
-            "tab": "Bias",
-            "score": 0.4385964912280702
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.48, mean=0.48, max=0.48, sum=0.48 (1)",
-            "tab": "Bias",
-            "score": 0.48000000000000004
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.423,
-        "details": {
-          "description": "min=0.423, mean=0.423, max=0.423, sum=0.423 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.31, mean=0.31, max=0.31, sum=0.31 (1)",
-            "tab": "Robustness",
-            "score": 0.3098633908730089
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.353, mean=0.353, max=0.353, sum=0.353 (1)",
-            "tab": "Fairness",
-            "score": 0.3528008659962099
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=3.44, mean=3.44, max=3.44, sum=3.44 (1)",
-            "tab": "General information",
-            "score": 3.44
-          },
-          "QuAC - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=3680.143, mean=3680.143, max=3680.143, sum=3680.143 (1)",
-            "tab": "General information",
-            "score": 3680.143
-          },
-          "QuAC - # output tokens": {
-            "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)",
-            "tab": "General information",
-            "score": 0.999
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.621, mean=0.621, max=0.621, sum=0.621 (1)",
-            "tab": "Bias",
-            "score": 0.6213450292397661
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.412, mean=0.412, max=0.412, sum=0.412 (1)",
-            "tab": "Bias",
-            "score": 0.4119047619047619
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.274, mean=0.274, max=0.274, sum=0.274 (1)",
-            "tab": "Bias",
-            "score": 0.27356321839080466
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.248, mean=0.248, max=0.248, sum=0.248 (1)",
-            "tab": "Bias",
-            "score": 0.2479564032697547
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.003, mean=0.003, max=0.003, sum=0.003 (1)",
-            "tab": "Toxicity",
-            "score": 0.003
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.422,
-        "details": {
-          "description": "min=0.422, mean=0.422, max=0.422, sum=0.422 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.339, mean=0.339, max=0.339, sum=0.339 (1)",
-            "tab": "Robustness",
-            "score": 0.3394495412844037
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.332 (1)",
-            "tab": "Fairness",
-            "score": 0.3318042813455658
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.962,
-        "details": {
-          "description": "min=0.962, mean=0.962, max=0.962, sum=0.962 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.954, mean=0.954, max=0.954, sum=0.954 (1)",
-            "tab": "Robustness",
-            "score": 0.954
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.952, mean=0.952, max=0.952, sum=0.952 (1)",
-            "tab": "Fairness",
-            "score": 0.952
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=2811.31, mean=2811.31, max=2811.31, sum=2811.31 (1)",
-            "tab": "General information",
-            "score": 2811.31
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.624,
-        "details": {
-          "description": "min=0.219, mean=0.624, max=0.874, sum=11.24 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.123, mean=0.521, max=0.842, sum=9.37 (18)",
-            "tab": "Robustness",
-            "score": 0.5205335787071343
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.06, mean=0.52, max=0.863, sum=9.357 (18)",
-            "tab": "Fairness",
-            "score": 0.5198588163222009
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=390.28, mean=831.904, max=1394.234, sum=14974.265 (18)",
-            "tab": "General information",
-            "score": 831.9036212109548
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.707,
-        "details": {
-          "description": "min=0.1, mean=0.707, max=0.975, sum=7.775 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.025, mean=0.652, max=0.975, sum=7.175 (11)",
-            "tab": "Robustness",
-            "score": 0.6522727272727272
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.1, mean=0.664, max=0.975, sum=7.3 (11)",
-            "tab": "Fairness",
-            "score": 0.6636363636363636
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=2.675, mean=4.789, max=5, sum=52.675 (11)",
-            "tab": "General information",
-            "score": 4.788636363636363
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=0, mean=328.595, max=3614.55, sum=3614.55 (11)",
-            "tab": "General information",
-            "score": 328.5954545454546
-          },
-          "RAFT - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json b/data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json
deleted file mode 100644
index b0d1817b0..000000000
--- a/data/helm_classic/mosaicml/MPT-30B/32cc2aa3-be26-41bd-8124-a8b1073c84c4.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/mosaicml_MPT-30B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MPT 30B",
-    "id": "mosaicml/MPT-30B",
-    "developer": "mosaicml",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.714,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": null
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.6966666666666667
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.7464102564102564
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.2946998974900761
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.44918414918414923
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.437,
-        "details": {
-          "description": "min=0.25, mean=0.437, max=0.68, sum=2.183 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.25, mean=0.381, max=0.6, sum=1.904 (5)",
-            "tab": "Robustness",
-            "score": 0.38087719298245615
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.24, mean=0.41, max=0.64, sum=2.049 (5)",
-            "tab": "Fairness",
-            "score": 0.40989473684210526
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=358.76, mean=467.936, max=612.798, sum=2339.678 (5)",
-            "tab": "General information",
-            "score": 467.935649122807
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.704,
-        "details": {
-          "description": "min=0.704, mean=0.704, max=0.704, sum=0.704 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)",
-            "tab": "Robustness",
-            "score": 0.656
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.631, mean=0.631, max=0.631, sum=0.631 (1)",
-            "tab": "Fairness",
-            "score": 0.631
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1251.897, mean=1251.897, max=1251.897, sum=1251.897 (1)",
-            "tab": "General information",
-            "score": 1251.897
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.732,
-        "details": {
-          "description": "min=0.732, mean=0.732, max=0.732, sum=0.732 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.584, mean=0.584, max=0.584, sum=0.584 (1)",
-            "tab": "Robustness",
-            "score": 0.5840358182644836
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.653, mean=0.653, max=0.653, sum=0.653 (1)",
-            "tab": "Fairness",
-            "score": 0.6525810359656932
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)",
-            "tab": "General information",
-            "score": 1.9690140845070423
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1691.082, mean=1691.082, max=1691.082, sum=1691.082 (1)",
-            "tab": "General information",
-            "score": 1691.081690140845
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.238, mean=0.238, max=0.238, sum=0.238 (1)",
-            "tab": "Bias",
-            "score": 0.2377049180327869
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.02, mean=0.02, max=0.02, sum=0.02 (1)",
-            "tab": "Toxicity",
-            "score": 0.01971830985915493
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.673,
-        "details": {
-          "description": "min=0.673, mean=0.673, max=0.673, sum=0.673 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.272, mean=0.272, max=0.272, sum=0.272 (1)",
-            "tab": "Robustness",
-            "score": 0.2720121639433268
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.609, mean=0.609, max=0.609, sum=0.609 (1)",
-            "tab": "Robustness",
-            "score": 0.6094875286076354
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.287, mean=0.287, max=0.287, sum=0.287 (1)",
-            "tab": "Fairness",
-            "score": 0.28717918481295357
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.624, mean=0.624, max=0.624, sum=0.624 (1)",
-            "tab": "Fairness",
-            "score": 0.6239999868788104
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=117.299, mean=117.299, max=117.299, sum=117.299 (1)",
-            "tab": "General information",
-            "score": 117.299
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)",
-            "tab": "General information",
-            "score": 0.999
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.704, mean=4.704, max=4.704, sum=4.704 (1)",
-            "tab": "General information",
-            "score": 4.704
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.037, mean=0.037, max=0.037, sum=0.037 (1)",
-            "tab": "General information",
-            "score": 0.037
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1495.552, mean=1495.552, max=1495.552, sum=1495.552 (1)",
-            "tab": "General information",
-            "score": 1495.552
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.993, mean=0.993, max=0.993, sum=0.993 (1)",
-            "tab": "General information",
-            "score": 0.993
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)",
-            "tab": "Bias",
-            "score": 0.3333333333333333
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.088, mean=0.088, max=0.088, sum=0.088 (1)",
-            "tab": "Bias",
-            "score": 0.08823529411764708
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.527, mean=0.527, max=0.527, sum=0.527 (1)",
-            "tab": "Bias",
-            "score": 0.5268817204301075
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.18, mean=0.18, max=0.18, sum=0.18 (1)",
-            "tab": "Bias",
-            "score": 0.17999999999999997
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.393,
-        "details": {
-          "description": "min=0.393, mean=0.393, max=0.393, sum=0.393 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.231, mean=0.231, max=0.231, sum=0.231 (1)",
-            "tab": "Robustness",
-            "score": 0.23071567735549398
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.318, mean=0.318, max=0.318, sum=0.318 (1)",
-            "tab": "Fairness",
-            "score": 0.3176438145195143
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.883, mean=0.883, max=0.883, sum=0.883 (1)",
-            "tab": "General information",
-            "score": 0.883
-          },
-          "QuAC - truncated": {
-            "description": "min=0.021, mean=0.021, max=0.021, sum=0.021 (1)",
-            "tab": "General information",
-            "score": 0.021
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1655.708, mean=1655.708, max=1655.708, sum=1655.708 (1)",
-            "tab": "General information",
-            "score": 1655.708
-          },
-          "QuAC - # output tokens": {
-            "description": "min=0.997, mean=0.997, max=0.997, sum=0.997 (1)",
-            "tab": "General information",
-            "score": 0.997
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.413, mean=0.413, max=0.413, sum=0.413 (1)",
-            "tab": "Bias",
-            "score": 0.4133540372670807
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.443, mean=0.443, max=0.443, sum=0.443 (1)",
-            "tab": "Bias",
-            "score": 0.4433656957928802
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.279, mean=0.279, max=0.279, sum=0.279 (1)",
-            "tab": "Bias",
-            "score": 0.27914110429447847
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.231,
-        "details": {
-          "description": "min=0.231, mean=0.231, max=0.231, sum=0.231 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.177, mean=0.177, max=0.177, sum=0.177 (1)",
-            "tab": "Robustness",
-            "score": 0.17737003058103976
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.19, mean=0.19, max=0.19, sum=0.19 (1)",
-            "tab": "Fairness",
-            "score": 0.18960244648318042
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=505.352, mean=505.352, max=505.352, sum=505.352 (1)",
-            "tab": "General information",
-            "score": 505.35168195718654
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.959,
-        "details": {
-          "description": "min=0.959, mean=0.959, max=0.959, sum=0.959 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.942, mean=0.942, max=0.942, sum=0.942 (1)",
-            "tab": "Robustness",
-            "score": 0.942
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.955, mean=0.955, max=0.955, sum=0.955 (1)",
-            "tab": "Fairness",
-            "score": 0.955
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.911, mean=2.911, max=2.911, sum=2.911 (1)",
-            "tab": "General information",
-            "score": 2.911
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1619.568, mean=1619.568, max=1619.568, sum=1619.568 (1)",
-            "tab": "General information",
-            "score": 1619.568
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.599,
-        "details": {
-          "description": "min=0.121, mean=0.599, max=0.951, sum=10.782 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.112, mean=0.484, max=0.81, sum=8.708 (18)",
-            "tab": "Robustness",
-            "score": 0.4837936253587437
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.073, mean=0.553, max=0.939, sum=9.947 (18)",
-            "tab": "Fairness",
-            "score": 0.5526050039546541
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=360.976, mean=771.654, max=1282.4, sum=13889.772 (18)",
-            "tab": "General information",
-            "score": 771.6539847352628
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.723,
-        "details": {
-          "description": "min=0.45, mean=0.723, max=0.975, sum=7.95 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.075, mean=0.58, max=0.975, sum=6.375 (11)",
-            "tab": "Robustness",
-            "score": 0.5795454545454546
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.35, mean=0.68, max=0.975, sum=7.475 (11)",
-            "tab": "Fairness",
-            "score": 0.6795454545454546
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.7, mean=4.605, max=5, sum=50.65 (11)",
-            "tab": "General information",
-            "score": 4.6045454545454545
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=280.35, mean=869.691, max=1756.575, sum=9566.6 (11)",
-            "tab": "General information",
-            "score": 869.6909090909089
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.725, mean=0.975, max=1, sum=10.725 (11)",
-            "tab": "General information",
-            "score": 0.975
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json b/data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json
deleted file mode 100644
index 771c4ac02..000000000
--- a/data/helm_classic/mosaicml/MPT-Instruct-30B/42a86a4a-7e76-4c7d-af48-e765a38df589.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/mosaicml_MPT-Instruct-30B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MPT-Instruct 30B",
-    "id": "mosaicml/MPT-Instruct-30B",
-    "developer": "mosaicml",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.716,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": null
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.6561072261072262
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.6874125874125874
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.3616994955593857
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.2453962703962704
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.444,
-        "details": {
-          "description": "min=0.3, mean=0.444, max=0.64, sum=2.222 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.22, mean=0.383, max=0.59, sum=1.913 (5)",
-            "tab": "Robustness",
-            "score": 0.3826315789473684
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.24, mean=0.4, max=0.61, sum=2.002 (5)",
-            "tab": "Fairness",
-            "score": 0.40038596491228073
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=358.76, mean=467.936, max=612.798, sum=2339.678 (5)",
-            "tab": "General information",
-            "score": 467.935649122807
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=0.85 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.77, mean=0.77, max=0.77, sum=0.77 (1)",
-            "tab": "Robustness",
-            "score": 0.77
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.807, mean=0.807, max=0.807, sum=0.807 (1)",
-            "tab": "Fairness",
-            "score": 0.807
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1251.897, mean=1251.897, max=1251.897, sum=1251.897 (1)",
-            "tab": "General information",
-            "score": 1251.897
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.733,
-        "details": {
-          "description": "min=0.733, mean=0.733, max=0.733, sum=0.733 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.623, mean=0.623, max=0.623, sum=0.623 (1)",
-            "tab": "Robustness",
-            "score": 0.6233490338408667
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.633, mean=0.633, max=0.633, sum=0.633 (1)",
-            "tab": "Fairness",
-            "score": 0.6330893045624563
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)",
-            "tab": "General information",
-            "score": 1.9690140845070423
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1691.082, mean=1691.082, max=1691.082, sum=1691.082 (1)",
-            "tab": "General information",
-            "score": 1691.081690140845
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.224, mean=0.224, max=0.224, sum=0.224 (1)",
-            "tab": "Bias",
-            "score": 0.22357723577235772
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.017, mean=0.017, max=0.017, sum=0.017 (1)",
-            "tab": "Toxicity",
-            "score": 0.016901408450704224
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.697,
-        "details": {
-          "description": "min=0.697, mean=0.697, max=0.697, sum=0.697 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.202, mean=0.202, max=0.202, sum=0.202 (1)",
-            "tab": "Robustness",
-            "score": 0.20213849058578032
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.607, mean=0.607, max=0.607, sum=0.607 (1)",
-            "tab": "Robustness",
-            "score": 0.6065652552159236
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.233, mean=0.233, max=0.233, sum=0.233 (1)",
-            "tab": "Fairness",
-            "score": 0.23301952773256637
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.639, mean=0.639, max=0.639, sum=0.639 (1)",
-            "tab": "Fairness",
-            "score": 0.6392400021633227
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=117.299, mean=117.299, max=117.299, sum=117.299 (1)",
-            "tab": "General information",
-            "score": 117.299
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.704, mean=4.704, max=4.704, sum=4.704 (1)",
-            "tab": "General information",
-            "score": 4.704
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.037, mean=0.037, max=0.037, sum=0.037 (1)",
-            "tab": "General information",
-            "score": 0.037
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1495.552, mean=1495.552, max=1495.552, sum=1495.552 (1)",
-            "tab": "General information",
-            "score": 1495.552
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.994, mean=0.994, max=0.994, sum=0.994 (1)",
-            "tab": "General information",
-            "score": 0.994
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Bias",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.542, mean=0.542, max=0.542, sum=0.542 (1)",
-            "tab": "Bias",
-            "score": 0.5416666666666667
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.091, mean=0.091, max=0.091, sum=0.091 (1)",
-            "tab": "Bias",
-            "score": 0.09090909090909088
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.493, mean=0.493, max=0.493, sum=0.493 (1)",
-            "tab": "Bias",
-            "score": 0.4931129476584022
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.286, mean=0.286, max=0.286, sum=0.286 (1)",
-            "tab": "Bias",
-            "score": 0.2857142857142857
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.327,
-        "details": {
-          "description": "min=0.327, mean=0.327, max=0.327, sum=0.327 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.204, mean=0.204, max=0.204, sum=0.204 (1)",
-            "tab": "Robustness",
-            "score": 0.20366013650654988
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.252, mean=0.252, max=0.252, sum=0.252 (1)",
-            "tab": "Fairness",
-            "score": 0.2519147363869601
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.883, mean=0.883, max=0.883, sum=0.883 (1)",
-            "tab": "General information",
-            "score": 0.883
-          },
-          "QuAC - truncated": {
-            "description": "min=0.021, mean=0.021, max=0.021, sum=0.021 (1)",
-            "tab": "General information",
-            "score": 0.021
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1655.708, mean=1655.708, max=1655.708, sum=1655.708 (1)",
-            "tab": "General information",
-            "score": 1655.708
-          },
-          "QuAC - # output tokens": {
-            "description": "min=0.998, mean=0.998, max=0.998, sum=0.998 (1)",
-            "tab": "General information",
-            "score": 0.998
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.426, mean=0.426, max=0.426, sum=0.426 (1)",
-            "tab": "Bias",
-            "score": 0.42553763440860215
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.407, mean=0.407, max=0.407, sum=0.407 (1)",
-            "tab": "Bias",
-            "score": 0.4074074074074074
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.232, mean=0.232, max=0.232, sum=0.232 (1)",
-            "tab": "Bias",
-            "score": 0.23239436619718312
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.003, mean=0.003, max=0.003, sum=0.003 (1)",
-            "tab": "Toxicity",
-            "score": 0.003
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.234,
-        "details": {
-          "description": "min=0.234, mean=0.234, max=0.234, sum=0.234 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.177, mean=0.177, max=0.177, sum=0.177 (1)",
-            "tab": "Robustness",
-            "score": 0.17737003058103976
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.18, mean=0.18, max=0.18, sum=0.18 (1)",
-            "tab": "Fairness",
-            "score": 0.18042813455657492
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=505.352, mean=505.352, max=505.352, sum=505.352 (1)",
-            "tab": "General information",
-            "score": 505.35168195718654
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.956,
-        "details": {
-          "description": "min=0.956, mean=0.956, max=0.956, sum=0.956 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.942, mean=0.942, max=0.942, sum=0.942 (1)",
-            "tab": "Robustness",
-            "score": 0.942
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.944, mean=0.944, max=0.944, sum=0.944 (1)",
-            "tab": "Fairness",
-            "score": 0.944
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.911, mean=2.911, max=2.911, sum=2.911 (1)",
-            "tab": "General information",
-            "score": 2.911
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1619.568, mean=1619.568, max=1619.568, sum=1619.568 (1)",
-            "tab": "General information",
-            "score": 1619.568
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.573,
-        "details": {
-          "description": "min=0.119, mean=0.573, max=0.967, sum=10.316 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.042, mean=0.408, max=0.867, sum=7.353 (18)",
-            "tab": "Robustness",
-            "score": 0.40848129232892094
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.085, mean=0.527, max=0.95, sum=9.488 (18)",
-            "tab": "Fairness",
-            "score": 0.5271340155324973
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=360.976, mean=771.654, max=1282.4, sum=13889.772 (18)",
-            "tab": "General information",
-            "score": 771.6539847352628
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.68,
-        "details": {
-          "description": "min=0.425, mean=0.68, max=0.9, sum=7.475 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.075, mean=0.548, max=0.875, sum=6.025 (11)",
-            "tab": "Robustness",
-            "score": 0.5477272727272727
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.4, mean=0.636, max=0.825, sum=7 (11)",
-            "tab": "Fairness",
-            "score": 0.6363636363636364
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.7, mean=4.605, max=5, sum=50.65 (11)",
-            "tab": "General information",
-            "score": 4.6045454545454545
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=280.35, mean=869.691, max=1756.575, sum=9566.6 (11)",
-            "tab": "General information",
-            "score": 869.6909090909089
-          },
-          "RAFT - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json b/data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json
deleted file mode 100644
index 20a0f0d63..000000000
--- a/data/helm_classic/openai/GPT-J-6B/f9746ed1-887f-4850-ac2d-700de18acbaf.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_GPT-J-6B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-J 6B",
-    "id": "openai/GPT-J-6B",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.273,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.4640964584689531
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.29051104623963353
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.2899930436637889
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.6008771929824561
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.4572430192172563
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.24521373688040354
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.5489557226399332
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.249,
-        "details": {
-          "description": "min=0.14, mean=0.249, max=0.3, sum=3.728 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.062, mean=0.115, max=0.149, sum=1.732 (15)",
-            "tab": "Calibration",
-            "score": 0.11546362297486105
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.11, mean=0.217, max=0.28, sum=3.262 (15)",
-            "tab": "Robustness",
-            "score": 0.2174502923976608
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.13, mean=0.22, max=0.27, sum=3.294 (15)",
-            "tab": "Fairness",
-            "score": 0.21961403508771932
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.066, mean=0.07, max=0.072, sum=1.05 (15)",
-            "tab": "Efficiency",
-            "score": 0.06997480863135229
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)",
-            "tab": "General information",
-            "score": 472.2740350877193
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.649,
-        "details": {
-          "description": "min=0.646, mean=0.649, max=0.65, sum=1.946 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.043, mean=0.062, max=0.086, sum=0.187 (3)",
-            "tab": "Calibration",
-            "score": 0.062432673938629946
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.608, mean=0.621, max=0.631, sum=1.863 (3)",
-            "tab": "Robustness",
-            "score": 0.621
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.638, mean=0.639, max=0.64, sum=1.916 (3)",
-            "tab": "Fairness",
-            "score": 0.6386666666666666
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.354, mean=0.499, max=0.575, sum=1.497 (3)",
-            "tab": "Efficiency",
-            "score": 0.49915384031836946
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)",
-            "tab": "General information",
-            "score": 908.4063333333334
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.545,
-        "details": {
-          "description": "min=0.54, mean=0.545, max=0.554, sum=1.634 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.189, mean=0.199, max=0.211, sum=0.596 (3)",
-            "tab": "Calibration",
-            "score": 0.19883043691040034
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.099, mean=0.135, max=0.156, sum=0.405 (3)",
-            "tab": "Robustness",
-            "score": 0.1349521611222693
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.417, mean=0.433, max=0.448, sum=1.3 (3)",
-            "tab": "Fairness",
-            "score": 0.43317656281615613
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=0.988, mean=1.311, max=1.513, sum=3.934 (3)",
-            "tab": "Efficiency",
-            "score": 1.311420011868712
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)",
-            "tab": "General information",
-            "score": 1.6469483568075116
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)",
-            "tab": "General information",
-            "score": 1652.3774647887324
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=42.766, mean=56.052, max=70.845, sum=168.155 (3)",
-            "tab": "General information",
-            "score": 56.05164319248826
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.416, mean=0.451, max=0.5, sum=1.353 (3)",
-            "tab": "Bias",
-            "score": 0.4510416666666666
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.204, mean=0.217, max=0.229, sum=0.651 (3)",
-            "tab": "Bias",
-            "score": 0.21710889248239795
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.017, mean=0.021, max=0.025, sum=0.062 (3)",
-            "tab": "Toxicity",
-            "score": 0.020657276995305163
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.559,
-        "details": {
-          "description": "min=0.548, mean=0.559, max=0.57, sum=1.677 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.069, mean=0.075, max=0.079, sum=0.224 (3)",
-            "tab": "Calibration",
-            "score": 0.07464671252737104
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.346, mean=0.354, max=0.358, sum=1.062 (3)",
-            "tab": "Calibration",
-            "score": 0.3539383109024162
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.09, mean=0.099, max=0.109, sum=0.298 (3)",
-            "tab": "Robustness",
-            "score": 0.09933930594531819
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.185, mean=0.228, max=0.265, sum=0.683 (3)",
-            "tab": "Robustness",
-            "score": 0.22767804828628146
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.112, mean=0.122, max=0.128, sum=0.365 (3)",
-            "tab": "Fairness",
-            "score": 0.12161534757794057
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.475, mean=0.493, max=0.505, sum=1.479 (3)",
-            "tab": "Fairness",
-            "score": 0.4930833990161269
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=1.626, mean=1.777, max=1.998, sum=5.331 (3)",
-            "tab": "Efficiency",
-            "score": 1.77691167926379
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=3.687, mean=3.866, max=4.016, sum=11.599 (3)",
-            "tab": "Efficiency",
-            "score": 3.8663324384530373
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)",
-            "tab": "General information",
-            "score": 112.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=273.408, mean=282.837, max=296.556, sum=848.512 (3)",
-            "tab": "General information",
-            "score": 282.83733333333333
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)",
-            "tab": "General information",
-            "score": 4.691333333333334
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)",
-            "tab": "General information",
-            "score": 0.036
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)",
-            "tab": "General information",
-            "score": 1419.5736666666664
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=234.154, mean=247.23, max=261.681, sum=741.689 (3)",
-            "tab": "General information",
-            "score": 247.22966666666665
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.373, mean=0.49, max=0.553, sum=1.47 (3)",
-            "tab": "Bias",
-            "score": 0.49013920663848926
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.071, mean=0.192, max=0.38, sum=0.576 (3)",
-            "tab": "Bias",
-            "score": 0.19214285714285717
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.484, mean=0.524, max=0.561, sum=1.571 (3)",
-            "tab": "Bias",
-            "score": 0.5236086934551658
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.289, mean=0.317, max=0.333, sum=0.95 (3)",
-            "tab": "Bias",
-            "score": 0.3167977414801371
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.002, sum=0.004 (3)",
-            "tab": "Toxicity",
-            "score": 0.0013333333333333333
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.33,
-        "details": {
-          "description": "min=0.322, mean=0.33, max=0.335, sum=0.989 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.104, mean=0.13, max=0.169, sum=0.391 (3)",
-            "tab": "Calibration",
-            "score": 0.13037730069459044
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.14, mean=0.147, max=0.155, sum=0.44 (3)",
-            "tab": "Robustness",
-            "score": 0.14672783806116493
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.245, mean=0.249, max=0.258, sum=0.748 (3)",
-            "tab": "Fairness",
-            "score": 0.2494842989068126
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=1.354, mean=1.389, max=1.411, sum=4.166 (3)",
-            "tab": "Efficiency",
-            "score": 1.3887290514336688
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)",
-            "tab": "General information",
-            "score": 0.9443333333333334
-          },
-          "QuAC - truncated": {
-            "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)",
-            "tab": "General information",
-            "score": 0.016
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)",
-            "tab": "General information",
-            "score": 1644.8306666666667
-          },
-          "QuAC - # output tokens": {
-            "description": "min=64.208, mean=68.54, max=71.626, sum=205.621 (3)",
-            "tab": "General information",
-            "score": 68.54033333333334
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.568, mean=0.613, max=0.641, sum=1.838 (3)",
-            "tab": "Bias",
-            "score": 0.6126959460292795
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.41, mean=0.43, max=0.447, sum=1.29 (3)",
-            "tab": "Bias",
-            "score": 0.4301368170697724
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.232, mean=0.266, max=0.294, sum=0.798 (3)",
-            "tab": "Bias",
-            "score": 0.2658629278217009
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.211, mean=0.23, max=0.241, sum=0.69 (3)",
-            "tab": "Bias",
-            "score": 0.2300432286449244
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.004, max=0.005, sum=0.011 (3)",
-            "tab": "Toxicity",
-            "score": 0.0036666666666666666
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.663,
-        "details": {
-          "description": "min=0.663, mean=0.663, max=0.663, sum=0.663 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.233, mean=0.233, max=0.233, sum=0.233 (1)",
-            "tab": "Calibration",
-            "score": 0.2332919292558098
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.619, mean=0.619, max=0.619, sum=0.619 (1)",
-            "tab": "Robustness",
-            "score": 0.619
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.486, mean=0.486, max=0.486, sum=0.486 (1)",
-            "tab": "Fairness",
-            "score": 0.486
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.03, mean=0.03, max=0.03, sum=0.03 (1)",
-            "tab": "Efficiency",
-            "score": 0.030294155851006508
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)",
-            "tab": "General information",
-            "score": 87.888
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.514,
-        "details": {
-          "description": "min=0.514, mean=0.514, max=0.514, sum=0.514 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.235, mean=0.235, max=0.235, sum=0.235 (1)",
-            "tab": "Calibration",
-            "score": 0.2353362549897216
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.398, mean=0.398, max=0.398, sum=0.398 (1)",
-            "tab": "Robustness",
-            "score": 0.398
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.416, mean=0.416, max=0.416, sum=0.416 (1)",
-            "tab": "Fairness",
-            "score": 0.416
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.019, mean=0.019, max=0.019, sum=0.019 (1)",
-            "tab": "Efficiency",
-            "score": 0.019339164675618026
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)",
-            "tab": "General information",
-            "score": 5.27
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.199,
-        "details": {
-          "description": "min=0.187, mean=0.199, max=0.213, sum=0.797 (4)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.056, mean=0.078, max=0.103, sum=0.311 (4)",
-            "tab": "Calibration",
-            "score": 0.07772735423117484
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.157, mean=0.181, max=0.209, sum=0.725 (4)",
-            "tab": "Robustness",
-            "score": 0.1811926605504587
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.156, mean=0.18, max=0.209, sum=0.72 (4)",
-            "tab": "Fairness",
-            "score": 0.18004587155963303
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.018, mean=0.044, max=0.053, sum=0.175 (4)",
-            "tab": "Efficiency",
-            "score": 0.043782452828866295
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=2616 (4)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=0, mean=3.75, max=5, sum=15 (4)",
-            "tab": "General information",
-            "score": 3.75
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (4)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=85.121, mean=404.621, max=529.121, sum=1618.483 (4)",
-            "tab": "General information",
-            "score": 404.62079510703364
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=4 (4)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=2.5, max=3, sum=10 (4)",
-            "tab": "General information",
-            "score": 2.5
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.345,
-        "details": {
-          "description": "min=0.315, mean=0.345, max=0.362, sum=1.035 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.094, mean=0.116, max=0.131, sum=0.349 (3)",
-            "tab": "Robustness",
-            "score": 0.11636587301587299
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.29, mean=0.319, max=0.336, sum=0.957 (3)",
-            "tab": "Robustness",
-            "score": 0.3190834142643501
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.106, mean=0.129, max=0.144, sum=0.387 (3)",
-            "tab": "Fairness",
-            "score": 0.12886375661375657
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.303, mean=0.332, max=0.348, sum=0.997 (3)",
-            "tab": "Fairness",
-            "score": 0.3321982457704417
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.081, mean=0.084, max=0.088, sum=0.252 (3)",
-            "tab": "Efficiency",
-            "score": 0.08407480907713127
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.078, mean=0.081, max=0.083, sum=0.242 (3)",
-            "tab": "Efficiency",
-            "score": 0.08053553836682271
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)",
-            "tab": "General information",
-            "score": 532.5653333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)",
-            "tab": "General information",
-            "score": 515.8217054263565
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.131,
-        "details": {
-          "description": "min=0.127, mean=0.131, max=0.135, sum=0.787 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=1.997, mean=2.076, max=2.172, sum=12.455 (6)",
-            "tab": "Efficiency",
-            "score": 2.0758840914959578
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)",
-            "tab": "General information",
-            "score": 1549.9191702432045
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=76.916, mean=83.931, max=91.68, sum=503.584 (6)",
-            "tab": "General information",
-            "score": 83.93061516452074
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.602, mean=0.63, max=0.655, sum=3.78 (6)",
-            "tab": "Bias",
-            "score": 0.6299677400199846
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.398, mean=0.402, max=0.41, sum=2.415 (6)",
-            "tab": "Bias",
-            "score": 0.40247728320483095
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.23, mean=0.293, max=0.359, sum=1.759 (6)",
-            "tab": "Bias",
-            "score": 0.2931668421996429
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.131, mean=0.146, max=0.169, sum=0.875 (6)",
-            "tab": "Bias",
-            "score": 0.14576217898261626
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.013 (6)",
-            "tab": "Toxicity",
-            "score": 0.002145922746781116
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.172, mean=0.208, max=0.236, sum=0.623 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.20780144742590156
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=4.648, mean=4.704, max=4.739, sum=28.226 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.704313539792442
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.241, mean=0.247, max=0.25, sum=0.74 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.2466254745716148
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.902, mean=0.948, max=0.97, sum=5.685 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9475541325972495
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=41.364, mean=48.284, max=57.69, sum=289.703 (6)",
-            "tab": "Summarization metrics",
-            "score": 48.283839374824815
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=8.117, mean=9.864, max=11.439, sum=59.186 (6)",
-            "tab": "Summarization metrics",
-            "score": 9.864391531990323
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.096,
-        "details": {
-          "description": "min=0.093, mean=0.096, max=0.097, sum=0.573 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=0.73, mean=0.742, max=0.758, sum=4.455 (6)",
-            "tab": "Efficiency",
-            "score": 0.7424737962465443
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)",
-            "tab": "General information",
-            "score": 4.998712998712999
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1456.402, mean=1510.418, max=1538.921, sum=9062.51 (6)",
-            "tab": "General information",
-            "score": 1510.4182754182755
-          },
-          "XSUM - # output tokens": {
-            "description": "min=24.919, mean=25.529, max=26.187, sum=153.174 (6)",
-            "tab": "General information",
-            "score": 25.52895752895753
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.393, mean=0.435, max=0.466, sum=2.612 (6)",
-            "tab": "Bias",
-            "score": 0.43535525321239604
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.467, mean=0.513, max=0.565, sum=3.08 (6)",
-            "tab": "Bias",
-            "score": 0.5133548156104547
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.141, mean=0.165, max=0.179, sum=0.988 (6)",
-            "tab": "Bias",
-            "score": 0.1646512031093765
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.002, max=0.004, sum=0.012 (6)",
-            "tab": "Toxicity",
-            "score": 0.0019305019305019308
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.229, mean=-0.198, max=-0.176, sum=-0.593 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.1976111372976741
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=3.59, mean=3.813, max=4.142, sum=22.877 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.8128682530109397
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.379, mean=0.381, max=0.384, sum=1.142 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.3808147712365148
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.824, mean=0.829, max=0.831, sum=4.972 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.8286466360730634
-          },
-          "XSUM - Density": {
-            "description": "min=3.796, mean=4.043, max=4.434, sum=24.256 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.042629935538992
-          },
-          "XSUM - Compression": {
-            "description": "min=17.57, mean=17.942, max=18.398, sum=107.65 (6)",
-            "tab": "Summarization metrics",
-            "score": 17.941696288315352
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.939,
-        "details": {
-          "description": "min=0.932, mean=0.939, max=0.946, sum=2.816 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.285, mean=0.295, max=0.311, sum=0.884 (3)",
-            "tab": "Calibration",
-            "score": 0.2945110955018834
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.895, mean=0.903, max=0.908, sum=2.709 (3)",
-            "tab": "Robustness",
-            "score": 0.903
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.92, mean=0.927, max=0.932, sum=2.782 (3)",
-            "tab": "Fairness",
-            "score": 0.9273333333333333
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.63, mean=0.701, max=0.761, sum=2.104 (3)",
-            "tab": "Efficiency",
-            "score": 0.7011672212481499
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=4.846, mean=4.933, max=4.986, sum=14.798 (3)",
-            "tab": "General information",
-            "score": 4.932666666666667
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1152.694, mean=1389.454, max=1744.631, sum=4168.363 (3)",
-            "tab": "General information",
-            "score": 1389.4543333333331
-          },
-          "IMDB - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52,
-        "details": {
-          "description": "min=0.002, mean=0.52, max=1, sum=28.06 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.075, mean=0.409, max=0.626, sum=22.076 (54)",
-            "tab": "Calibration",
-            "score": 0.40880926893677766
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.418, max=1, sum=22.597 (54)",
-            "tab": "Robustness",
-            "score": 0.4184575354873046
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.488, max=1, sum=26.356 (54)",
-            "tab": "Fairness",
-            "score": 0.4880679688031825
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.154, mean=0.307, max=0.494, sum=16.591 (54)",
-            "tab": "Efficiency",
-            "score": 0.30723795570455475
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)",
-            "tab": "General information",
-            "score": 722.6354931173206
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "min=0.333, mean=0.5, max=0.667, sum=1 (2)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0.0, max=0.001, sum=0.001 (54)",
-            "tab": "Toxicity",
-            "score": 0.000027763895829862844
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.619,
-        "details": {
-          "description": "min=0.275, mean=0.619, max=0.975, sum=20.425 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.116, mean=0.389, max=0.975, sum=12.832 (33)",
-            "tab": "Calibration",
-            "score": 0.3888407166022056
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.1, mean=0.53, max=0.975, sum=17.5 (33)",
-            "tab": "Robustness",
-            "score": 0.5303030303030303
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.275, mean=0.594, max=0.975, sum=19.6 (33)",
-            "tab": "Fairness",
-            "score": 0.593939393939394
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.107, mean=0.628, max=1.382, sum=20.733 (33)",
-            "tab": "Efficiency",
-            "score": 0.6282604447639349
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.556, max=5, sum=150.35 (33)",
-            "tab": "General information",
-            "score": 4.556060606060607
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)",
-            "tab": "General information",
-            "score": 812.937878787879
-          },
-          "RAFT - # output tokens": {
-            "description": "min=5, mean=14.276, max=30, sum=471.1 (33)",
-            "tab": "General information",
-            "score": 14.275757575757577
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json b/data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json
deleted file mode 100644
index 0c00ea05c..000000000
--- a/data/helm_classic/openai/GPT-NeoX-20B/899521d0-e5eb-4e1b-af5a-78b3bd32e232.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_GPT-NeoX-20B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-NeoX 20B",
-    "id": "openai/GPT-NeoX-20B",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.351,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.4215761012322838
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.3361523348731358
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.3311530516202374
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.5141337719298246
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.46836548983528487
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.36547434047434046
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.4456349206349206
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276,
-        "details": {
-          "description": "min=0.21, mean=0.276, max=0.351, sum=4.146 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.094, mean=0.122, max=0.145, sum=1.831 (15)",
-            "tab": "Calibration",
-            "score": 0.12205035764205192
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.149, mean=0.189, max=0.24, sum=2.833 (15)",
-            "tab": "Robustness",
-            "score": 0.1888421052631579
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.175, mean=0.215, max=0.26, sum=3.228 (15)",
-            "tab": "Fairness",
-            "score": 0.21518128654970764
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.093, mean=0.133, max=0.275, sum=1.995 (15)",
-            "tab": "Efficiency",
-            "score": 0.1330090104470642
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=358.76, mean=467.936, max=612.798, sum=7019.035 (15)",
-            "tab": "General information",
-            "score": 467.935649122807
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.683,
-        "details": {
-          "description": "min=0.659, mean=0.683, max=0.714, sum=2.048 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.168, mean=0.195, max=0.238, sum=0.585 (3)",
-            "tab": "Calibration",
-            "score": 0.19500535688345313
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.548, mean=0.551, max=0.556, sum=1.653 (3)",
-            "tab": "Robustness",
-            "score": 0.551
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.594, mean=0.609, max=0.629, sum=1.827 (3)",
-            "tab": "Fairness",
-            "score": 0.609
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.515, mean=0.773, max=1.206, sum=2.318 (3)",
-            "tab": "Efficiency",
-            "score": 0.772616056262233
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=656.897, mean=913.897, max=1251.897, sum=2741.691 (3)",
-            "tab": "General information",
-            "score": 913.8969999999999
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.599,
-        "details": {
-          "description": "min=0.558, mean=0.599, max=0.623, sum=1.797 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.2, mean=0.224, max=0.244, sum=0.672 (3)",
-            "tab": "Calibration",
-            "score": 0.2239646545151891
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.378, mean=0.421, max=0.443, sum=1.263 (3)",
-            "tab": "Robustness",
-            "score": 0.4211068794456416
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.419, mean=0.461, max=0.485, sum=1.382 (3)",
-            "tab": "Fairness",
-            "score": 0.46066534756418576
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=0.904, mean=1.468, max=1.998, sum=4.404 (3)",
-            "tab": "Efficiency",
-            "score": 1.4680144681286658
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=0.989, mean=1.568, max=1.969, sum=4.704 (3)",
-            "tab": "General information",
-            "score": 1.568075117370892
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1607.893, mean=1641.033, max=1691.082, sum=4923.099 (3)",
-            "tab": "General information",
-            "score": 1641.0328638497651
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=24.282, mean=40.047, max=54.028, sum=120.141 (3)",
-            "tab": "General information",
-            "score": 40.04694835680751
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.396, mean=0.449, max=0.5, sum=1.346 (3)",
-            "tab": "Bias",
-            "score": 0.44861111111111107
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.159, mean=0.186, max=0.206, sum=0.557 (3)",
-            "tab": "Bias",
-            "score": 0.18579713036394171
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.017, mean=0.022, max=0.025, sum=0.065 (3)",
-            "tab": "Toxicity",
-            "score": 0.0215962441314554
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.596,
-        "details": {
-          "description": "min=0.581, mean=0.596, max=0.608, sum=1.788 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.099, mean=0.103, max=0.106, sum=0.309 (3)",
-            "tab": "Calibration",
-            "score": 0.10315653555419742
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.371, mean=0.373, max=0.375, sum=1.118 (3)",
-            "tab": "Calibration",
-            "score": 0.37278118995003706
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.125, mean=0.133, max=0.14, sum=0.398 (3)",
-            "tab": "Robustness",
-            "score": 0.1325934362402064
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.429, mean=0.452, max=0.48, sum=1.357 (3)",
-            "tab": "Robustness",
-            "score": 0.4524359199313521
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.147, mean=0.154, max=0.158, sum=0.461 (3)",
-            "tab": "Fairness",
-            "score": 0.15381312093617092
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.509, mean=0.525, max=0.537, sum=1.574 (3)",
-            "tab": "Fairness",
-            "score": 0.524698076718683
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.381, mean=0.482, max=0.655, sum=1.447 (3)",
-            "tab": "Efficiency",
-            "score": 0.4823250982166127
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=1.913, mean=2.137, max=2.288, sum=6.411 (3)",
-            "tab": "Efficiency",
-            "score": 2.1369374864319965
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=110.299, mean=112.966, max=117.299, sum=338.897 (3)",
-            "tab": "General information",
-            "score": 112.96566666666668
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=77.379, mean=90.195, max=107.541, sum=270.584 (3)",
-            "tab": "General information",
-            "score": 90.19466666666666
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.685, mean=4.704, max=4.723, sum=14.112 (3)",
-            "tab": "General information",
-            "score": 4.704
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.037, mean=0.037, max=0.037, sum=0.111 (3)",
-            "tab": "General information",
-            "score": 0.037
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1247.862, mean=1394.229, max=1495.552, sum=4182.688 (3)",
-            "tab": "General information",
-            "score": 1394.2293333333334
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=73.671, mean=87.693, max=98.984, sum=263.078 (3)",
-            "tab": "General information",
-            "score": 87.69266666666665
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.309, mean=0.362, max=0.444, sum=1.086 (3)",
-            "tab": "Bias",
-            "score": 0.3621399176954732
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.233, mean=0.318, max=0.382, sum=0.954 (3)",
-            "tab": "Bias",
-            "score": 0.31784137078254726
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.538, mean=0.57, max=0.59, sum=1.709 (3)",
-            "tab": "Bias",
-            "score": 0.5695499220251695
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0, mean=0.094, max=0.241, sum=0.283 (3)",
-            "tab": "Bias",
-            "score": 0.09428104575163399
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.002, max=0.003, sum=0.006 (3)",
-            "tab": "Toxicity",
-            "score": 0.002
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.326,
-        "details": {
-          "description": "min=0.32, mean=0.326, max=0.335, sum=0.979 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.105, mean=0.115, max=0.129, sum=0.345 (3)",
-            "tab": "Calibration",
-            "score": 0.11494333135422596
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.176, mean=0.191, max=0.202, sum=0.574 (3)",
-            "tab": "Robustness",
-            "score": 0.19141062427574787
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.224, mean=0.232, max=0.243, sum=0.695 (3)",
-            "tab": "Fairness",
-            "score": 0.23177797124335245
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=1.906, mean=2.025, max=2.127, sum=6.075 (3)",
-            "tab": "Efficiency",
-            "score": 2.024874148220674
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.804, mean=0.889, max=0.979, sum=2.666 (3)",
-            "tab": "General information",
-            "score": 0.8886666666666666
-          },
-          "QuAC - truncated": {
-            "description": "min=0.021, mean=0.021, max=0.021, sum=0.063 (3)",
-            "tab": "General information",
-            "score": 0.021
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1602.026, mean=1640.361, max=1663.349, sum=4921.083 (3)",
-            "tab": "General information",
-            "score": 1640.3609999999999
-          },
-          "QuAC - # output tokens": {
-            "description": "min=73.99, mean=77.489, max=80.665, sum=232.466 (3)",
-            "tab": "General information",
-            "score": 77.48866666666667
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.606, mean=0.626, max=0.639, sum=1.877 (3)",
-            "tab": "Bias",
-            "score": 0.6257674787086551
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.436, mean=0.448, max=0.455, sum=1.344 (3)",
-            "tab": "Bias",
-            "score": 0.4481503328194676
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.319, mean=0.334, max=0.354, sum=1.003 (3)",
-            "tab": "Bias",
-            "score": 0.3344046827039365
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.258, mean=0.268, max=0.282, sum=0.804 (3)",
-            "tab": "Bias",
-            "score": 0.26793463346025864
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.718,
-        "details": {
-          "description": "min=0.718, mean=0.718, max=0.718, sum=0.718 (1)",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.277, mean=0.277, max=0.277, sum=0.277 (1)",
-            "tab": "Calibration",
-            "score": 0.2773372160584027
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.661, mean=0.661, max=0.661, sum=0.661 (1)",
-            "tab": "Robustness",
-            "score": 0.661
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.552, mean=0.552, max=0.552, sum=0.552 (1)",
-            "tab": "Fairness",
-            "score": 0.552
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.025, mean=0.025, max=0.025, sum=0.025 (1)",
-            "tab": "Efficiency",
-            "score": 0.025470768198370932
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=88.806, mean=88.806, max=88.806, sum=88.806 (1)",
-            "tab": "General information",
-            "score": 88.806
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.524,
-        "details": {
-          "description": "min=0.524, mean=0.524, max=0.524, sum=0.524 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.232, mean=0.232, max=0.232, sum=0.232 (1)",
-            "tab": "Calibration",
-            "score": 0.23249621701719156
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.414, mean=0.414, max=0.414, sum=0.414 (1)",
-            "tab": "Robustness",
-            "score": 0.414
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.438, mean=0.438, max=0.438, sum=0.438 (1)",
-            "tab": "Fairness",
-            "score": 0.438
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.024, mean=0.024, max=0.024, sum=0.024 (1)",
-            "tab": "Efficiency",
-            "score": 0.023963596328905958
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.346, mean=5.346, max=5.346, sum=5.346 (1)",
-            "tab": "General information",
-            "score": 5.346
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.216,
-        "details": {
-          "description": "min=0.205, mean=0.216, max=0.225, sum=0.864 (4)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.051, mean=0.058, max=0.068, sum=0.232 (4)",
-            "tab": "Calibration",
-            "score": 0.057891800582365614
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.144, mean=0.175, max=0.225, sum=0.7 (4)",
-            "tab": "Robustness",
-            "score": 0.17507645259938837
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.161, mean=0.179, max=0.225, sum=0.714 (4)",
-            "tab": "Fairness",
-            "score": 0.17851681957186544
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.029, mean=0.084, max=0.133, sum=0.335 (4)",
-            "tab": "Efficiency",
-            "score": 0.08375055263898766
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=2616 (4)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=0, mean=3.75, max=5, sum=15 (4)",
-            "tab": "General information",
-            "score": 3.75
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (4)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=86.352, mean=406.102, max=532.352, sum=1624.407 (4)",
-            "tab": "General information",
-            "score": 406.10168195718654
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=4 (4)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=2.5, max=3, sum=10 (4)",
-            "tab": "General information",
-            "score": 2.5
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.398,
-        "details": {
-          "description": "min=0.37, mean=0.398, max=0.436, sum=1.195 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.082, mean=0.096, max=0.107, sum=0.288 (3)",
-            "tab": "Robustness",
-            "score": 0.09600105820105831
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.338, mean=0.351, max=0.365, sum=1.053 (3)",
-            "tab": "Robustness",
-            "score": 0.3510422646487042
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.137, mean=0.148, max=0.163, sum=0.445 (3)",
-            "tab": "Fairness",
-            "score": 0.1483276455026454
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.347, mean=0.381, max=0.416, sum=1.144 (3)",
-            "tab": "Fairness",
-            "score": 0.38125183165300675
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.111, mean=0.118, max=0.128, sum=0.355 (3)",
-            "tab": "Efficiency",
-            "score": 0.11821914517316674
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.105, mean=0.116, max=0.127, sum=0.349 (3)",
-            "tab": "Efficiency",
-            "score": 0.11621723726407733
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=499.575, mean=537.908, max=583.575, sum=1613.725 (3)",
-            "tab": "General information",
-            "score": 537.9083333333334
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=481.14, mean=519.473, max=565.14, sum=1558.419 (3)",
-            "tab": "General information",
-            "score": 519.4728682170543
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.123,
-        "details": {
-          "description": "min=0.108, mean=0.123, max=0.138, sum=0.738 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=2.104, mean=2.133, max=2.168, sum=12.798 (6)",
-            "tab": "Efficiency",
-            "score": 2.133056901521097
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1561.275, mean=1582.608, max=1612.275, sum=9495.648 (6)",
-            "tab": "General information",
-            "score": 1582.6080114449214
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=80.197, mean=80.409, max=80.588, sum=482.455 (6)",
-            "tab": "General information",
-            "score": 80.40915593705294
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.612, mean=0.616, max=0.62, sum=3.697 (6)",
-            "tab": "Bias",
-            "score": 0.6162431158667614
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.386, mean=0.41, max=0.431, sum=2.46 (6)",
-            "tab": "Bias",
-            "score": 0.4099353286102709
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.182, mean=0.289, max=0.35, sum=1.732 (6)",
-            "tab": "Bias",
-            "score": 0.288716873622534
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.127, mean=0.149, max=0.168, sum=0.896 (6)",
-            "tab": "Bias",
-            "score": 0.14933277507884896
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.009 (6)",
-            "tab": "Toxicity",
-            "score": 0.001430615164520744
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=-0.009, mean=0.165, max=0.255, sum=0.494 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.16465107490254738
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=4.591, mean=4.69, max=4.763, sum=28.138 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.689614935266213
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.175, mean=0.226, max=0.262, sum=0.677 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.2255769362361307
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.786, mean=0.91, max=0.973, sum=5.46 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.910005755446767
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=35.834, mean=37.149, max=38.818, sum=222.893 (6)",
-            "tab": "Summarization metrics",
-            "score": 37.14890205441478
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=9.164, mean=9.676, max=9.978, sum=58.057 (6)",
-            "tab": "Summarization metrics",
-            "score": 9.676104726319009
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102,
-        "details": {
-          "description": "min=0.098, mean=0.102, max=0.105, sum=0.61 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=1.104, mean=1.116, max=1.135, sum=6.698 (6)",
-            "tab": "Efficiency",
-            "score": 1.1163698516910754
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.996, mean=4.997, max=5, sum=29.985 (6)",
-            "tab": "General information",
-            "score": 4.997425997425997
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1487.131, mean=1545.148, max=1574.17, sum=9270.888 (6)",
-            "tab": "General information",
-            "score": 1545.148005148005
-          },
-          "XSUM - # output tokens": {
-            "description": "min=24.871, mean=25.402, max=26.143, sum=152.413 (6)",
-            "tab": "General information",
-            "score": 25.402187902187904
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.449, mean=0.449, max=0.449, sum=2.694 (6)",
-            "tab": "Bias",
-            "score": 0.4490600226000671
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.483, mean=0.526, max=0.565, sum=3.158 (6)",
-            "tab": "Bias",
-            "score": 0.5263835263835264
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.132, mean=0.162, max=0.184, sum=0.972 (6)",
-            "tab": "Bias",
-            "score": 0.16191706040214252
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.002, max=0.004, sum=0.012 (6)",
-            "tab": "Toxicity",
-            "score": 0.0019305019305019308
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.22, mean=-0.208, max=-0.2, sum=-0.625 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.2082928215061222
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=3.048, mean=3.303, max=3.621, sum=19.818 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.302964744932122
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.385, mean=0.391, max=0.395, sum=1.174 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.39129907447599627
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.822, mean=0.825, max=0.83, sum=4.948 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.8247285888112758
-          },
-          "XSUM - Density": {
-            "description": "min=3.228, mean=3.371, max=3.613, sum=20.226 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.3710531876366
-          },
-          "XSUM - Compression": {
-            "description": "min=17.631, mean=18.238, max=18.621, sum=109.428 (6)",
-            "tab": "Summarization metrics",
-            "score": 18.23798025069092
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.948,
-        "details": {
-          "description": "min=0.946, mean=0.948, max=0.95, sum=2.844 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.189, mean=0.23, max=0.269, sum=0.69 (3)",
-            "tab": "Calibration",
-            "score": 0.22988586030197733
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.906, mean=0.912, max=0.921, sum=2.736 (3)",
-            "tab": "Robustness",
-            "score": 0.9119999999999999
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.925, mean=0.928, max=0.933, sum=2.785 (3)",
-            "tab": "Fairness",
-            "score": 0.9283333333333333
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.748, mean=0.862, max=1.078, sum=2.586 (3)",
-            "tab": "Efficiency",
-            "score": 0.862092325799332
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=4.842, mean=4.93, max=4.981, sum=14.789 (3)",
-            "tab": "General information",
-            "score": 4.929666666666667
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1162.003, mean=1398.09, max=1750.717, sum=4194.271 (3)",
-            "tab": "General information",
-            "score": 1398.0903333333333
-          },
-          "IMDB - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.516,
-        "details": {
-          "description": "min=0, mean=0.516, max=1, sum=27.878 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.26, mean=0.444, max=0.593, sum=23.994 (54)",
-            "tab": "Calibration",
-            "score": 0.4443373993811643
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.48, max=1, sum=25.9 (54)",
-            "tab": "Robustness",
-            "score": 0.4796354739742704
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.491, max=1, sum=26.497 (54)",
-            "tab": "Fairness",
-            "score": 0.4906931444587031
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.253, mean=0.408, max=0.906, sum=22.04 (54)",
-            "tab": "Efficiency",
-            "score": 0.4081493504712871
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=360.976, mean=726.728, max=1282.4, sum=39243.315 (54)",
-            "tab": "General information",
-            "score": 726.7280588093369
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.505,
-        "details": {
-          "description": "min=0.025, mean=0.505, max=0.975, sum=16.65 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.067, mean=0.324, max=0.975, sum=10.705 (33)",
-            "tab": "Calibration",
-            "score": 0.3243919141625793
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.399, max=0.975, sum=13.175 (33)",
-            "tab": "Robustness",
-            "score": 0.39924242424242423
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.025, mean=0.475, max=0.975, sum=15.675 (33)",
-            "tab": "Fairness",
-            "score": 0.47500000000000003
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.16, mean=1.156, max=2.589, sum=38.155 (33)",
-            "tab": "Efficiency",
-            "score": 1.1562087950381366
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.56, max=5, sum=150.475 (33)",
-            "tab": "General information",
-            "score": 4.5598484848484855
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=269.35, mean=807.97, max=1764, sum=26663.0 (33)",
-            "tab": "General information",
-            "score": 807.9696969696969
-          },
-          "RAFT - # output tokens": {
-            "description": "min=5, mean=13.945, max=30, sum=460.2 (33)",
-            "tab": "General information",
-            "score": 13.945454545454545
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json b/data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json
deleted file mode 100644
index 5355ce78b..000000000
--- a/data/helm_classic/openai/ada-350M/1fb2c6db-2495-4609-a96b-57815c579953.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_ada-350M/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ada 350M",
-    "id": "openai/ada-350M",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.6164902182478501
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.10196623917424807
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.10483119031506129
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.7698300438596491
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.4272126112641924
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.30052416719083386
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.23114035087719298
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.243,
-        "details": {
-          "description": "min=0.132, mean=0.243, max=0.32, sum=3.641 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.049, mean=0.128, max=0.186, sum=1.923 (15)",
-            "tab": "Calibration",
-            "score": 0.1282115692539908
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.105, mean=0.204, max=0.28, sum=3.054 (15)",
-            "tab": "Robustness",
-            "score": 0.20357894736842103
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.053, mean=0.21, max=0.31, sum=3.155 (15)",
-            "tab": "Fairness",
-            "score": 0.2103157894736842
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.14, mean=0.14, max=0.141, sum=2.103 (15)",
-            "tab": "Efficiency",
-            "score": 0.1402282775493421
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)",
-            "tab": "General information",
-            "score": 472.2740350877193
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.581,
-        "details": {
-          "description": "min=0.525, mean=0.581, max=0.627, sum=1.743 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.049, mean=0.067, max=0.09, sum=0.2 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.06655133808072823
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.349, mean=0.461, max=0.549, sum=1.383 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.461
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.421, mean=0.507, max=0.575, sum=1.52 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.5066666666666667
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.14, mean=0.141, max=0.141, sum=0.422 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.14052770182291666
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 908.4063333333334
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1.004, max=1.008, sum=3.012 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.004
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.326,
-        "details": {
-          "description": "min=0.311, mean=0.326, max=0.35, sum=0.978 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.034, mean=0.046, max=0.064, sum=0.138 (3)",
-            "tab": "Calibration",
-            "score": 0.04605131521940172
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.094, mean=0.104, max=0.11, sum=0.312 (3)",
-            "tab": "Robustness",
-            "score": 0.10413260236022294
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.191, mean=0.205, max=0.221, sum=0.616 (3)",
-            "tab": "Fairness",
-            "score": 0.20535614023925777
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=0.203, mean=0.211, max=0.224, sum=0.632 (3)",
-            "tab": "Efficiency",
-            "score": 0.21074192341549294
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)",
-            "tab": "General information",
-            "score": 1.6469483568075116
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)",
-            "tab": "General information",
-            "score": 1652.3774647887324
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=11.13, mean=12.381, max=14.623, sum=37.144 (3)",
-            "tab": "General information",
-            "score": 12.381220657276996
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.415, mean=0.444, max=0.464, sum=1.333 (3)",
-            "tab": "Bias",
-            "score": 0.44422611988401467
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.074, mean=0.132, max=0.198, sum=0.397 (3)",
-            "tab": "Bias",
-            "score": 0.13244266197852694
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.025, mean=0.03, max=0.037, sum=0.09 (3)",
-            "tab": "Toxicity",
-            "score": 0.030046948356807508
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.365,
-        "details": {
-          "description": "min=0.35, mean=0.365, max=0.379, sum=1.095 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.024, mean=0.028, max=0.034, sum=0.083 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.02767630939495112
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.168, mean=0.18, max=0.188, sum=0.539 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.17953919898525875
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.029, mean=0.031, max=0.033, sum=0.092 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.030523107267064337
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.042, mean=0.043, max=0.044, sum=0.129 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.04293332221345858
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.054, mean=0.057, max=0.061, sum=0.171 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.057147528877813734
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.269, mean=0.273, max=0.278, sum=0.82 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.2734675120722885
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.162, mean=0.167, max=0.171, sum=0.5 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.16660095312500048
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=0.259, mean=0.271, max=0.277, sum=0.812 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.27051720963541687
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 112.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.865, mean=5.656, max=6.378, sum=16.969 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.656333333333333
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 4.691333333333334
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.036
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1419.5736666666664
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=20.643, mean=22.436, max=23.53, sum=67.308 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 22.436000000000003
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.067, mean=0.284, max=0.429, sum=0.852 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.2838533114395183
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.167, mean=0.281, max=0.404, sum=0.843 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.2809020267563887
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.487, mean=0.496, max=0.5, sum=1.487 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.4955194805194805
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.401, mean=0.466, max=0.574, sum=1.399 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.46622237638437936
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.308, mean=0.333, max=0.361, sum=0.998 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.33253136409012896
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0006666666666666666
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.002, max=0.004, sum=0.007 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0023333333333333335
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.242,
-        "details": {
-          "description": "min=0.226, mean=0.242, max=0.267, sum=0.725 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.022, mean=0.039, max=0.059, sum=0.118 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.039442503431989094
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.082, mean=0.092, max=0.098, sum=0.275 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.09165527832991893
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.15, mean=0.166, max=0.187, sum=0.497 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.16579958101328882
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=0.253, mean=0.27, max=0.28, sum=0.811 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.2701784687500001
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.9443333333333334
-          },
-          "QuAC - truncated": {
-            "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.016
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1644.8306666666667
-          },
-          "QuAC - # output tokens": {
-            "description": "min=19.431, mean=22.281, max=23.851, sum=66.844 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 22.281333333333333
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.437, mean=0.452, max=0.465, sum=1.355 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.4515937058073862
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.269, mean=0.341, max=0.377, sum=1.022 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.3407089337701805
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.195, mean=0.209, max=0.237, sum=0.627 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.2091296383711505
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.002, mean=0.003, max=0.004, sum=0.008 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0026666666666666666
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.435,
-        "details": {
-          "description": "min=0.435, mean=0.435, max=0.435, sum=0.435 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.057, mean=0.057, max=0.057, sum=0.057 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.057406609088416535
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.37, mean=0.37, max=0.37, sum=0.37 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.37
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.294, mean=0.294, max=0.294, sum=0.294 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.294
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.138, mean=0.138, max=0.138, sum=0.138 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.13805987500000028
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 87.888
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38,
-        "details": {
-          "description": "min=0.38, mean=0.38, max=0.38, sum=0.38 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.346, mean=0.346, max=0.346, sum=0.346 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.3457887658657961
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.27, mean=0.27, max=0.27, sum=0.27 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.27
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.318, mean=0.318, max=0.318, sum=0.318 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.318
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.136, mean=0.136, max=0.136, sum=0.136 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.13612351562500047
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.27
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.215,
-        "details": {
-          "description": "min=0.206, mean=0.215, max=0.222, sum=0.645 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.06, mean=0.071, max=0.086, sum=0.213 (3)",
-            "tab": "Calibration",
-            "score": 0.07105251349575469
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.154, mean=0.167, max=0.179, sum=0.502 (3)",
-            "tab": "Robustness",
-            "score": 0.1671763506625892
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.18, mean=0.185, max=0.187, sum=0.554 (3)",
-            "tab": "Fairness",
-            "score": 0.18450560652395517
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.14, mean=0.141, max=0.141, sum=0.422 (3)",
-            "tab": "Efficiency",
-            "score": 0.14062155366016812
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)",
-            "tab": "General information",
-            "score": 511.12079510703364
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.29,
-        "details": {
-          "description": "min=0.184, mean=0.29, max=0.427, sum=0.871 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.039, mean=0.072, max=0.111, sum=0.215 (3)",
-            "tab": "Robustness",
-            "score": 0.07152063492063503
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.148, mean=0.247, max=0.358, sum=0.741 (3)",
-            "tab": "Robustness",
-            "score": 0.24715427563243078
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.051, mean=0.086, max=0.134, sum=0.258 (3)",
-            "tab": "Fairness",
-            "score": 0.08609259259259262
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.17, mean=0.268, max=0.399, sum=0.804 (3)",
-            "tab": "Fairness",
-            "score": 0.267882893215826
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.14, mean=0.142, max=0.143, sum=0.425 (3)",
-            "tab": "Efficiency",
-            "score": 0.14154662890625005
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.14, mean=0.142, max=0.142, sum=0.425 (3)",
-            "tab": "Efficiency",
-            "score": 0.14153152252906978
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)",
-            "tab": "General information",
-            "score": 532.5653333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=1.059, mean=1.219, max=1.379, sum=3.656 (3)",
-            "tab": "General information",
-            "score": 1.2186666666666666
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)",
-            "tab": "General information",
-            "score": 515.8217054263565
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=1.093, mean=1.171, max=1.209, sum=3.512 (3)",
-            "tab": "General information",
-            "score": 1.1705426356589146
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.09,
-        "details": {
-          "description": "min=0.046, mean=0.09, max=0.116, sum=0.541 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=0.494, mean=0.598, max=0.669, sum=3.587 (6)",
-            "tab": "Efficiency",
-            "score": 0.5978011528746431
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)",
-            "tab": "General information",
-            "score": 1549.9191702432045
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=59.695, mean=76.958, max=88.815, sum=461.747 (6)",
-            "tab": "General information",
-            "score": 76.95779685264664
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.598, mean=0.628, max=0.667, sum=3.769 (6)",
-            "tab": "Bias",
-            "score": 0.6280987623495909
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.361, mean=0.403, max=0.447, sum=2.416 (6)",
-            "tab": "Bias",
-            "score": 0.4025937932369326
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.275, mean=0.297, max=0.329, sum=1.782 (6)",
-            "tab": "Bias",
-            "score": 0.2969968830498775
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.109, mean=0.134, max=0.15, sum=0.804 (6)",
-            "tab": "Bias",
-            "score": 0.13397007527013516
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)",
-            "tab": "Toxicity",
-            "score": 0.000715307582260372
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.007, mean=0.169, max=0.28, sum=0.506 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.1685268875223913
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=3.028, mean=3.742, max=4.119, sum=22.454 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.742251717543341
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=-0.233, mean=0.026, max=0.191, sum=0.079 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.02646359689379031
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.551, mean=0.773, max=0.886, sum=4.64 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.7733298424406031
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=18.265, mean=36.596, max=52.461, sum=219.577 (6)",
-            "tab": "Summarization metrics",
-            "score": 36.59619529550019
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=9.827, mean=12.07, max=15.425, sum=72.42 (6)",
-            "tab": "Summarization metrics",
-            "score": 12.070019676025145
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.022,
-        "details": {
-          "description": "min=0.012, mean=0.022, max=0.034, sum=0.134 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=0.194, mean=0.237, max=0.271, sum=1.423 (6)",
-            "tab": "Efficiency",
-            "score": 0.23717034165862286
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)",
-            "tab": "General information",
-            "score": 4.998712998712999
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1456.402, mean=1510.418, max=1538.921, sum=9062.51 (6)",
-            "tab": "General information",
-            "score": 1510.4182754182755
-          },
-          "XSUM - # output tokens": {
-            "description": "min=9.643, mean=16.878, max=22.542, sum=101.27 (6)",
-            "tab": "General information",
-            "score": 16.878378378378375
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.383, mean=0.412, max=0.438, sum=2.474 (6)",
-            "tab": "Bias",
-            "score": 0.4122685185185186
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.467, mean=0.558, max=0.667, sum=3.35 (6)",
-            "tab": "Bias",
-            "score": 0.5583333333333335
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.158, mean=0.222, max=0.264, sum=1.335 (6)",
-            "tab": "Bias",
-            "score": 0.22244262246907046
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.151, mean=-0.115, max=-0.086, sum=-0.345 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.11515867019712234
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=0, mean=0.009, max=0.028, sum=0.056 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.009336465575789038
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=-0.509, mean=-0.232, max=-0.002, sum=-0.695 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.23174258205917408
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.208, mean=0.407, max=0.566, sum=2.442 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.40704982952261465
-          },
-          "XSUM - Density": {
-            "description": "min=1.129, mean=2.653, max=3.54, sum=15.917 (6)",
-            "tab": "Summarization metrics",
-            "score": 2.652801659570502
-          },
-          "XSUM - Compression": {
-            "description": "min=4.395, mean=8.023, max=11.123, sum=48.138 (6)",
-            "tab": "Summarization metrics",
-            "score": 8.022940864769765
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.849,
-        "details": {
-          "description": "min=0.834, mean=0.849, max=0.861, sum=2.547 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.223, mean=0.274, max=0.332, sum=0.821 (3)",
-            "tab": "Calibration",
-            "score": 0.2737600797307666
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.663, mean=0.701, max=0.737, sum=2.102 (3)",
-            "tab": "Robustness",
-            "score": 0.7006666666666668
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.787, mean=0.806, max=0.819, sum=2.417 (3)",
-            "tab": "Fairness",
-            "score": 0.8056666666666666
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.141, mean=0.142, max=0.143, sum=0.426 (3)",
-            "tab": "Efficiency",
-            "score": 0.14206914127604175
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.916, mean=4.242, max=4.986, sum=12.726 (3)",
-            "tab": "General information",
-            "score": 4.242
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1277.729, mean=1553.363, max=1768.607, sum=4660.089 (3)",
-            "tab": "General information",
-            "score": 1553.363
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.517,
-        "details": {
-          "description": "min=0, mean=0.517, max=1, sum=27.9 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.096, mean=0.355, max=0.704, sum=19.19 (54)",
-            "tab": "Calibration",
-            "score": 0.35537087067123496
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.421, max=1, sum=22.752 (54)",
-            "tab": "Robustness",
-            "score": 0.42132444064350366
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.436, max=1, sum=23.537 (54)",
-            "tab": "Fairness",
-            "score": 0.435870046986927
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.14, mean=0.141, max=0.141, sum=7.587 (54)",
-            "tab": "Efficiency",
-            "score": 0.14050017531142125
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)",
-            "tab": "General information",
-            "score": 722.6354931173206
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=54 (54)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.423,
-        "details": {
-          "description": "min=0, mean=0.423, max=0.975, sum=13.975 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.066, mean=0.268, max=0.696, sum=8.86 (33)",
-            "tab": "Calibration",
-            "score": 0.2684712140450576
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.345, max=0.975, sum=11.375 (33)",
-            "tab": "Robustness",
-            "score": 0.3446969696969697
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0, mean=0.395, max=0.975, sum=13.05 (33)",
-            "tab": "Fairness",
-            "score": 0.3954545454545455
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.142, mean=0.154, max=0.17, sum=5.08 (33)",
-            "tab": "Efficiency",
-            "score": 0.15395451290246212
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.556, max=5, sum=150.35 (33)",
-            "tab": "General information",
-            "score": 4.556060606060607
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)",
-            "tab": "General information",
-            "score": 812.937878787879
-          },
-          "RAFT - # output tokens": {
-            "description": "min=1.275, mean=3.125, max=5.85, sum=103.125 (33)",
-            "tab": "General information",
-            "score": 3.125
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json b/data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json
deleted file mode 100644
index d3977fc36..000000000
--- a/data/helm_classic/openai/babbage-1.3B/a5b6cc8b-676d-4c19-8093-0b893937e3d4.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_babbage-1.3B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "babbage 1.3B",
-    "id": "openai/babbage-1.3B",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.5876917234841996
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.11687598645329457
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.13375380644568632
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.860531798245614
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.47969140134405086
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.5128371628371629
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.19609440267335004
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.235,
-        "details": {
-          "description": "min=0.17, mean=0.235, max=0.35, sum=3.518 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.095, mean=0.14, max=0.179, sum=2.093 (15)",
-            "tab": "Calibration",
-            "score": 0.13954639548632583
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.09, mean=0.166, max=0.24, sum=2.489 (15)",
-            "tab": "Robustness",
-            "score": 0.165906432748538
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.14, mean=0.206, max=0.28, sum=3.085 (15)",
-            "tab": "Fairness",
-            "score": 0.20567251461988303
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.118, mean=0.119, max=0.12, sum=1.785 (15)",
-            "tab": "Efficiency",
-            "score": 0.11896953947368419
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)",
-            "tab": "General information",
-            "score": 472.2740350877193
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.574,
-        "details": {
-          "description": "min=0.52, mean=0.574, max=0.623, sum=1.723 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.036, mean=0.068, max=0.089, sum=0.203 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.06758031979129187
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.432, mean=0.477, max=0.522, sum=1.431 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.47700000000000004
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.404, mean=0.436, max=0.457, sum=1.307 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.43566666666666665
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.119, mean=0.121, max=0.125, sum=0.364 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.12137238953993056
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 908.4063333333334
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.491,
-        "details": {
-          "description": "min=0.468, mean=0.491, max=0.525, sum=1.474 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.025, mean=0.027, max=0.03, sum=0.081 (3)",
-            "tab": "Calibration",
-            "score": 0.027162479976532598
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.232, mean=0.255, max=0.266, sum=0.764 (3)",
-            "tab": "Robustness",
-            "score": 0.2547490737014401
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.334, mean=0.367, max=0.396, sum=1.101 (3)",
-            "tab": "Fairness",
-            "score": 0.3669650821225828
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=0.164, mean=0.176, max=0.194, sum=0.529 (3)",
-            "tab": "Efficiency",
-            "score": 0.1762964825410799
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)",
-            "tab": "General information",
-            "score": 1.6469483568075116
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)",
-            "tab": "General information",
-            "score": 1652.3774647887324
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=6.659, mean=8.835, max=11.769, sum=26.504 (3)",
-            "tab": "General information",
-            "score": 8.83474178403756
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.404, mean=0.445, max=0.5, sum=1.335 (3)",
-            "tab": "Bias",
-            "score": 0.44511511879932936
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.142, mean=0.191, max=0.246, sum=0.574 (3)",
-            "tab": "Bias",
-            "score": 0.1912053369170701
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.011, mean=0.016, max=0.023, sum=0.048 (3)",
-            "tab": "Toxicity",
-            "score": 0.01596244131455399
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.451,
-        "details": {
-          "description": "min=0.435, mean=0.451, max=0.47, sum=1.354 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.012, mean=0.016, max=0.023, sum=0.048 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.01603851394023659
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.141, mean=0.147, max=0.153, sum=0.44 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.14681748032197228
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.063, mean=0.068, max=0.072, sum=0.205 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.06829400341950241
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.211, mean=0.212, max=0.214, sum=0.637 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.21249077319847984
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.079, mean=0.084, max=0.088, sum=0.252 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.08399089853474369
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.365, mean=0.381, max=0.403, sum=1.144 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.381423207180998
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.15, mean=0.152, max=0.152, sum=0.455 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.15162744531249991
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=0.228, mean=0.232, max=0.235, sum=0.696 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.23211142730034728
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 112.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=6.994, mean=7.258, max=7.401, sum=21.773 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 7.257666666666666
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 4.691333333333334
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.036
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1419.5736666666664
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=18.158, mean=18.539, max=18.902, sum=55.617 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 18.539
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.578, mean=0.624, max=0.667, sum=1.871 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.6236303630363037
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0, mean=0.015, max=0.038, sum=0.046 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.015466015466015476
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.445, mean=0.479, max=0.5, sum=1.436 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.47855712855712856
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.422, mean=0.441, max=0.46, sum=1.323 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.44113329919781535
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.257, mean=0.349, max=0.419, sum=1.046 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.34872771165606054
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.002 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0006666666666666666
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.273,
-        "details": {
-          "description": "min=0.263, mean=0.273, max=0.282, sum=0.818 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.03, mean=0.045, max=0.065, sum=0.136 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.04533749534838898
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.141, mean=0.149, max=0.156, sum=0.448 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.14927279809816305
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.198, mean=0.202, max=0.205, sum=0.607 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.20229238580626874
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=0.245, mean=0.261, max=0.27, sum=0.782 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.2607369557291667
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.9443333333333334
-          },
-          "QuAC - truncated": {
-            "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.016
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1644.8306666666667
-          },
-          "QuAC - # output tokens": {
-            "description": "min=20.236, mean=22.916, max=24.512, sum=68.749 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 22.91633333333333
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.643, mean=0.659, max=0.667, sum=1.976 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.6587301587301589
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.434, mean=0.445, max=0.452, sum=1.336 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.4452529926214137
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.311, mean=0.339, max=0.382, sum=1.016 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.33878845629358273
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.251, mean=0.258, max=0.264, sum=0.775 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.25817229310554
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.002, max=0.002, sum=0.005 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0016666666666666668
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.555,
-        "details": {
-          "description": "min=0.555, mean=0.555, max=0.555, sum=0.555 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.144, mean=0.144, max=0.144, sum=0.144 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.14430034567571584
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.489, mean=0.489, max=0.489, sum=0.489 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.489
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.401, mean=0.401, max=0.401, sum=0.401 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.401
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.113, mean=0.113, max=0.113, sum=0.113 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.1134031874999998
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 87.888
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.438,
-        "details": {
-          "description": "min=0.438, mean=0.438, max=0.438, sum=0.438 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.3 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.3000308921028506
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.314, mean=0.314, max=0.314, sum=0.314 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.314
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.326, mean=0.326, max=0.326, sum=0.326 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.326
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.111, mean=0.111, max=0.111, sum=0.111 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.11114410156249971
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.27
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.188,
-        "details": {
-          "description": "min=0.174, mean=0.188, max=0.196, sum=0.563 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.13, mean=0.142, max=0.164, sum=0.426 (3)",
-            "tab": "Calibration",
-            "score": 0.14198207765086143
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.141, mean=0.162, max=0.183, sum=0.486 (3)",
-            "tab": "Robustness",
-            "score": 0.1620795107033639
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.159, mean=0.178, max=0.19, sum=0.534 (3)",
-            "tab": "Fairness",
-            "score": 0.17787971457696228
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.119, mean=0.12, max=0.12, sum=0.359 (3)",
-            "tab": "Efficiency",
-            "score": 0.11970087223655701
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)",
-            "tab": "General information",
-            "score": 511.12079510703364
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.317,
-        "details": {
-          "description": "min=0.291, mean=0.317, max=0.362, sum=0.95 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.055, mean=0.073, max=0.086, sum=0.219 (3)",
-            "tab": "Robustness",
-            "score": 0.07291031746031752
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.206, mean=0.246, max=0.285, sum=0.739 (3)",
-            "tab": "Robustness",
-            "score": 0.24641961891165112
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.082, mean=0.105, max=0.123, sum=0.316 (3)",
-            "tab": "Fairness",
-            "score": 0.10532936507936512
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.275, mean=0.301, max=0.346, sum=0.902 (3)",
-            "tab": "Fairness",
-            "score": 0.300592144197253
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.119, mean=0.122, max=0.126, sum=0.367 (3)",
-            "tab": "Efficiency",
-            "score": 0.12232188151041663
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.118, mean=0.122, max=0.128, sum=0.367 (3)",
-            "tab": "Efficiency",
-            "score": 0.12249798631298452
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)",
-            "tab": "General information",
-            "score": 532.5653333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=1.128, mean=1.537, max=2.075, sum=4.612 (3)",
-            "tab": "General information",
-            "score": 1.5373333333333334
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)",
-            "tab": "General information",
-            "score": 515.8217054263565
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=1, mean=1.496, max=2.302, sum=4.488 (3)",
-            "tab": "General information",
-            "score": 1.4961240310077522
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.079,
-        "details": {
-          "description": "min=0.016, mean=0.079, max=0.147, sum=0.472 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=0.293, mean=0.533, max=0.795, sum=3.197 (6)",
-            "tab": "Efficiency",
-            "score": 0.5327935382950345
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)",
-            "tab": "General information",
-            "score": 1549.9191702432045
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=28.479, mean=68.44, max=112.258, sum=410.639 (6)",
-            "tab": "General information",
-            "score": 68.43991416309014
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.5, mean=0.568, max=0.611, sum=3.41 (6)",
-            "tab": "Bias",
-            "score": 0.5683358120009704
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.403, mean=0.418, max=0.435, sum=2.509 (6)",
-            "tab": "Bias",
-            "score": 0.4181282755076701
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.321, mean=0.327, max=0.333, sum=1.962 (6)",
-            "tab": "Bias",
-            "score": 0.32700197854837026
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.131, mean=0.146, max=0.165, sum=0.879 (6)",
-            "tab": "Bias",
-            "score": 0.14643429372740835
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=-0.024, mean=0.194, max=0.404, sum=0.582 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.19395910509097278
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=1.208, mean=3.207, max=4.672, sum=19.24 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.206720080183251
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=-0.533, mean=-0.129, max=0.256, sum=-0.388 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.12942978993545518
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.243, mean=0.606, max=0.942, sum=3.637 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.6061106279492011
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=7.213, mean=43.534, max=84.961, sum=261.202 (6)",
-            "tab": "Summarization metrics",
-            "score": 43.533595505945534
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=5.569, mean=6.733, max=8.376, sum=40.398 (6)",
-            "tab": "Summarization metrics",
-            "score": 6.733051993966683
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.045,
-        "details": {
-          "description": "min=0.041, mean=0.045, max=0.054, sum=0.273 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=0.264, mean=0.272, max=0.286, sum=1.632 (6)",
-            "tab": "Efficiency",
-            "score": 0.27202574924254597
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)",
-            "tab": "General information",
-            "score": 4.998712998712999
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1456.402, mean=1510.418, max=1538.921, sum=9062.51 (6)",
-            "tab": "General information",
-            "score": 1510.4182754182755
-          },
-          "XSUM - # output tokens": {
-            "description": "min=23.645, mean=25.051, max=27.259, sum=150.309 (6)",
-            "tab": "General information",
-            "score": 25.051480051480052
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.389, mean=0.42, max=0.46, sum=2.52 (6)",
-            "tab": "Bias",
-            "score": 0.42004149135109864
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.417, mean=0.458, max=0.542, sum=2.75 (6)",
-            "tab": "Bias",
-            "score": 0.4583333333333333
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.105, mean=0.148, max=0.182, sum=0.89 (6)",
-            "tab": "Bias",
-            "score": 0.14837887499687488
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.221, mean=-0.188, max=-0.16, sum=-0.564 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.18805348402642733
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=0.003, mean=0.195, max=0.546, sum=1.171 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.19517962440346606
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=-0.047, mean=0.02, max=0.139, sum=0.059 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.01972435572139075
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.538, mean=0.604, max=0.715, sum=3.622 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.6037080043294082
-          },
-          "XSUM - Density": {
-            "description": "min=3.597, mean=4.386, max=5.935, sum=26.316 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.385950410054523
-          },
-          "XSUM - Compression": {
-            "description": "min=10.355, mean=11.716, max=13.636, sum=70.293 (6)",
-            "tab": "Summarization metrics",
-            "score": 11.71557516895029
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.597,
-        "details": {
-          "description": "min=0.5, mean=0.597, max=0.646, sum=1.792 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.161, mean=0.212, max=0.289, sum=0.637 (3)",
-            "tab": "Calibration",
-            "score": 0.2122386190139247
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.476, mean=0.5, max=0.512, sum=1.5 (3)",
-            "tab": "Robustness",
-            "score": 0.5
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.489, mean=0.534, max=0.558, sum=1.602 (3)",
-            "tab": "Fairness",
-            "score": 0.534
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.125, mean=0.128, max=0.131, sum=0.385 (3)",
-            "tab": "Efficiency",
-            "score": 0.12819260763888898
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.916, mean=4.242, max=4.986, sum=12.726 (3)",
-            "tab": "General information",
-            "score": 4.242
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1277.729, mean=1553.363, max=1768.607, sum=4660.089 (3)",
-            "tab": "General information",
-            "score": 1553.363
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.519,
-        "details": {
-          "description": "min=0.005, mean=0.519, max=0.996, sum=28.025 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.063, mean=0.31, max=0.598, sum=16.723 (54)",
-            "tab": "Calibration",
-            "score": 0.30968147474692964
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.4, max=0.996, sum=21.618 (54)",
-            "tab": "Robustness",
-            "score": 0.40032672585199003
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.474, max=0.994, sum=25.57 (54)",
-            "tab": "Fairness",
-            "score": 0.4735149158411243
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.118, mean=0.12, max=0.125, sum=6.485 (54)",
-            "tab": "Efficiency",
-            "score": 0.12008918109610113
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)",
-            "tab": "General information",
-            "score": 722.6354931173206
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=54 (54)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.455,
-        "details": {
-          "description": "min=0.025, mean=0.455, max=0.975, sum=15.025 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.1, mean=0.286, max=0.455, sum=9.428 (33)",
-            "tab": "Calibration",
-            "score": 0.28570502706051176
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.409, max=0.975, sum=13.5 (33)",
-            "tab": "Robustness",
-            "score": 0.40909090909090906
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0, mean=0.438, max=0.975, sum=14.45 (33)",
-            "tab": "Fairness",
-            "score": 0.43787878787878787
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.117, mean=0.137, max=0.182, sum=4.525 (33)",
-            "tab": "Efficiency",
-            "score": 0.13711408420138893
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.556, max=5, sum=150.35 (33)",
-            "tab": "General information",
-            "score": 4.556060606060607
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)",
-            "tab": "General information",
-            "score": 812.937878787879
-          },
-          "RAFT - # output tokens": {
-            "description": "min=1, mean=3.511, max=10.6, sum=115.85 (33)",
-            "tab": "General information",
-            "score": 3.5106060606060603
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "Bias",
-            "score": 0.0
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json b/data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json
deleted file mode 100644
index fe011ca06..000000000
--- a/data/helm_classic/openai/curie-6.7B/0d4d42b2-d90c-418a-b3e3-c2d59453bacf.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_curie-6.7B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "curie 6.7B",
-    "id": "openai/curie-6.7B",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.247,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.6031752149929763
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.23139443056017028
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.23055057660174458
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.8951315789473684
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.36598228279277495
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.4175808759142092
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.32471804511278196
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.243,
-        "details": {
-          "description": "min=0.19, mean=0.243, max=0.29, sum=3.642 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.069, mean=0.138, max=0.238, sum=2.071 (15)",
-            "tab": "Calibration",
-            "score": 0.1380385889615569
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.1, mean=0.19, max=0.263, sum=2.854 (15)",
-            "tab": "Robustness",
-            "score": 0.1902923976608187
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.15, mean=0.218, max=0.281, sum=3.266 (15)",
-            "tab": "Fairness",
-            "score": 0.21771929824561406
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.091, mean=0.092, max=0.095, sum=1.387 (15)",
-            "tab": "Efficiency",
-            "score": 0.09245237979714913
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)",
-            "tab": "General information",
-            "score": 472.2740350877193
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.656,
-        "details": {
-          "description": "min=0.597, mean=0.656, max=0.704, sum=1.969 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.051, mean=0.079, max=0.115, sum=0.236 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.07881150352718548
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.484, mean=0.545, max=0.599, sum=1.635 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.545
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.535, mean=0.594, max=0.631, sum=1.782 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.594
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.096, mean=0.1, max=0.104, sum=0.3 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.09988102712673615
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 908.4063333333334
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.604,
-        "details": {
-          "description": "min=0.588, mean=0.604, max=0.632, sum=1.813 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.031, mean=0.045, max=0.056, sum=0.135 (3)",
-            "tab": "Calibration",
-            "score": 0.044936394093581626
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.352, mean=0.367, max=0.39, sum=1.1 (3)",
-            "tab": "Robustness",
-            "score": 0.36665112128820915
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.453, mean=0.482, max=0.515, sum=1.445 (3)",
-            "tab": "Fairness",
-            "score": 0.48150959406800437
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=0.14, mean=0.152, max=0.166, sum=0.455 (3)",
-            "tab": "Efficiency",
-            "score": 0.15159477332746474
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)",
-            "tab": "General information",
-            "score": 1.6469483568075116
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)",
-            "tab": "General information",
-            "score": 1652.3774647887324
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=4.775, mean=6.607, max=8.732, sum=19.82 (3)",
-            "tab": "General information",
-            "score": 6.606572769953051
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.431, mean=0.455, max=0.5, sum=1.364 (3)",
-            "tab": "Bias",
-            "score": 0.45462962962962966
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.209, mean=0.229, max=0.267, sum=0.688 (3)",
-            "tab": "Bias",
-            "score": 0.2292955082742317
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.017, mean=0.017, max=0.017, sum=0.051 (3)",
-            "tab": "Toxicity",
-            "score": 0.016901408450704224
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.552,
-        "details": {
-          "description": "min=0.521, mean=0.552, max=0.568, sum=1.655 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.014, mean=0.017, max=0.022, sum=0.052 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.01724854000741595
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.123, mean=0.134, max=0.149, sum=0.403 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.13427394452181574
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.118, mean=0.126, max=0.133, sum=0.379 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.1262678947150161
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.28, mean=0.338, max=0.381, sum=1.015 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.33838638278361
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.139, mean=0.147, max=0.151, sum=0.44 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.14670404179376148
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.446, mean=0.479, max=0.506, sum=1.436 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.47851717891712475
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.116, mean=0.122, max=0.128, sum=0.367 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.12234622395833335
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=0.166, mean=0.189, max=0.21, sum=0.566 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.18882224978298598
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 112.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=5.376, mean=6.313, max=7.104, sum=18.94 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 6.3133333333333335
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 4.691333333333334
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.036
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1419.5736666666664
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=9.89, mean=12.581, max=15.337, sum=37.742 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 12.580666666666668
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.291, mean=0.415, max=0.509, sum=1.245 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.4150858887700994
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.119, mean=0.203, max=0.25, sum=0.608 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.20272601794340928
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.407, mean=0.469, max=0.5, sum=1.407 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.469047619047619
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.441, mean=0.453, max=0.467, sum=1.359 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.4528357579590976
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.361, mean=0.379, max=0.397, sum=1.136 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.3786428074398272
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.002, max=0.003, sum=0.005 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0016666666666666668
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.321,
-        "details": {
-          "description": "min=0.312, mean=0.321, max=0.335, sum=0.963 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.033, mean=0.043, max=0.055, sum=0.129 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.04303687950629059
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.164, mean=0.171, max=0.178, sum=0.513 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.1711623480279509
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.241, mean=0.243, max=0.245, sum=0.728 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.24255939370982219
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=0.31, mean=0.323, max=0.34, sum=0.968 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.32252038281250045
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.9443333333333334
-          },
-          "QuAC - truncated": {
-            "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.016
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1644.8306666666667
-          },
-          "QuAC - # output tokens": {
-            "description": "min=29.104, mean=31.034, max=33.548, sum=93.102 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 31.034000000000002
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.633, mean=0.645, max=0.667, sum=1.936 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.6454545454545455
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.426, mean=0.439, max=0.452, sum=1.317 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.4390862600512319
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.2, mean=0.246, max=0.271, sum=0.738 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.24599483204134365
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.226, mean=0.231, max=0.234, sum=0.693 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.23109052551695608
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.002, mean=0.003, max=0.003, sum=0.008 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0026666666666666666
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.682,
-        "details": {
-          "description": "min=0.682, mean=0.682, max=0.682, sum=0.682 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.25, mean=0.25, max=0.25, sum=0.25 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.24965148877506194
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.632, mean=0.632, max=0.632, sum=0.632 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.632
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.522, mean=0.522, max=0.522, sum=0.522 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.522
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.084, mean=0.084, max=0.084, sum=0.084 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.08380637499999992
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 87.888
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.502,
-        "details": {
-          "description": "min=0.502, mean=0.502, max=0.502, sum=0.502 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.26, mean=0.26, max=0.26, sum=0.26 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.25956257561884827
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.396, mean=0.396, max=0.396, sum=0.396 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.396
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.43, mean=0.43, max=0.43, sum=0.43 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.43
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.079, mean=0.079, max=0.079, sum=0.079 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.07928820312499986
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.27
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.232,
-        "details": {
-          "description": "min=0.222, mean=0.232, max=0.251, sum=0.696 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.05, mean=0.062, max=0.072, sum=0.186 (3)",
-            "tab": "Calibration",
-            "score": 0.06204978796421436
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.167, mean=0.186, max=0.214, sum=0.557 (3)",
-            "tab": "Robustness",
-            "score": 0.1855249745158002
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.165, mean=0.186, max=0.216, sum=0.558 (3)",
-            "tab": "Fairness",
-            "score": 0.18603465851172274
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.093, mean=0.094, max=0.094, sum=0.281 (3)",
-            "tab": "Efficiency",
-            "score": 0.09360438168960249
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)",
-            "tab": "General information",
-            "score": 511.12079510703364
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3,
-        "details": {
-          "description": "min=0.279, mean=0.3, max=0.31, sum=0.899 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.086, mean=0.11, max=0.14, sum=0.33 (3)",
-            "tab": "Robustness",
-            "score": 0.10991481481481481
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.25, mean=0.253, max=0.254, sum=0.759 (3)",
-            "tab": "Robustness",
-            "score": 0.25287196320995325
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.119, mean=0.14, max=0.167, sum=0.42 (3)",
-            "tab": "Fairness",
-            "score": 0.14012791005291
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.266, mean=0.284, max=0.295, sum=0.852 (3)",
-            "tab": "Fairness",
-            "score": 0.2838824123845733
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.094, mean=0.094, max=0.095, sum=0.283 (3)",
-            "tab": "Efficiency",
-            "score": 0.09442029557291665
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.094, mean=0.095, max=0.097, sum=0.286 (3)",
-            "tab": "Efficiency",
-            "score": 0.09531934350775194
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)",
-            "tab": "General information",
-            "score": 532.5653333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=1.035, mean=1.112, max=1.183, sum=3.336 (3)",
-            "tab": "General information",
-            "score": 1.112
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)",
-            "tab": "General information",
-            "score": 515.8217054263565
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=1.093, mean=1.248, max=1.488, sum=3.744 (3)",
-            "tab": "General information",
-            "score": 1.248062015503876
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.113,
-        "details": {
-          "description": "min=0.038, mean=0.113, max=0.141, sum=0.789 (7)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=0.559, mean=0.623, max=0.691, sum=4.363 (7)",
-            "tab": "Efficiency",
-            "score": 0.6232588631080115
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=3262 (7)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=0, mean=4.286, max=5, sum=30 (7)",
-            "tab": "General information",
-            "score": 4.285714285714286
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=583.586, mean=1411.872, max=1567.586, sum=9883.101 (7)",
-            "tab": "General information",
-            "score": 1411.8715511955854
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=65.127, mean=74.606, max=84.073, sum=522.245 (7)",
-            "tab": "General information",
-            "score": 74.60637645616187
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=1, mean=2.714, max=3, sum=19 (7)",
-            "tab": "General information",
-            "score": 2.7142857142857144
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.619, mean=0.642, max=0.667, sum=4.492 (7)",
-            "tab": "Bias",
-            "score": 0.6416796928441896
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.383, mean=0.409, max=0.43, sum=2.86 (7)",
-            "tab": "Bias",
-            "score": 0.40861926379951435
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.238, mean=0.295, max=0.417, sum=2.068 (7)",
-            "tab": "Bias",
-            "score": 0.29545894187058713
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.109, mean=0.129, max=0.144, sum=0.9 (7)",
-            "tab": "Bias",
-            "score": 0.12851266312443646
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.004 (7)",
-            "tab": "Toxicity",
-            "score": 0.0006131207847946045
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=-0.108, mean=0.354, max=0.557, sum=1.415 (4)",
-            "tab": "Summarization metrics",
-            "score": 0.3538436304603978
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=1.248, mean=4.204, max=4.78, sum=29.431 (7)",
-            "tab": "Summarization metrics",
-            "score": 4.20445410382703
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=-0.343, mean=0.089, max=0.264, sum=0.355 (4)",
-            "tab": "Summarization metrics",
-            "score": 0.08867060792677807
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.425, mean=0.89, max=0.973, sum=6.231 (7)",
-            "tab": "Summarization metrics",
-            "score": 0.8901263761958778
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=11.471, mean=23.472, max=34.455, sum=164.303 (7)",
-            "tab": "Summarization metrics",
-            "score": 23.471817181725523
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=5.037, mean=9.495, max=12.229, sum=66.463 (7)",
-            "tab": "Summarization metrics",
-            "score": 9.494670330829432
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "min=0.287, mean=0.287, max=0.287, sum=0.287 (1)",
-            "tab": "Summarization metrics",
-            "score": 0.2866666666666666
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "min=1.933, mean=1.933, max=1.933, sum=1.933 (1)",
-            "tab": "Summarization metrics",
-            "score": 1.9333333333333333
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "min=1.767, mean=1.767, max=1.767, sum=1.767 (1)",
-            "tab": "Summarization metrics",
-            "score": 1.7666666666666666
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.091,
-        "details": {
-          "description": "min=0.035, mean=0.091, max=0.104, sum=0.636 (7)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=0.274, mean=0.294, max=0.41, sum=2.059 (7)",
-            "tab": "Efficiency",
-            "score": 0.29416145294688817
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3626 (7)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=0, mean=4.285, max=5, sum=29.992 (7)",
-            "tab": "General information",
-            "score": 4.284611141753999
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=388.402, mean=1350.13, max=1538.921, sum=9450.911 (7)",
-            "tab": "General information",
-            "score": 1350.1301709873137
-          },
-          "XSUM - # output tokens": {
-            "description": "min=24.405, mean=27.757, max=46.521, sum=194.297 (7)",
-            "tab": "General information",
-            "score": 27.75675675675676
-          },
-          "XSUM - # trials": {
-            "description": "min=1, mean=2.714, max=3, sum=19 (7)",
-            "tab": "General information",
-            "score": 2.7142857142857144
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=3.333 (5)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.409, mean=0.449, max=0.488, sum=3.143 (7)",
-            "tab": "Bias",
-            "score": 0.44897893078382667
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.446, mean=0.599, max=0.667, sum=4.196 (7)",
-            "tab": "Bias",
-            "score": 0.5994124922696351
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.169, mean=0.205, max=0.268, sum=1.435 (7)",
-            "tab": "Bias",
-            "score": 0.20496360887910145
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.004 (7)",
-            "tab": "Toxicity",
-            "score": 0.0005515719801434088
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.237, mean=-0.143, max=0.073, sum=-0.574 (4)",
-            "tab": "Summarization metrics",
-            "score": -0.14346265436541167
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=2.914, mean=3.922, max=4.204, sum=27.454 (7)",
-            "tab": "Summarization metrics",
-            "score": 3.9220091164391953
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.091, mean=0.313, max=0.388, sum=1.251 (4)",
-            "tab": "Summarization metrics",
-            "score": 0.312644368874429
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.795, mean=0.815, max=0.823, sum=5.707 (7)",
-            "tab": "Summarization metrics",
-            "score": 0.8152742026902194
-          },
-          "XSUM - Density": {
-            "description": "min=2.849, mean=5.57, max=19.82, sum=38.989 (7)",
-            "tab": "Summarization metrics",
-            "score": 5.569907111767537
-          },
-          "XSUM - Compression": {
-            "description": "min=10.146, mean=17.018, max=18.474, sum=119.123 (7)",
-            "tab": "Summarization metrics",
-            "score": 17.01754099745573
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "min=0.773, mean=0.924, max=1, sum=2.773 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.9244444444444445
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "min=3.387, mean=3.573, max=3.667, sum=10.72 (3)",
-            "tab": "Summarization metrics",
-            "score": 3.573333333333333
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "min=3.163, mean=4.166, max=4.667, sum=12.497 (3)",
-            "tab": "Summarization metrics",
-            "score": 4.165555555555556
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.889,
-        "details": {
-          "description": "min=0.831, mean=0.889, max=0.939, sum=2.668 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.231, mean=0.259, max=0.285, sum=0.776 (3)",
-            "tab": "Calibration",
-            "score": 0.25871248887630766
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.716, mean=0.803, max=0.892, sum=2.41 (3)",
-            "tab": "Robustness",
-            "score": 0.8033333333333333
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.792, mean=0.86, max=0.922, sum=2.581 (3)",
-            "tab": "Fairness",
-            "score": 0.8603333333333333
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.105, mean=0.11, max=0.115, sum=0.331 (3)",
-            "tab": "Efficiency",
-            "score": 0.11035393728298622
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.916, mean=4.242, max=4.986, sum=12.726 (3)",
-            "tab": "General information",
-            "score": 4.242
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1277.729, mean=1553.363, max=1768.607, sum=4660.089 (3)",
-            "tab": "General information",
-            "score": 1553.363
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.539,
-        "details": {
-          "description": "min=0.012, mean=0.539, max=1, sum=29.083 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.042, mean=0.293, max=0.601, sum=15.826 (54)",
-            "tab": "Calibration",
-            "score": 0.29307434802498333
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.002, mean=0.347, max=1, sum=18.748 (54)",
-            "tab": "Robustness",
-            "score": 0.3471901723680723
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.412, max=1, sum=22.222 (54)",
-            "tab": "Fairness",
-            "score": 0.41152337126555366
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.09, mean=0.097, max=0.105, sum=5.259 (54)",
-            "tab": "Efficiency",
-            "score": 0.09739228545773865
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)",
-            "tab": "General information",
-            "score": 722.6354931173206
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=54 (54)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.49,
-        "details": {
-          "description": "min=0, mean=0.49, max=0.975, sum=16.175 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.054, mean=0.319, max=0.977, sum=10.54 (33)",
-            "tab": "Calibration",
-            "score": 0.31939577693629423
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.413, max=0.975, sum=13.625 (33)",
-            "tab": "Robustness",
-            "score": 0.4128787878787879
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0, mean=0.473, max=0.975, sum=15.625 (33)",
-            "tab": "Fairness",
-            "score": 0.4734848484848485
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.094, mean=0.112, max=0.139, sum=3.696 (33)",
-            "tab": "Efficiency",
-            "score": 0.11198840159406566
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.556, max=5, sum=150.35 (33)",
-            "tab": "General information",
-            "score": 4.556060606060607
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)",
-            "tab": "General information",
-            "score": 812.937878787879
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.025, mean=2.867, max=6.375, sum=94.6 (33)",
-            "tab": "General information",
-            "score": 2.8666666666666667
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json b/data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json
deleted file mode 100644
index b376d2873..000000000
--- a/data/helm_classic/openai/davinci-175B/bc207557-fb49-4a87-8401-22c3ce853e7c.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_davinci-175B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "davinci 175B",
-    "id": "openai/davinci-175B",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.538,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.5745594499834401
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.5094878610451469
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.5578754949166518
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.557938596491228
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.44460142486244675
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.42202673869340535
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.3600250626566416
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.422,
-        "details": {
-          "description": "min=0.26, mean=0.422, max=0.7, sum=6.336 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.093, mean=0.132, max=0.18, sum=1.976 (15)",
-            "tab": "Calibration",
-            "score": 0.13175836488041992
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.17, mean=0.34, max=0.6, sum=5.102 (15)",
-            "tab": "Robustness",
-            "score": 0.3401169590643275
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.24, mean=0.38, max=0.61, sum=5.705 (15)",
-            "tab": "Fairness",
-            "score": 0.3803040935672514
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.203, mean=0.212, max=0.221, sum=3.181 (15)",
-            "tab": "Efficiency",
-            "score": 0.21209971402138156
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)",
-            "tab": "General information",
-            "score": 472.2740350877193
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.722,
-        "details": {
-          "description": "min=0.679, mean=0.722, max=0.77, sum=2.167 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.047, mean=0.072, max=0.103, sum=0.215 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.07164645838795872
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.592, mean=0.639, max=0.677, sum=1.918 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.6393333333333334
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.635, mean=0.682, max=0.729, sum=2.046 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.682
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.204, mean=0.21, max=0.217, sum=0.631 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.21022733463541673
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 908.4063333333334
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.687,
-        "details": {
-          "description": "min=0.664, mean=0.687, max=0.706, sum=2.061 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.041, mean=0.067, max=0.109, sum=0.202 (3)",
-            "tab": "Calibration",
-            "score": 0.06738212205854943
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.476, mean=0.498, max=0.52, sum=1.493 (3)",
-            "tab": "Robustness",
-            "score": 0.4976057829109271
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.556, mean=0.597, max=0.634, sum=1.791 (3)",
-            "tab": "Fairness",
-            "score": 0.5970096000459133
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=0.36, mean=0.369, max=0.384, sum=1.108 (3)",
-            "tab": "Efficiency",
-            "score": 0.3694498019366194
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)",
-            "tab": "General information",
-            "score": 1.6469483568075116
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)",
-            "tab": "General information",
-            "score": 1652.3774647887324
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.338, mean=5.709, max=6.197, sum=17.127 (3)",
-            "tab": "General information",
-            "score": 5.708920187793427
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.4, mean=0.443, max=0.5, sum=1.329 (3)",
-            "tab": "Bias",
-            "score": 0.44285714285714284
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.199, mean=0.208, max=0.221, sum=0.623 (3)",
-            "tab": "Bias",
-            "score": 0.2075773756101625
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.008, mean=0.012, max=0.014, sum=0.037 (3)",
-            "tab": "Toxicity",
-            "score": 0.012206572769953052
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.625,
-        "details": {
-          "description": "min=0.599, mean=0.625, max=0.65, sum=1.874 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.054, mean=0.061, max=0.07, sum=0.182 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.06060614220397647
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.06, mean=0.079, max=0.1, sum=0.236 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.07854855230782792
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.251, mean=0.256, max=0.264, sum=0.769 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.2562420226045557
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.48, mean=0.521, max=0.561, sum=1.563 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.5211614334906893
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.271, mean=0.276, max=0.282, sum=0.828 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.2760483569290458
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.537, mean=0.567, max=0.594, sum=1.702 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.5674897299434086
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.304, mean=0.327, max=0.357, sum=0.981 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.32700476562499997
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=0.378, mean=0.462, max=0.583, sum=1.386 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.462036467447917
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 112.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.601, mean=5.361, max=6.345, sum=16.082 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.360666666666667
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 4.691333333333334
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.036
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1419.5736666666664
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=6.369, mean=8.992, max=12.931, sum=26.977 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 8.992333333333333
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.342, mean=0.447, max=0.5, sum=1.342 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.4472502805836139
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.286, mean=0.382, max=0.439, sum=1.147 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.382401229992038
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.032, mean=0.247, max=0.4, sum=0.742 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.24726062467997953
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.293, mean=0.365, max=0.412, sum=1.096 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.3654871847728991
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.422, mean=0.435, max=0.447, sum=1.304 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.4346811201445348
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.222, mean=0.244, max=0.271, sum=0.733 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.24420285420364105
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0003333333333333333
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.002 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0006666666666666666
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.36,
-        "details": {
-          "description": "min=0.354, mean=0.36, max=0.367, sum=1.081 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.066, mean=0.068, max=0.071, sum=0.204 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.06797808745527684
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.197, mean=0.208, max=0.217, sum=0.623 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.20766668147064418
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.264, mean=0.279, max=0.288, sum=0.836 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.27860575089348755
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=1.01, mean=1.085, max=1.233, sum=3.256 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 1.085224210937499
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.9443333333333334
-          },
-          "QuAC - truncated": {
-            "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.016
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1644.8306666666667
-          },
-          "QuAC - # output tokens": {
-            "description": "min=27.082, mean=29.572, max=34.534, sum=88.717 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 29.572333333333333
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.636, mean=0.65, max=0.667, sum=1.949 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.6495628554452085
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.435, mean=0.445, max=0.455, sum=1.335 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.4451588893133011
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.354, mean=0.367, max=0.375, sum=1.1 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.366690749431994
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.244, mean=0.251, max=0.256, sum=0.754 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.25124249915688174
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0003333333333333333
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.775,
-        "details": {
-          "description": "min=0.775, mean=0.775, max=0.775, sum=0.775 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.31, mean=0.31, max=0.31, sum=0.31 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.30968673998386337
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.738, mean=0.738, max=0.738, sum=0.738 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.738
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.641, mean=0.641, max=0.641, sum=0.641 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.641
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.193, mean=0.193, max=0.193, sum=0.193 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.19329937499999997
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 87.888
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.586,
-        "details": {
-          "description": "min=0.586, mean=0.586, max=0.586, sum=0.586 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.204, mean=0.204, max=0.204, sum=0.204 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.20443749582919374
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.474, mean=0.474, max=0.474, sum=0.474 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.474
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.502, mean=0.502, max=0.502, sum=0.502 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.502
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.184, mean=0.184, max=0.184, sum=0.184 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.18361757812499943
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.27
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.194,
-        "details": {
-          "description": "min=0.182, mean=0.194, max=0.213, sum=0.581 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.186, mean=0.211, max=0.224, sum=0.632 (3)",
-            "tab": "Calibration",
-            "score": 0.21061421693460983
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.131, mean=0.145, max=0.162, sum=0.434 (3)",
-            "tab": "Robustness",
-            "score": 0.14475025484199797
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.136, mean=0.155, max=0.185, sum=0.466 (3)",
-            "tab": "Fairness",
-            "score": 0.15545361875637104
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.208, mean=0.215, max=0.219, sum=0.645 (3)",
-            "tab": "Efficiency",
-            "score": 0.21492536613627675
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)",
-            "tab": "General information",
-            "score": 511.12079510703364
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.378,
-        "details": {
-          "description": "min=0.343, mean=0.378, max=0.397, sum=1.135 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.15, mean=0.154, max=0.157, sum=0.462 (3)",
-            "tab": "Robustness",
-            "score": 0.15391111111111108
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.31, mean=0.332, max=0.352, sum=0.996 (3)",
-            "tab": "Robustness",
-            "score": 0.3320850067305285
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.179, mean=0.185, max=0.192, sum=0.554 (3)",
-            "tab": "Fairness",
-            "score": 0.18462896825396802
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.324, mean=0.357, max=0.375, sum=1.072 (3)",
-            "tab": "Fairness",
-            "score": 0.35718542292055805
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.202, mean=0.211, max=0.218, sum=0.632 (3)",
-            "tab": "Efficiency",
-            "score": 0.21074697460937475
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.201, mean=0.214, max=0.221, sum=0.641 (3)",
-            "tab": "Efficiency",
-            "score": 0.2137389625726744
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)",
-            "tab": "General information",
-            "score": 532.5653333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)",
-            "tab": "General information",
-            "score": 515.8217054263565
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.127,
-        "details": {
-          "description": "min=0.087, mean=0.127, max=0.14, sum=0.889 (7)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=1.919, mean=2.256, max=3.967, sum=15.789 (7)",
-            "tab": "Efficiency",
-            "score": 2.255577085568669
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=3262 (7)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=0, mean=4.286, max=5, sum=30 (7)",
-            "tab": "General information",
-            "score": 4.285714285714286
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=583.586, mean=1411.872, max=1567.586, sum=9883.101 (7)",
-            "tab": "General information",
-            "score": 1411.8715511955854
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=57.459, mean=68.76, max=126.343, sum=481.322 (7)",
-            "tab": "General information",
-            "score": 68.76026977314531
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=1, mean=2.714, max=3, sum=19 (7)",
-            "tab": "General information",
-            "score": 2.7142857142857144
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.579, mean=0.619, max=0.641, sum=4.33 (7)",
-            "tab": "Bias",
-            "score": 0.618631744195654
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.373, mean=0.401, max=0.418, sum=2.804 (7)",
-            "tab": "Bias",
-            "score": 0.4005751850408633
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.235, mean=0.301, max=0.378, sum=2.105 (7)",
-            "tab": "Bias",
-            "score": 0.3007554818500092
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.111, mean=0.125, max=0.16, sum=0.876 (7)",
-            "tab": "Bias",
-            "score": 0.12511140031093898
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.002, max=0.002, sum=0.011 (7)",
-            "tab": "Toxicity",
-            "score": 0.0015328019619865114
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=-0.08, mean=0.321, max=0.532, sum=1.284 (4)",
-            "tab": "Summarization metrics",
-            "score": 0.321074205166444
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=2.929, mean=4.062, max=4.888, sum=28.435 (7)",
-            "tab": "Summarization metrics",
-            "score": 4.062076530805548
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.023, mean=0.182, max=0.25, sum=0.729 (4)",
-            "tab": "Summarization metrics",
-            "score": 0.18232803102041212
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.72, mean=0.873, max=0.944, sum=6.111 (7)",
-            "tab": "Summarization metrics",
-            "score": 0.87307141297806
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=15.056, mean=17.914, max=20.184, sum=125.396 (7)",
-            "tab": "Summarization metrics",
-            "score": 17.913710646412884
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=4.761, mean=9.843, max=11.282, sum=68.899 (7)",
-            "tab": "Summarization metrics",
-            "score": 9.842721706219109
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "min=0.763, mean=0.953, max=1, sum=4.763 (5)",
-            "tab": "Summarization metrics",
-            "score": 0.9526666666666668
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "min=3.503, mean=4.501, max=5, sum=22.503 (5)",
-            "tab": "Summarization metrics",
-            "score": 4.500666666666667
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "min=2.647, mean=3.863, max=4.667, sum=19.313 (5)",
-            "tab": "Summarization metrics",
-            "score": 3.862666666666667
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.126,
-        "details": {
-          "description": "min=0.045, mean=0.126, max=0.144, sum=0.884 (7)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=0.958, mean=1.148, max=2.074, sum=8.038 (7)",
-            "tab": "Efficiency",
-            "score": 1.1482822034007862
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3626 (7)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=0, mean=4.285, max=5, sum=29.992 (7)",
-            "tab": "General information",
-            "score": 4.284611141753999
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=388.402, mean=1350.13, max=1538.921, sum=9450.911 (7)",
-            "tab": "General information",
-            "score": 1350.1301709873137
-          },
-          "XSUM - # output tokens": {
-            "description": "min=25.444, mean=31.877, max=63.193, sum=223.139 (7)",
-            "tab": "General information",
-            "score": 31.87699944842802
-          },
-          "XSUM - # trials": {
-            "description": "min=1, mean=2.714, max=3, sum=19 (7)",
-            "tab": "General information",
-            "score": 2.7142857142857144
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4.667 (7)",
-            "tab": "Bias",
-            "score": 0.6666666666666669
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.427, mean=0.444, max=0.469, sum=3.111 (7)",
-            "tab": "Bias",
-            "score": 0.44436594684493835
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.473, mean=0.564, max=0.667, sum=3.948 (7)",
-            "tab": "Bias",
-            "score": 0.5639808220453382
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.189, mean=0.217, max=0.251, sum=1.521 (7)",
-            "tab": "Bias",
-            "score": 0.21723674492179154
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.003, max=0.015, sum=0.019 (7)",
-            "tab": "Toxicity",
-            "score": 0.0027578599007170436
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.317, mean=-0.267, max=-0.218, sum=-1.068 (4)",
-            "tab": "Summarization metrics",
-            "score": -0.2669066513504126
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=1.878, mean=2.338, max=2.635, sum=16.363 (7)",
-            "tab": "Summarization metrics",
-            "score": 2.337582859954366
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.063, mean=0.318, max=0.423, sum=1.272 (4)",
-            "tab": "Summarization metrics",
-            "score": 0.3179425085241978
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.698, mean=0.751, max=0.774, sum=5.255 (7)",
-            "tab": "Summarization metrics",
-            "score": 0.7506856271565006
-          },
-          "XSUM - Density": {
-            "description": "min=2.081, mean=3.351, max=10.076, sum=23.459 (7)",
-            "tab": "Summarization metrics",
-            "score": 3.3513024292310853
-          },
-          "XSUM - Compression": {
-            "description": "min=7.668, mean=14.08, max=15.293, sum=98.56 (7)",
-            "tab": "Summarization metrics",
-            "score": 14.079969364330754
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "min=0.5, mean=0.829, max=1, sum=5.803 (7)",
-            "tab": "Summarization metrics",
-            "score": 0.8290476190476191
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "min=2.833, mean=4.075, max=5, sum=28.523 (7)",
-            "tab": "Summarization metrics",
-            "score": 4.074761904761905
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "min=2.167, mean=3.398, max=5, sum=23.783 (7)",
-            "tab": "Summarization metrics",
-            "score": 3.397619047619048
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.933,
-        "details": {
-          "description": "min=0.925, mean=0.933, max=0.942, sum=2.8 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.104, mean=0.126, max=0.166, sum=0.378 (3)",
-            "tab": "Calibration",
-            "score": 0.12610548329130192
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.855, mean=0.873, max=0.89, sum=2.62 (3)",
-            "tab": "Robustness",
-            "score": 0.8733333333333334
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.917, mean=0.921, max=0.923, sum=2.762 (3)",
-            "tab": "Fairness",
-            "score": 0.9206666666666669
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.218, mean=0.225, max=0.231, sum=0.676 (3)",
-            "tab": "Efficiency",
-            "score": 0.22547806217447905
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.916, mean=4.242, max=4.986, sum=12.726 (3)",
-            "tab": "General information",
-            "score": 4.242
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1277.729, mean=1553.363, max=1768.607, sum=4660.089 (3)",
-            "tab": "General information",
-            "score": 1553.363
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.532,
-        "details": {
-          "description": "min=0.006, mean=0.532, max=1, sum=28.723 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.083, mean=0.396, max=0.664, sum=21.389 (54)",
-            "tab": "Calibration",
-            "score": 0.3960964912577608
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.461, max=1, sum=24.899 (54)",
-            "tab": "Robustness",
-            "score": 0.461098863197608
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.003, mean=0.478, max=1, sum=25.83 (54)",
-            "tab": "Fairness",
-            "score": 0.4783299102254815
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.203, mean=0.21, max=0.218, sum=11.326 (54)",
-            "tab": "Efficiency",
-            "score": 0.20974755918568705
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)",
-            "tab": "General information",
-            "score": 722.6354931173206
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=0.998, mean=1.0, max=1.001, sum=54.0 (54)",
-            "tab": "General information",
-            "score": 0.9999957802714455
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.642,
-        "details": {
-          "description": "min=0.1, mean=0.642, max=0.975, sum=21.2 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.073, mean=0.222, max=0.806, sum=7.328 (33)",
-            "tab": "Calibration",
-            "score": 0.22206849861217967
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.025, mean=0.505, max=0.975, sum=16.65 (33)",
-            "tab": "Robustness",
-            "score": 0.5045454545454545
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.05, mean=0.605, max=0.975, sum=19.95 (33)",
-            "tab": "Fairness",
-            "score": 0.6045454545454545
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.213, mean=0.279, max=0.378, sum=9.22 (33)",
-            "tab": "Efficiency",
-            "score": 0.2793995279947917
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.556, max=5, sum=150.35 (33)",
-            "tab": "General information",
-            "score": 4.556060606060607
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)",
-            "tab": "General information",
-            "score": 812.937878787879
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.3, mean=3.056, max=6.575, sum=100.85 (33)",
-            "tab": "General information",
-            "score": 3.056060606060606
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json b/data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json
deleted file mode 100644
index 8051b9b3e..000000000
--- a/data/helm_classic/openai/gpt-3.5-turbo-0301/895266ee-71a5-4ca5-b3f9-62df6383ff95.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0301/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt-3.5-turbo-0301",
-    "id": "openai/gpt-3.5-turbo-0301",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": null
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.8156643356643357
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.6617249417249418
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5128923320135726
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.8050116550116551
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.59,
-        "details": {
-          "description": "min=0.3, mean=0.59, max=0.85, sum=2.949 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.23, mean=0.525, max=0.79, sum=2.627 (5)",
-            "tab": "Robustness",
-            "score": 0.5254736842105263
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.26, mean=0.53, max=0.8, sum=2.65 (5)",
-            "tab": "Fairness",
-            "score": 0.5299649122807017
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)",
-            "tab": "General information",
-            "score": 460.71996491228066
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1.012, max=1.06, sum=5.06 (5)",
-            "tab": "General information",
-            "score": 1.012
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.74,
-        "details": {
-          "description": "min=0.74, mean=0.74, max=0.74, sum=0.74 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.66, mean=0.66, max=0.66, sum=0.66 (1)",
-            "tab": "Robustness",
-            "score": 0.66
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.666, mean=0.666, max=0.666, sum=0.666 (1)",
-            "tab": "Fairness",
-            "score": 0.666
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1220.329, mean=1220.329, max=1220.329, sum=1220.329 (1)",
-            "tab": "General information",
-            "score": 1220.329
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1.932, mean=1.932, max=1.932, sum=1.932 (1)",
-            "tab": "General information",
-            "score": 1.932
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.663,
-        "details": {
-          "description": "min=0.663, mean=0.663, max=0.663, sum=0.663 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.602, mean=0.602, max=0.602, sum=0.602 (1)",
-            "tab": "Robustness",
-            "score": 0.6017866194784781
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.585, mean=0.585, max=0.585, sum=0.585 (1)",
-            "tab": "Fairness",
-            "score": 0.5846601621436455
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.966, mean=4.966, max=4.966, sum=4.966 (1)",
-            "tab": "General information",
-            "score": 4.966197183098592
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3443.349, mean=3443.349, max=3443.349, sum=3443.349 (1)",
-            "tab": "General information",
-            "score": 3443.349295774648
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=11.186, mean=11.186, max=11.186, sum=11.186 (1)",
-            "tab": "General information",
-            "score": 11.185915492957747
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.479, mean=0.479, max=0.479, sum=0.479 (1)",
-            "tab": "Bias",
-            "score": 0.4789473684210526
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)",
-            "tab": "Bias",
-            "score": 0.33333333333333337
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.216, mean=0.216, max=0.216, sum=0.216 (1)",
-            "tab": "Bias",
-            "score": 0.21590909090909088
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.011, mean=0.011, max=0.011, sum=0.011 (1)",
-            "tab": "Toxicity",
-            "score": 0.011267605633802818
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.624,
-        "details": {
-          "description": "min=0.624, mean=0.624, max=0.624, sum=0.624 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.327 (1)",
-            "tab": "Robustness",
-            "score": 0.32682585209770315
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.556, mean=0.556, max=0.556, sum=0.556 (1)",
-            "tab": "Robustness",
-            "score": 0.5559619230719722
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.331, mean=0.331, max=0.331, sum=0.331 (1)",
-            "tab": "Fairness",
-            "score": 0.3309794595447127
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.559, mean=0.559, max=0.559, sum=0.559 (1)",
-            "tab": "Fairness",
-            "score": 0.5593911419045751
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=112.127, mean=112.127, max=112.127, sum=112.127 (1)",
-            "tab": "General information",
-            "score": 112.127
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=16.241, mean=16.241, max=16.241, sum=16.241 (1)",
-            "tab": "General information",
-            "score": 16.241
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.887, mean=4.887, max=4.887, sum=4.887 (1)",
-            "tab": "General information",
-            "score": 4.887
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.019, mean=0.019, max=0.019, sum=0.019 (1)",
-            "tab": "General information",
-            "score": 0.019
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1590.821, mean=1590.821, max=1590.821, sum=1590.821 (1)",
-            "tab": "General information",
-            "score": 1590.821
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=12.998, mean=12.998, max=12.998, sum=12.998 (1)",
-            "tab": "General information",
-            "score": 12.998
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.353, mean=0.353, max=0.353, sum=0.353 (1)",
-            "tab": "Bias",
-            "score": 0.35333333333333333
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.364, mean=0.364, max=0.364, sum=0.364 (1)",
-            "tab": "Bias",
-            "score": 0.3643410852713178
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.167, mean=0.167, max=0.167, sum=0.167 (1)",
-            "tab": "Bias",
-            "score": 0.16666666666666669
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.408, mean=0.408, max=0.408, sum=0.408 (1)",
-            "tab": "Bias",
-            "score": 0.4083885209713024
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.236, mean=0.236, max=0.236, sum=0.236 (1)",
-            "tab": "Bias",
-            "score": 0.23584905660377362
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.512,
-        "details": {
-          "description": "min=0.512, mean=0.512, max=0.512, sum=0.512 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.411, mean=0.411, max=0.411, sum=0.411 (1)",
-            "tab": "Robustness",
-            "score": 0.41122249859183385
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.417, mean=0.417, max=0.417, sum=0.417 (1)",
-            "tab": "Fairness",
-            "score": 0.4167691534016683
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=3.871, mean=3.871, max=3.871, sum=3.871 (1)",
-            "tab": "General information",
-            "score": 3.871
-          },
-          "QuAC - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=3461.981, mean=3461.981, max=3461.981, sum=3461.981 (1)",
-            "tab": "General information",
-            "score": 3461.981
-          },
-          "QuAC - # output tokens": {
-            "description": "min=23.136, mean=23.136, max=23.136, sum=23.136 (1)",
-            "tab": "General information",
-            "score": 23.136
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.639, mean=0.639, max=0.639, sum=0.639 (1)",
-            "tab": "Bias",
-            "score": 0.638888888888889
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.403, mean=0.403, max=0.403, sum=0.403 (1)",
-            "tab": "Bias",
-            "score": 0.40322916666666675
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.436 (1)",
-            "tab": "Bias",
-            "score": 0.43589743589743585
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.229, mean=0.229, max=0.229, sum=0.229 (1)",
-            "tab": "Bias",
-            "score": 0.22941176470588232
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.609,
-        "details": {
-          "description": "min=0.609, mean=0.609, max=0.609, sum=0.609 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.566, mean=0.566, max=0.566, sum=0.566 (1)",
-            "tab": "Robustness",
-            "score": 0.5657492354740061
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.514, mean=0.514, max=0.514, sum=0.514 (1)",
-            "tab": "Fairness",
-            "score": 0.5137614678899083
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=464.434, mean=464.434, max=464.434, sum=464.434 (1)",
-            "tab": "General information",
-            "score": 464.434250764526
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1.047, mean=1.047, max=1.047, sum=1.047 (1)",
-            "tab": "General information",
-            "score": 1.047400611620795
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.899,
-        "details": {
-          "description": "min=0.899, mean=0.899, max=0.899, sum=0.899 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.857, mean=0.857, max=0.857, sum=0.857 (1)",
-            "tab": "Robustness",
-            "score": 0.857
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.844, mean=0.844, max=0.844, sum=0.844 (1)",
-            "tab": "Fairness",
-            "score": 0.844
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=2543.665, mean=2543.665, max=2543.665, sum=2543.665 (1)",
-            "tab": "General information",
-            "score": 2543.665
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1.006, mean=1.006, max=1.006, sum=1.006 (1)",
-            "tab": "General information",
-            "score": 1.006
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.674,
-        "details": {
-          "description": "min=0.528, mean=0.674, max=0.824, sum=12.134 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.394, mean=0.605, max=0.824, sum=10.882 (18)",
-            "tab": "Robustness",
-            "score": 0.6045521523734413
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.024, mean=0.422, max=0.824, sum=7.597 (18)",
-            "tab": "Fairness",
-            "score": 0.4220761773099496
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=333.915, mean=733.362, max=1226.723, sum=13200.513 (18)",
-            "tab": "General information",
-            "score": 733.3618295565135
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1.023, max=1.103, sum=18.406 (18)",
-            "tab": "General information",
-            "score": 1.0225713328901465
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.768,
-        "details": {
-          "description": "min=0.3, mean=0.768, max=0.975, sum=8.45 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.2, mean=0.705, max=0.975, sum=7.75 (11)",
-            "tab": "Robustness",
-            "score": 0.7045454545454546
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.025, mean=0.689, max=0.975, sum=7.575 (11)",
-            "tab": "Fairness",
-            "score": 0.6886363636363636
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=3, mean=4.818, max=5, sum=53 (11)",
-            "tab": "General information",
-            "score": 4.818181818181818
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=252.275, mean=1002.239, max=3545.1, sum=11024.625 (11)",
-            "tab": "General information",
-            "score": 1002.2386363636365
-          },
-          "RAFT - # output tokens": {
-            "description": "min=1.325, mean=2.982, max=5, sum=32.8 (11)",
-            "tab": "General information",
-            "score": 2.9818181818181815
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json b/data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json
deleted file mode 100644
index b2682e6f7..000000000
--- a/data/helm_classic/openai/gpt-3.5-turbo-0613/8828e9e8-5716-41b4-a2d1-233bb056dc32.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_gpt-3.5-turbo-0613/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt-3.5-turbo-0613",
-    "id": "openai/gpt-3.5-turbo-0613",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.783,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": null
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.7622144522144523
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.7175058275058275
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5232317557148765
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.7166083916083916
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.391,
-        "details": {
-          "description": "min=0.2, mean=0.391, max=0.73, sum=1.955 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.1, mean=0.262, max=0.49, sum=1.312 (5)",
-            "tab": "Robustness",
-            "score": 0.2623859649122807
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.12, mean=0.313, max=0.66, sum=1.566 (5)",
-            "tab": "Fairness",
-            "score": 0.31312280701754386
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)",
-            "tab": "General information",
-            "score": 460.71996491228066
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1.19, mean=1.371, max=1.61, sum=6.857 (5)",
-            "tab": "General information",
-            "score": 1.3714035087719298
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=0.87 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.845, mean=0.845, max=0.845, sum=0.845 (1)",
-            "tab": "Robustness",
-            "score": 0.845
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.817, mean=0.817, max=0.817, sum=0.817 (1)",
-            "tab": "Fairness",
-            "score": 0.817
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1220.329, mean=1220.329, max=1220.329, sum=1220.329 (1)",
-            "tab": "General information",
-            "score": 1220.329
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1.057, mean=1.057, max=1.057, sum=1.057 (1)",
-            "tab": "General information",
-            "score": 1.057
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.625,
-        "details": {
-          "description": "min=0.625, mean=0.625, max=0.625, sum=0.625 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.566, mean=0.566, max=0.566, sum=0.566 (1)",
-            "tab": "Robustness",
-            "score": 0.5658549915417233
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.547, mean=0.547, max=0.547, sum=0.547 (1)",
-            "tab": "Fairness",
-            "score": 0.546599991762967
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.966, mean=4.966, max=4.966, sum=4.966 (1)",
-            "tab": "General information",
-            "score": 4.966197183098592
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3443.349, mean=3443.349, max=3443.349, sum=3443.349 (1)",
-            "tab": "General information",
-            "score": 3443.349295774648
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=12.194, mean=12.194, max=12.194, sum=12.194 (1)",
-            "tab": "General information",
-            "score": 12.194366197183099
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.455, mean=0.455, max=0.455, sum=0.455 (1)",
-            "tab": "Bias",
-            "score": 0.45454545454545453
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.429, mean=0.429, max=0.429, sum=0.429 (1)",
-            "tab": "Bias",
-            "score": 0.42857142857142855
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.169, mean=0.169, max=0.169, sum=0.169 (1)",
-            "tab": "Bias",
-            "score": 0.16860465116279072
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.011, mean=0.011, max=0.011, sum=0.011 (1)",
-            "tab": "Toxicity",
-            "score": 0.011267605633802818
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.675,
-        "details": {
-          "description": "min=0.675, mean=0.675, max=0.675, sum=0.675 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.284, mean=0.284, max=0.284, sum=0.284 (1)",
-            "tab": "Robustness",
-            "score": 0.28373438775512194
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.606, mean=0.606, max=0.606, sum=0.606 (1)",
-            "tab": "Robustness",
-            "score": 0.6060594363127481
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.287, mean=0.287, max=0.287, sum=0.287 (1)",
-            "tab": "Fairness",
-            "score": 0.2871379631388369
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.627, mean=0.627, max=0.627, sum=0.627 (1)",
-            "tab": "Fairness",
-            "score": 0.6270354958497198
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=112.127, mean=112.127, max=112.127, sum=112.127 (1)",
-            "tab": "General information",
-            "score": 112.127
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=18.876, mean=18.876, max=18.876, sum=18.876 (1)",
-            "tab": "General information",
-            "score": 18.876
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.887, mean=4.887, max=4.887, sum=4.887 (1)",
-            "tab": "General information",
-            "score": 4.887
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.019, mean=0.019, max=0.019, sum=0.019 (1)",
-            "tab": "General information",
-            "score": 0.019
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1590.821, mean=1590.821, max=1590.821, sum=1590.821 (1)",
-            "tab": "General information",
-            "score": 1590.821
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=11.901, mean=11.901, max=11.901, sum=11.901 (1)",
-            "tab": "General information",
-            "score": 11.901
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.382, mean=0.382, max=0.382, sum=0.382 (1)",
-            "tab": "Bias",
-            "score": 0.38211382113821135
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.104, mean=0.104, max=0.104, sum=0.104 (1)",
-            "tab": "Bias",
-            "score": 0.10377358490566038
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.415, mean=0.415, max=0.415, sum=0.415 (1)",
-            "tab": "Bias",
-            "score": 0.41463414634146334
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.233, mean=0.233, max=0.233, sum=0.233 (1)",
-            "tab": "Bias",
-            "score": 0.23333333333333336
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.485,
-        "details": {
-          "description": "min=0.485, mean=0.485, max=0.485, sum=0.485 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.371, mean=0.371, max=0.371, sum=0.371 (1)",
-            "tab": "Robustness",
-            "score": 0.3712446607257685
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.398, mean=0.398, max=0.398, sum=0.398 (1)",
-            "tab": "Fairness",
-            "score": 0.3977545370248786
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=3.871, mean=3.871, max=3.871, sum=3.871 (1)",
-            "tab": "General information",
-            "score": 3.871
-          },
-          "QuAC - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=3461.981, mean=3461.981, max=3461.981, sum=3461.981 (1)",
-            "tab": "General information",
-            "score": 3461.981
-          },
-          "QuAC - # output tokens": {
-            "description": "min=25.691, mean=25.691, max=25.691, sum=25.691 (1)",
-            "tab": "General information",
-            "score": 25.691
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.589, mean=0.589, max=0.589, sum=0.589 (1)",
-            "tab": "Bias",
-            "score": 0.5889724310776943
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.403, mean=0.403, max=0.403, sum=0.403 (1)",
-            "tab": "Bias",
-            "score": 0.4030096483037659
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.378, mean=0.378, max=0.378, sum=0.378 (1)",
-            "tab": "Bias",
-            "score": 0.3782051282051282
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.223, mean=0.223, max=0.223, sum=0.223 (1)",
-            "tab": "Bias",
-            "score": 0.22334293948126804
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.339,
-        "details": {
-          "description": "min=0.339, mean=0.339, max=0.339, sum=0.339 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.187, mean=0.187, max=0.187, sum=0.187 (1)",
-            "tab": "Robustness",
-            "score": 0.18654434250764526
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.255, mean=0.255, max=0.255, sum=0.255 (1)",
-            "tab": "Fairness",
-            "score": 0.25535168195718655
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=464.434, mean=464.434, max=464.434, sum=464.434 (1)",
-            "tab": "General information",
-            "score": 464.434250764526
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1.517, mean=1.517, max=1.517, sum=1.517 (1)",
-            "tab": "General information",
-            "score": 1.5168195718654434
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.943,
-        "details": {
-          "description": "min=0.943, mean=0.943, max=0.943, sum=0.943 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.916, mean=0.916, max=0.916, sum=0.916 (1)",
-            "tab": "Robustness",
-            "score": 0.916
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.912, mean=0.912, max=0.912, sum=0.912 (1)",
-            "tab": "Fairness",
-            "score": 0.912
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=2543.665, mean=2543.665, max=2543.665, sum=2543.665 (1)",
-            "tab": "General information",
-            "score": 2543.665
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.696,
-        "details": {
-          "description": "min=0.48, mean=0.696, max=0.874, sum=12.534 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.206, mean=0.564, max=0.863, sum=10.15 (18)",
-            "tab": "Robustness",
-            "score": 0.5638779146224463
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.133, mean=0.525, max=0.863, sum=9.458 (18)",
-            "tab": "Fairness",
-            "score": 0.5254285459217098
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=333.915, mean=733.362, max=1226.723, sum=13200.513 (18)",
-            "tab": "General information",
-            "score": 733.3618295565135
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1.001, max=1.01, sum=18.025 (18)",
-            "tab": "General information",
-            "score": 1.0013947024944874
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.748,
-        "details": {
-          "description": "min=0.275, mean=0.748, max=0.95, sum=8.225 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.15, mean=0.677, max=0.95, sum=7.45 (11)",
-            "tab": "Robustness",
-            "score": 0.6772727272727272
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.2, mean=0.641, max=0.95, sum=7.05 (11)",
-            "tab": "Fairness",
-            "score": 0.640909090909091
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=3, mean=4.818, max=5, sum=53 (11)",
-            "tab": "General information",
-            "score": 4.818181818181818
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=252.275, mean=1002.239, max=3545.1, sum=11024.625 (11)",
-            "tab": "General information",
-            "score": 1002.2386363636365
-          },
-          "RAFT - # output tokens": {
-            "description": "min=1.275, mean=2.955, max=5.05, sum=32.5 (11)",
-            "tab": "General information",
-            "score": 2.9545454545454546
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json b/data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json
deleted file mode 100644
index 43f728bf2..000000000
--- a/data/helm_classic/openai/text-ada-001/f267ba72-b239-4126-99c5-675f79b1ae95.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_text-ada-001/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "text-ada-001",
-    "id": "openai/text-ada-001",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.107,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.17139908178298557
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.10508470024599056
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.10817286162113748
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.937796052631579
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.4261942744755245
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.5531715198381865
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.48596491228070177
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.238,
-        "details": {
-          "description": "min=0.14, mean=0.238, max=0.31, sum=3.566 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.357, mean=0.506, max=0.666, sum=7.594 (15)",
-            "tab": "Calibration",
-            "score": 0.5062965949265723
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.08, mean=0.178, max=0.28, sum=2.665 (15)",
-            "tab": "Robustness",
-            "score": 0.17768421052631578
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.11, mean=0.202, max=0.28, sum=3.026 (15)",
-            "tab": "Fairness",
-            "score": 0.201766081871345
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.086, mean=0.088, max=0.089, sum=1.314 (15)",
-            "tab": "Efficiency",
-            "score": 0.08760755934758772
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)",
-            "tab": "General information",
-            "score": 472.2740350877193
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.464,
-        "details": {
-          "description": "min=0.405, mean=0.464, max=0.503, sum=1.392 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.257, mean=0.346, max=0.483, sum=1.039 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.34632807207915267
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.316, mean=0.332, max=0.362, sum=0.997 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.33233333333333337
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.364, mean=0.378, max=0.397, sum=1.134 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.37799999999999995
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.09, mean=0.096, max=0.103, sum=0.287 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.09557654231770833
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 908.4063333333334
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=0.995, mean=1.003, max=1.009, sum=3.009 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.003
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.238,
-        "details": {
-          "description": "min=0.22, mean=0.238, max=0.273, sum=0.714 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.248, mean=0.319, max=0.386, sum=0.956 (3)",
-            "tab": "Calibration",
-            "score": 0.318718698868713
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.049, mean=0.058, max=0.075, sum=0.175 (3)",
-            "tab": "Robustness",
-            "score": 0.05828828370185365
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.11, mean=0.119, max=0.126, sum=0.356 (3)",
-            "tab": "Fairness",
-            "score": 0.1187630501762329
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=0.16, mean=0.171, max=0.186, sum=0.513 (3)",
-            "tab": "Efficiency",
-            "score": 0.1710890294894365
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)",
-            "tab": "General information",
-            "score": 1.6469483568075116
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)",
-            "tab": "General information",
-            "score": 1652.3774647887324
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=9.054, mean=10.756, max=13.293, sum=32.268 (3)",
-            "tab": "General information",
-            "score": 10.755868544600938
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.382, mean=0.403, max=0.438, sum=1.21 (3)",
-            "tab": "Bias",
-            "score": 0.40317130936696155
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.151, mean=0.203, max=0.252, sum=0.609 (3)",
-            "tab": "Bias",
-            "score": 0.20287726757892108
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.003, mean=0.006, max=0.008, sum=0.017 (3)",
-            "tab": "Toxicity",
-            "score": 0.005633802816901408
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.149,
-        "details": {
-          "description": "min=0.06, mean=0.149, max=0.193, sum=0.446 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.751, mean=0.764, max=0.789, sum=2.292 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.7640868917536278
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.6, mean=0.691, max=0.866, sum=2.072 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.6905918803748641
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.007, mean=0.008, max=0.009, sum=0.023 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.007711173104376766
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.01, mean=0.034, max=0.062, sum=0.102 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.033837452909760764
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.009, mean=0.012, max=0.018, sum=0.036 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.012133718750385417
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.026, mean=0.083, max=0.115, sum=0.249 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.08303504557607948
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.083, mean=0.085, max=0.087, sum=0.255 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.08484092187500009
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=0.119, mean=0.128, max=0.133, sum=0.383 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.12779065299479173
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 112.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0.729, mean=1.04, max=1.418, sum=3.12 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0399999999999998
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 4.691333333333334
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.036
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1419.5736666666664
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=1.801, mean=3.933, max=5.648, sum=11.799 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.933
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.167, mean=0.167, max=0.167, sum=0.167 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.16666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.567, mean=0.633, max=0.667, sum=1.9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.6333333333333334
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.1, mean=0.217, max=0.318, sum=0.652 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.21717171717171715
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.176,
-        "details": {
-          "description": "min=0.14, mean=0.176, max=0.203, sum=0.527 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.16, mean=0.268, max=0.362, sum=0.803 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.2675195450588613
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.054, mean=0.067, max=0.074, sum=0.201 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.06713428098997175
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.063, mean=0.091, max=0.113, sum=0.273 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.09086419903543015
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=0.194, mean=0.21, max=0.221, sum=0.629 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.20979015885416655
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.9443333333333334
-          },
-          "QuAC - truncated": {
-            "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.016
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1644.8306666666667
-          },
-          "QuAC - # output tokens": {
-            "description": "min=14.536, mean=17.274, max=19.327, sum=51.821 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 17.273666666666667
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.625, mean=0.653, max=0.667, sum=1.958 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.6527777777777778
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.415, mean=0.433, max=0.448, sum=1.3 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.4333686045042254
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.308, mean=0.345, max=0.387, sum=1.034 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.34482454482454483
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.223, mean=0.244, max=0.269, sum=0.732 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.24387920564334062
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0003333333333333333
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.429,
-        "details": {
-          "description": "min=0.429, mean=0.429, max=0.429, sum=0.429 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.103, mean=0.103, max=0.103, sum=0.103 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.1034689985203878
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.32, mean=0.32, max=0.32, sum=0.32 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.32
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.27, mean=0.27, max=0.27, sum=0.27 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.27
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.079, mean=0.079, max=0.079, sum=0.079 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.07943312500000001
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 87.888
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.346,
-        "details": {
-          "description": "min=0.346, mean=0.346, max=0.346, sum=0.346 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.487, mean=0.487, max=0.487, sum=0.487 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.4870210553256142
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.248, mean=0.248, max=0.248, sum=0.248 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.248
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.266, mean=0.266, max=0.266, sum=0.266 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.266
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.076, mean=0.076, max=0.076, sum=0.076 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.07620585937499988
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.27
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.232,
-        "details": {
-          "description": "min=0.216, mean=0.232, max=0.263, sum=0.696 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.418, mean=0.465, max=0.495, sum=1.395 (3)",
-            "tab": "Calibration",
-            "score": 0.46507296315502505
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.165, mean=0.175, max=0.194, sum=0.526 (3)",
-            "tab": "Robustness",
-            "score": 0.17533129459734964
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.18, mean=0.191, max=0.213, sum=0.573 (3)",
-            "tab": "Fairness",
-            "score": 0.191131498470948
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.088, mean=0.089, max=0.089, sum=0.266 (3)",
-            "tab": "Efficiency",
-            "score": 0.08860781608371561
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)",
-            "tab": "General information",
-            "score": 511.12079510703364
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302,
-        "details": {
-          "description": "min=0.21, mean=0.302, max=0.353, sum=0.905 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.044, mean=0.069, max=0.091, sum=0.207 (3)",
-            "tab": "Robustness",
-            "score": 0.06911044973544983
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.172, mean=0.252, max=0.302, sum=0.757 (3)",
-            "tab": "Robustness",
-            "score": 0.2521954718959493
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.071, mean=0.107, max=0.133, sum=0.32 (3)",
-            "tab": "Fairness",
-            "score": 0.10653478835978836
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.177, mean=0.276, max=0.327, sum=0.827 (3)",
-            "tab": "Fairness",
-            "score": 0.2757254036023355
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.089, mean=0.09, max=0.091, sum=0.27 (3)",
-            "tab": "Efficiency",
-            "score": 0.08991796223958341
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.089, mean=0.09, max=0.09, sum=0.269 (3)",
-            "tab": "Efficiency",
-            "score": 0.08954472504844961
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)",
-            "tab": "General information",
-            "score": 532.5653333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=1.014, mean=1.123, max=1.303, sum=3.369 (3)",
-            "tab": "General information",
-            "score": 1.123
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)",
-            "tab": "General information",
-            "score": 515.8217054263565
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=0.953, mean=1.101, max=1.326, sum=3.302 (3)",
-            "tab": "General information",
-            "score": 1.1007751937984496
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.136,
-        "details": {
-          "description": "min=0.134, mean=0.136, max=0.137, sum=0.813 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=0.791, mean=0.793, max=0.796, sum=4.758 (6)",
-            "tab": "Efficiency",
-            "score": 0.7929256541152537
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)",
-            "tab": "General information",
-            "score": 1549.9191702432045
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=114.727, mean=114.938, max=115.313, sum=689.627 (6)",
-            "tab": "General information",
-            "score": 114.93776824034335
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.585, mean=0.603, max=0.618, sum=3.62 (6)",
-            "tab": "Bias",
-            "score": 0.6033209686988849
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.366, mean=0.376, max=0.394, sum=2.258 (6)",
-            "tab": "Bias",
-            "score": 0.376337569695528
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.32, mean=0.327, max=0.336, sum=1.964 (6)",
-            "tab": "Bias",
-            "score": 0.3273411562788524
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.118, mean=0.135, max=0.151, sum=0.81 (6)",
-            "tab": "Bias",
-            "score": 0.13502681064518518
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)",
-            "tab": "Toxicity",
-            "score": 0.000715307582260372
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.202, mean=0.223, max=0.237, sum=0.67 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.22335669413101697
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=2.69, mean=3.369, max=3.833, sum=20.217 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.3694626717468696
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.244, mean=0.247, max=0.25, sum=0.741 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.2468463296383967
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.923, mean=0.929, max=0.933, sum=5.574 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9289690481394134
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=28.745, mean=31.424, max=35.767, sum=188.544 (6)",
-            "tab": "Summarization metrics",
-            "score": 31.424005422737114
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=5.334, mean=5.461, max=5.548, sum=32.769 (6)",
-            "tab": "Summarization metrics",
-            "score": 5.461465024583634
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.034,
-        "details": {
-          "description": "min=0.034, mean=0.034, max=0.036, sum=0.206 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=0.304, mean=0.311, max=0.318, sum=1.868 (6)",
-            "tab": "Efficiency",
-            "score": 0.31128436946991633
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)",
-            "tab": "General information",
-            "score": 4.998712998712999
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1456.402, mean=1510.418, max=1538.921, sum=9062.51 (6)",
-            "tab": "General information",
-            "score": 1510.4182754182755
-          },
-          "XSUM - # output tokens": {
-            "description": "min=33.533, mean=34.806, max=36.037, sum=208.834 (6)",
-            "tab": "General information",
-            "score": 34.805662805662806
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4.0 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666669
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.387, mean=0.403, max=0.414, sum=2.418 (6)",
-            "tab": "Bias",
-            "score": 0.4030736615819075
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.547, mean=0.597, max=0.623, sum=3.579 (6)",
-            "tab": "Bias",
-            "score": 0.5965455454885051
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.087, mean=0.19, max=0.25, sum=1.142 (6)",
-            "tab": "Bias",
-            "score": 0.19037429957632912
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.132, mean=-0.102, max=-0.078, sum=-0.305 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.10168572979799827
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=4.849, mean=4.929, max=5.055, sum=29.572 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.92859074878104
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.237, mean=0.245, max=0.254, sum=0.734 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.24476258912195994
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.834, mean=0.847, max=0.866, sum=5.08 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.8466942307223615
-          },
-          "XSUM - Density": {
-            "description": "min=7.289, mean=7.626, max=8.299, sum=45.753 (6)",
-            "tab": "Summarization metrics",
-            "score": 7.625570347216255
-          },
-          "XSUM - Compression": {
-            "description": "min=12.7, mean=13.08, max=13.496, sum=78.483 (6)",
-            "tab": "Summarization metrics",
-            "score": 13.080494860928995
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.822,
-        "details": {
-          "description": "min=0.776, mean=0.822, max=0.853, sum=2.466 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.053, mean=0.09, max=0.142, sum=0.269 (3)",
-            "tab": "Calibration",
-            "score": 0.08977338148861268
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.663, mean=0.716, max=0.744, sum=2.148 (3)",
-            "tab": "Robustness",
-            "score": 0.7160000000000001
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.724, mean=0.769, max=0.808, sum=2.308 (3)",
-            "tab": "Fairness",
-            "score": 0.7693333333333333
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.104, mean=0.109, max=0.114, sum=0.328 (3)",
-            "tab": "Efficiency",
-            "score": 0.109459033203125
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.916, mean=4.242, max=4.986, sum=12.726 (3)",
-            "tab": "General information",
-            "score": 4.242
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1277.729, mean=1553.363, max=1768.607, sum=4660.089 (3)",
-            "tab": "General information",
-            "score": 1553.363
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1.006, mean=1.013, max=1.021, sum=3.039 (3)",
-            "tab": "General information",
-            "score": 1.013
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.503,
-        "details": {
-          "description": "min=0, mean=0.503, max=1, sum=27.18 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.012, mean=0.479, max=0.985, sum=25.845 (54)",
-            "tab": "Calibration",
-            "score": 0.47860750507636396
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.491, max=1, sum=26.518 (54)",
-            "tab": "Robustness",
-            "score": 0.4910745197871521
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.497, max=1, sum=26.82 (54)",
-            "tab": "Fairness",
-            "score": 0.49665917233754203
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.086, mean=0.092, max=0.103, sum=4.964 (54)",
-            "tab": "Efficiency",
-            "score": 0.0919244734885576
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)",
-            "tab": "General information",
-            "score": 722.6354931173206
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=54 (54)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.406,
-        "details": {
-          "description": "min=0.05, mean=0.406, max=0.975, sum=13.4 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.018, mean=0.473, max=0.891, sum=15.613 (33)",
-            "tab": "Calibration",
-            "score": 0.47311876061285835
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.335, max=0.925, sum=11.05 (33)",
-            "tab": "Robustness",
-            "score": 0.3348484848484849
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.05, mean=0.376, max=0.975, sum=12.4 (33)",
-            "tab": "Fairness",
-            "score": 0.3757575757575758
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.084, mean=0.107, max=0.14, sum=3.527 (33)",
-            "tab": "Efficiency",
-            "score": 0.10687999526515152
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.556, max=5, sum=150.35 (33)",
-            "tab": "General information",
-            "score": 4.556060606060607
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)",
-            "tab": "General information",
-            "score": 812.937878787879
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.15, mean=2.997, max=6.925, sum=98.9 (33)",
-            "tab": "General information",
-            "score": 2.996969696969697
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json b/data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json
deleted file mode 100644
index fbb4b5bb6..000000000
--- a/data/helm_classic/openai/text-babbage-001/f386e763-8078-454b-bd14-32b106663d53.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_text-babbage-001/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "text-babbage-001",
-    "id": "openai/text-babbage-001",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.229,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.27686841173581844
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.22569775422945612
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.2438772758572536
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.7775548245614035
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.5333126239886427
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.5020704604037938
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.6459690893901421
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.229,
-        "details": {
-          "description": "min=0.11, mean=0.229, max=0.325, sum=3.431 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.16, mean=0.311, max=0.472, sum=4.659 (15)",
-            "tab": "Calibration",
-            "score": 0.31056724427484883
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.1, mean=0.186, max=0.228, sum=2.79 (15)",
-            "tab": "Robustness",
-            "score": 0.18602339181286548
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.09, mean=0.205, max=0.272, sum=3.077 (15)",
-            "tab": "Fairness",
-            "score": 0.20512280701754387
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.131, mean=0.133, max=0.135, sum=1.99 (15)",
-            "tab": "Efficiency",
-            "score": 0.13263352809758774
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)",
-            "tab": "General information",
-            "score": 472.2740350877193
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.451,
-        "details": {
-          "description": "min=0.414, mean=0.451, max=0.477, sum=1.353 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.318, mean=0.344, max=0.371, sum=1.031 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.34372183455656985
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.339, mean=0.384, max=0.412, sum=1.151 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.38366666666666666
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.388, mean=0.41, max=0.43, sum=1.23 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.41
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.136, mean=0.142, max=0.15, sum=0.426 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.14212787000868074
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 908.4063333333334
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1.004, max=1.008, sum=3.012 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.004
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.429,
-        "details": {
-          "description": "min=0.412, mean=0.429, max=0.463, sum=1.288 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.158, mean=0.186, max=0.215, sum=0.557 (3)",
-            "tab": "Calibration",
-            "score": 0.18581698260430923
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.101, mean=0.126, max=0.154, sum=0.377 (3)",
-            "tab": "Robustness",
-            "score": 0.12577588570182116
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.277, mean=0.299, max=0.335, sum=0.896 (3)",
-            "tab": "Fairness",
-            "score": 0.29864937428822036
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=0.239, mean=0.243, max=0.246, sum=0.728 (3)",
-            "tab": "Efficiency",
-            "score": 0.24279079738849765
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)",
-            "tab": "General information",
-            "score": 1.6469483568075116
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)",
-            "tab": "General information",
-            "score": 1652.3774647887324
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=12.048, mean=12.829, max=13.307, sum=38.487 (3)",
-            "tab": "General information",
-            "score": 12.829107981220657
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.382, mean=0.403, max=0.433, sum=1.209 (3)",
-            "tab": "Bias",
-            "score": 0.40286362942612947
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.089, mean=0.132, max=0.178, sum=0.395 (3)",
-            "tab": "Bias",
-            "score": 0.13153743304740043
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.003, mean=0.009, max=0.02, sum=0.028 (3)",
-            "tab": "Toxicity",
-            "score": 0.009389671361502348
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.33,
-        "details": {
-          "description": "min=0.296, mean=0.33, max=0.355, sum=0.989 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.505, mean=0.522, max=0.555, sum=1.567 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.5224886706365456
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.346, mean=0.385, max=0.427, sum=1.155 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.38493664744185446
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.039, mean=0.04, max=0.041, sum=0.119 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.039736972833954616
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.139, mean=0.151, max=0.169, sum=0.452 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.15066474277626352
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.048, mean=0.053, max=0.057, sum=0.16 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.05326475617936846
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.209, mean=0.24, max=0.263, sum=0.72 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.23984494964196315
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.134, mean=0.136, max=0.137, sum=0.407 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.1355529375
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=0.2, mean=0.204, max=0.207, sum=0.612 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.20402605620659717
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 112.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1.708, mean=2.016, max=2.304, sum=6.048 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 2.016
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 4.691333333333334
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.036
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1419.5736666666664
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=7.676, mean=7.772, max=7.9, sum=23.317 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 7.772333333333333
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.238, mean=0.317, max=0.467, sum=0.95 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.3167919799498747
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.125, mean=0.145, max=0.167, sum=0.435 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.14484126984126985
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.286, mean=0.333, max=0.364, sum=0.999 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.3331168831168831
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.35, mean=0.403, max=0.457, sum=1.208 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.4025813878698122
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.221, mean=0.243, max=0.273, sum=0.728 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.2427837942788109
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0003333333333333333
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.284,
-        "details": {
-          "description": "min=0.279, mean=0.284, max=0.288, sum=0.852 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.224, mean=0.24, max=0.25, sum=0.72 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.2399406998223789
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.083, mean=0.087, max=0.091, sum=0.261 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.08703476784265192
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.188, mean=0.196, max=0.202, sum=0.589 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.19638729492261867
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=0.305, mean=0.314, max=0.32, sum=0.941 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.3136292994791667
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.9443333333333334
-          },
-          "QuAC - truncated": {
-            "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.016
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1644.8306666666667
-          },
-          "QuAC - # output tokens": {
-            "description": "min=21.715, mean=22.966, max=24.001, sum=68.897 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 22.965666666666667
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.593, mean=0.617, max=0.643, sum=1.851 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.6171143671143672
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.425, mean=0.435, max=0.449, sum=1.305 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.43511418044370825
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.342, mean=0.361, max=0.388, sum=1.084 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.36134886795921545
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.255, mean=0.26, max=0.268, sum=0.779 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.25974518866516266
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0, mean=0.0, max=0.001, sum=0.001 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0003333333333333333
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.561,
-        "details": {
-          "description": "min=0.561, mean=0.561, max=0.561, sum=0.561 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.083, mean=0.083, max=0.083, sum=0.083 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.08291053064819098
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.468, mean=0.468, max=0.468, sum=0.468 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.468
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.405, mean=0.405, max=0.405, sum=0.405 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.405
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.125, mean=0.125, max=0.125, sum=0.125 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.12474649999999997
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 87.888
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.452,
-        "details": {
-          "description": "min=0.452, mean=0.452, max=0.452, sum=0.452 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.362, mean=0.362, max=0.362, sum=0.362 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.36220844968968424
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.39
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.386, mean=0.386, max=0.386, sum=0.386 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.386
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.122, mean=0.122, max=0.122, sum=0.122 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.12216468749999997
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.27
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.233,
-        "details": {
-          "description": "min=0.2, mean=0.233, max=0.274, sum=0.699 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.202, mean=0.251, max=0.279, sum=0.752 (3)",
-            "tab": "Calibration",
-            "score": 0.2505684624777335
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.156, mean=0.195, max=0.252, sum=0.586 (3)",
-            "tab": "Robustness",
-            "score": 0.19520897043832822
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.173, mean=0.207, max=0.257, sum=0.622 (3)",
-            "tab": "Fairness",
-            "score": 0.20744138634046894
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.133, mean=0.134, max=0.134, sum=0.401 (3)",
-            "tab": "Efficiency",
-            "score": 0.1335233459161568
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)",
-            "tab": "General information",
-            "score": 511.12079510703364
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.449,
-        "details": {
-          "description": "min=0.42, mean=0.449, max=0.493, sum=1.347 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.099, mean=0.122, max=0.16, sum=0.366 (3)",
-            "tab": "Robustness",
-            "score": 0.12212023809523809
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.315, mean=0.356, max=0.413, sum=1.069 (3)",
-            "tab": "Robustness",
-            "score": 0.35630094105473137
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.152, mean=0.174, max=0.213, sum=0.523 (3)",
-            "tab": "Fairness",
-            "score": 0.17431719576719562
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.396, mean=0.424, max=0.469, sum=1.273 (3)",
-            "tab": "Fairness",
-            "score": 0.4244404820446352
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.136, mean=0.136, max=0.136, sum=0.408 (3)",
-            "tab": "Efficiency",
-            "score": 0.1359015429687499
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.135, mean=0.135, max=0.136, sum=0.406 (3)",
-            "tab": "Efficiency",
-            "score": 0.1353138323643411
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)",
-            "tab": "General information",
-            "score": 532.5653333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=1.142, mean=1.212, max=1.282, sum=3.635 (3)",
-            "tab": "General information",
-            "score": 1.2116666666666667
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)",
-            "tab": "General information",
-            "score": 515.8217054263565
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=0.977, mean=1.132, max=1.326, sum=3.395 (3)",
-            "tab": "General information",
-            "score": 1.1317829457364341
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.151,
-        "details": {
-          "description": "min=0.147, mean=0.151, max=0.155, sum=0.907 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=0.951, mean=0.968, max=0.994, sum=5.81 (6)",
-            "tab": "Efficiency",
-            "score": 0.9683207451306926
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)",
-            "tab": "General information",
-            "score": 1549.9191702432045
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=114.333, mean=116.858, max=120.519, sum=701.146 (6)",
-            "tab": "General information",
-            "score": 116.85765379113019
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.623, mean=0.626, max=0.63, sum=3.757 (6)",
-            "tab": "Bias",
-            "score": 0.6261965622126104
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.369, mean=0.385, max=0.401, sum=2.312 (6)",
-            "tab": "Bias",
-            "score": 0.3853218330657557
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.366, mean=0.389, max=0.408, sum=2.333 (6)",
-            "tab": "Bias",
-            "score": 0.38877532854423413
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.142, mean=0.147, max=0.152, sum=0.879 (6)",
-            "tab": "Bias",
-            "score": 0.14657801266351475
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.347, mean=0.378, max=0.402, sum=1.135 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.3784199534784201
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=4.659, mean=4.676, max=4.708, sum=28.057 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.676089387380419
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.277, mean=0.282, max=0.285, sum=0.845 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.28169928727191773
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.969, mean=0.972, max=0.973, sum=5.83 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9716251936961523
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=41.642, mean=45.948, max=53.738, sum=275.691 (6)",
-            "tab": "Summarization metrics",
-            "score": 45.94847550953912
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=5.013, mean=5.291, max=5.576, sum=31.744 (6)",
-            "tab": "Summarization metrics",
-            "score": 5.290663826380655
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.046,
-        "details": {
-          "description": "min=0.044, mean=0.046, max=0.047, sum=0.275 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=0.416, mean=0.431, max=0.439, sum=2.583 (6)",
-            "tab": "Efficiency",
-            "score": 0.43057023625187685
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.998, mean=4.999, max=5, sum=29.992 (6)",
-            "tab": "General information",
-            "score": 4.998712998712999
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1456.402, mean=1510.418, max=1538.921, sum=9062.51 (6)",
-            "tab": "General information",
-            "score": 1510.4182754182755
-          },
-          "XSUM - # output tokens": {
-            "description": "min=38.037, mean=40.165, max=41.259, sum=240.988 (6)",
-            "tab": "General information",
-            "score": 40.16473616473616
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.42, mean=0.443, max=0.467, sum=2.66 (6)",
-            "tab": "Bias",
-            "score": 0.44339662209590786
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.436, mean=0.521, max=0.667, sum=3.124 (6)",
-            "tab": "Bias",
-            "score": 0.5206745206745207
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.178, mean=0.204, max=0.222, sum=1.222 (6)",
-            "tab": "Bias",
-            "score": 0.20364463830300386
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.008 (6)",
-            "tab": "Toxicity",
-            "score": 0.001287001287001287
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.078, mean=-0.057, max=-0.044, sum=-0.17 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.05681849002633572
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=4.256, mean=4.33, max=4.381, sum=25.981 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.330178153632894
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.277, mean=0.281, max=0.286, sum=0.844 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.28149043918051486
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.873, mean=0.885, max=0.893, sum=5.312 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.8853480945766184
-          },
-          "XSUM - Density": {
-            "description": "min=7.239, mean=8.487, max=9.133, sum=50.925 (6)",
-            "tab": "Summarization metrics",
-            "score": 8.487450287350649
-          },
-          "XSUM - Compression": {
-            "description": "min=11.1, mean=11.856, max=12.376, sum=71.136 (6)",
-            "tab": "Summarization metrics",
-            "score": 11.856076449493486
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.913,
-        "details": {
-          "description": "min=0.902, mean=0.913, max=0.921, sum=2.738 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.028, mean=0.038, max=0.05, sum=0.115 (3)",
-            "tab": "Calibration",
-            "score": 0.038396495508375095
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.821, mean=0.844, max=0.868, sum=2.532 (3)",
-            "tab": "Robustness",
-            "score": 0.844
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.871, mean=0.887, max=0.901, sum=2.66 (3)",
-            "tab": "Fairness",
-            "score": 0.8866666666666667
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.151, mean=0.157, max=0.162, sum=0.472 (3)",
-            "tab": "Efficiency",
-            "score": 0.15740409657118068
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.916, mean=4.242, max=4.986, sum=12.726 (3)",
-            "tab": "General information",
-            "score": 4.242
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1277.729, mean=1553.363, max=1768.607, sum=4660.089 (3)",
-            "tab": "General information",
-            "score": 1553.363
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1.001, max=1.003, sum=3.003 (3)",
-            "tab": "General information",
-            "score": 1.0010000000000001
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.499,
-        "details": {
-          "description": "min=0, mean=0.499, max=1, sum=26.951 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.092, mean=0.499, max=0.911, sum=26.966 (54)",
-            "tab": "Calibration",
-            "score": 0.49936533676896183
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.499, max=1, sum=26.94 (54)",
-            "tab": "Robustness",
-            "score": 0.4988821054609162
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.499, max=1, sum=26.936 (54)",
-            "tab": "Fairness",
-            "score": 0.4988205867192775
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.13, mean=0.138, max=0.151, sum=7.438 (54)",
-            "tab": "Efficiency",
-            "score": 0.13774715150926628
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)",
-            "tab": "General information",
-            "score": 722.6354931173206
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=54 (54)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.509,
-        "details": {
-          "description": "min=0.125, mean=0.509, max=0.925, sum=16.8 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.102, mean=0.295, max=0.541, sum=9.737 (33)",
-            "tab": "Calibration",
-            "score": 0.2950696376748286
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.025, mean=0.383, max=0.925, sum=12.625 (33)",
-            "tab": "Robustness",
-            "score": 0.38257575757575757
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.125, mean=0.475, max=0.925, sum=15.675 (33)",
-            "tab": "Fairness",
-            "score": 0.47500000000000003
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.13, mean=0.153, max=0.188, sum=5.047 (33)",
-            "tab": "Efficiency",
-            "score": 0.15293320707070707
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.556, max=5, sum=150.35 (33)",
-            "tab": "General information",
-            "score": 4.556060606060607
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)",
-            "tab": "General information",
-            "score": 812.937878787879
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.85, mean=2.774, max=5.875, sum=91.55 (33)",
-            "tab": "General information",
-            "score": 2.7742424242424244
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.025, sum=0.025 (33)",
-            "tab": "Toxicity",
-            "score": 0.0007575757575757576
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json b/data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json
deleted file mode 100644
index 4537bcc84..000000000
--- a/data/helm_classic/openai/text-curie-001/a4739cda-028b-48e0-b3b5-ca9b583d03f5.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_text-curie-001/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "text-curie-001",
-    "id": "openai/text-curie-001",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.36,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.33452535946368817
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.336998226097225
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.377271245624972
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.7827028508771929
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.49509040746991073
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.4050529717196384
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.6165831244778613
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.237,
-        "details": {
-          "description": "min=0.21, mean=0.237, max=0.298, sum=3.558 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.298, mean=0.462, max=0.534, sum=6.937 (15)",
-            "tab": "Calibration",
-            "score": 0.4624557415628211
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.16, mean=0.22, max=0.272, sum=3.303 (15)",
-            "tab": "Robustness",
-            "score": 0.22019883040935673
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.2, mean=0.231, max=0.281, sum=3.462 (15)",
-            "tab": "Fairness",
-            "score": 0.23079532163742691
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.129, mean=0.133, max=0.14, sum=1.998 (15)",
-            "tab": "Efficiency",
-            "score": 0.13321992694627194
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)",
-            "tab": "General information",
-            "score": 472.2740350877193
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.62,
-        "details": {
-          "description": "min=0.591, mean=0.62, max=0.638, sum=1.861 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.239, mean=0.253, max=0.279, sum=0.758 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.252648729019218
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.519, mean=0.549, max=0.566, sum=1.648 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.5493333333333332
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.543, mean=0.576, max=0.592, sum=1.727 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.5756666666666667
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.141, mean=0.143, max=0.146, sum=0.429 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.14293199392361097
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 908.4063333333334
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1.004, mean=1.007, max=1.012, sum=3.021 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.007
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.582,
-        "details": {
-          "description": "min=0.55, mean=0.582, max=0.63, sum=1.746 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.198, mean=0.221, max=0.233, sum=0.664 (3)",
-            "tab": "Calibration",
-            "score": 0.22125645338584943
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.299, mean=0.34, max=0.38, sum=1.02 (3)",
-            "tab": "Robustness",
-            "score": 0.33989457936851464
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.428, mean=0.463, max=0.5, sum=1.389 (3)",
-            "tab": "Fairness",
-            "score": 0.4630759323159577
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=0.19, mean=0.205, max=0.217, sum=0.615 (3)",
-            "tab": "Efficiency",
-            "score": 0.20493085387323948
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.051, mean=1.647, max=2.085, sum=4.941 (3)",
-            "tab": "General information",
-            "score": 1.6469483568075116
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1601.955, mean=1652.377, max=1705.003, sum=4957.132 (3)",
-            "tab": "General information",
-            "score": 1652.3774647887324
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=6.645, mean=8.971, max=10.738, sum=26.913 (3)",
-            "tab": "General information",
-            "score": 8.970892018779344
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.436, mean=0.446, max=0.453, sum=1.339 (3)",
-            "tab": "Bias",
-            "score": 0.44628176056747487
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.494, mean=0.609, max=0.667, sum=1.828 (3)",
-            "tab": "Bias",
-            "score": 0.6091954022988506
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.161, mean=0.19, max=0.207, sum=0.569 (3)",
-            "tab": "Bias",
-            "score": 0.1896444305777106
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.014, mean=0.015, max=0.017, sum=0.045 (3)",
-            "tab": "Toxicity",
-            "score": 0.015023474178403754
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.571,
-        "details": {
-          "description": "min=0.536, mean=0.571, max=0.599, sum=1.714 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.233, mean=0.253, max=0.264, sum=0.758 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.25269080261254767
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.215, mean=0.216, max=0.217, sum=0.648 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.21613185314031233
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.116, mean=0.121, max=0.124, sum=0.363 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.12098406641539787
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.365, mean=0.415, max=0.445, sum=1.246 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.4152585116053236
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.124, mean=0.132, max=0.139, sum=0.396 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.13187631785928275
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.464, mean=0.5, max=0.519, sum=1.499 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.4995085831746681
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.152, mean=0.153, max=0.154, sum=0.459 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.15303552604166656
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=0.176, mean=0.185, max=0.193, sum=0.554 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.1847613116319444
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 112.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.507, mean=4.641, max=4.737, sum=13.923 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 4.641
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.647, mean=4.691, max=4.724, sum=14.074 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 4.691333333333334
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.036
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1231.212, mean=1419.574, max=1523.257, sum=4258.721 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1419.5736666666664
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=5.931, mean=6.634, max=7.52, sum=19.901 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 6.633666666666667
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.542, mean=0.566, max=0.6, sum=1.697 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.5657407407407408
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.119, mean=0.238, max=0.346, sum=0.715 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.23840048840048841
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.3, mean=0.433, max=0.5, sum=1.3 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.43333333333333335
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.405, mean=0.441, max=0.467, sum=1.323 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.44097026888062185
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.144, mean=0.158, max=0.179, sum=0.473 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.15754640839386602
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358,
-        "details": {
-          "description": "min=0.341, mean=0.358, max=0.383, sum=1.074 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.237, mean=0.254, max=0.272, sum=0.763 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.25427485237899866
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.166, mean=0.169, max=0.173, sum=0.506 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.16872479684813432
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.244, mean=0.255, max=0.264, sum=0.765 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.2548639356870548
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=0.287, mean=0.298, max=0.313, sum=0.894 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.29803956770833356
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.845, mean=0.944, max=1.086, sum=2.833 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.9443333333333334
-          },
-          "QuAC - truncated": {
-            "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.016
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1625.523, mean=1644.831, max=1670.605, sum=4934.492 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1644.8306666666667
-          },
-          "QuAC - # output tokens": {
-            "description": "min=20.676, mean=22.198, max=24.409, sum=66.593 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 22.197666666666663
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.593, mean=0.631, max=0.667, sum=1.893 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.6308641975308643
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.438, mean=0.456, max=0.473, sum=1.367 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.4556780038650607
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.244, mean=0.274, max=0.294, sum=0.822 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.27410775768984724
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.231, mean=0.242, max=0.26, sum=0.726 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.24189395211611728
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.002, sum=0.004 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0013333333333333333
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.676,
-        "details": {
-          "description": "min=0.676, mean=0.676, max=0.676, sum=0.676 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.153, mean=0.153, max=0.153, sum=0.153 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.15281579026404526
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.625, mean=0.625, max=0.625, sum=0.625 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.625
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.534, mean=0.534, max=0.534, sum=0.534 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.534
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.125, mean=0.125, max=0.125, sum=0.125 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.12517962499999974
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 87.888
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.514,
-        "details": {
-          "description": "min=0.514, mean=0.514, max=0.514, sum=0.514 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.321 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.3206023655720099
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.424, mean=0.424, max=0.424, sum=0.424 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.424
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.452, mean=0.452, max=0.452, sum=0.452 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.452
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.119, mean=0.119, max=0.119, sum=0.119 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.1193705468750003
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.27
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.257,
-        "details": {
-          "description": "min=0.231, mean=0.257, max=0.301, sum=0.772 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.321, mean=0.355, max=0.375, sum=1.066 (3)",
-            "tab": "Calibration",
-            "score": 0.35539796883884156
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.206, mean=0.235, max=0.284, sum=0.705 (3)",
-            "tab": "Robustness",
-            "score": 0.23496432212028542
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.209, mean=0.239, max=0.286, sum=0.717 (3)",
-            "tab": "Fairness",
-            "score": 0.23904179408766565
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.134, mean=0.134, max=0.136, sum=0.403 (3)",
-            "tab": "Efficiency",
-            "score": 0.1343441023987004
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)",
-            "tab": "General information",
-            "score": 511.12079510703364
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.507,
-        "details": {
-          "description": "min=0.476, mean=0.507, max=0.545, sum=1.522 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.171, mean=0.198, max=0.222, sum=0.594 (3)",
-            "tab": "Robustness",
-            "score": 0.1980144179894178
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.393, mean=0.444, max=0.486, sum=1.331 (3)",
-            "tab": "Robustness",
-            "score": 0.4437543283018195
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.231, mean=0.244, max=0.26, sum=0.732 (3)",
-            "tab": "Fairness",
-            "score": 0.2441616402116399
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.448, mean=0.482, max=0.523, sum=1.445 (3)",
-            "tab": "Fairness",
-            "score": 0.4817143719085842
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.134, mean=0.136, max=0.138, sum=0.408 (3)",
-            "tab": "Efficiency",
-            "score": 0.13591170442708336
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.133, mean=0.135, max=0.138, sum=0.406 (3)",
-            "tab": "Efficiency",
-            "score": 0.13529218144379848
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)",
-            "tab": "General information",
-            "score": 532.5653333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=1.005, mean=1.031, max=1.08, sum=3.092 (3)",
-            "tab": "General information",
-            "score": 1.0306666666666666
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)",
-            "tab": "General information",
-            "score": 515.8217054263565
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=1, mean=1.078, max=1.209, sum=3.233 (3)",
-            "tab": "General information",
-            "score": 1.0775193798449612
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.152,
-        "details": {
-          "description": "min=0.144, mean=0.152, max=0.159, sum=1.061 (7)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=0.748, mean=0.799, max=0.848, sum=5.594 (7)",
-            "tab": "Efficiency",
-            "score": 0.7991309579692929
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=3262 (7)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=0, mean=4.286, max=5, sum=30 (7)",
-            "tab": "General information",
-            "score": 4.285714285714286
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=583.586, mean=1411.872, max=1567.586, sum=9883.101 (7)",
-            "tab": "General information",
-            "score": 1411.8715511955854
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=86.798, mean=94.314, max=101.208, sum=660.2 (7)",
-            "tab": "General information",
-            "score": 94.31422440220724
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=1, mean=2.714, max=3, sum=19 (7)",
-            "tab": "General information",
-            "score": 2.7142857142857144
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.566, mean=0.61, max=0.637, sum=4.269 (7)",
-            "tab": "Bias",
-            "score": 0.609875949224765
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.366, mean=0.387, max=0.406, sum=2.706 (7)",
-            "tab": "Bias",
-            "score": 0.38654992671117155
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.282, mean=0.301, max=0.322, sum=2.106 (7)",
-            "tab": "Bias",
-            "score": 0.30088570849440416
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.1, mean=0.118, max=0.133, sum=0.827 (7)",
-            "tab": "Bias",
-            "score": 0.11810804679822585
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.156, mean=0.291, max=0.356, sum=1.165 (4)",
-            "tab": "Summarization metrics",
-            "score": 0.2913458656100147
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=4.214, mean=4.616, max=4.743, sum=32.315 (7)",
-            "tab": "Summarization metrics",
-            "score": 4.616429547159027
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.299, mean=0.306, max=0.314, sum=1.222 (4)",
-            "tab": "Summarization metrics",
-            "score": 0.3055441003363248
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.935, mean=0.961, max=0.97, sum=6.725 (7)",
-            "tab": "Summarization metrics",
-            "score": 0.9607616041668255
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=17.105, mean=26.1, max=29.982, sum=182.7 (7)",
-            "tab": "Summarization metrics",
-            "score": 26.09992906850249
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=6.155, mean=6.829, max=7.635, sum=47.805 (7)",
-            "tab": "Summarization metrics",
-            "score": 6.829258437977153
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "min=0.967, mean=0.967, max=0.967, sum=0.967 (1)",
-            "tab": "Summarization metrics",
-            "score": 0.9666666666666669
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "min=4.587, mean=4.587, max=4.587, sum=4.587 (1)",
-            "tab": "Summarization metrics",
-            "score": 4.586666666666667
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "min=4.243, mean=4.243, max=4.243, sum=4.243 (1)",
-            "tab": "Summarization metrics",
-            "score": 4.243333333333334
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.076,
-        "details": {
-          "description": "min=0.056, mean=0.076, max=0.081, sum=0.533 (7)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=0.349, mean=0.364, max=0.408, sum=2.548 (7)",
-            "tab": "Efficiency",
-            "score": 0.36398217373942815
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3626 (7)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=0, mean=4.285, max=5, sum=29.992 (7)",
-            "tab": "General information",
-            "score": 4.284611141753999
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=388.402, mean=1350.13, max=1538.921, sum=9450.911 (7)",
-            "tab": "General information",
-            "score": 1350.1301709873137
-          },
-          "XSUM - # output tokens": {
-            "description": "min=29.917, mean=32.345, max=40.357, sum=226.415 (7)",
-            "tab": "General information",
-            "score": 32.3450082735797
-          },
-          "XSUM - # trials": {
-            "description": "min=1, mean=2.714, max=3, sum=19 (7)",
-            "tab": "General information",
-            "score": 2.7142857142857144
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4.667 (7)",
-            "tab": "Bias",
-            "score": 0.6666666666666669
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.419, mean=0.442, max=0.466, sum=3.093 (7)",
-            "tab": "Bias",
-            "score": 0.4418823146165695
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.473, mean=0.54, max=0.584, sum=3.777 (7)",
-            "tab": "Bias",
-            "score": 0.5395129666982432
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.172, mean=0.194, max=0.228, sum=1.356 (7)",
-            "tab": "Bias",
-            "score": 0.1937219794503278
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.002, max=0.004, sum=0.012 (7)",
-            "tab": "Toxicity",
-            "score": 0.0016547159404302263
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.241, mean=-0.185, max=-0.057, sum=-0.741 (4)",
-            "tab": "Summarization metrics",
-            "score": -0.18531544589014434
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=3.199, mean=3.459, max=3.799, sum=24.213 (7)",
-            "tab": "Summarization metrics",
-            "score": 3.458996653634986
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.308, mean=0.354, max=0.372, sum=1.415 (4)",
-            "tab": "Summarization metrics",
-            "score": 0.3536865086232682
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.823, mean=0.839, max=0.903, sum=5.872 (7)",
-            "tab": "Summarization metrics",
-            "score": 0.838839539634714
-          },
-          "XSUM - Density": {
-            "description": "min=3.005, mean=4.008, max=8.274, sum=28.059 (7)",
-            "tab": "Summarization metrics",
-            "score": 4.008473483028278
-          },
-          "XSUM - Compression": {
-            "description": "min=11.556, mean=12.98, max=13.601, sum=90.86 (7)",
-            "tab": "Summarization metrics",
-            "score": 12.979988031884476
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "min=0.957, mean=0.991, max=1, sum=4.957 (5)",
-            "tab": "Summarization metrics",
-            "score": 0.9913333333333334
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "min=4, mean=4.068, max=4.34, sum=20.34 (5)",
-            "tab": "Summarization metrics",
-            "score": 4.068
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "min=4.273, mean=4.321, max=4.333, sum=21.607 (5)",
-            "tab": "Summarization metrics",
-            "score": 4.3213333333333335
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.923,
-        "details": {
-          "description": "min=0.915, mean=0.923, max=0.927, sum=2.768 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.027, mean=0.031, max=0.034, sum=0.093 (3)",
-            "tab": "Calibration",
-            "score": 0.03108408690404522
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.876, mean=0.881, max=0.887, sum=2.642 (3)",
-            "tab": "Robustness",
-            "score": 0.8806666666666666
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.903, mean=0.91, max=0.916, sum=2.731 (3)",
-            "tab": "Fairness",
-            "score": 0.9103333333333333
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.142, mean=0.147, max=0.151, sum=0.442 (3)",
-            "tab": "Efficiency",
-            "score": 0.1473289437934027
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.916, mean=4.242, max=4.986, sum=12.726 (3)",
-            "tab": "General information",
-            "score": 4.242
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1277.729, mean=1553.363, max=1768.607, sum=4660.089 (3)",
-            "tab": "General information",
-            "score": 1553.363
-          },
-          "IMDB - # output tokens": {
-            "description": "min=0.998, mean=0.999, max=1, sum=2.996 (3)",
-            "tab": "General information",
-            "score": 0.9986666666666667
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.537,
-        "details": {
-          "description": "min=0.04, mean=0.537, max=0.93, sum=29.013 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.049, mean=0.262, max=0.674, sum=14.15 (54)",
-            "tab": "Calibration",
-            "score": 0.26204430696260744
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.129, max=0.39, sum=6.954 (54)",
-            "tab": "Robustness",
-            "score": 0.12877898867890694
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.02, mean=0.471, max=0.874, sum=25.434 (54)",
-            "tab": "Fairness",
-            "score": 0.4710066762167616
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.129, mean=0.142, max=0.149, sum=7.645 (54)",
-            "tab": "Efficiency",
-            "score": 0.1415740791295965
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)",
-            "tab": "General information",
-            "score": 722.6354931173206
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=0.905, mean=0.979, max=1, sum=52.876 (54)",
-            "tab": "General information",
-            "score": 0.9791789992573504
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.489,
-        "details": {
-          "description": "min=0, mean=0.489, max=0.85, sum=16.15 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.079, mean=0.409, max=1, sum=13.49 (33)",
-            "tab": "Calibration",
-            "score": 0.40879785924457385
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.399, max=0.775, sum=13.175 (33)",
-            "tab": "Robustness",
-            "score": 0.3992424242424243
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0, mean=0.458, max=0.85, sum=15.125 (33)",
-            "tab": "Fairness",
-            "score": 0.45833333333333337
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.13, mean=0.152, max=0.183, sum=5.003 (33)",
-            "tab": "Efficiency",
-            "score": 0.1516085454150884
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.556, max=5, sum=150.35 (33)",
-            "tab": "General information",
-            "score": 4.556060606060607
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)",
-            "tab": "General information",
-            "score": 812.937878787879
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0, mean=2.751, max=5.95, sum=90.775 (33)",
-            "tab": "General information",
-            "score": 2.750757575757576
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json b/data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json
deleted file mode 100644
index 0e9fa4947..000000000
--- a/data/helm_classic/openai/text-davinci-002/837e20ff-fed1-4431-b643-63b904055c66.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_text-davinci-002/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "text-davinci-002",
-    "id": "openai/text-davinci-002",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.905,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.4743236143945364
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.9158568720860156
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.8637256699548135
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.6036239035087719
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.502171676177358
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.4088448588448588
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.6410087719298245
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.568,
-        "details": {
-          "description": "min=0.26, mean=0.568, max=0.86, sum=8.515 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.064, mean=0.176, max=0.264, sum=2.644 (15)",
-            "tab": "Calibration",
-            "score": 0.17629729974248792
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.23, mean=0.525, max=0.83, sum=7.868 (15)",
-            "tab": "Robustness",
-            "score": 0.5245380116959065
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.24, mean=0.531, max=0.82, sum=7.964 (15)",
-            "tab": "Fairness",
-            "score": 0.5309473684210526
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.175, mean=0.196, max=0.215, sum=2.946 (15)",
-            "tab": "Efficiency",
-            "score": 0.19643028419682018
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)",
-            "tab": "General information",
-            "score": 472.2740350877193
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.877,
-        "details": {
-          "description": "min=0.872, mean=0.877, max=0.883, sum=2.631 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.057, mean=0.064, max=0.068, sum=0.192 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.06391934132499137
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.834, mean=0.841, max=0.854, sum=2.523 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.8410000000000001
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.829, mean=0.837, max=0.844, sum=2.51 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.8366666666666666
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.176, mean=0.191, max=0.216, sum=0.574 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.1911954346788195
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 908.4063333333334
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1.009, mean=1.013, max=1.018, sum=3.039 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.013
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.727,
-        "details": {
-          "description": "min=0.711, mean=0.727, max=0.752, sum=2.182 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.214, mean=0.239, max=0.268, sum=0.718 (3)",
-            "tab": "Calibration",
-            "score": 0.2393596998509794
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.61, mean=0.638, max=0.663, sum=1.915 (3)",
-            "tab": "Robustness",
-            "score": 0.6382180079306305
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.637, mean=0.646, max=0.664, sum=1.938 (3)",
-            "tab": "Fairness",
-            "score": 0.6459531095726224
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=0.48, mean=0.512, max=0.539, sum=1.537 (3)",
-            "tab": "Efficiency",
-            "score": 0.5124278205692486
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.259, mean=4.532, max=4.955, sum=13.597 (3)",
-            "tab": "General information",
-            "score": 4.532394366197183
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3479.563, mean=3579.093, max=3633.659, sum=10737.279 (3)",
-            "tab": "General information",
-            "score": 3579.092957746479
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=6.158, mean=7.378, max=8.448, sum=22.135 (3)",
-            "tab": "General information",
-            "score": 7.378403755868544
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.363, mean=0.395, max=0.417, sum=1.184 (3)",
-            "tab": "Bias",
-            "score": 0.39479717813051146
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.17, mean=0.189, max=0.21, sum=0.568 (3)",
-            "tab": "Bias",
-            "score": 0.18948121770702417
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.008, mean=0.013, max=0.017, sum=0.039 (3)",
-            "tab": "Toxicity",
-            "score": 0.013145539906103286
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.713,
-        "details": {
-          "description": "min=0.71, mean=0.713, max=0.716, sum=2.139 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.315, mean=0.341, max=0.356, sum=1.022 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.34056739358291327
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.233, mean=0.242, max=0.247, sum=0.726 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.24207582378172995
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.279, mean=0.299, max=0.31, sum=0.896 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.29853007347043187
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.66, mean=0.665, max=0.67, sum=1.994 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.6645627340843298
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.311, mean=0.32, max=0.326, sum=0.96 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.3200640288704773
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.655, mean=0.659, max=0.663, sum=1.976 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.658783235208417
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.259, mean=0.264, max=0.268, sum=0.791 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.26376651302083315
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=0.387, mean=0.394, max=0.398, sum=1.182 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.3939576829427085
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 112.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=3.783, mean=3.954, max=4.116, sum=11.861 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.9536666666666664
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.874, mean=4.883, max=4.891, sum=14.65 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 4.883333333333334
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.02, mean=0.02, max=0.02, sum=0.06 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.02
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1315.257, mean=1520.977, max=1629.945, sum=4562.931 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1520.977
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=6.586, mean=6.652, max=6.739, sum=19.957 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 6.652333333333334
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.439, mean=0.448, max=0.467, sum=1.344 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.44795321637426905
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.079, mean=0.129, max=0.167, sum=0.388 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.1294903926482874
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.4, mean=0.407, max=0.42, sum=1.22 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.40666666666666673
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.474, mean=0.487, max=0.505, sum=1.46 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.48653132655730696
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.375, mean=0.401, max=0.44, sum=1.202 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.40059748427672953
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.001
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.445,
-        "details": {
-          "description": "min=0.435, mean=0.445, max=0.451, sum=1.335 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.234, mean=0.274, max=0.301, sum=0.821 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.27378530130603257
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.313, mean=0.319, max=0.331, sum=0.958 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.3193910892114107
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.339, mean=0.353, max=0.363, sum=1.06 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.3532761321768228
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=0.887, mean=0.891, max=0.894, sum=2.674 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.8912715646701383
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=2.978, mean=3.438, max=3.878, sum=10.315 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.438333333333333
-          },
-          "QuAC - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=2819.048, mean=3249.907, max=3487.39, sum=9749.722 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3249.907333333333
-          },
-          "QuAC - # output tokens": {
-            "description": "min=20.711, mean=20.986, max=21.534, sum=62.959 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 20.98633333333333
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.567, mean=0.579, max=0.6, sum=1.738 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.5793650793650794
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.443, mean=0.453, max=0.461, sum=1.358 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.4526990667248227
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.256, mean=0.27, max=0.28, sum=0.81 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.2701590708612791
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.245, mean=0.255, max=0.265, sum=0.764 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.2545671124587146
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.003, sum=0.007 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0023333333333333335
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.815,
-        "details": {
-          "description": "min=0.815, mean=0.815, max=0.815, sum=0.815 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.286, mean=0.286, max=0.286, sum=0.286 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.2864163850455534
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.776, mean=0.776, max=0.776, sum=0.776 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.776
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.703, mean=0.703, max=0.703, sum=0.703 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.703
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "min=0.171, mean=0.171, max=0.171, sum=0.171 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.1710758125
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 87.888
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.594,
-        "details": {
-          "description": "min=0.594, mean=0.594, max=0.594, sum=0.594 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.238, mean=0.238, max=0.238, sum=0.238 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.23789749910476482
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.52, mean=0.52, max=0.52, sum=0.52 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.52
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.54, mean=0.54, max=0.54, sum=0.54 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.54
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "min=0.158, mean=0.158, max=0.158, sum=0.158 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Efficiency",
-            "score": 0.1578440234375
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.27
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.61,
-        "details": {
-          "description": "min=0.596, mean=0.61, max=0.63, sum=1.829 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.167, mean=0.199, max=0.232, sum=0.596 (3)",
-            "tab": "Calibration",
-            "score": 0.19868497875362334
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.517, mean=0.547, max=0.573, sum=1.641 (3)",
-            "tab": "Robustness",
-            "score": 0.5468909276248726
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.48, mean=0.515, max=0.547, sum=1.546 (3)",
-            "tab": "Fairness",
-            "score": 0.5152905198776758
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.186, mean=0.2, max=0.208, sum=0.601 (3)",
-            "tab": "Efficiency",
-            "score": 0.20048467762487246
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)",
-            "tab": "General information",
-            "score": 511.12079510703364
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.664,
-        "details": {
-          "description": "min=0.642, mean=0.664, max=0.685, sum=1.991 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.327, mean=0.344, max=0.366, sum=1.031 (3)",
-            "tab": "Robustness",
-            "score": 0.3435873015873012
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.615, mean=0.628, max=0.641, sum=1.884 (3)",
-            "tab": "Robustness",
-            "score": 0.627999061572698
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.357, mean=0.373, max=0.39, sum=1.12 (3)",
-            "tab": "Fairness",
-            "score": 0.3732579365079361
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.614, mean=0.639, max=0.663, sum=1.917 (3)",
-            "tab": "Fairness",
-            "score": 0.6388640932298691
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "min=0.174, mean=0.192, max=0.207, sum=0.577 (3)",
-            "tab": "Efficiency",
-            "score": 0.19244404882812502
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "min=0.173, mean=0.198, max=0.213, sum=0.594 (3)",
-            "tab": "Efficiency",
-            "score": 0.19810631661821707
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)",
-            "tab": "General information",
-            "score": 532.5653333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=1.006, mean=1.014, max=1.024, sum=3.042 (3)",
-            "tab": "General information",
-            "score": 1.014
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)",
-            "tab": "General information",
-            "score": 515.8217054263565
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=0.977, mean=0.992, max=1, sum=2.977 (3)",
-            "tab": "General information",
-            "score": 0.9922480620155039
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.153,
-        "details": {
-          "description": "min=0.148, mean=0.153, max=0.156, sum=1.074 (7)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=2.064, mean=2.236, max=2.638, sum=15.65 (7)",
-            "tab": "Efficiency",
-            "score": 2.235718461202547
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=3262 (7)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=0, mean=4.286, max=5, sum=30 (7)",
-            "tab": "General information",
-            "score": 4.285714285714286
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=583.586, mean=1411.872, max=1567.586, sum=9883.101 (7)",
-            "tab": "General information",
-            "score": 1411.8715511955854
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=64.197, mean=70.37, max=85.644, sum=492.592 (7)",
-            "tab": "General information",
-            "score": 70.37032495401594
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=1, mean=2.714, max=3, sum=19 (7)",
-            "tab": "General information",
-            "score": 2.7142857142857144
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.603, mean=0.625, max=0.667, sum=4.375 (7)",
-            "tab": "Bias",
-            "score": 0.6249837439576494
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.388, mean=0.408, max=0.42, sum=2.856 (7)",
-            "tab": "Bias",
-            "score": 0.4080224162158765
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.238, mean=0.293, max=0.347, sum=2.051 (7)",
-            "tab": "Bias",
-            "score": 0.293047968208597
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.07, mean=0.107, max=0.138, sum=0.752 (7)",
-            "tab": "Bias",
-            "score": 0.1073937839039085
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.136, mean=0.353, max=0.455, sum=1.412 (4)",
-            "tab": "Summarization metrics",
-            "score": 0.35298687802144607
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "min=4.04, mean=4.635, max=4.834, sum=32.448 (7)",
-            "tab": "Summarization metrics",
-            "score": 4.635409033816104
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.303, mean=0.321, max=0.333, sum=1.283 (4)",
-            "tab": "Summarization metrics",
-            "score": 0.3206946902747002
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.904, mean=0.946, max=0.957, sum=6.625 (7)",
-            "tab": "Summarization metrics",
-            "score": 0.9464923911138073
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=13.275, mean=15.995, max=17.016, sum=111.962 (7)",
-            "tab": "Summarization metrics",
-            "score": 15.994591776988235
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=7.152, mean=8.818, max=9.675, sum=61.729 (7)",
-            "tab": "Summarization metrics",
-            "score": 8.818392473408851
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "min=0.993, mean=0.999, max=1, sum=6.993 (7)",
-            "tab": "Summarization metrics",
-            "score": 0.9990476190476191
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "min=4.333, mean=4.435, max=4.6, sum=31.044 (7)",
-            "tab": "Summarization metrics",
-            "score": 4.434920634920635
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "min=4, mean=4.371, max=5, sum=30.598 (7)",
-            "tab": "Summarization metrics",
-            "score": 4.3711111111111105
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.144,
-        "details": {
-          "description": "min=0.087, mean=0.144, max=0.161, sum=1.006 (7)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=1.003, mean=1.026, max=1.088, sum=7.181 (7)",
-            "tab": "Efficiency",
-            "score": 1.0257979815553757
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3626 (7)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=0, mean=4.286, max=5, sum=30 (7)",
-            "tab": "General information",
-            "score": 4.285714285714286
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=388.402, mean=1350.402, max=1539.402, sum=9452.811 (7)",
-            "tab": "General information",
-            "score": 1350.4015444015445
-          },
-          "XSUM - # output tokens": {
-            "description": "min=27.776, mean=28.674, max=31.952, sum=200.716 (7)",
-            "tab": "General information",
-            "score": 28.673745173745175
-          },
-          "XSUM - # trials": {
-            "description": "min=1, mean=2.714, max=3, sum=19 (7)",
-            "tab": "General information",
-            "score": 2.7142857142857144
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4.667 (7)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.441, mean=0.457, max=0.48, sum=3.202 (7)",
-            "tab": "Bias",
-            "score": 0.45745150585486727
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.376, mean=0.481, max=0.556, sum=3.37 (7)",
-            "tab": "Bias",
-            "score": 0.48149813295367977
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.19, mean=0.239, max=0.257, sum=1.672 (7)",
-            "tab": "Bias",
-            "score": 0.2388259605365298
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.002, max=0.004, sum=0.012 (7)",
-            "tab": "Toxicity",
-            "score": 0.0016547159404302263
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.288, mean=-0.273, max=-0.257, sum=-1.091 (4)",
-            "tab": "Summarization metrics",
-            "score": -0.2728636190391109
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=2.795, mean=3.007, max=3.207, sum=21.05 (7)",
-            "tab": "Summarization metrics",
-            "score": 3.0071326818732076
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.366, mean=0.43, max=0.459, sum=1.718 (4)",
-            "tab": "Summarization metrics",
-            "score": 0.4296202005928721
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.789, mean=0.801, max=0.833, sum=5.604 (7)",
-            "tab": "Summarization metrics",
-            "score": 0.8005553389114972
-          },
-          "XSUM - Density": {
-            "description": "min=2.471, mean=2.872, max=4.654, sum=20.107 (7)",
-            "tab": "Summarization metrics",
-            "score": 2.8724523474356
-          },
-          "XSUM - Compression": {
-            "description": "min=13.554, mean=14.07, max=14.306, sum=98.488 (7)",
-            "tab": "Summarization metrics",
-            "score": 14.069713395015288
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "min=0.762, mean=0.849, max=0.963, sum=5.941 (7)",
-            "tab": "Summarization metrics",
-            "score": 0.848692365835223
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "min=4.277, mean=4.41, max=4.63, sum=30.869 (7)",
-            "tab": "Summarization metrics",
-            "score": 4.40989417989418
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "min=4.403, mean=4.685, max=4.815, sum=32.795 (7)",
-            "tab": "Summarization metrics",
-            "score": 4.684981103552532
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.948,
-        "details": {
-          "description": "min=0.945, mean=0.948, max=0.953, sum=2.843 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.029, mean=0.031, max=0.033, sum=0.092 (3)",
-            "tab": "Calibration",
-            "score": 0.03076843904734194
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.898, mean=0.925, max=0.946, sum=2.776 (3)",
-            "tab": "Robustness",
-            "score": 0.9253333333333332
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.919, mean=0.934, max=0.945, sum=2.803 (3)",
-            "tab": "Fairness",
-            "score": 0.9343333333333333
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=0.218, mean=0.247, max=0.279, sum=0.741 (3)",
-            "tab": "Efficiency",
-            "score": 0.24716598621961808
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1282.797, mean=1897.464, max=2572.797, sum=5692.391 (3)",
-            "tab": "General information",
-            "score": 1897.4636666666665
-          },
-          "IMDB - # output tokens": {
-            "description": "min=0.999, mean=1.0, max=1, sum=2.999 (3)",
-            "tab": "General information",
-            "score": 0.9996666666666667
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.668,
-        "details": {
-          "description": "min=0.4, mean=0.668, max=0.876, sum=36.093 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.018, mean=0.183, max=0.424, sum=9.875 (54)",
-            "tab": "Calibration",
-            "score": 0.18286487616515196
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.287, mean=0.567, max=0.838, sum=30.64 (54)",
-            "tab": "Robustness",
-            "score": 0.5673997819699065
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.082, mean=0.463, max=0.851, sum=24.991 (54)",
-            "tab": "Fairness",
-            "score": 0.46278978149694866
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.174, mean=0.186, max=0.217, sum=10.038 (54)",
-            "tab": "Efficiency",
-            "score": 0.18589157378997984
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)",
-            "tab": "General information",
-            "score": 722.6354931173206
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=0.967, mean=0.997, max=1, sum=53.855 (54)",
-            "tab": "General information",
-            "score": 0.9973133394349212
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.733,
-        "details": {
-          "description": "min=0.15, mean=0.733, max=0.975, sum=24.175 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.043, mean=0.212, max=0.586, sum=6.999 (33)",
-            "tab": "Calibration",
-            "score": 0.21210473630230625
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.666, max=0.975, sum=21.975 (33)",
-            "tab": "Robustness",
-            "score": 0.665909090909091
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.125, mean=0.671, max=0.975, sum=22.15 (33)",
-            "tab": "Fairness",
-            "score": 0.6712121212121211
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.195, mean=0.276, max=0.351, sum=9.119 (33)",
-            "tab": "Efficiency",
-            "score": 0.27634172535905943
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=2.025, mean=4.752, max=5, sum=156.8 (33)",
-            "tab": "General information",
-            "score": 4.751515151515152
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=257.35, mean=1033.465, max=3591.4, sum=34104.35 (33)",
-            "tab": "General information",
-            "score": 1033.4651515151515
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.875, mean=3.057, max=6.85, sum=100.875 (33)",
-            "tab": "General information",
-            "score": 3.0568181818181817
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json b/data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json
deleted file mode 100644
index 9ca831c0f..000000000
--- a/data/helm_classic/openai/text-davinci-003/e411f017-22c6-4d49-9bf9-5d99c1091791.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/openai_text-davinci-003/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "text-davinci-003",
-    "id": "openai/text-davinci-003",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.872,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.4065137447036923
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.9095617026651509
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.9027696441489546
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.4087317179294733
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.4974399057732391
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.5263157894736842
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.569,
-        "details": {
-          "description": "min=0.28, mean=0.569, max=0.86, sum=8.532 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.127, mean=0.317, max=0.54, sum=4.761 (15)",
-            "tab": "Calibration",
-            "score": 0.31740378740673564
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.19, mean=0.517, max=0.84, sum=7.752 (15)",
-            "tab": "Robustness",
-            "score": 0.5167953216374268
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.24, mean=0.537, max=0.83, sum=8.054 (15)",
-            "tab": "Fairness",
-            "score": 0.5369590643274853
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)",
-            "tab": "General information",
-            "score": 472.2740350877193
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.881,
-        "details": {
-          "description": "min=0.879, mean=0.881, max=0.883, sum=2.644 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.097, mean=0.098, max=0.099, sum=0.295 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.09835218401604591
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.851, mean=0.858, max=0.864, sum=2.573 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.8576666666666667
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.854, mean=0.858, max=0.861, sum=2.574 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.858
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 908.4063333333334
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1.036, mean=1.043, max=1.058, sum=3.13 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0433333333333332
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0006666666666666666
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.727,
-        "details": {
-          "description": "min=0.703, mean=0.727, max=0.747, sum=2.181 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.346, mean=0.37, max=0.389, sum=1.111 (3)",
-            "tab": "Calibration",
-            "score": 0.3702182824812234
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.673, mean=0.694, max=0.713, sum=2.082 (3)",
-            "tab": "Robustness",
-            "score": 0.6939161040603179
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.643, mean=0.664, max=0.682, sum=1.993 (3)",
-            "tab": "Fairness",
-            "score": 0.6644210581739292
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.259, mean=4.532, max=4.955, sum=13.597 (3)",
-            "tab": "General information",
-            "score": 4.532394366197183
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3479.563, mean=3579.093, max=3633.659, sum=10737.279 (3)",
-            "tab": "General information",
-            "score": 3579.092957746479
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=8.231, mean=9.164, max=9.732, sum=27.493 (3)",
-            "tab": "General information",
-            "score": 9.16431924882629
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.424, mean=0.442, max=0.464, sum=1.327 (3)",
-            "tab": "Bias",
-            "score": 0.44232989232989234
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.169, mean=0.177, max=0.187, sum=0.532 (3)",
-            "tab": "Bias",
-            "score": 0.17722658310007708
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.011, mean=0.013, max=0.014, sum=0.039 (3)",
-            "tab": "Toxicity",
-            "score": 0.013145539906103287
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.768, mean=0.77, max=0.773, sum=2.311 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.27, mean=0.286, max=0.299, sum=0.857 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.28562303267045125
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.318, mean=0.323, max=0.331, sum=0.969 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.3230345144505907
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.36, mean=0.369, max=0.376, sum=1.106 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.36865975256659933
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.729, mean=0.73, max=0.733, sum=2.191 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.7304543451569532
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.349, mean=0.356, max=0.361, sum=1.069 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.3564629891973459
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.719, mean=0.721, max=0.725, sum=2.164 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.7213345530431851
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 112.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=7.074, mean=7.964, max=8.442, sum=23.891 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 7.963666666666666
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.874, mean=4.883, max=4.891, sum=14.65 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 4.883333333333334
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.02, mean=0.02, max=0.02, sum=0.06 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.02
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1315.257, mean=1520.977, max=1629.945, sum=4562.931 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1520.977
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=6.8, mean=6.937, max=7.011, sum=20.81 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 6.9366666666666665
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.452, mean=0.484, max=0.5, sum=1.452 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.4841269841269842
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.292, mean=0.347, max=0.43, sum=1.042 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.34749417249417247
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.167, mean=0.27, max=0.367, sum=0.811 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.27037037037037037
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.429, mean=0.443, max=0.454, sum=1.328 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.4428170082518513
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.375, mean=0.407, max=0.423, sum=1.221 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.407051282051282
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.525,
-        "details": {
-          "description": "min=0.496, mean=0.525, max=0.54, sum=1.574 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.259, mean=0.27, max=0.279, sum=0.809 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.2696184343953211
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.401, mean=0.42, max=0.432, sum=1.26 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.4199382541834728
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.427, mean=0.45, max=0.465, sum=1.351 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.45040220156517236
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=2.978, mean=3.438, max=3.878, sum=10.315 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.438333333333333
-          },
-          "QuAC - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=2819.048, mean=3249.907, max=3487.39, sum=9749.722 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3249.907333333333
-          },
-          "QuAC - # output tokens": {
-            "description": "min=25.946, mean=27.199, max=28.821, sum=81.596 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 27.198666666666668
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.556, mean=0.582, max=0.606, sum=1.745 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.5816498316498318
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.427, mean=0.428, max=0.43, sum=1.285 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.4283515137656795
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.321, mean=0.369, max=0.395, sum=1.106 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.368660072841299
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.244, mean=0.257, max=0.27, sum=0.772 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Bias",
-            "score": 0.2573013036656095
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.003 (3)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.822,
-        "details": {
-          "description": "min=0.822, mean=0.822, max=0.822, sum=0.822 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "min=0.278, mean=0.278, max=0.278, sum=0.278 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.2781634038368795
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "min=0.798, mean=0.798, max=0.798, sum=0.798 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.798
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "min=0.729, mean=0.729, max=0.729, sum=0.729 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.729
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "HellaSwag - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "min=87.888, mean=87.888, max=87.888, sum=87.888 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 87.888
-          },
-          "HellaSwag - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "HellaSwag - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.646,
-        "details": {
-          "description": "min=0.646, mean=0.646, max=0.646, sum=0.646 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "min=0.216, mean=0.216, max=0.216, sum=0.216 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Calibration",
-            "score": 0.21592533141452896
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "min=0.572, mean=0.572, max=0.572, sum=0.572 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Robustness",
-            "score": 0.572
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "min=0.578, mean=0.578, max=0.578, sum=0.578 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "Fairness",
-            "score": 0.578
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=5.27, mean=5.27, max=5.27, sum=5.27 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 5.27
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)\n⚠ Brown et al. perform an analysis of the contamination for GPT-3 and its known derivatives. For these datasets, they find that 1% - 6% of the datasets' test instances are contaminated based on N-gram overlap, and model performance does not substantially change for these datasets. See Table C.1 on page 45 of https://arxiv.org/pdf/2005.14165.pdf.",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.593,
-        "details": {
-          "description": "min=0.558, mean=0.593, max=0.615, sum=1.78 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.329, mean=0.348, max=0.373, sum=1.043 (3)",
-            "tab": "Calibration",
-            "score": 0.3477434253470754
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.479, mean=0.516, max=0.54, sum=1.549 (3)",
-            "tab": "Robustness",
-            "score": 0.5163098878695208
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.448, mean=0.491, max=0.521, sum=1.474 (3)",
-            "tab": "Fairness",
-            "score": 0.491335372069317
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)",
-            "tab": "General information",
-            "score": 511.12079510703364
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.644,
-        "details": {
-          "description": "min=0.611, mean=0.644, max=0.662, sum=1.931 (3)",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "min=0.292, mean=0.304, max=0.319, sum=0.911 (3)",
-            "tab": "Robustness",
-            "score": 0.3037781746031745
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "min=0.578, mean=0.616, max=0.645, sum=1.848 (3)",
-            "tab": "Robustness",
-            "score": 0.6160995919712035
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "min=0.322, mean=0.335, max=0.353, sum=1.005 (3)",
-            "tab": "Fairness",
-            "score": 0.33500119047619026
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "min=0.603, mean=0.633, max=0.652, sum=1.898 (3)",
-            "tab": "Fairness",
-            "score": 0.6326849780192724
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "min=495.232, mean=532.565, max=577.232, sum=1597.696 (3)",
-            "tab": "General information",
-            "score": 532.5653333333333
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "min=43, mean=43, max=43, sum=129 (3)",
-            "tab": "General information",
-            "score": 43.0
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "min=478.488, mean=515.822, max=560.488, sum=1547.465 (3)",
-            "tab": "General information",
-            "score": 515.8217054263565
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.156,
-        "details": {
-          "description": "min=0.151, mean=0.156, max=0.16, sum=0.935 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1531.586, mean=1549.919, max=1567.586, sum=9299.515 (6)",
-            "tab": "General information",
-            "score": 1549.9191702432045
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=60.524, mean=64.315, max=67.878, sum=385.888 (6)",
-            "tab": "General information",
-            "score": 64.31473533619457
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.643, mean=0.646, max=0.652, sum=3.879 (6)",
-            "tab": "Bias",
-            "score": 0.6464418252138059
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.404, mean=0.414, max=0.427, sum=2.482 (6)",
-            "tab": "Bias",
-            "score": 0.41359496216384023
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.245, mean=0.274, max=0.29, sum=1.641 (6)",
-            "tab": "Bias",
-            "score": 0.2735791651454302
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.074, mean=0.083, max=0.099, sum=0.498 (6)",
-            "tab": "Bias",
-            "score": 0.08299026507382476
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.33, mean=0.359, max=0.403, sum=1.077 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.35893042891379157
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.336, mean=0.342, max=0.347, sum=1.026 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.3420449797279243
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.953, mean=0.956, max=0.959, sum=5.734 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9556982855176755
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=7.169, mean=7.545, max=7.928, sum=45.269 (6)",
-            "tab": "Summarization metrics",
-            "score": 7.544859402012935
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=8.736, mean=9.389, max=10.065, sum=56.334 (6)",
-            "tab": "Summarization metrics",
-            "score": 9.389062386727216
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.124,
-        "details": {
-          "description": "min=0.122, mean=0.124, max=0.126, sum=0.744 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1456.402, mean=1510.735, max=1539.402, sum=9064.409 (6)",
-            "tab": "General information",
-            "score": 1510.734877734878
-          },
-          "XSUM - # output tokens": {
-            "description": "min=34.797, mean=35.293, max=36.073, sum=211.761 (6)",
-            "tab": "General information",
-            "score": 35.293436293436294
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4.0 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666669
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.413, mean=0.449, max=0.482, sum=2.694 (6)",
-            "tab": "Bias",
-            "score": 0.44896203413444785
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.518, mean=0.534, max=0.545, sum=3.202 (6)",
-            "tab": "Bias",
-            "score": 0.533635827356637
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.234, mean=0.238, max=0.242, sum=1.427 (6)",
-            "tab": "Bias",
-            "score": 0.23788037651548422
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.004, sum=0.008 (6)",
-            "tab": "Toxicity",
-            "score": 0.001287001287001287
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.313, mean=-0.301, max=-0.289, sum=-0.902 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.3005772048135215
-          },
-          "XSUM - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.406, mean=0.411, max=0.414, sum=1.233 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.411029433026404
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.814, mean=0.822, max=0.829, sum=4.933 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.8221014569634312
-          },
-          "XSUM - Density": {
-            "description": "min=2.461, mean=2.63, max=2.752, sum=15.779 (6)",
-            "tab": "Summarization metrics",
-            "score": 2.6298820148802573
-          },
-          "XSUM - Compression": {
-            "description": "min=10.736, mean=10.932, max=11.034, sum=65.59 (6)",
-            "tab": "Summarization metrics",
-            "score": 10.931690583444237
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.848,
-        "details": {
-          "description": "min=0.828, mean=0.848, max=0.881, sum=2.545 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.086, mean=0.113, max=0.132, sum=0.339 (3)",
-            "tab": "Calibration",
-            "score": 0.11283562591578779
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.749, mean=0.779, max=0.827, sum=2.338 (3)",
-            "tab": "Robustness",
-            "score": 0.7793333333333333
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.814, mean=0.833, max=0.868, sum=2.498 (3)",
-            "tab": "Fairness",
-            "score": 0.8326666666666666
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1282.797, mean=1897.464, max=2572.797, sum=5692.391 (3)",
-            "tab": "General information",
-            "score": 1897.4636666666665
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.684,
-        "details": {
-          "description": "min=0.52, mean=0.684, max=0.863, sum=36.959 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.12, mean=0.292, max=0.449, sum=15.772 (54)",
-            "tab": "Calibration",
-            "score": 0.29207184855040197
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.366, mean=0.594, max=0.838, sum=32.08 (54)",
-            "tab": "Robustness",
-            "score": 0.5940672674614373
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.253, mean=0.559, max=0.863, sum=30.179 (54)",
-            "tab": "Fairness",
-            "score": 0.5588650073949972
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)",
-            "tab": "General information",
-            "score": 722.6354931173206
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1.0, max=1.007, sum=54.007 (54)",
-            "tab": "General information",
-            "score": 1.0001279344975371
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.759,
-        "details": {
-          "description": "min=0.075, mean=0.759, max=0.95, sum=25.05 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.037, mean=0.203, max=0.736, sum=6.696 (33)",
-            "tab": "Calibration",
-            "score": 0.2029109351449743
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.05, mean=0.714, max=0.95, sum=23.55 (33)",
-            "tab": "Robustness",
-            "score": 0.7136363636363635
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.05, mean=0.705, max=0.95, sum=23.275 (33)",
-            "tab": "Fairness",
-            "score": 0.7053030303030302
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=2.025, mean=4.752, max=5, sum=156.8 (33)",
-            "tab": "General information",
-            "score": 4.751515151515152
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=257.35, mean=1033.465, max=3591.4, sum=34104.35 (33)",
-            "tab": "General information",
-            "score": 1033.4651515151515
-          },
-          "RAFT - # output tokens": {
-            "description": "min=1, mean=3.137, max=6.7, sum=103.525 (33)",
-            "tab": "General information",
-            "score": 3.1371212121212113
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json b/data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json
deleted file mode 100644
index cf2a4b297..000000000
--- a/data/helm_classic/stanford/Alpaca-7B/7bd2b266-5a65-4c63-bf18-5e4114564bfc.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/stanford_Alpaca-7B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Alpaca 7B",
-    "id": "stanford/Alpaca-7B",
-    "developer": "stanford",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.381,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.3335337650323774
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.37923076923076926
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.3719114219114219
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.4865162612605669
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.6546037296037296
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.385,
-        "details": {
-          "description": "min=0.263, mean=0.385, max=0.6, sum=1.923 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.151, mean=0.234, max=0.32, sum=1.171 (5)",
-            "tab": "Calibration",
-            "score": 0.23428857555005617
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.18, mean=0.324, max=0.52, sum=1.621 (5)",
-            "tab": "Robustness",
-            "score": 0.32410526315789473
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.219, mean=0.346, max=0.53, sum=1.729 (5)",
-            "tab": "Fairness",
-            "score": 0.34585964912280703
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)",
-            "tab": "General information",
-            "score": 522.5470877192982
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.778,
-        "details": {
-          "description": "min=0.778, mean=0.778, max=0.778, sum=0.778 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.343 (1)",
-            "tab": "Calibration",
-            "score": 0.3432802705941571
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.643, mean=0.643, max=0.643, sum=0.643 (1)",
-            "tab": "Robustness",
-            "score": 0.643
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.729, mean=0.729, max=0.729, sum=0.729 (1)",
-            "tab": "Fairness",
-            "score": 0.729
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1439.447, mean=1439.447, max=1439.447, sum=1439.447 (1)",
-            "tab": "General information",
-            "score": 1439.447
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=4.883, mean=4.883, max=4.883, sum=4.883 (1)",
-            "tab": "General information",
-            "score": 4.883
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.396,
-        "details": {
-          "description": "min=0.396, mean=0.396, max=0.396, sum=0.396 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.046, mean=0.046, max=0.046, sum=0.046 (1)",
-            "tab": "Calibration",
-            "score": 0.045878175333070315
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.246, mean=0.246, max=0.246, sum=0.246 (1)",
-            "tab": "Robustness",
-            "score": 0.24590950452109447
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.299, mean=0.299, max=0.299, sum=0.299 (1)",
-            "tab": "Fairness",
-            "score": 0.2987402817318288
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.437, mean=1.437, max=1.437, sum=1.437 (1)",
-            "tab": "General information",
-            "score": 1.4366197183098592
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1541.115, mean=1541.115, max=1541.115, sum=1541.115 (1)",
-            "tab": "General information",
-            "score": 1541.1154929577465
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=26.006, mean=26.006, max=26.006, sum=26.006 (1)",
-            "tab": "General information",
-            "score": 26.005633802816902
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.41, mean=0.41, max=0.41, sum=0.41 (1)",
-            "tab": "Bias",
-            "score": 0.41025641025641024
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.196, mean=0.196, max=0.196, sum=0.196 (1)",
-            "tab": "Bias",
-            "score": 0.19627507163323785
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.006, mean=0.006, max=0.006, sum=0.006 (1)",
-            "tab": "Toxicity",
-            "score": 0.005633802816901409
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.592,
-        "details": {
-          "description": "min=0.592, mean=0.592, max=0.592, sum=0.592 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.134, mean=0.134, max=0.134, sum=0.134 (1)",
-            "tab": "Calibration",
-            "score": 0.13434354583448904
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.238, mean=0.238, max=0.238, sum=0.238 (1)",
-            "tab": "Calibration",
-            "score": 0.23769723451909555
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.203, mean=0.203, max=0.203, sum=0.203 (1)",
-            "tab": "Robustness",
-            "score": 0.20255716308011695
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.491, mean=0.491, max=0.491, sum=0.491 (1)",
-            "tab": "Robustness",
-            "score": 0.4912677371744195
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.21, mean=0.21, max=0.21, sum=0.21 (1)",
-            "tab": "Fairness",
-            "score": 0.20966482260352876
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.53, mean=0.53, max=0.53, sum=0.53 (1)",
-            "tab": "Fairness",
-            "score": 0.5302078541276196
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)",
-            "tab": "General information",
-            "score": 137.383
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=84.53, mean=84.53, max=84.53, sum=84.53 (1)",
-            "tab": "General information",
-            "score": 84.53
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)",
-            "tab": "General information",
-            "score": 3.722
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.049, mean=0.049, max=0.049, sum=0.049 (1)",
-            "tab": "General information",
-            "score": 0.049
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1407.178, mean=1407.178, max=1407.178, sum=1407.178 (1)",
-            "tab": "General information",
-            "score": 1407.178
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=122.525, mean=122.525, max=122.525, sum=122.525 (1)",
-            "tab": "General information",
-            "score": 122.525
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.456, mean=0.456, max=0.456, sum=0.456 (1)",
-            "tab": "Bias",
-            "score": 0.45588235294117646
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.412, mean=0.412, max=0.412, sum=0.412 (1)",
-            "tab": "Bias",
-            "score": 0.4117647058823529
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.054, mean=0.054, max=0.054, sum=0.054 (1)",
-            "tab": "Bias",
-            "score": 0.053571428571428575
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.419, mean=0.419, max=0.419, sum=0.419 (1)",
-            "tab": "Bias",
-            "score": 0.4185185185185185
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.454, mean=0.454, max=0.454, sum=0.454 (1)",
-            "tab": "Bias",
-            "score": 0.4540682414698163
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.315, mean=0.315, max=0.315, sum=0.315 (1)",
-            "tab": "Bias",
-            "score": 0.31481481481481477
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.27,
-        "details": {
-          "description": "min=0.27, mean=0.27, max=0.27, sum=0.27 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.04, mean=0.04, max=0.04, sum=0.04 (1)",
-            "tab": "Calibration",
-            "score": 0.04026034301598206
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.16, mean=0.16, max=0.16, sum=0.16 (1)",
-            "tab": "Robustness",
-            "score": 0.1604861950978603
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.204, mean=0.204, max=0.204, sum=0.204 (1)",
-            "tab": "Fairness",
-            "score": 0.20395081036123316
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)",
-            "tab": "General information",
-            "score": 0.507
-          },
-          "QuAC - truncated": {
-            "description": "min=0.06, mean=0.06, max=0.06, sum=0.06 (1)",
-            "tab": "General information",
-            "score": 0.06
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1498.657, mean=1498.657, max=1498.657, sum=1498.657 (1)",
-            "tab": "General information",
-            "score": 1498.657
-          },
-          "QuAC - # output tokens": {
-            "description": "min=77.323, mean=77.323, max=77.323, sum=77.323 (1)",
-            "tab": "General information",
-            "score": 77.323
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.636, mean=0.636, max=0.636, sum=0.636 (1)",
-            "tab": "Bias",
-            "score": 0.6363636363636365
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.435, mean=0.435, max=0.435, sum=0.435 (1)",
-            "tab": "Bias",
-            "score": 0.4349771051252814
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.236, mean=0.236, max=0.236, sum=0.236 (1)",
-            "tab": "Bias",
-            "score": 0.23589743589743586
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.281, mean=0.281, max=0.281, sum=0.281 (1)",
-            "tab": "Bias",
-            "score": 0.2813953488372093
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)",
-            "tab": "Toxicity",
-            "score": 0.002
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.243,
-        "details": {
-          "description": "min=0.243, mean=0.243, max=0.243, sum=0.243 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.375, mean=0.375, max=0.375, sum=0.375 (1)",
-            "tab": "Calibration",
-            "score": 0.3750196178145884
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.199, mean=0.199, max=0.199, sum=0.199 (1)",
-            "tab": "Robustness",
-            "score": 0.19877675840978593
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.202, mean=0.202, max=0.202, sum=0.202 (1)",
-            "tab": "Fairness",
-            "score": 0.2018348623853211
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=524.602, mean=524.602, max=524.602, sum=524.602 (1)",
-            "tab": "General information",
-            "score": 524.6024464831804
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.738,
-        "details": {
-          "description": "min=0.738, mean=0.738, max=0.738, sum=0.738 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.281, mean=0.281, max=0.281, sum=0.281 (1)",
-            "tab": "Calibration",
-            "score": 0.28073357253102127
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.561, mean=0.561, max=0.561, sum=0.561 (1)",
-            "tab": "Robustness",
-            "score": 0.561
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.699, mean=0.699, max=0.699, sum=0.699 (1)",
-            "tab": "Fairness",
-            "score": 0.699
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.781, mean=2.781, max=2.781, sum=2.781 (1)",
-            "tab": "General information",
-            "score": 2.781
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1751.213, mean=1751.213, max=1751.213, sum=1751.213 (1)",
-            "tab": "General information",
-            "score": 1751.213
-          },
-          "IMDB - # output tokens": {
-            "description": "min=4.966, mean=4.966, max=4.966, sum=4.966 (1)",
-            "tab": "General information",
-            "score": 4.966
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.566,
-        "details": {
-          "description": "min=0.158, mean=0.566, max=0.939, sum=10.184 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.162, mean=0.352, max=0.606, sum=6.328 (18)",
-            "tab": "Calibration",
-            "score": 0.3515610942498128
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.133, mean=0.482, max=0.844, sum=8.674 (18)",
-            "tab": "Robustness",
-            "score": 0.4818807145268457
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.122, mean=0.483, max=0.818, sum=8.691 (18)",
-            "tab": "Fairness",
-            "score": 0.4828512879651531
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=404.732, mean=855.241, max=1417.567, sum=15394.339 (18)",
-            "tab": "General information",
-            "score": 855.2410378605821
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=2.746, mean=4.216, max=4.89, sum=75.887 (18)",
-            "tab": "General information",
-            "score": 4.2159316386124255
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.486,
-        "details": {
-          "description": "min=0, mean=0.486, max=0.9, sum=5.35 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.004, mean=0.33, max=0.711, sum=3.626 (11)",
-            "tab": "Calibration",
-            "score": 0.3296795633615674
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.42, max=0.875, sum=4.625 (11)",
-            "tab": "Robustness",
-            "score": 0.42045454545454536
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0, mean=0.459, max=0.9, sum=5.05 (11)",
-            "tab": "Fairness",
-            "score": 0.45909090909090916
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.45, mean=4.552, max=5, sum=50.075 (11)",
-            "tab": "General information",
-            "score": 4.552272727272727
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=303.675, mean=954.111, max=1882.1, sum=10495.225 (11)",
-            "tab": "General information",
-            "score": 954.1113636363635
-          },
-          "RAFT - # output tokens": {
-            "description": "min=3.7, mean=19.468, max=30, sum=214.15 (11)",
-            "tab": "General information",
-            "score": 19.468181818181815
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json b/data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json
deleted file mode 100644
index 97f13c6d9..000000000
--- a/data/helm_classic/tiiuae/Falcon-40B/49a1423e-d5f4-4665-b81e-d491f492a316.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/tiiuae_Falcon-40B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon 40B",
-    "id": "tiiuae/Falcon-40B",
-    "developer": "tiiuae",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.729,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": null
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.7051048951048952
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.6857342657342658
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.48586479674272687
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.4706876456876457
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.509,
-        "details": {
-          "description": "min=0.32, mean=0.509, max=0.79, sum=2.545 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.26, mean=0.457, max=0.76, sum=2.283 (5)",
-            "tab": "Robustness",
-            "score": 0.4566315789473684
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.272, mean=0.48, max=0.78, sum=2.402 (5)",
-            "tab": "Fairness",
-            "score": 0.4803859649122807
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=389.6, mean=500.12, max=664.281, sum=2500.601 (5)",
-            "tab": "General information",
-            "score": 500.12014035087725
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.819,
-        "details": {
-          "description": "min=0.819, mean=0.819, max=0.819, sum=0.819 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.763, mean=0.763, max=0.763, sum=0.763 (1)",
-            "tab": "Robustness",
-            "score": 0.763
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)",
-            "tab": "Fairness",
-            "score": 0.783
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1284.629, mean=1284.629, max=1284.629, sum=1284.629 (1)",
-            "tab": "General information",
-            "score": 1284.629
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.673,
-        "details": {
-          "description": "min=0.673, mean=0.673, max=0.673, sum=0.673 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.557, mean=0.557, max=0.557, sum=0.557 (1)",
-            "tab": "Robustness",
-            "score": 0.5574684493620005
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.559, mean=0.559, max=0.559, sum=0.559 (1)",
-            "tab": "Fairness",
-            "score": 0.5589601433703856
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=2.025, mean=2.025, max=2.025, sum=2.025 (1)",
-            "tab": "General information",
-            "score": 2.0253521126760563
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1694.082, mean=1694.082, max=1694.082, sum=1694.082 (1)",
-            "tab": "General information",
-            "score": 1694.081690140845
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.398, mean=0.398, max=0.398, sum=0.398 (1)",
-            "tab": "Bias",
-            "score": 0.39814814814814814
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.191, mean=0.191, max=0.191, sum=0.191 (1)",
-            "tab": "Bias",
-            "score": 0.19148936170212763
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.02, mean=0.02, max=0.02, sum=0.02 (1)",
-            "tab": "Toxicity",
-            "score": 0.01971830985915493
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.675,
-        "details": {
-          "description": "min=0.675, mean=0.675, max=0.675, sum=0.675 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.329, mean=0.329, max=0.329, sum=0.329 (1)",
-            "tab": "Robustness",
-            "score": 0.32850713007659726
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.593, mean=0.593, max=0.593, sum=0.593 (1)",
-            "tab": "Robustness",
-            "score": 0.5930765119599164
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.338, mean=0.338, max=0.338, sum=0.338 (1)",
-            "tab": "Fairness",
-            "score": 0.33840782877152153
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.625, mean=0.625, max=0.625, sum=0.625 (1)",
-            "tab": "Fairness",
-            "score": 0.6251513417645462
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=124.246, mean=124.246, max=124.246, sum=124.246 (1)",
-            "tab": "General information",
-            "score": 124.246
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.599, mean=4.599, max=4.599, sum=4.599 (1)",
-            "tab": "General information",
-            "score": 4.599
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)",
-            "tab": "General information",
-            "score": 0.039
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1587.334, mean=1587.334, max=1587.334, sum=1587.334 (1)",
-            "tab": "General information",
-            "score": 1587.334
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.995, mean=0.995, max=0.995, sum=0.995 (1)",
-            "tab": "General information",
-            "score": 0.995
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.256, mean=0.256, max=0.256, sum=0.256 (1)",
-            "tab": "Bias",
-            "score": 0.2556237218813906
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.107, mean=0.107, max=0.107, sum=0.107 (1)",
-            "tab": "Bias",
-            "score": 0.10714285714285715
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.443, mean=0.443, max=0.443, sum=0.443 (1)",
-            "tab": "Bias",
-            "score": 0.4428571428571429
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.382, mean=0.382, max=0.382, sum=0.382 (1)",
-            "tab": "Bias",
-            "score": 0.38245614035087716
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.132, mean=0.132, max=0.132, sum=0.132 (1)",
-            "tab": "Bias",
-            "score": 0.13157894736842105
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307,
-        "details": {
-          "description": "min=0.307, mean=0.307, max=0.307, sum=0.307 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.162, mean=0.162, max=0.162, sum=0.162 (1)",
-            "tab": "Robustness",
-            "score": 0.16237264946195393
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.256, mean=0.256, max=0.256, sum=0.256 (1)",
-            "tab": "Fairness",
-            "score": 0.25646510454177246
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)",
-            "tab": "General information",
-            "score": 0.862
-          },
-          "QuAC - truncated": {
-            "description": "min=0.031, mean=0.031, max=0.031, sum=0.031 (1)",
-            "tab": "General information",
-            "score": 0.031
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1667.28, mean=1667.28, max=1667.28, sum=1667.28 (1)",
-            "tab": "General information",
-            "score": 1667.28
-          },
-          "QuAC - # output tokens": {
-            "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)",
-            "tab": "General information",
-            "score": 0.999
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.468, mean=0.468, max=0.468, sum=0.468 (1)",
-            "tab": "Bias",
-            "score": 0.4681547619047619
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.423, mean=0.423, max=0.423, sum=0.423 (1)",
-            "tab": "Bias",
-            "score": 0.42342342342342343
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.141, mean=0.141, max=0.141, sum=0.141 (1)",
-            "tab": "Bias",
-            "score": 0.141304347826087
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)",
-            "tab": "Toxicity",
-            "score": 0.002
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.353,
-        "details": {
-          "description": "min=0.353, mean=0.353, max=0.353, sum=0.353 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.303, mean=0.303, max=0.303, sum=0.303 (1)",
-            "tab": "Robustness",
-            "score": 0.30275229357798167
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.292, mean=0.292, max=0.292, sum=0.292 (1)",
-            "tab": "Fairness",
-            "score": 0.29204892966360857
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=507.503, mean=507.503, max=507.503, sum=507.503 (1)",
-            "tab": "General information",
-            "score": 507.50305810397555
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.959,
-        "details": {
-          "description": "min=0.959, mean=0.959, max=0.959, sum=0.959 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.935, mean=0.935, max=0.935, sum=0.935 (1)",
-            "tab": "Robustness",
-            "score": 0.935
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.954, mean=0.954, max=0.954, sum=0.954 (1)",
-            "tab": "Fairness",
-            "score": 0.954
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.871, mean=2.871, max=2.871, sum=2.871 (1)",
-            "tab": "General information",
-            "score": 2.871
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1666.079, mean=1666.079, max=1666.079, sum=1666.079 (1)",
-            "tab": "General information",
-            "score": 1666.079
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.552,
-        "details": {
-          "description": "min=0.098, mean=0.552, max=0.969, sum=9.936 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.037, mean=0.412, max=0.827, sum=7.414 (18)",
-            "tab": "Robustness",
-            "score": 0.4118677862671613
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.098, mean=0.292, max=0.594, sum=5.248 (18)",
-            "tab": "Fairness",
-            "score": 0.29157916197633543
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=367.585, mean=782.759, max=1312.924, sum=14089.663 (18)",
-            "tab": "General information",
-            "score": 782.7590374602355
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.661,
-        "details": {
-          "description": "min=0.2, mean=0.661, max=0.975, sum=7.275 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.586, max=0.975, sum=6.45 (11)",
-            "tab": "Robustness",
-            "score": 0.5863636363636363
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.15, mean=0.611, max=0.975, sum=6.725 (11)",
-            "tab": "Fairness",
-            "score": 0.6113636363636364
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.7, mean=4.6, max=5, sum=50.6 (11)",
-            "tab": "General information",
-            "score": 4.6000000000000005
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=289.025, mean=877.464, max=1772.5, sum=9652.1 (11)",
-            "tab": "General information",
-            "score": 877.4636363636364
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.7, mean=0.973, max=1, sum=10.7 (11)",
-            "tab": "General information",
-            "score": 0.9727272727272727
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json b/data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json
deleted file mode 100644
index 80c0ac18a..000000000
--- a/data/helm_classic/tiiuae/Falcon-7B/8ec2c3d9-c84e-4742-a760-2d33ddf47eab.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/tiiuae_Falcon-7B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon 7B",
-    "id": "tiiuae/Falcon-7B",
-    "developer": "tiiuae",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.378,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": null
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.4253379953379953
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.4469230769230769
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.35594420480554084
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.5821678321678322
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.286,
-        "details": {
-          "description": "min=0.17, mean=0.286, max=0.39, sum=1.432 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.13, mean=0.236, max=0.37, sum=1.181 (5)",
-            "tab": "Robustness",
-            "score": 0.23610526315789473
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.15, mean=0.261, max=0.33, sum=1.303 (5)",
-            "tab": "Fairness",
-            "score": 0.26063157894736844
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=389.6, mean=500.12, max=664.281, sum=2500.601 (5)",
-            "tab": "General information",
-            "score": 500.12014035087725
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.753,
-        "details": {
-          "description": "min=0.753, mean=0.753, max=0.753, sum=0.753 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.65, mean=0.65, max=0.65, sum=0.65 (1)",
-            "tab": "Robustness",
-            "score": 0.65
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.702, mean=0.702, max=0.702, sum=0.702 (1)",
-            "tab": "Fairness",
-            "score": 0.702
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1284.629, mean=1284.629, max=1284.629, sum=1284.629 (1)",
-            "tab": "General information",
-            "score": 1284.629
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.621,
-        "details": {
-          "description": "min=0.621, mean=0.621, max=0.621, sum=0.621 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.436 (1)",
-            "tab": "Robustness",
-            "score": 0.4358401092976052
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.52, mean=0.52, max=0.52, sum=0.52 (1)",
-            "tab": "Fairness",
-            "score": 0.5199130399003071
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=2.025, mean=2.025, max=2.025, sum=2.025 (1)",
-            "tab": "General information",
-            "score": 2.0253521126760563
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1694.082, mean=1694.082, max=1694.082, sum=1694.082 (1)",
-            "tab": "General information",
-            "score": 1694.081690140845
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.444, mean=0.444, max=0.444, sum=0.444 (1)",
-            "tab": "Bias",
-            "score": 0.4444444444444444
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.205, mean=0.205, max=0.205, sum=0.205 (1)",
-            "tab": "Bias",
-            "score": 0.2046979865771812
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.017, mean=0.017, max=0.017, sum=0.017 (1)",
-            "tab": "Toxicity",
-            "score": 0.016901408450704224
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.579,
-        "details": {
-          "description": "min=0.579, mean=0.579, max=0.579, sum=0.579 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.185, mean=0.185, max=0.185, sum=0.185 (1)",
-            "tab": "Robustness",
-            "score": 0.18513134554094532
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.489, mean=0.489, max=0.489, sum=0.489 (1)",
-            "tab": "Robustness",
-            "score": 0.4889733445855735
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.233, mean=0.233, max=0.233, sum=0.233 (1)",
-            "tab": "Fairness",
-            "score": 0.2334955595363806
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.537, mean=0.537, max=0.537, sum=0.537 (1)",
-            "tab": "Fairness",
-            "score": 0.536571121609654
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=124.246, mean=124.246, max=124.246, sum=124.246 (1)",
-            "tab": "General information",
-            "score": 124.246
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.599, mean=4.599, max=4.599, sum=4.599 (1)",
-            "tab": "General information",
-            "score": 4.599
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)",
-            "tab": "General information",
-            "score": 0.039
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1587.334, mean=1587.334, max=1587.334, sum=1587.334 (1)",
-            "tab": "General information",
-            "score": 1587.334
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.994, mean=0.994, max=0.994, sum=0.994 (1)",
-            "tab": "General information",
-            "score": 0.994
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.389, mean=0.389, max=0.389, sum=0.389 (1)",
-            "tab": "Bias",
-            "score": 0.38888888888888884
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.476, mean=0.476, max=0.476, sum=0.476 (1)",
-            "tab": "Bias",
-            "score": 0.47619047619047616
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.14, mean=0.14, max=0.14, sum=0.14 (1)",
-            "tab": "Bias",
-            "score": 0.14
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)",
-            "tab": "Bias",
-            "score": 0.3333333333333333
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.553, mean=0.553, max=0.553, sum=0.553 (1)",
-            "tab": "Bias",
-            "score": 0.5528942115768464
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.275, mean=0.275, max=0.275, sum=0.275 (1)",
-            "tab": "Bias",
-            "score": 0.2745098039215687
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.332,
-        "details": {
-          "description": "min=0.332, mean=0.332, max=0.332, sum=0.332 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.164, mean=0.164, max=0.164, sum=0.164 (1)",
-            "tab": "Robustness",
-            "score": 0.16389145934637706
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.262, mean=0.262, max=0.262, sum=0.262 (1)",
-            "tab": "Fairness",
-            "score": 0.2622208848575014
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)",
-            "tab": "General information",
-            "score": 0.862
-          },
-          "QuAC - truncated": {
-            "description": "min=0.031, mean=0.031, max=0.031, sum=0.031 (1)",
-            "tab": "General information",
-            "score": 0.031
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1667.28, mean=1667.28, max=1667.28, sum=1667.28 (1)",
-            "tab": "General information",
-            "score": 1667.28
-          },
-          "QuAC - # output tokens": {
-            "description": "min=0.995, mean=0.995, max=0.995, sum=0.995 (1)",
-            "tab": "General information",
-            "score": 0.995
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)",
-            "tab": "Bias",
-            "score": 0.45680272108843534
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.402, mean=0.402, max=0.402, sum=0.402 (1)",
-            "tab": "Bias",
-            "score": 0.4022988505747127
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.247, mean=0.247, max=0.247, sum=0.247 (1)",
-            "tab": "Bias",
-            "score": 0.24695863746958635
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.234,
-        "details": {
-          "description": "min=0.234, mean=0.234, max=0.234, sum=0.234 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.205, mean=0.205, max=0.205, sum=0.205 (1)",
-            "tab": "Robustness",
-            "score": 0.20489296636085627
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.213, mean=0.213, max=0.213, sum=0.213 (1)",
-            "tab": "Fairness",
-            "score": 0.21253822629969418
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=507.503, mean=507.503, max=507.503, sum=507.503 (1)",
-            "tab": "General information",
-            "score": 507.50305810397555
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.836,
-        "details": {
-          "description": "min=0.836, mean=0.836, max=0.836, sum=0.836 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.692, mean=0.692, max=0.692, sum=0.692 (1)",
-            "tab": "Robustness",
-            "score": 0.692
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.794, mean=0.794, max=0.794, sum=0.794 (1)",
-            "tab": "Fairness",
-            "score": 0.794
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.871, mean=2.871, max=2.871, sum=2.871 (1)",
-            "tab": "General information",
-            "score": 2.871
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1666.079, mean=1666.079, max=1666.079, sum=1666.079 (1)",
-            "tab": "General information",
-            "score": 1666.079
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.514,
-        "details": {
-          "description": "min=0, mean=0.514, max=0.999, sum=9.257 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.485, max=0.999, sum=8.731 (18)",
-            "tab": "Robustness",
-            "score": 0.4850751828621894
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.494, max=0.999, sum=8.898 (18)",
-            "tab": "Fairness",
-            "score": 0.49430637095445207
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=367.585, mean=782.759, max=1312.924, sum=14089.663 (18)",
-            "tab": "General information",
-            "score": 782.7590374602355
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.602,
-        "details": {
-          "description": "min=0.15, mean=0.602, max=0.975, sum=6.625 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.025, mean=0.516, max=0.975, sum=5.675 (11)",
-            "tab": "Robustness",
-            "score": 0.5159090909090908
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.15, mean=0.555, max=0.975, sum=6.1 (11)",
-            "tab": "Fairness",
-            "score": 0.5545454545454546
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.7, mean=4.6, max=5, sum=50.6 (11)",
-            "tab": "General information",
-            "score": 4.6000000000000005
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=289.025, mean=877.464, max=1772.5, sum=9652.1 (11)",
-            "tab": "General information",
-            "score": 877.4636363636364
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.725, mean=0.975, max=1, sum=10.725 (11)",
-            "tab": "General information",
-            "score": 0.975
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json b/data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json
deleted file mode 100644
index 4b7c6b681..000000000
--- a/data/helm_classic/tiiuae/Falcon-Instruct-40B/ec39cb88-fbd3-4cfb-9a11-571ef43e193e.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-40B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon-Instruct 40B",
-    "id": "tiiuae/Falcon-Instruct-40B",
-    "developer": "tiiuae",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.727,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": null
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.7631002331002331
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.7087645687645687
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.4307003912490803
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.44994172494172496
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.497,
-        "details": {
-          "description": "min=0.263, mean=0.497, max=0.82, sum=2.483 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.228, mean=0.446, max=0.78, sum=2.228 (5)",
-            "tab": "Robustness",
-            "score": 0.44561403508771924
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.219, mean=0.466, max=0.8, sum=2.329 (5)",
-            "tab": "Fairness",
-            "score": 0.4658596491228071
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=389.6, mean=500.12, max=664.281, sum=2500.601 (5)",
-            "tab": "General information",
-            "score": 500.12014035087725
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.829,
-        "details": {
-          "description": "min=0.829, mean=0.829, max=0.829, sum=0.829 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.781, mean=0.781, max=0.781, sum=0.781 (1)",
-            "tab": "Robustness",
-            "score": 0.781
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.799, mean=0.799, max=0.799, sum=0.799 (1)",
-            "tab": "Fairness",
-            "score": 0.799
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1284.629, mean=1284.629, max=1284.629, sum=1284.629 (1)",
-            "tab": "General information",
-            "score": 1284.629
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.625,
-        "details": {
-          "description": "min=0.625, mean=0.625, max=0.625, sum=0.625 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.508, mean=0.508, max=0.508, sum=0.508 (1)",
-            "tab": "Robustness",
-            "score": 0.5082425698893845
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.543, mean=0.543, max=0.543, sum=0.543 (1)",
-            "tab": "Fairness",
-            "score": 0.543279669317833
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=2.025, mean=2.025, max=2.025, sum=2.025 (1)",
-            "tab": "General information",
-            "score": 2.0253521126760563
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1694.082, mean=1694.082, max=1694.082, sum=1694.082 (1)",
-            "tab": "General information",
-            "score": 1694.081690140845
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.332 (1)",
-            "tab": "Bias",
-            "score": 0.33194444444444443
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)",
-            "tab": "Bias",
-            "score": 0.4666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.175, mean=0.175, max=0.175, sum=0.175 (1)",
-            "tab": "Bias",
-            "score": 0.17464114832535887
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.011, mean=0.011, max=0.011, sum=0.011 (1)",
-            "tab": "Toxicity",
-            "score": 0.011267605633802818
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.666,
-        "details": {
-          "description": "min=0.666, mean=0.666, max=0.666, sum=0.666 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.335, mean=0.335, max=0.335, sum=0.335 (1)",
-            "tab": "Robustness",
-            "score": 0.33514492181201283
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.591, mean=0.591, max=0.591, sum=0.591 (1)",
-            "tab": "Robustness",
-            "score": 0.5912781280483248
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.331, mean=0.331, max=0.331, sum=0.331 (1)",
-            "tab": "Fairness",
-            "score": 0.33094416222152356
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.607, mean=0.607, max=0.607, sum=0.607 (1)",
-            "tab": "Fairness",
-            "score": 0.6067807528449897
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=124.246, mean=124.246, max=124.246, sum=124.246 (1)",
-            "tab": "General information",
-            "score": 124.246
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)",
-            "tab": "General information",
-            "score": 0.999
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.599, mean=4.599, max=4.599, sum=4.599 (1)",
-            "tab": "General information",
-            "score": 4.599
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)",
-            "tab": "General information",
-            "score": 0.039
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1587.334, mean=1587.334, max=1587.334, sum=1587.334 (1)",
-            "tab": "General information",
-            "score": 1587.334
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.995, mean=0.995, max=0.995, sum=0.995 (1)",
-            "tab": "General information",
-            "score": 0.995
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.415, mean=0.415, max=0.415, sum=0.415 (1)",
-            "tab": "Bias",
-            "score": 0.41463414634146334
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.155, mean=0.155, max=0.155, sum=0.155 (1)",
-            "tab": "Bias",
-            "score": 0.15517241379310343
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.42 (1)",
-            "tab": "Bias",
-            "score": 0.42000000000000004
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.552, mean=0.552, max=0.552, sum=0.552 (1)",
-            "tab": "Bias",
-            "score": 0.5516224188790559
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.195, mean=0.195, max=0.195, sum=0.195 (1)",
-            "tab": "Bias",
-            "score": 0.19491525423728814
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)",
-            "tab": "Toxicity",
-            "score": 0.002
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.371,
-        "details": {
-          "description": "min=0.371, mean=0.371, max=0.371, sum=0.371 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.212, mean=0.212, max=0.212, sum=0.212 (1)",
-            "tab": "Robustness",
-            "score": 0.21167117057056115
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.308, mean=0.308, max=0.308, sum=0.308 (1)",
-            "tab": "Fairness",
-            "score": 0.3078257563786361
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)",
-            "tab": "General information",
-            "score": 0.862
-          },
-          "QuAC - truncated": {
-            "description": "min=0.031, mean=0.031, max=0.031, sum=0.031 (1)",
-            "tab": "General information",
-            "score": 0.031
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1667.28, mean=1667.28, max=1667.28, sum=1667.28 (1)",
-            "tab": "General information",
-            "score": 1667.28
-          },
-          "QuAC - # output tokens": {
-            "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)",
-            "tab": "General information",
-            "score": 0.999
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.418, mean=0.418, max=0.418, sum=0.418 (1)",
-            "tab": "Bias",
-            "score": 0.4182641806722689
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.476, mean=0.476, max=0.476, sum=0.476 (1)",
-            "tab": "Bias",
-            "score": 0.4756554307116105
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.214, mean=0.214, max=0.214, sum=0.214 (1)",
-            "tab": "Bias",
-            "score": 0.2142857142857143
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)",
-            "tab": "Toxicity",
-            "score": 0.002
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.384,
-        "details": {
-          "description": "min=0.384, mean=0.384, max=0.384, sum=0.384 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.338, mean=0.338, max=0.338, sum=0.338 (1)",
-            "tab": "Robustness",
-            "score": 0.3379204892966361
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.312, mean=0.312, max=0.312, sum=0.312 (1)",
-            "tab": "Fairness",
-            "score": 0.3119266055045872
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=507.503, mean=507.503, max=507.503, sum=507.503 (1)",
-            "tab": "General information",
-            "score": 507.50305810397555
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.959,
-        "details": {
-          "description": "min=0.959, mean=0.959, max=0.959, sum=0.959 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.938, mean=0.938, max=0.938, sum=0.938 (1)",
-            "tab": "Robustness",
-            "score": 0.938
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.957, mean=0.957, max=0.957, sum=0.957 (1)",
-            "tab": "Fairness",
-            "score": 0.957
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.871, mean=2.871, max=2.871, sum=2.871 (1)",
-            "tab": "General information",
-            "score": 2.871
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1666.079, mean=1666.079, max=1666.079, sum=1666.079 (1)",
-            "tab": "General information",
-            "score": 1666.079
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.603,
-        "details": {
-          "description": "min=0.203, mean=0.603, max=0.918, sum=10.849 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.135, mean=0.523, max=0.864, sum=9.414 (18)",
-            "tab": "Robustness",
-            "score": 0.5230033316869794
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.16, mean=0.462, max=0.762, sum=8.312 (18)",
-            "tab": "Fairness",
-            "score": 0.4617550507789773
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=367.585, mean=782.759, max=1312.924, sum=14089.663 (18)",
-            "tab": "General information",
-            "score": 782.7590374602355
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.586,
-        "details": {
-          "description": "min=0.175, mean=0.586, max=0.925, sum=6.45 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.025, mean=0.523, max=0.875, sum=5.75 (11)",
-            "tab": "Robustness",
-            "score": 0.5227272727272726
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.175, mean=0.561, max=0.875, sum=6.175 (11)",
-            "tab": "Fairness",
-            "score": 0.5613636363636363
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.7, mean=4.6, max=5, sum=50.6 (11)",
-            "tab": "General information",
-            "score": 4.6000000000000005
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=289.025, mean=877.464, max=1772.5, sum=9652.1 (11)",
-            "tab": "General information",
-            "score": 877.4636363636364
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.825, mean=0.984, max=1, sum=10.825 (11)",
-            "tab": "General information",
-            "score": 0.984090909090909
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json b/data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json
deleted file mode 100644
index cd7efa818..000000000
--- a/data/helm_classic/tiiuae/Falcon-Instruct-7B/a2b4ed40-b04f-481f-986b-25a2c26bbb79.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/tiiuae_Falcon-Instruct-7B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon-Instruct 7B",
-    "id": "tiiuae/Falcon-Instruct-7B",
-    "developer": "tiiuae",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.244,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": null
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.3032867132867133
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.2968298368298368
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.514714004225644
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.29545454545454547
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.275,
-        "details": {
-          "description": "min=0.21, mean=0.275, max=0.34, sum=1.374 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.2, mean=0.25, max=0.32, sum=1.248 (5)",
-            "tab": "Robustness",
-            "score": 0.24961403508771932
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.2, mean=0.261, max=0.32, sum=1.307 (5)",
-            "tab": "Fairness",
-            "score": 0.2613684210526316
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=389.6, mean=500.12, max=664.281, sum=2500.601 (5)",
-            "tab": "General information",
-            "score": 500.12014035087725
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.72,
-        "details": {
-          "description": "min=0.72, mean=0.72, max=0.72, sum=0.72 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.593, mean=0.593, max=0.593, sum=0.593 (1)",
-            "tab": "Robustness",
-            "score": 0.593
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.637, mean=0.637, max=0.637, sum=0.637 (1)",
-            "tab": "Fairness",
-            "score": 0.637
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1284.629, mean=1284.629, max=1284.629, sum=1284.629 (1)",
-            "tab": "General information",
-            "score": 1284.629
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.476,
-        "details": {
-          "description": "min=0.476, mean=0.476, max=0.476, sum=0.476 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.258, mean=0.258, max=0.258, sum=0.258 (1)",
-            "tab": "Robustness",
-            "score": 0.2582769089885097
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.354, mean=0.354, max=0.354, sum=0.354 (1)",
-            "tab": "Fairness",
-            "score": 0.3536054591455644
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=2.025, mean=2.025, max=2.025, sum=2.025 (1)",
-            "tab": "General information",
-            "score": 2.0253521126760563
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1694.082, mean=1694.082, max=1694.082, sum=1694.082 (1)",
-            "tab": "General information",
-            "score": 1694.081690140845
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.444, mean=0.444, max=0.444, sum=0.444 (1)",
-            "tab": "Bias",
-            "score": 0.4444444444444444
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.187, mean=0.187, max=0.187, sum=0.187 (1)",
-            "tab": "Bias",
-            "score": 0.1870229007633588
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.017, mean=0.017, max=0.017, sum=0.017 (1)",
-            "tab": "Toxicity",
-            "score": 0.016901408450704224
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.449,
-        "details": {
-          "description": "min=0.449, mean=0.449, max=0.449, sum=0.449 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.132, mean=0.132, max=0.132, sum=0.132 (1)",
-            "tab": "Robustness",
-            "score": 0.1322266230747346
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.327 (1)",
-            "tab": "Robustness",
-            "score": 0.32667933185026377
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.148, mean=0.148, max=0.148, sum=0.148 (1)",
-            "tab": "Fairness",
-            "score": 0.14824932914209746
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.383, mean=0.383, max=0.383, sum=0.383 (1)",
-            "tab": "Fairness",
-            "score": 0.38333017617065734
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=124.246, mean=124.246, max=124.246, sum=124.246 (1)",
-            "tab": "General information",
-            "score": 124.246
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)",
-            "tab": "General information",
-            "score": 0.999
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.599, mean=4.599, max=4.599, sum=4.599 (1)",
-            "tab": "General information",
-            "score": 4.599
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)",
-            "tab": "General information",
-            "score": 0.039
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1587.334, mean=1587.334, max=1587.334, sum=1587.334 (1)",
-            "tab": "General information",
-            "score": 1587.334
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.984, mean=0.984, max=0.984, sum=0.984 (1)",
-            "tab": "General information",
-            "score": 0.984
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.272, mean=0.272, max=0.272, sum=0.272 (1)",
-            "tab": "Bias",
-            "score": 0.2716049382716049
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.071, mean=0.071, max=0.071, sum=0.071 (1)",
-            "tab": "Bias",
-            "score": 0.07142857142857142
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.426, mean=0.426, max=0.426, sum=0.426 (1)",
-            "tab": "Bias",
-            "score": 0.4257907542579076
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.068, mean=0.068, max=0.068, sum=0.068 (1)",
-            "tab": "Bias",
-            "score": 0.0684931506849315
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.311,
-        "details": {
-          "description": "min=0.311, mean=0.311, max=0.311, sum=0.311 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.179, mean=0.179, max=0.179, sum=0.179 (1)",
-            "tab": "Robustness",
-            "score": 0.1789889679486199
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.219, mean=0.219, max=0.219, sum=0.219 (1)",
-            "tab": "Fairness",
-            "score": 0.21915649953692506
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)",
-            "tab": "General information",
-            "score": 0.862
-          },
-          "QuAC - truncated": {
-            "description": "min=0.031, mean=0.031, max=0.031, sum=0.031 (1)",
-            "tab": "General information",
-            "score": 0.031
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1667.28, mean=1667.28, max=1667.28, sum=1667.28 (1)",
-            "tab": "General information",
-            "score": 1667.28
-          },
-          "QuAC - # output tokens": {
-            "description": "min=0.997, mean=0.997, max=0.997, sum=0.997 (1)",
-            "tab": "General information",
-            "score": 0.997
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.625, mean=0.625, max=0.625, sum=0.625 (1)",
-            "tab": "Bias",
-            "score": 0.625
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.456, mean=0.456, max=0.456, sum=0.456 (1)",
-            "tab": "Bias",
-            "score": 0.4561372269705603
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.262, mean=0.262, max=0.262, sum=0.262 (1)",
-            "tab": "Bias",
-            "score": 0.26241134751773054
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.251, mean=0.251, max=0.251, sum=0.251 (1)",
-            "tab": "Bias",
-            "score": 0.25052854122621565
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)",
-            "tab": "Toxicity",
-            "score": 0.002
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.213,
-        "details": {
-          "description": "min=0.213, mean=0.213, max=0.213, sum=0.213 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.17, mean=0.17, max=0.17, sum=0.17 (1)",
-            "tab": "Robustness",
-            "score": 0.16972477064220184
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.183, mean=0.183, max=0.183, sum=0.183 (1)",
-            "tab": "Fairness",
-            "score": 0.1834862385321101
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=507.503, mean=507.503, max=507.503, sum=507.503 (1)",
-            "tab": "General information",
-            "score": 507.50305810397555
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.852,
-        "details": {
-          "description": "min=0.852, mean=0.852, max=0.852, sum=0.852 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.759, mean=0.759, max=0.759, sum=0.759 (1)",
-            "tab": "Robustness",
-            "score": 0.759
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.811, mean=0.811, max=0.811, sum=0.811 (1)",
-            "tab": "Fairness",
-            "score": 0.811
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.871, mean=2.871, max=2.871, sum=2.871 (1)",
-            "tab": "General information",
-            "score": 2.871
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1666.079, mean=1666.079, max=1666.079, sum=1666.079 (1)",
-            "tab": "General information",
-            "score": 1666.079
-          },
-          "IMDB - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.511,
-        "details": {
-          "description": "min=0, mean=0.511, max=1, sum=9.199 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.487, max=0.999, sum=8.769 (18)",
-            "tab": "Robustness",
-            "score": 0.4871679045873981
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.502, max=1, sum=9.031 (18)",
-            "tab": "Fairness",
-            "score": 0.5017354752179064
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=367.585, mean=782.759, max=1312.924, sum=14089.663 (18)",
-            "tab": "General information",
-            "score": 782.7590374602355
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.523,
-        "details": {
-          "description": "min=0.15, mean=0.523, max=0.975, sum=5.75 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.15, mean=0.445, max=0.975, sum=4.9 (11)",
-            "tab": "Robustness",
-            "score": 0.4454545454545454
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.15, mean=0.5, max=0.975, sum=5.5 (11)",
-            "tab": "Fairness",
-            "score": 0.5
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.7, mean=4.6, max=5, sum=50.6 (11)",
-            "tab": "General information",
-            "score": 4.6000000000000005
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=289.025, mean=877.464, max=1772.5, sum=9652.1 (11)",
-            "tab": "General information",
-            "score": 877.4636363636364
-          },
-          "RAFT - # output tokens": {
-            "description": "min=0.95, mean=0.995, max=1, sum=10.95 (11)",
-            "tab": "General information",
-            "score": 0.9954545454545454
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json b/data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json
deleted file mode 100644
index f25c83f2e..000000000
--- a/data/helm_classic/together/RedPajama-INCITE-Base-7B/e88f9163-5334-43ed-9b56-154bf543f898.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-7B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RedPajama-INCITE-Base 7B",
-    "id": "together/RedPajama-INCITE-Base-7B",
-    "developer": "together",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.378,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.40883441258094355
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.3311188811188811
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.3233799533799534
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.41358382155085455
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.1998834498834499
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302,
-        "details": {
-          "description": "min=0.228, mean=0.302, max=0.38, sum=1.508 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.08, mean=0.098, max=0.13, sum=0.49 (5)",
-            "tab": "Calibration",
-            "score": 0.09791468112621773
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.2, mean=0.25, max=0.33, sum=1.251 (5)",
-            "tab": "Robustness",
-            "score": 0.2501052631578947
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.219, mean=0.276, max=0.34, sum=1.379 (5)",
-            "tab": "Fairness",
-            "score": 0.275859649122807
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=358.76, mean=467.936, max=612.798, sum=2339.678 (5)",
-            "tab": "General information",
-            "score": 467.935649122807
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.713,
-        "details": {
-          "description": "min=0.713, mean=0.713, max=0.713, sum=0.713 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.127, mean=0.127, max=0.127, sum=0.127 (1)",
-            "tab": "Calibration",
-            "score": 0.1268200294718189
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.569, mean=0.569, max=0.569, sum=0.569 (1)",
-            "tab": "Robustness",
-            "score": 0.569
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.65, mean=0.65, max=0.65, sum=0.65 (1)",
-            "tab": "Fairness",
-            "score": 0.65
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1251.897, mean=1251.897, max=1251.897, sum=1251.897 (1)",
-            "tab": "General information",
-            "score": 1251.897
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.617,
-        "details": {
-          "description": "min=0.617, mean=0.617, max=0.617, sum=0.617 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.276, mean=0.276, max=0.276, sum=0.276 (1)",
-            "tab": "Calibration",
-            "score": 0.27605359630786236
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.424, mean=0.424, max=0.424, sum=0.424 (1)",
-            "tab": "Robustness",
-            "score": 0.4240469400392869
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.524, mean=0.524, max=0.524, sum=0.524 (1)",
-            "tab": "Fairness",
-            "score": 0.5239003837979788
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)",
-            "tab": "General information",
-            "score": 1.9690140845070423
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1691.082, mean=1691.082, max=1691.082, sum=1691.082 (1)",
-            "tab": "General information",
-            "score": 1691.081690140845
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=100 (1)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.438, mean=0.438, max=0.438, sum=0.438 (1)",
-            "tab": "Bias",
-            "score": 0.4375
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.171, mean=0.171, max=0.171, sum=0.171 (1)",
-            "tab": "Bias",
-            "score": 0.17123287671232879
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.014, mean=0.014, max=0.014, sum=0.014 (1)",
-            "tab": "Toxicity",
-            "score": 0.014084507042253521
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.586,
-        "details": {
-          "description": "min=0.586, mean=0.586, max=0.586, sum=0.586 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.127, mean=0.127, max=0.127, sum=0.127 (1)",
-            "tab": "Calibration",
-            "score": 0.12699960693149975
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.396, mean=0.396, max=0.396, sum=0.396 (1)",
-            "tab": "Calibration",
-            "score": 0.39598996118757757
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.167, mean=0.167, max=0.167, sum=0.167 (1)",
-            "tab": "Robustness",
-            "score": 0.1665503977180178
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.472, mean=0.472, max=0.472, sum=0.472 (1)",
-            "tab": "Robustness",
-            "score": 0.47226706838923
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.193, mean=0.193, max=0.193, sum=0.193 (1)",
-            "tab": "Fairness",
-            "score": 0.19300226376410895
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.514, mean=0.514, max=0.514, sum=0.514 (1)",
-            "tab": "Fairness",
-            "score": 0.5136843159783826
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=117.299, mean=117.299, max=117.299, sum=117.299 (1)",
-            "tab": "General information",
-            "score": 117.299
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=300 (1)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.704, mean=4.704, max=4.704, sum=4.704 (1)",
-            "tab": "General information",
-            "score": 4.704
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.037, mean=0.037, max=0.037, sum=0.037 (1)",
-            "tab": "General information",
-            "score": 0.037
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1495.552, mean=1495.552, max=1495.552, sum=1495.552 (1)",
-            "tab": "General information",
-            "score": 1495.552
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=300 (1)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.498, mean=0.498, max=0.498, sum=0.498 (1)",
-            "tab": "Bias",
-            "score": 0.49783549783549785
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.289, mean=0.289, max=0.289, sum=0.289 (1)",
-            "tab": "Bias",
-            "score": 0.2894736842105263
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.408, mean=0.408, max=0.408, sum=0.408 (1)",
-            "tab": "Bias",
-            "score": 0.4081597222222222
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.412, mean=0.412, max=0.412, sum=0.412 (1)",
-            "tab": "Bias",
-            "score": 0.4124293785310734
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.256, mean=0.256, max=0.256, sum=0.256 (1)",
-            "tab": "Bias",
-            "score": 0.25630252100840334
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.002, mean=0.002, max=0.002, sum=0.002 (1)",
-            "tab": "Toxicity",
-            "score": 0.002
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.336,
-        "details": {
-          "description": "min=0.336, mean=0.336, max=0.336, sum=0.336 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.131, mean=0.131, max=0.131, sum=0.131 (1)",
-            "tab": "Calibration",
-            "score": 0.13131742636553145
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.186, mean=0.186, max=0.186, sum=0.186 (1)",
-            "tab": "Robustness",
-            "score": 0.18577129287689287
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.238, mean=0.238, max=0.238, sum=0.238 (1)",
-            "tab": "Fairness",
-            "score": 0.23848247289290064
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.883, mean=0.883, max=0.883, sum=0.883 (1)",
-            "tab": "General information",
-            "score": 0.883
-          },
-          "QuAC - truncated": {
-            "description": "min=0.021, mean=0.021, max=0.021, sum=0.021 (1)",
-            "tab": "General information",
-            "score": 0.021
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1655.708, mean=1655.708, max=1655.708, sum=1655.708 (1)",
-            "tab": "General information",
-            "score": 1655.708
-          },
-          "QuAC - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=100 (1)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666669
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.382, mean=0.382, max=0.382, sum=0.382 (1)",
-            "tab": "Bias",
-            "score": 0.38163008049881736
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.38, mean=0.38, max=0.38, sum=0.38 (1)",
-            "tab": "Bias",
-            "score": 0.3802816901408451
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.249, mean=0.249, max=0.249, sum=0.249 (1)",
-            "tab": "Bias",
-            "score": 0.24864864864864863
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.005, mean=0.005, max=0.005, sum=0.005 (1)",
-            "tab": "Toxicity",
-            "score": 0.005
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.205,
-        "details": {
-          "description": "min=0.205, mean=0.205, max=0.205, sum=0.205 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.063, mean=0.063, max=0.063, sum=0.063 (1)",
-            "tab": "Calibration",
-            "score": 0.06284277332135296
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.173, mean=0.173, max=0.173, sum=0.173 (1)",
-            "tab": "Robustness",
-            "score": 0.172782874617737
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.17, mean=0.17, max=0.17, sum=0.17 (1)",
-            "tab": "Fairness",
-            "score": 0.16972477064220184
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=505.352, mean=505.352, max=505.352, sum=505.352 (1)",
-            "tab": "General information",
-            "score": 505.35168195718654
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.752,
-        "details": {
-          "description": "min=0.752, mean=0.752, max=0.752, sum=0.752 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.206, mean=0.206, max=0.206, sum=0.206 (1)",
-            "tab": "Calibration",
-            "score": 0.20649886073889429
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.56, mean=0.56, max=0.56, sum=0.56 (1)",
-            "tab": "Robustness",
-            "score": 0.56
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.694, mean=0.694, max=0.694, sum=0.694 (1)",
-            "tab": "Fairness",
-            "score": 0.694
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.911, mean=2.911, max=2.911, sum=2.911 (1)",
-            "tab": "General information",
-            "score": 2.911
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1619.568, mean=1619.568, max=1619.568, sum=1619.568 (1)",
-            "tab": "General information",
-            "score": 1619.568
-          },
-          "IMDB - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.547,
-        "details": {
-          "description": "min=0.064, mean=0.547, max=0.954, sum=9.838 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.109, mean=0.305, max=0.471, sum=5.486 (18)",
-            "tab": "Calibration",
-            "score": 0.3047575712176879
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.054, mean=0.401, max=0.835, sum=7.221 (18)",
-            "tab": "Robustness",
-            "score": 0.4011569280490217
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.06, mean=0.431, max=0.811, sum=7.756 (18)",
-            "tab": "Fairness",
-            "score": 0.43087088541137863
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=360.976, mean=771.654, max=1282.4, sum=13889.772 (18)",
-            "tab": "General information",
-            "score": 771.6539847352628
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.648,
-        "details": {
-          "description": "min=0.3, mean=0.648, max=0.925, sum=7.125 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.3, mean=0.648, max=0.925, sum=7.123 (11)",
-            "tab": "Calibration",
-            "score": 0.6475429539256364
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.489, max=0.925, sum=5.375 (11)",
-            "tab": "Robustness",
-            "score": 0.48863636363636365
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.275, mean=0.595, max=0.925, sum=6.55 (11)",
-            "tab": "Fairness",
-            "score": 0.5954545454545455
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.7, mean=4.605, max=5, sum=50.65 (11)",
-            "tab": "General information",
-            "score": 4.6045454545454545
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=280.35, mean=869.691, max=1756.575, sum=9566.6 (11)",
-            "tab": "General information",
-            "score": 869.6909090909089
-          },
-          "RAFT - # output tokens": {
-            "description": "min=30, mean=30, max=30, sum=330 (11)",
-            "tab": "General information",
-            "score": 30.0
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json b/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json
deleted file mode 100644
index d4d85552c..000000000
--- a/data/helm_classic/together/RedPajama-INCITE-Base-v1-3B/6d436bd5-9d49-4895-8c07-7814b2eef12c.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Base-v1-3B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RedPajama-INCITE-Base-v1 3B",
-    "id": "together/RedPajama-INCITE-Base-v1-3B",
-    "developer": "together",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.311,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.4387141535615171
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.293006993006993
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.26995337995338
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.4599624127215427
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.7068181818181818
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.263,
-        "details": {
-          "description": "min=0.24, mean=0.263, max=0.3, sum=1.314 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.082, mean=0.115, max=0.149, sum=0.575 (5)",
-            "tab": "Calibration",
-            "score": 0.11506526711032969
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.184, mean=0.217, max=0.29, sum=1.084 (5)",
-            "tab": "Robustness",
-            "score": 0.2168421052631579
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.2, mean=0.232, max=0.29, sum=1.161 (5)",
-            "tab": "Fairness",
-            "score": 0.23210526315789473
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=358.76, mean=467.936, max=612.798, sum=2339.678 (5)",
-            "tab": "General information",
-            "score": 467.935649122807
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.685,
-        "details": {
-          "description": "min=0.685, mean=0.685, max=0.685, sum=0.685 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.187, mean=0.187, max=0.187, sum=0.187 (1)",
-            "tab": "Calibration",
-            "score": 0.1865846445420437
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.585, mean=0.585, max=0.585, sum=0.585 (1)",
-            "tab": "Robustness",
-            "score": 0.585
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.624, mean=0.624, max=0.624, sum=0.624 (1)",
-            "tab": "Fairness",
-            "score": 0.624
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1251.897, mean=1251.897, max=1251.897, sum=1251.897 (1)",
-            "tab": "General information",
-            "score": 1251.897
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.555,
-        "details": {
-          "description": "min=0.555, mean=0.555, max=0.555, sum=0.555 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.234, mean=0.234, max=0.234, sum=0.234 (1)",
-            "tab": "Calibration",
-            "score": 0.2338003327407993
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.346, mean=0.346, max=0.346, sum=0.346 (1)",
-            "tab": "Robustness",
-            "score": 0.3460535146763825
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.42 (1)",
-            "tab": "Fairness",
-            "score": 0.42019517663794076
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)",
-            "tab": "General information",
-            "score": 1.9690140845070423
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1691.082, mean=1691.082, max=1691.082, sum=1691.082 (1)",
-            "tab": "General information",
-            "score": 1691.081690140845
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=100 (1)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.463, mean=0.463, max=0.463, sum=0.463 (1)",
-            "tab": "Bias",
-            "score": 0.4629629629629629
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.167, mean=0.167, max=0.167, sum=0.167 (1)",
-            "tab": "Bias",
-            "score": 0.16666666666666666
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.008, mean=0.008, max=0.008, sum=0.008 (1)",
-            "tab": "Toxicity",
-            "score": 0.008450704225352112
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52,
-        "details": {
-          "description": "min=0.52, mean=0.52, max=0.52, sum=0.52 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.116, mean=0.116, max=0.116, sum=0.116 (1)",
-            "tab": "Calibration",
-            "score": 0.1159999973291356
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.345, mean=0.345, max=0.345, sum=0.345 (1)",
-            "tab": "Calibration",
-            "score": 0.34498406074093657
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.134, mean=0.134, max=0.134, sum=0.134 (1)",
-            "tab": "Robustness",
-            "score": 0.1341635313992508
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.396, mean=0.396, max=0.396, sum=0.396 (1)",
-            "tab": "Robustness",
-            "score": 0.3964044537010397
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.145, mean=0.145, max=0.145, sum=0.145 (1)",
-            "tab": "Fairness",
-            "score": 0.14546689822682907
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.452, mean=0.452, max=0.452, sum=0.452 (1)",
-            "tab": "Fairness",
-            "score": 0.4521647378074364
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=117.299, mean=117.299, max=117.299, sum=117.299 (1)",
-            "tab": "General information",
-            "score": 117.299
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=300 (1)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.704, mean=4.704, max=4.704, sum=4.704 (1)",
-            "tab": "General information",
-            "score": 4.704
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.037, mean=0.037, max=0.037, sum=0.037 (1)",
-            "tab": "General information",
-            "score": 0.037
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1495.552, mean=1495.552, max=1495.552, sum=1495.552 (1)",
-            "tab": "General information",
-            "score": 1495.552
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=299.738, mean=299.738, max=299.738, sum=299.738 (1)",
-            "tab": "General information",
-            "score": 299.738
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.46, mean=0.46, max=0.46, sum=0.46 (1)",
-            "tab": "Bias",
-            "score": 0.4597701149425287
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.182, mean=0.182, max=0.182, sum=0.182 (1)",
-            "tab": "Bias",
-            "score": 0.18181818181818182
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.464, mean=0.464, max=0.464, sum=0.464 (1)",
-            "tab": "Bias",
-            "score": 0.4642857142857143
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.453, mean=0.453, max=0.453, sum=0.453 (1)",
-            "tab": "Bias",
-            "score": 0.45299145299145294
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.242, mean=0.242, max=0.242, sum=0.242 (1)",
-            "tab": "Bias",
-            "score": 0.24223602484472045
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.309,
-        "details": {
-          "description": "min=0.309, mean=0.309, max=0.309, sum=0.309 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.078, mean=0.078, max=0.078, sum=0.078 (1)",
-            "tab": "Calibration",
-            "score": 0.07775925403447285
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.177, mean=0.177, max=0.177, sum=0.177 (1)",
-            "tab": "Robustness",
-            "score": 0.17735561911839576
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.238, mean=0.238, max=0.238, sum=0.238 (1)",
-            "tab": "Fairness",
-            "score": 0.23753496056157644
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.883, mean=0.883, max=0.883, sum=0.883 (1)",
-            "tab": "General information",
-            "score": 0.883
-          },
-          "QuAC - truncated": {
-            "description": "min=0.021, mean=0.021, max=0.021, sum=0.021 (1)",
-            "tab": "General information",
-            "score": 0.021
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1655.708, mean=1655.708, max=1655.708, sum=1655.708 (1)",
-            "tab": "General information",
-            "score": 1655.708
-          },
-          "QuAC - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=100 (1)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.575, mean=0.575, max=0.575, sum=0.575 (1)",
-            "tab": "Bias",
-            "score": 0.575
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.389, mean=0.389, max=0.389, sum=0.389 (1)",
-            "tab": "Bias",
-            "score": 0.38936550778656037
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.3 (1)",
-            "tab": "Bias",
-            "score": 0.3003300330033003
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.269, mean=0.269, max=0.269, sum=0.269 (1)",
-            "tab": "Bias",
-            "score": 0.268640350877193
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.277,
-        "details": {
-          "description": "min=0.277, mean=0.277, max=0.277, sum=0.277 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.048, mean=0.048, max=0.048, sum=0.048 (1)",
-            "tab": "Calibration",
-            "score": 0.04833037892853392
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.226, mean=0.226, max=0.226, sum=0.226 (1)",
-            "tab": "Robustness",
-            "score": 0.22629969418960244
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.248, mean=0.248, max=0.248, sum=0.248 (1)",
-            "tab": "Fairness",
-            "score": 0.24770642201834864
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=505.352, mean=505.352, max=505.352, sum=505.352 (1)",
-            "tab": "General information",
-            "score": 505.35168195718654
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.907,
-        "details": {
-          "description": "min=0.907, mean=0.907, max=0.907, sum=0.907 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.248, mean=0.248, max=0.248, sum=0.248 (1)",
-            "tab": "Calibration",
-            "score": 0.24822902119068743
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.843, mean=0.843, max=0.843, sum=0.843 (1)",
-            "tab": "Robustness",
-            "score": 0.843
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.89, mean=0.89, max=0.89, sum=0.89 (1)",
-            "tab": "Fairness",
-            "score": 0.89
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.911, mean=2.911, max=2.911, sum=2.911 (1)",
-            "tab": "General information",
-            "score": 2.911
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1619.568, mean=1619.568, max=1619.568, sum=1619.568 (1)",
-            "tab": "General information",
-            "score": 1619.568
-          },
-          "IMDB - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.549,
-        "details": {
-          "description": "min=0.013, mean=0.549, max=0.996, sum=9.877 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.105, mean=0.303, max=0.532, sum=5.455 (18)",
-            "tab": "Calibration",
-            "score": 0.3030711579633833
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.336, max=0.996, sum=6.045 (18)",
-            "tab": "Robustness",
-            "score": 0.3358431190860201
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.393, max=0.996, sum=7.082 (18)",
-            "tab": "Fairness",
-            "score": 0.39345093425226885
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=360.976, mean=771.654, max=1282.4, sum=13889.772 (18)",
-            "tab": "General information",
-            "score": 771.6539847352628
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.502,
-        "details": {
-          "description": "min=0.225, mean=0.502, max=0.975, sum=5.525 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.225, mean=0.502, max=0.975, sum=5.524 (11)",
-            "tab": "Calibration",
-            "score": 0.5021656428017803
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.427, max=0.975, sum=4.7 (11)",
-            "tab": "Robustness",
-            "score": 0.4272727272727273
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.175, mean=0.475, max=0.975, sum=5.225 (11)",
-            "tab": "Fairness",
-            "score": 0.47500000000000003
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.7, mean=4.605, max=5, sum=50.65 (11)",
-            "tab": "General information",
-            "score": 4.6045454545454545
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=280.35, mean=869.691, max=1756.575, sum=9566.6 (11)",
-            "tab": "General information",
-            "score": 869.6909090909089
-          },
-          "RAFT - # output tokens": {
-            "description": "min=30, mean=30, max=30, sum=330 (11)",
-            "tab": "General information",
-            "score": 30.0
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json b/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json
deleted file mode 100644
index 9d60f7506..000000000
--- a/data/helm_classic/together/RedPajama-INCITE-Instruct-7B/681d0d6d-de06-4b8e-a7e2-964d98e2806e.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-7B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RedPajama-INCITE-Instruct 7B",
-    "id": "together/RedPajama-INCITE-Instruct-7B",
-    "developer": "together",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.524,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.38751156336725257
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.4953146853146853
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.46615384615384614
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.33794748465968927
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.29364801864801865
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.363,
-        "details": {
-          "description": "min=0.246, mean=0.363, max=0.52, sum=1.816 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.092, mean=0.143, max=0.182, sum=0.715 (5)",
-            "tab": "Calibration",
-            "score": 0.14292977551638825
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.175, mean=0.291, max=0.46, sum=1.455 (5)",
-            "tab": "Robustness",
-            "score": 0.2910877192982456
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.167, mean=0.305, max=0.48, sum=1.527 (5)",
-            "tab": "Fairness",
-            "score": 0.30533333333333335
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=358.76, mean=467.936, max=612.798, sum=2339.678 (5)",
-            "tab": "General information",
-            "score": 467.935649122807
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.705,
-        "details": {
-          "description": "min=0.705, mean=0.705, max=0.705, sum=0.705 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.035, mean=0.035, max=0.035, sum=0.035 (1)",
-            "tab": "Calibration",
-            "score": 0.034644312737608846
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.599, mean=0.599, max=0.599, sum=0.599 (1)",
-            "tab": "Robustness",
-            "score": 0.599
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.616, mean=0.616, max=0.616, sum=0.616 (1)",
-            "tab": "Fairness",
-            "score": 0.616
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1251.897, mean=1251.897, max=1251.897, sum=1251.897 (1)",
-            "tab": "General information",
-            "score": 1251.897
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.638,
-        "details": {
-          "description": "min=0.638, mean=0.638, max=0.638, sum=0.638 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.247, mean=0.247, max=0.247, sum=0.247 (1)",
-            "tab": "Calibration",
-            "score": 0.24703559378209236
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.482, mean=0.482, max=0.482, sum=0.482 (1)",
-            "tab": "Robustness",
-            "score": 0.4816661888359549
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.506, mean=0.506, max=0.506, sum=0.506 (1)",
-            "tab": "Fairness",
-            "score": 0.5062845788047843
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)",
-            "tab": "General information",
-            "score": 1.9690140845070423
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1691.082, mean=1691.082, max=1691.082, sum=1691.082 (1)",
-            "tab": "General information",
-            "score": 1691.081690140845
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=100 (1)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.193, mean=0.193, max=0.193, sum=0.193 (1)",
-            "tab": "Bias",
-            "score": 0.19318181818181815
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.025, mean=0.025, max=0.025, sum=0.025 (1)",
-            "tab": "Toxicity",
-            "score": 0.02535211267605634
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.659,
-        "details": {
-          "description": "min=0.659, mean=0.659, max=0.659, sum=0.659 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.142, mean=0.142, max=0.142, sum=0.142 (1)",
-            "tab": "Calibration",
-            "score": 0.14200000000000002
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.466, mean=0.466, max=0.466, sum=0.466 (1)",
-            "tab": "Calibration",
-            "score": 0.4659999973351183
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.137, mean=0.137, max=0.137, sum=0.137 (1)",
-            "tab": "Robustness",
-            "score": 0.13717330495393032
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.547, mean=0.547, max=0.547, sum=0.547 (1)",
-            "tab": "Robustness",
-            "score": 0.5468327185577326
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.164, mean=0.164, max=0.164, sum=0.164 (1)",
-            "tab": "Fairness",
-            "score": 0.16419040044922398
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.592, mean=0.592, max=0.592, sum=0.592 (1)",
-            "tab": "Fairness",
-            "score": 0.5920301139461878
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=117.299, mean=117.299, max=117.299, sum=117.299 (1)",
-            "tab": "General information",
-            "score": 117.299
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=300 (1)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.704, mean=4.704, max=4.704, sum=4.704 (1)",
-            "tab": "General information",
-            "score": 4.704
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.037, mean=0.037, max=0.037, sum=0.037 (1)",
-            "tab": "General information",
-            "score": 0.037
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1495.552, mean=1495.552, max=1495.552, sum=1495.552 (1)",
-            "tab": "General information",
-            "score": 1495.552
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=300 (1)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.406, mean=0.406, max=0.406, sum=0.406 (1)",
-            "tab": "Bias",
-            "score": 0.4061624649859944
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Bias",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.524, mean=0.524, max=0.524, sum=0.524 (1)",
-            "tab": "Bias",
-            "score": 0.5238095238095237
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.281, mean=0.281, max=0.281, sum=0.281 (1)",
-            "tab": "Bias",
-            "score": 0.28125
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.26,
-        "details": {
-          "description": "min=0.26, mean=0.26, max=0.26, sum=0.26 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.074, mean=0.074, max=0.074, sum=0.074 (1)",
-            "tab": "Calibration",
-            "score": 0.07389119661461117
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.164, mean=0.164, max=0.164, sum=0.164 (1)",
-            "tab": "Robustness",
-            "score": 0.16438450644529176
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.181, mean=0.181, max=0.181, sum=0.181 (1)",
-            "tab": "Fairness",
-            "score": 0.18079535886869938
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.883, mean=0.883, max=0.883, sum=0.883 (1)",
-            "tab": "General information",
-            "score": 0.883
-          },
-          "QuAC - truncated": {
-            "description": "min=0.021, mean=0.021, max=0.021, sum=0.021 (1)",
-            "tab": "General information",
-            "score": 0.021
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1655.708, mean=1655.708, max=1655.708, sum=1655.708 (1)",
-            "tab": "General information",
-            "score": 1655.708
-          },
-          "QuAC - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=100 (1)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.63, mean=0.63, max=0.63, sum=0.63 (1)",
-            "tab": "Bias",
-            "score": 0.6296296296296297
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.445, mean=0.445, max=0.445, sum=0.445 (1)",
-            "tab": "Bias",
-            "score": 0.4446840232318048
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)",
-            "tab": "Bias",
-            "score": 0.33333333333333337
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.242, mean=0.242, max=0.242, sum=0.242 (1)",
-            "tab": "Bias",
-            "score": 0.24226804123711343
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0.003, mean=0.003, max=0.003, sum=0.003 (1)",
-            "tab": "Toxicity",
-            "score": 0.003
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.243,
-        "details": {
-          "description": "min=0.243, mean=0.243, max=0.243, sum=0.243 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.232, mean=0.232, max=0.232, sum=0.232 (1)",
-            "tab": "Calibration",
-            "score": 0.23215642305686054
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.197, mean=0.197, max=0.197, sum=0.197 (1)",
-            "tab": "Robustness",
-            "score": 0.19724770642201836
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.183, mean=0.183, max=0.183, sum=0.183 (1)",
-            "tab": "Fairness",
-            "score": 0.1834862385321101
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=505.352, mean=505.352, max=505.352, sum=505.352 (1)",
-            "tab": "General information",
-            "score": 505.35168195718654
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.927,
-        "details": {
-          "description": "min=0.927, mean=0.927, max=0.927, sum=0.927 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.159, mean=0.159, max=0.159, sum=0.159 (1)",
-            "tab": "Calibration",
-            "score": 0.15862422483580252
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.82, mean=0.82, max=0.82, sum=0.82 (1)",
-            "tab": "Robustness",
-            "score": 0.82
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.907, mean=0.907, max=0.907, sum=0.907 (1)",
-            "tab": "Fairness",
-            "score": 0.907
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.911, mean=2.911, max=2.911, sum=2.911 (1)",
-            "tab": "General information",
-            "score": 2.911
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1619.568, mean=1619.568, max=1619.568, sum=1619.568 (1)",
-            "tab": "General information",
-            "score": 1619.568
-          },
-          "IMDB - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.664,
-        "details": {
-          "description": "min=0.487, mean=0.664, max=0.77, sum=11.961 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.035, mean=0.102, max=0.234, sum=1.831 (18)",
-            "tab": "Calibration",
-            "score": 0.10174488153691034
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0.277, mean=0.527, max=0.77, sum=9.491 (18)",
-            "tab": "Robustness",
-            "score": 0.5272697486345442
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0.25, mean=0.54, max=0.743, sum=9.724 (18)",
-            "tab": "Fairness",
-            "score": 0.5401968527212513
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=360.976, mean=771.654, max=1282.4, sum=13889.772 (18)",
-            "tab": "General information",
-            "score": 771.6539847352628
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.695,
-        "details": {
-          "description": "min=0.175, mean=0.695, max=0.925, sum=7.65 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.175, mean=0.695, max=0.925, sum=7.647 (11)",
-            "tab": "Calibration",
-            "score": 0.69518288885631
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.175, mean=0.605, max=0.9, sum=6.65 (11)",
-            "tab": "Robustness",
-            "score": 0.6045454545454546
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.175, mean=0.67, max=0.875, sum=7.375 (11)",
-            "tab": "Fairness",
-            "score": 0.6704545454545454
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.7, mean=4.605, max=5, sum=50.65 (11)",
-            "tab": "General information",
-            "score": 4.6045454545454545
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=280.35, mean=869.691, max=1756.575, sum=9566.6 (11)",
-            "tab": "General information",
-            "score": 869.6909090909089
-          },
-          "RAFT - # output tokens": {
-            "description": "min=30, mean=30, max=30, sum=330 (11)",
-            "tab": "General information",
-            "score": 30.0
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json b/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json
deleted file mode 100644
index 57ffafd39..000000000
--- a/data/helm_classic/together/RedPajama-INCITE-Instruct-v1-3B/e79e0c17-2e9b-4b99-85e4-7f15e1a337f7.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/together_RedPajama-INCITE-Instruct-v1-3B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RedPajama-INCITE-Instruct-v1 3B",
-    "id": "together/RedPajama-INCITE-Instruct-v1-3B",
-    "developer": "together",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.366,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.37183163737280295
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.3874825174825175
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.3690909090909091
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.18974591969523494
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.6051282051282051
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.257,
-        "details": {
-          "description": "min=0.22, mean=0.257, max=0.29, sum=1.287 (5)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.09, mean=0.124, max=0.157, sum=0.619 (5)",
-            "tab": "Calibration",
-            "score": 0.1238999810101579
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.18, mean=0.218, max=0.23, sum=1.089 (5)",
-            "tab": "Robustness",
-            "score": 0.21785964912280703
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.18, mean=0.222, max=0.27, sum=1.111 (5)",
-            "tab": "Fairness",
-            "score": 0.22210526315789475
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=358.76, mean=467.936, max=612.798, sum=2339.678 (5)",
-            "tab": "General information",
-            "score": 467.935649122807
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.677,
-        "details": {
-          "description": "min=0.677, mean=0.677, max=0.677, sum=0.677 (1)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.141, mean=0.141, max=0.141, sum=0.141 (1)",
-            "tab": "Calibration",
-            "score": 0.14082220350962116
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.629, mean=0.629, max=0.629, sum=0.629 (1)",
-            "tab": "Robustness",
-            "score": 0.629
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.648, mean=0.648, max=0.648, sum=0.648 (1)",
-            "tab": "Fairness",
-            "score": 0.648
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=1251.897, mean=1251.897, max=1251.897, sum=1251.897 (1)",
-            "tab": "General information",
-            "score": 1251.897
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.638,
-        "details": {
-          "description": "min=0.638, mean=0.638, max=0.638, sum=0.638 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.254, mean=0.254, max=0.254, sum=0.254 (1)",
-            "tab": "Calibration",
-            "score": 0.25351615672342864
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.403, mean=0.403, max=0.403, sum=0.403 (1)",
-            "tab": "Robustness",
-            "score": 0.4034697604028265
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.506, mean=0.506, max=0.506, sum=0.506 (1)",
-            "tab": "Fairness",
-            "score": 0.5060331991298288
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)",
-            "tab": "General information",
-            "score": 1.9690140845070423
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1691.082, mean=1691.082, max=1691.082, sum=1691.082 (1)",
-            "tab": "General information",
-            "score": 1691.081690140845
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=100 (1)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.184, mean=0.184, max=0.184, sum=0.184 (1)",
-            "tab": "Bias",
-            "score": 0.18354430379746836
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.014, mean=0.014, max=0.014, sum=0.014 (1)",
-            "tab": "Toxicity",
-            "score": 0.014084507042253521
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.637,
-        "details": {
-          "description": "min=0.637, mean=0.637, max=0.637, sum=0.637 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.12, mean=0.12, max=0.12, sum=0.12 (1)",
-            "tab": "Calibration",
-            "score": 0.12000000000000001
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.454, mean=0.454, max=0.454, sum=0.454 (1)",
-            "tab": "Calibration",
-            "score": 0.4539999913132661
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.132, mean=0.132, max=0.132, sum=0.132 (1)",
-            "tab": "Robustness",
-            "score": 0.13162030419976034
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.536, mean=0.536, max=0.536, sum=0.536 (1)",
-            "tab": "Robustness",
-            "score": 0.5356772534642628
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.143, mean=0.143, max=0.143, sum=0.143 (1)",
-            "tab": "Fairness",
-            "score": 0.1431948167839223
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.571, mean=0.571, max=0.571, sum=0.571 (1)",
-            "tab": "Fairness",
-            "score": 0.57068667733919
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=117.299, mean=117.299, max=117.299, sum=117.299 (1)",
-            "tab": "General information",
-            "score": 117.299
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=300 (1)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.704, mean=4.704, max=4.704, sum=4.704 (1)",
-            "tab": "General information",
-            "score": 4.704
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.037, mean=0.037, max=0.037, sum=0.037 (1)",
-            "tab": "General information",
-            "score": 0.037
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1495.552, mean=1495.552, max=1495.552, sum=1495.552 (1)",
-            "tab": "General information",
-            "score": 1495.552
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=300, mean=300, max=300, sum=300 (1)",
-            "tab": "General information",
-            "score": 300.0
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)",
-            "tab": "Bias",
-            "score": 0.4666666666666666
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.278, mean=0.278, max=0.278, sum=0.278 (1)",
-            "tab": "Bias",
-            "score": 0.2777777777777778
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.566, mean=0.566, max=0.566, sum=0.566 (1)",
-            "tab": "Bias",
-            "score": 0.5660749506903353
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.324, mean=0.324, max=0.324, sum=0.324 (1)",
-            "tab": "Bias",
-            "score": 0.32352941176470584
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.001, max=0.001, sum=0.001 (1)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.259,
-        "details": {
-          "description": "min=0.259, mean=0.259, max=0.259, sum=0.259 (1)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.1, mean=0.1, max=0.1, sum=0.1 (1)",
-            "tab": "Calibration",
-            "score": 0.09989902749544036
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.137, mean=0.137, max=0.137, sum=0.137 (1)",
-            "tab": "Robustness",
-            "score": 0.1368222933188553
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.183, mean=0.183, max=0.183, sum=0.183 (1)",
-            "tab": "Fairness",
-            "score": 0.18270531445590665
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.883, mean=0.883, max=0.883, sum=0.883 (1)",
-            "tab": "General information",
-            "score": 0.883
-          },
-          "QuAC - truncated": {
-            "description": "min=0.021, mean=0.021, max=0.021, sum=0.021 (1)",
-            "tab": "General information",
-            "score": 0.021
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1655.708, mean=1655.708, max=1655.708, sum=1655.708 (1)",
-            "tab": "General information",
-            "score": 1655.708
-          },
-          "QuAC - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=100 (1)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "QuAC - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.439, mean=0.439, max=0.439, sum=0.439 (1)",
-            "tab": "Bias",
-            "score": 0.4393162393162393
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.34, mean=0.34, max=0.34, sum=0.34 (1)",
-            "tab": "Bias",
-            "score": 0.33993399339933994
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.285, mean=0.285, max=0.285, sum=0.285 (1)",
-            "tab": "Bias",
-            "score": 0.28532608695652173
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.208,
-        "details": {
-          "description": "min=0.208, mean=0.208, max=0.208, sum=0.208 (1)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.097, mean=0.097, max=0.097, sum=0.097 (1)",
-            "tab": "Calibration",
-            "score": 0.09733177984986514
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.173, mean=0.173, max=0.173, sum=0.173 (1)",
-            "tab": "Robustness",
-            "score": 0.172782874617737
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.179, mean=0.179, max=0.179, sum=0.179 (1)",
-            "tab": "Fairness",
-            "score": 0.17889908256880735
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=654 (1)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=505.352, mean=505.352, max=505.352, sum=505.352 (1)",
-            "tab": "General information",
-            "score": 505.35168195718654
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "XSUM - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "XSUM - SummaC": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Density": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Compression": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "No matching runs",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.894,
-        "details": {
-          "description": "min=0.894, mean=0.894, max=0.894, sum=0.894 (1)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.04, mean=0.04, max=0.04, sum=0.04 (1)",
-            "tab": "Calibration",
-            "score": 0.04045821313550608
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.852, mean=0.852, max=0.852, sum=0.852 (1)",
-            "tab": "Robustness",
-            "score": 0.852
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.876, mean=0.876, max=0.876, sum=0.876 (1)",
-            "tab": "Fairness",
-            "score": 0.876
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.911, mean=2.911, max=2.911, sum=2.911 (1)",
-            "tab": "General information",
-            "score": 2.911
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1619.568, mean=1619.568, max=1619.568, sum=1619.568 (1)",
-            "tab": "General information",
-            "score": 1619.568
-          },
-          "IMDB - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - # trials": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.549,
-        "details": {
-          "description": "min=0.028, mean=0.549, max=0.997, sum=9.891 (18)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.09, mean=0.383, max=0.8, sum=6.9 (18)",
-            "tab": "Calibration",
-            "score": 0.3833406193329736
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.506, max=0.993, sum=9.105 (18)",
-            "tab": "Robustness",
-            "score": 0.5058374710841333
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.499, max=0.985, sum=8.983 (18)",
-            "tab": "Fairness",
-            "score": 0.4990473523687277
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=6688 (18)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (18)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=360.976, mean=771.654, max=1282.4, sum=13889.772 (18)",
-            "tab": "General information",
-            "score": 771.6539847352628
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=90 (18)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=1, mean=1, max=1, sum=18 (18)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.661,
-        "details": {
-          "description": "min=0.2, mean=0.661, max=0.975, sum=7.275 (11)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.2, mean=0.661, max=0.975, sum=7.274 (11)",
-            "tab": "Calibration",
-            "score": 0.6612967467806994
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.075, mean=0.548, max=0.95, sum=6.025 (11)",
-            "tab": "Robustness",
-            "score": 0.5477272727272727
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.175, mean=0.632, max=0.975, sum=6.95 (11)",
-            "tab": "Fairness",
-            "score": 0.631818181818182
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=440 (11)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0.7, mean=4.605, max=5, sum=50.65 (11)",
-            "tab": "General information",
-            "score": 4.6045454545454545
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (11)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=280.35, mean=869.691, max=1756.575, sum=9566.6 (11)",
-            "tab": "General information",
-            "score": 869.6909090909089
-          },
-          "RAFT - # output tokens": {
-            "description": "min=30, mean=30, max=30, sum=330 (11)",
-            "tab": "General information",
-            "score": 30.0
-          },
-          "RAFT - # trials": {
-            "description": "min=1, mean=1, max=1, sum=11 (11)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json b/data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json
deleted file mode 100644
index fe1ab40e2..000000000
--- a/data/helm_classic/writer/InstructPalmyra-30B/cb80bd5f-204a-4dd8-96ec-40c7df93975f.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/writer_InstructPalmyra-30B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "InstructPalmyra 30B",
-    "id": "writer/InstructPalmyra-30B",
-    "developer": "writer",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.568,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": null
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.5224242424242425
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.5379254079254079
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": null
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.47136458620459815
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.5811383061383062
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.403,
-        "details": {
-          "description": "min=0.23, mean=0.403, max=0.7, sum=6.041 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.14, mean=0.348, max=0.65, sum=5.223 (15)",
-            "tab": "Robustness",
-            "score": 0.34819883040935673
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.19, mean=0.371, max=0.66, sum=5.572 (15)",
-            "tab": "Fairness",
-            "score": 0.3714502923976608
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "5 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=7084.111 (15)",
-            "tab": "General information",
-            "score": 472.2740350877193
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.751,
-        "details": {
-          "description": "min=0.698, mean=0.751, max=0.798, sum=2.254 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.564, mean=0.656, max=0.719, sum=1.967 (3)",
-            "tab": "Robustness",
-            "score": 0.6556666666666667
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.636, mean=0.7, max=0.762, sum=2.099 (3)",
-            "tab": "Fairness",
-            "score": 0.6996666666666668
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=660.073, mean=908.406, max=1242.073, sum=2725.219 (3)",
-            "tab": "General information",
-            "score": 908.4063333333334
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=3 (3)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.496,
-        "details": {
-          "description": "min=0.253, mean=0.496, max=0.636, sum=1.489 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.091, mean=0.317, max=0.444, sum=0.952 (3)",
-            "tab": "Robustness",
-            "score": 0.3173185298582432
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.18, mean=0.405, max=0.538, sum=1.214 (3)",
-            "tab": "Fairness",
-            "score": 0.40467419690737483
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.051, mean=1.646, max=2.085, sum=4.938 (3)",
-            "tab": "General information",
-            "score": 1.6460093896713615
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1600.366, mean=1651.848, max=1705.003, sum=4955.544 (3)",
-            "tab": "General information",
-            "score": 1651.8478873239437
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1.93, mean=5.347, max=7.079, sum=16.042 (3)",
-            "tab": "General information",
-            "score": 5.347417840375587
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.381, mean=0.445, max=0.5, sum=1.335 (3)",
-            "tab": "Bias",
-            "score": 0.44516594516594515
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.333, mean=0.444, max=0.667, sum=1.333 (3)",
-            "tab": "Bias",
-            "score": 0.4444444444444445
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.164, mean=0.196, max=0.241, sum=0.588 (3)",
-            "tab": "Bias",
-            "score": 0.1960646593836042
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.003, mean=0.012, max=0.017, sum=0.037 (3)",
-            "tab": "Toxicity",
-            "score": 0.01220657276995305
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.682,
-        "details": {
-          "description": "min=0.678, mean=0.682, max=0.688, sum=2.046 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.257, mean=0.267, max=0.272, sum=0.8 (3)",
-            "tab": "Robustness",
-            "score": 0.2667976861519438
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.52, mean=0.567, max=0.61, sum=1.701 (3)",
-            "tab": "Robustness",
-            "score": 0.5669828313348768
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.272, mean=0.276, max=0.282, sum=0.829 (3)",
-            "tab": "Fairness",
-            "score": 0.276181640672073
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.621, mean=0.63, max=0.639, sum=1.891 (3)",
-            "tab": "Fairness",
-            "score": 0.6303513019528806
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=110.254, mean=112.254, max=116.254, sum=336.762 (3)",
-            "tab": "General information",
-            "score": 112.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=3.935, mean=4.247, max=4.675, sum=12.74 (3)",
-            "tab": "General information",
-            "score": 4.246666666666667
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.647, mean=4.691, max=4.723, sum=14.072 (3)",
-            "tab": "General information",
-            "score": 4.690666666666666
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.036, mean=0.036, max=0.036, sum=0.108 (3)",
-            "tab": "General information",
-            "score": 0.036
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1231.056, mean=1419.328, max=1523.222, sum=4257.983 (3)",
-            "tab": "General information",
-            "score": 1419.3276666666668
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=6.778, mean=7.657, max=8.266, sum=22.97 (3)",
-            "tab": "General information",
-            "score": 7.656666666666666
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.498, mean=0.525, max=0.55, sum=1.576 (3)",
-            "tab": "Bias",
-            "score": 0.5252747252747252
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.088, mean=0.134, max=0.206, sum=0.401 (3)",
-            "tab": "Bias",
-            "score": 0.13375350140056022
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.3, mean=0.392, max=0.443, sum=1.176 (3)",
-            "tab": "Bias",
-            "score": 0.39206349206349206
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.45, mean=0.49, max=0.533, sum=1.47 (3)",
-            "tab": "Bias",
-            "score": 0.4899991188650981
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.327, mean=0.384, max=0.422, sum=1.152 (3)",
-            "tab": "Bias",
-            "score": 0.3838592033738646
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.433,
-        "details": {
-          "description": "min=0.423, mean=0.433, max=0.447, sum=1.3 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.23, mean=0.248, max=0.258, sum=0.743 (3)",
-            "tab": "Robustness",
-            "score": 0.24761534139298128
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.328, mean=0.337, max=0.353, sum=1.011 (3)",
-            "tab": "Fairness",
-            "score": 0.3370729442565461
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.845, mean=0.944, max=1.084, sum=2.831 (3)",
-            "tab": "General information",
-            "score": 0.9436666666666667
-          },
-          "QuAC - truncated": {
-            "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)",
-            "tab": "General information",
-            "score": 0.016
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1624.371, mean=1644.436, max=1670.589, sum=4933.308 (3)",
-            "tab": "General information",
-            "score": 1644.436
-          },
-          "QuAC - # output tokens": {
-            "description": "min=18.652, mean=22.969, max=26.445, sum=68.907 (3)",
-            "tab": "General information",
-            "score": 22.969000000000005
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.571, mean=0.582, max=0.59, sum=1.745 (3)",
-            "tab": "Bias",
-            "score": 0.5815018315018315
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.413, mean=0.431, max=0.463, sum=1.292 (3)",
-            "tab": "Bias",
-            "score": 0.43052581120508293
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.317, mean=0.337, max=0.368, sum=1.012 (3)",
-            "tab": "Bias",
-            "score": 0.33749135321526574
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.224, mean=0.236, max=0.243, sum=0.707 (3)",
-            "tab": "Bias",
-            "score": 0.2355073330063574
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.002 (3)",
-            "tab": "Toxicity",
-            "score": 0.0006666666666666666
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.185,
-        "details": {
-          "description": "min=0.18, mean=0.185, max=0.19, sum=0.555 (3)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.148, mean=0.151, max=0.154, sum=0.454 (3)",
-            "tab": "Robustness",
-            "score": 0.1513761467889908
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.147, mean=0.152, max=0.157, sum=0.456 (3)",
-            "tab": "Fairness",
-            "score": 0.15188583078491336
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=1962 (3)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=501.121, mean=511.121, max=529.121, sum=1533.362 (3)",
-            "tab": "General information",
-            "score": 511.12079510703364
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=0.998, mean=0.999, max=1, sum=2.997 (3)",
-            "tab": "General information",
-            "score": 0.998980632008155
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.152,
-        "details": {
-          "description": "min=0.142, mean=0.152, max=0.165, sum=0.455 (3)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=1398 (3)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1531.586, mean=1549.919, max=1567.586, sum=4649.758 (3)",
-            "tab": "General information",
-            "score": 1549.9191702432045
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=74.511, mean=83.965, max=95.704, sum=251.895 (3)",
-            "tab": "General information",
-            "score": 83.96494992846924
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.619, mean=0.638, max=0.651, sum=1.914 (3)",
-            "tab": "Bias",
-            "score": 0.638095238095238
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.344, mean=0.371, max=0.398, sum=1.112 (3)",
-            "tab": "Bias",
-            "score": 0.3705770935558364
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.199, mean=0.258, max=0.288, sum=0.773 (3)",
-            "tab": "Bias",
-            "score": 0.2575629817009127
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.108, mean=0.117, max=0.129, sum=0.351 (3)",
-            "tab": "Bias",
-            "score": 0.11691353772442492
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.002, max=0.006, sum=0.006 (3)",
-            "tab": "Toxicity",
-            "score": 0.002145922746781116
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.961, mean=0.972, max=0.979, sum=2.915 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.9716859203819838
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=22.383, mean=28.97, max=38.633, sum=86.91 (3)",
-            "tab": "Summarization metrics",
-            "score": 28.97014469233496
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=6.723, mean=7.901, max=9.103, sum=23.703 (3)",
-            "tab": "Summarization metrics",
-            "score": 7.901010404629208
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.104,
-        "details": {
-          "description": "min=0.1, mean=0.104, max=0.106, sum=0.312 (3)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=1554 (3)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.998, mean=4.999, max=5, sum=14.996 (3)",
-            "tab": "General information",
-            "score": 4.998712998712999
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1456.402, mean=1510.418, max=1538.921, sum=4531.255 (3)",
-            "tab": "General information",
-            "score": 1510.4182754182755
-          },
-          "XSUM - # output tokens": {
-            "description": "min=26.207, mean=26.632, max=27.241, sum=79.896 (3)",
-            "tab": "General information",
-            "score": 26.631917631917634
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.436, mean=0.459, max=0.489, sum=1.376 (3)",
-            "tab": "Bias",
-            "score": 0.45852730200556285
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.532, mean=0.59, max=0.667, sum=1.771 (3)",
-            "tab": "Bias",
-            "score": 0.5901750807411186
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.17, mean=0.187, max=0.207, sum=0.562 (3)",
-            "tab": "Bias",
-            "score": 0.18720575071822934
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.002 (3)",
-            "tab": "Toxicity",
-            "score": 0.0006435006435006435
-          },
-          "XSUM - SummaC": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - QAFactEval": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.836, mean=0.844, max=0.853, sum=2.531 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.8437121246995759
-          },
-          "XSUM - Density": {
-            "description": "min=3.292, mean=3.441, max=3.518, sum=10.323 (3)",
-            "tab": "Summarization metrics",
-            "score": 3.4410181202034944
-          },
-          "XSUM - Compression": {
-            "description": "min=15.467, mean=15.707, max=15.837, sum=47.122 (3)",
-            "tab": "Summarization metrics",
-            "score": 15.707173220790708
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.936, mean=0.94, max=0.946, sum=2.821 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.898, mean=0.906, max=0.916, sum=2.718 (3)",
-            "tab": "Robustness",
-            "score": 0.906
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.925, mean=0.931, max=0.94, sum=2.793 (3)",
-            "tab": "Fairness",
-            "score": 0.931
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=2.916, mean=4.242, max=4.986, sum=12.726 (3)",
-            "tab": "General information",
-            "score": 4.242
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1277.729, mean=1553.363, max=1768.607, sum=4660.089 (3)",
-            "tab": "General information",
-            "score": 1553.363
-          },
-          "IMDB - # output tokens": {
-            "description": "min=0.995, mean=0.997, max=0.999, sum=2.992 (3)",
-            "tab": "General information",
-            "score": 0.9973333333333333
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.555,
-        "details": {
-          "description": "min=0, mean=0.555, max=0.877, sum=29.976 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.443, max=0.774, sum=23.937 (54)",
-            "tab": "Robustness",
-            "score": 0.4432801514699601
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.449, max=0.871, sum=24.239 (54)",
-            "tab": "Fairness",
-            "score": 0.44887663628250224
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=356.537, mean=722.635, max=1267.519, sum=39022.317 (54)",
-            "tab": "General information",
-            "score": 722.6354931173206
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=0, mean=0.905, max=1, sum=48.891 (54)",
-            "tab": "General information",
-            "score": 0.9053814074087929
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "9 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.652,
-        "details": {
-          "description": "min=0.275, mean=0.652, max=0.95, sum=21.5 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Calibration",
-            "score": null
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0.05, mean=0.518, max=0.95, sum=17.1 (33)",
-            "tab": "Robustness",
-            "score": 0.5181818181818182
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0.25, mean=0.618, max=0.925, sum=20.4 (33)",
-            "tab": "Fairness",
-            "score": 0.6181818181818182
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.556, max=5, sum=150.35 (33)",
-            "tab": "General information",
-            "score": 4.556060606060607
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=257.35, mean=812.938, max=1773.675, sum=26826.95 (33)",
-            "tab": "General information",
-            "score": 812.937878787879
-          },
-          "RAFT - # output tokens": {
-            "description": "min=1, mean=2.967, max=6.15, sum=97.925 (33)",
-            "tab": "General information",
-            "score": 2.9674242424242423
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "11 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json b/data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json
deleted file mode 100644
index 61a019ad2..000000000
--- a/data/helm_classic/yandex/YaLM-100B/f84f84a8-7191-42ac-8951-5d7141a0f700.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/yandex_YaLM-100B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "YaLM 100B",
-    "id": "yandex/YaLM-100B",
-    "developer": "yandex",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.075,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.40175763182238666
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.20536130536130537
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.16727272727272727
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.2658333333333333
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.37929404953000706
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.24189051689051688
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.04536340852130326
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.243,
-        "details": {
-          "description": "min=0.2, mean=0.243, max=0.28, sum=3.651 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.619, mean=0.708, max=0.769, sum=10.615 (15)",
-            "tab": "Calibration",
-            "score": 0.7076962372990694
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.2, mean=0.243, max=0.28, sum=3.651 (15)",
-            "tab": "Robustness",
-            "score": 0.2433684210526316
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.2, mean=0.243, max=0.28, sum=3.651 (15)",
-            "tab": "Fairness",
-            "score": 0.2433684210526316
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.09, mean=0.143, max=0.217, sum=2.144 (15)",
-            "tab": "Efficiency",
-            "score": 0.14296402070471761
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=354.96, mean=453.383, max=580.833, sum=6800.74 (15)",
-            "tab": "General information",
-            "score": 453.38266666666664
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.634,
-        "details": {
-          "description": "min=0.631, mean=0.634, max=0.64, sum=1.902 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.114, mean=0.147, max=0.167, sum=0.442 (3)",
-            "tab": "Calibration",
-            "score": 0.14717484078898194
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.437, mean=0.566, max=0.631, sum=1.698 (3)",
-            "tab": "Robustness",
-            "score": 0.566
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.486, mean=0.583, max=0.631, sum=1.748 (3)",
-            "tab": "Fairness",
-            "score": 0.5826666666666667
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.546, mean=0.828, max=1.136, sum=2.485 (3)",
-            "tab": "Efficiency",
-            "score": 0.8282727491158176
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=649.339, mean=899.006, max=1233.339, sum=2697.017 (3)",
-            "tab": "General information",
-            "score": 899.0056666666666
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.252,
-        "details": {
-          "description": "min=0.213, mean=0.252, max=0.297, sum=0.756 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.029, mean=0.06, max=0.101, sum=0.179 (3)",
-            "tab": "Calibration",
-            "score": 0.05960283323299867
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.078, mean=0.088, max=0.096, sum=0.264 (3)",
-            "tab": "Robustness",
-            "score": 0.08788676556219112
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.131, mean=0.146, max=0.169, sum=0.437 (3)",
-            "tab": "Fairness",
-            "score": 0.14573784149261218
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=2.158, mean=2.314, max=2.397, sum=6.943 (3)",
-            "tab": "Efficiency",
-            "score": 2.314193915889056
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.028, mean=1.604, max=2.008, sum=4.811 (3)",
-            "tab": "General information",
-            "score": 1.603755868544601
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1603.569, mean=1644.878, max=1690.352, sum=4934.634 (3)",
-            "tab": "General information",
-            "score": 1644.8779342723003
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=94.115, mean=96.018, max=98.566, sum=288.054 (3)",
-            "tab": "General information",
-            "score": 96.01784037558686
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.434, mean=0.449, max=0.478, sum=1.347 (3)",
-            "tab": "Bias",
-            "score": 0.449065994913171
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.429, mean=0.568, max=0.667, sum=1.703 (3)",
-            "tab": "Bias",
-            "score": 0.5676937441643325
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.127, mean=0.177, max=0.216, sum=0.53 (3)",
-            "tab": "Bias",
-            "score": 0.17681914997964296
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.014, mean=0.017, max=0.02, sum=0.051 (3)",
-            "tab": "Toxicity",
-            "score": 0.016901408450704227
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.227,
-        "details": {
-          "description": "min=0.197, mean=0.227, max=0.258, sum=0.68 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.019, mean=0.02, max=0.02, sum=0.059 (3)",
-            "tab": "Calibration",
-            "score": 0.019790335675494927
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.069, mean=0.086, max=0.12, sum=0.259 (3)",
-            "tab": "Calibration",
-            "score": 0.08637064333353452
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.045, mean=0.047, max=0.05, sum=0.14 (3)",
-            "tab": "Robustness",
-            "score": 0.04678550801735826
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.111, mean=0.125, max=0.146, sum=0.375 (3)",
-            "tab": "Robustness",
-            "score": 0.12496123369617401
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.051, mean=0.052, max=0.053, sum=0.155 (3)",
-            "tab": "Fairness",
-            "score": 0.0516362934670568
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.15, mean=0.177, max=0.207, sum=0.53 (3)",
-            "tab": "Fairness",
-            "score": 0.1768275232054711
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=2.669, mean=2.722, max=2.827, sum=8.167 (3)",
-            "tab": "Efficiency",
-            "score": 2.7221932611479644
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=4.373, mean=4.463, max=4.531, sum=13.389 (3)",
-            "tab": "Efficiency",
-            "score": 4.463013303365339
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=108.201, mean=111.534, max=117.201, sum=334.603 (3)",
-            "tab": "General information",
-            "score": 111.53433333333332
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=298.545, mean=299.515, max=300, sum=898.545 (3)",
-            "tab": "General information",
-            "score": 299.51500000000004
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.669, mean=4.702, max=4.738, sum=14.107 (3)",
-            "tab": "General information",
-            "score": 4.702333333333333
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.038, mean=0.038, max=0.038, sum=0.114 (3)",
-            "tab": "General information",
-            "score": 0.038
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1218.159, mean=1409.24, max=1510.891, sum=4227.721 (3)",
-            "tab": "General information",
-            "score": 1409.2403333333332
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=289.149, mean=291.572, max=293.886, sum=874.715 (3)",
-            "tab": "General information",
-            "score": 291.57166666666666
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.433, mean=0.478, max=0.5, sum=1.433 (3)",
-            "tab": "Bias",
-            "score": 0.4776758409785933
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.324, mean=0.327, max=0.33, sum=0.982 (3)",
-            "tab": "Bias",
-            "score": 0.3274145329078469
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.014, mean=0.168, max=0.277, sum=0.504 (3)",
-            "tab": "Bias",
-            "score": 0.16816448651008897
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.204, mean=0.385, max=0.523, sum=1.154 (3)",
-            "tab": "Bias",
-            "score": 0.38473904949347787
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.102, mean=0.175, max=0.25, sum=0.526 (3)",
-            "tab": "Bias",
-            "score": 0.17544176986611967
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0.007, mean=0.008, max=0.009, sum=0.024 (3)",
-            "tab": "Toxicity",
-            "score": 0.008
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.003, mean=0.003, max=0.003, sum=0.009 (3)",
-            "tab": "Toxicity",
-            "score": 0.0030000000000000005
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.162,
-        "details": {
-          "description": "min=0.156, mean=0.162, max=0.172, sum=0.485 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.012, mean=0.029, max=0.039, sum=0.087 (3)",
-            "tab": "Calibration",
-            "score": 0.028959032200530792
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.077, mean=0.08, max=0.082, sum=0.239 (3)",
-            "tab": "Robustness",
-            "score": 0.0795025876916194
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.092, mean=0.1, max=0.108, sum=0.301 (3)",
-            "tab": "Fairness",
-            "score": 0.10047785618783804
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=2.259, mean=2.278, max=2.297, sum=6.834 (3)",
-            "tab": "Efficiency",
-            "score": 2.278147567048529
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.841, mean=0.951, max=1.111, sum=2.853 (3)",
-            "tab": "General information",
-            "score": 0.951
-          },
-          "QuAC - truncated": {
-            "description": "min=0.016, mean=0.016, max=0.016, sum=0.048 (3)",
-            "tab": "General information",
-            "score": 0.016
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1630.348, mean=1646.729, max=1667.958, sum=4940.188 (3)",
-            "tab": "General information",
-            "score": 1646.7293333333334
-          },
-          "QuAC - # output tokens": {
-            "description": "min=99.146, mean=99.146, max=99.146, sum=297.438 (3)",
-            "tab": "General information",
-            "score": 99.146
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.44, mean=0.454, max=0.465, sum=1.363 (3)",
-            "tab": "Bias",
-            "score": 0.4543925551127126
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.312, mean=0.465, max=0.582, sum=1.396 (3)",
-            "tab": "Bias",
-            "score": 0.4653480174056855
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.335, mean=0.343, max=0.358, sum=1.029 (3)",
-            "tab": "Bias",
-            "score": 0.3431307584494557
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.003, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.202,
-        "details": {
-          "description": "min=0.197, mean=0.202, max=0.203, sum=0.807 (4)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.621, mean=0.679, max=0.751, sum=2.716 (4)",
-            "tab": "Calibration",
-            "score": 0.6789622806094777
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.197, mean=0.202, max=0.203, sum=0.807 (4)",
-            "tab": "Robustness",
-            "score": 0.2018348623853211
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.197, mean=0.202, max=0.203, sum=0.807 (4)",
-            "tab": "Fairness",
-            "score": 0.2018348623853211
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.058, mean=0.092, max=0.136, sum=0.37 (4)",
-            "tab": "Efficiency",
-            "score": 0.09243018414244196
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=2616 (4)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=0, mean=3.75, max=5, sum=15 (4)",
-            "tab": "General information",
-            "score": 3.75
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (4)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=85.664, mean=405.414, max=531.664, sum=1621.654 (4)",
-            "tab": "General information",
-            "score": 405.41360856269114
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=4 (4)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=2.5, max=3, sum=10 (4)",
-            "tab": "General information",
-            "score": 2.5
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.017,
-        "details": {
-          "description": "min=0.009, mean=0.017, max=0.022, sum=0.103 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=2.334, mean=2.346, max=2.352, sum=14.074 (6)",
-            "tab": "Efficiency",
-            "score": 2.3457143735281405
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1536.099, mean=1544.765, max=1562.099, sum=9268.592 (6)",
-            "tab": "General information",
-            "score": 1544.7653791130188
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=90.71, mean=102.407, max=108.32, sum=614.442 (6)",
-            "tab": "General information",
-            "score": 102.40701001430614
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.406, mean=0.42, max=0.438, sum=2.518 (6)",
-            "tab": "Bias",
-            "score": 0.4196869049681346
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.429, mean=0.588, max=0.667, sum=3.525 (6)",
-            "tab": "Bias",
-            "score": 0.5875706214689266
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.171, mean=0.206, max=0.237, sum=1.238 (6)",
-            "tab": "Bias",
-            "score": 0.20635612913269732
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.004 (6)",
-            "tab": "Toxicity",
-            "score": 0.000715307582260372
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=-0.35, mean=-0.322, max=-0.296, sum=-0.965 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.3217409663792838
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=-0.154, mean=-0.145, max=-0.127, sum=-0.435 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.14496527560996572
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.406, mean=0.541, max=0.615, sum=3.249 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.5414806522156069
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=0.681, mean=1.09, max=1.303, sum=6.541 (6)",
-            "tab": "Summarization metrics",
-            "score": 1.0902141864760964
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=6.289, mean=6.936, max=8.148, sum=41.615 (6)",
-            "tab": "Summarization metrics",
-            "score": 6.935882429972025
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.021,
-        "details": {
-          "description": "min=0.019, mean=0.021, max=0.022, sum=0.124 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=1.653, mean=1.671, max=1.681, sum=10.028 (6)",
-            "tab": "Efficiency",
-            "score": 1.6713877910966286
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1452.164, mean=1507.497, max=1536.164, sum=9044.985 (6)",
-            "tab": "General information",
-            "score": 1507.497425997426
-          },
-          "XSUM - # output tokens": {
-            "description": "min=46.541, mean=49.401, max=51.544, sum=296.405 (6)",
-            "tab": "General information",
-            "score": 49.4009009009009
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.434, mean=0.442, max=0.456, sum=2.652 (6)",
-            "tab": "Bias",
-            "score": 0.4419820754826329
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.333, mean=0.501, max=0.595, sum=3.009 (6)",
-            "tab": "Bias",
-            "score": 0.5014430014430014
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.209, mean=0.248, max=0.286, sum=1.485 (6)",
-            "tab": "Bias",
-            "score": 0.24754799603959324
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.352, mean=-0.347, max=-0.344, sum=-1.04 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.3466731809697447
-          },
-          "XSUM - QAFactEval": {
-            "description": "min=0.856, mean=1.176, max=1.555, sum=7.058 (6)",
-            "tab": "Summarization metrics",
-            "score": 1.1763058409064706
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.007, mean=0.031, max=0.057, sum=0.093 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.031129963643441894
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.557, mean=0.567, max=0.574, sum=3.405 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.5674251187038739
-          },
-          "XSUM - Density": {
-            "description": "min=1.005, mean=1.041, max=1.081, sum=6.248 (6)",
-            "tab": "Summarization metrics",
-            "score": 1.0413571284332044
-          },
-          "XSUM - Compression": {
-            "description": "min=9.397, mean=9.951, max=10.96, sum=59.706 (6)",
-            "tab": "Summarization metrics",
-            "score": 9.951019350255967
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.836,
-        "details": {
-          "description": "min=0.776, mean=0.836, max=0.876, sum=2.509 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.369, mean=0.418, max=0.496, sum=1.255 (3)",
-            "tab": "Calibration",
-            "score": 0.41834259640752514
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.578, mean=0.719, max=0.79, sum=2.158 (3)",
-            "tab": "Robustness",
-            "score": 0.7193333333333333
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.709, mean=0.8, max=0.853, sum=2.4 (3)",
-            "tab": "Fairness",
-            "score": 0.7999999999999999
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=1.076, mean=1.137, max=1.23, sum=3.41 (3)",
-            "tab": "Efficiency",
-            "score": 1.1365543731623833
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=4.845, mean=4.929, max=4.982, sum=14.788 (3)",
-            "tab": "General information",
-            "score": 4.929333333333333
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1161.789, mean=1402.276, max=1747.837, sum=4206.828 (3)",
-            "tab": "General information",
-            "score": 1402.2759999999998
-          },
-          "IMDB - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.49,
-        "details": {
-          "description": "min=0, mean=0.49, max=1, sum=26.448 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.108, mean=0.437, max=0.784, sum=23.581 (54)",
-            "tab": "Calibration",
-            "score": 0.43669079652569004
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.463, max=1, sum=25.008 (54)",
-            "tab": "Robustness",
-            "score": 0.4631081891632545
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.456, max=0.998, sum=24.603 (54)",
-            "tab": "Fairness",
-            "score": 0.4556089334763174
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.291, mean=0.41, max=0.737, sum=22.139 (54)",
-            "tab": "Efficiency",
-            "score": 0.4099806397254133
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=365, mean=729.671, max=1285.924, sum=39402.252 (54)",
-            "tab": "General information",
-            "score": 729.6713289334527
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.395,
-        "details": {
-          "description": "min=0, mean=0.395, max=0.975, sum=13.05 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.011, mean=0.278, max=0.881, sum=9.176 (33)",
-            "tab": "Calibration",
-            "score": 0.2780574023642052
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.211, max=0.65, sum=6.975 (33)",
-            "tab": "Robustness",
-            "score": 0.21136363636363636
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0, mean=0.342, max=0.975, sum=11.3 (33)",
-            "tab": "Fairness",
-            "score": 0.3424242424242424
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.132, mean=0.89, max=1.838, sum=29.385 (33)",
-            "tab": "Efficiency",
-            "score": 0.8904544346562409
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.562, max=5, sum=150.55 (33)",
-            "tab": "General information",
-            "score": 4.5621212121212125
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=255.875, mean=784.961, max=1758.075, sum=25903.725 (33)",
-            "tab": "General information",
-            "score": 784.9613636363637
-          },
-          "RAFT - # output tokens": {
-            "description": "min=5, mean=13.615, max=30, sum=449.3 (33)",
-            "tab": "General information",
-            "score": 13.615151515151515
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json b/data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json
deleted file mode 100644
index 04bdfa490..000000000
--- a/data/helm_classic/zhipu-ai/GLM-130B/9ba74767-b675-460a-bb68-e82adb6acd2f.json
+++ /dev/null
@@ -1,1613 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_classic/zhipu-ai_GLM-130B/1770834891.1472661",
-  "retrieved_timestamp": "1770834891.1472661",
-  "source_metadata": {
-    "source_name": "helm_classic",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GLM 130B",
-    "id": "zhipu-ai/GLM-130B",
-    "developer": "zhipu-ai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_classic",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.512,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Calibration": {
-            "description": null,
-            "tab": "Calibration",
-            "score": 0.6523126734505088
-          },
-          "Mean win rate - Robustness": {
-            "description": null,
-            "tab": "Robustness",
-            "score": 0.6465501165501165
-          },
-          "Mean win rate - Fairness": {
-            "description": null,
-            "tab": "Fairness",
-            "score": 0.5133566433566433
-          },
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.1511111111111111
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          },
-          "Mean win rate - Bias": {
-            "description": null,
-            "tab": "Bias",
-            "score": 0.45074793034678545
-          },
-          "Mean win rate - Toxicity": {
-            "description": null,
-            "tab": "Toxicity",
-            "score": 0.3347137430470764
-          },
-          "Mean win rate - Summarization metrics": {
-            "description": null,
-            "tab": "Summarization metrics",
-            "score": 0.4714285714285714
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344,
-        "details": {
-          "description": "min=0.23, mean=0.344, max=0.47, sum=5.16 (15)",
-          "tab": "Accuracy",
-          "MMLU - ECE (10-bin)": {
-            "description": "min=0.075, mean=0.128, max=0.196, sum=1.914 (15)",
-            "tab": "Calibration",
-            "score": 0.12760096192658882
-          },
-          "MMLU - EM (Robustness)": {
-            "description": "min=0.17, mean=0.32, max=0.44, sum=4.806 (15)",
-            "tab": "Robustness",
-            "score": 0.3203859649122807
-          },
-          "MMLU - EM (Fairness)": {
-            "description": "min=0.22, mean=0.315, max=0.43, sum=4.723 (15)",
-            "tab": "Fairness",
-            "score": 0.3148771929824561
-          },
-          "MMLU - Denoised inference time (s)": {
-            "description": "min=0.194, mean=0.335, max=0.546, sum=5.029 (15)",
-            "tab": "Efficiency",
-            "score": 0.33523606010994367
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=1542 (15)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=75 (15)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (15)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=354.52, mean=460.637, max=611.877, sum=6909.562 (15)",
-            "tab": "General information",
-            "score": 460.63743859649117
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=15 (15)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "MMLU - # trials": {
-            "description": "min=3, mean=3, max=3, sum=45 (15)",
-            "tab": "General information",
-            "score": 3.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "BoolQ",
-      "source_data": {
-        "dataset_name": "BoolQ",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on BoolQ",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.784,
-        "details": {
-          "description": "min=0.729, mean=0.784, max=0.819, sum=2.351 (3)",
-          "tab": "Accuracy",
-          "BoolQ - ECE (10-bin)": {
-            "description": "min=0.111, mean=0.171, max=0.205, sum=0.513 (3)",
-            "tab": "Calibration",
-            "score": 0.1710477879835662
-          },
-          "BoolQ - EM (Robustness)": {
-            "description": "min=0.68, mean=0.728, max=0.758, sum=2.183 (3)",
-            "tab": "Robustness",
-            "score": 0.7276666666666668
-          },
-          "BoolQ - EM (Fairness)": {
-            "description": "min=0.625, mean=0.69, max=0.722, sum=2.069 (3)",
-            "tab": "Fairness",
-            "score": 0.6896666666666667
-          },
-          "BoolQ - Denoised inference time (s)": {
-            "description": "min=0.942, mean=1.191, max=1.332, sum=3.574 (3)",
-            "tab": "Efficiency",
-            "score": 1.1913305165274586
-          },
-          "BoolQ - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "BoolQ - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "BoolQ - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "BoolQ - # prompt tokens": {
-            "description": "min=679.091, mean=931.424, max=1276.091, sum=2794.273 (3)",
-            "tab": "General information",
-            "score": 931.4243333333333
-          },
-          "BoolQ - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "BoolQ - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "BoolQ - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "BoolQ - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.706,
-        "details": {
-          "description": "min=0.655, mean=0.706, max=0.736, sum=2.118 (3)",
-          "tab": "Accuracy",
-          "NarrativeQA - ECE (10-bin)": {
-            "description": "min=0.027, mean=0.037, max=0.058, sum=0.112 (3)",
-            "tab": "Calibration",
-            "score": 0.03732324115716399
-          },
-          "NarrativeQA - F1 (Robustness)": {
-            "description": "min=0.531, mean=0.629, max=0.682, sum=1.888 (3)",
-            "tab": "Robustness",
-            "score": 0.6293880948208791
-          },
-          "NarrativeQA - F1 (Fairness)": {
-            "description": "min=0.55, mean=0.615, max=0.656, sum=1.846 (3)",
-            "tab": "Fairness",
-            "score": 0.6154230898629193
-          },
-          "NarrativeQA - Denoised inference time (s)": {
-            "description": "min=1.78, mean=2.315, max=3.197, sum=6.946 (3)",
-            "tab": "Efficiency",
-            "score": 2.3151894005635367
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=1065 (3)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.101, mean=1.675, max=2.11, sum=5.025 (3)",
-            "tab": "General information",
-            "score": 1.6751173708920186
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1597.372, mean=1658.811, max=1711.876, sum=4976.434 (3)",
-            "tab": "General information",
-            "score": 1658.8112676056337
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=6.008, mean=9.939, max=17.439, sum=29.817 (3)",
-            "tab": "General information",
-            "score": 9.938967136150234
-          },
-          "NarrativeQA - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NarrativeQA - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NarrativeQA - Stereotypes (gender)": {
-            "description": "min=0.365, mean=0.372, max=0.375, sum=1.115 (3)",
-            "tab": "Bias",
-            "score": 0.3717948717948718
-          },
-          "NarrativeQA - Representation (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Bias",
-            "score": 0.6666666666666667
-          },
-          "NarrativeQA - Representation (gender)": {
-            "description": "min=0.168, mean=0.19, max=0.215, sum=0.569 (3)",
-            "tab": "Bias",
-            "score": 0.1896318370894642
-          },
-          "NarrativeQA - Toxic fraction": {
-            "description": "min=0.011, mean=0.012, max=0.014, sum=0.037 (3)",
-            "tab": "Toxicity",
-            "score": 0.012206572769953052
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (open-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (open-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (open-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.642,
-        "details": {
-          "description": "min=0.639, mean=0.642, max=0.649, sum=1.927 (3)",
-          "tab": "Accuracy",
-          "NaturalQuestions (closed-book) - ECE (10-bin)": {
-            "description": "min=0.02, mean=0.022, max=0.023, sum=0.065 (3)",
-            "tab": "Calibration",
-            "score": 0.021760896948719733
-          },
-          "NaturalQuestions (open-book) - ECE (10-bin)": {
-            "description": "min=0.071, mean=0.076, max=0.082, sum=0.228 (3)",
-            "tab": "Calibration",
-            "score": 0.07592608066404687
-          },
-          "NaturalQuestions (closed-book) - F1 (Robustness)": {
-            "description": "min=0.11, mean=0.117, max=0.122, sum=0.35 (3)",
-            "tab": "Robustness",
-            "score": 0.11665134142344884
-          },
-          "NaturalQuestions (open-book) - F1 (Robustness)": {
-            "description": "min=0.592, mean=0.6, max=0.608, sum=1.8 (3)",
-            "tab": "Robustness",
-            "score": 0.5998399895408899
-          },
-          "NaturalQuestions (closed-book) - F1 (Fairness)": {
-            "description": "min=0.112, mean=0.12, max=0.124, sum=0.361 (3)",
-            "tab": "Fairness",
-            "score": 0.12026039507733897
-          },
-          "NaturalQuestions (open-book) - F1 (Fairness)": {
-            "description": "min=0.592, mean=0.597, max=0.603, sum=1.79 (3)",
-            "tab": "Fairness",
-            "score": 0.5967933879081116
-          },
-          "NaturalQuestions (closed-book) - Denoised inference time (s)": {
-            "description": "min=0.822, mean=0.953, max=1.045, sum=2.859 (3)",
-            "tab": "Efficiency",
-            "score": 0.9528701016867446
-          },
-          "NaturalQuestions (open-book) - Denoised inference time (s)": {
-            "description": "min=2.251, mean=2.369, max=2.58, sum=7.108 (3)",
-            "tab": "Efficiency",
-            "score": 2.3693331199589207
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=15 (3)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=121.658, mean=122.991, max=125.658, sum=368.974 (3)",
-            "tab": "General information",
-            "score": 122.99133333333333
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=6.22, mean=6.707, max=7.262, sum=20.12 (3)",
-            "tab": "General information",
-            "score": 6.706666666666667
-          },
-          "NaturalQuestions (closed-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.505, mean=4.631, max=4.705, sum=13.892 (3)",
-            "tab": "General information",
-            "score": 4.630666666666667
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.042, mean=0.047, max=0.056, sum=0.14 (3)",
-            "tab": "General information",
-            "score": 0.04666666666666667
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1340.319, mean=1502.677, max=1625.084, sum=4508.03 (3)",
-            "tab": "General information",
-            "score": 1502.676666666667
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=19.342, mean=21.064, max=23.914, sum=63.193 (3)",
-            "tab": "General information",
-            "score": 21.064333333333334
-          },
-          "NaturalQuestions (open-book) - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "NaturalQuestions (closed-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=0.5 (1)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (closed-book) - Representation (race)": {
-            "description": "min=0.121, mean=0.269, max=0.393, sum=0.807 (3)",
-            "tab": "Bias",
-            "score": 0.2689924681892553
-          },
-          "NaturalQuestions (closed-book) - Representation (gender)": {
-            "description": "min=0.038, mean=0.059, max=0.083, sum=0.177 (3)",
-            "tab": "Bias",
-            "score": 0.05911680911680913
-          },
-          "NaturalQuestions (open-book) - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=2 (3)",
-            "tab": "Bias",
-            "score": 0.6666666666666666
-          },
-          "NaturalQuestions (open-book) - Stereotypes (gender)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.5 (3)",
-            "tab": "Bias",
-            "score": 0.5
-          },
-          "NaturalQuestions (open-book) - Representation (race)": {
-            "description": "min=0.571, mean=0.585, max=0.598, sum=1.754 (3)",
-            "tab": "Bias",
-            "score": 0.584615044473471
-          },
-          "NaturalQuestions (open-book) - Representation (gender)": {
-            "description": "min=0.068, mean=0.073, max=0.079, sum=0.22 (3)",
-            "tab": "Bias",
-            "score": 0.07328275644065117
-          },
-          "NaturalQuestions (closed-book) - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.002, sum=0.003 (3)",
-            "tab": "Toxicity",
-            "score": 0.001
-          },
-          "NaturalQuestions (open-book) - Toxic fraction": {
-            "description": "min=0.001, mean=0.002, max=0.002, sum=0.005 (3)",
-            "tab": "Toxicity",
-            "score": 0.0016666666666666668
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "QuAC",
-      "source_data": {
-        "dataset_name": "QuAC",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on QuAC",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.272,
-        "details": {
-          "description": "min=0.23, mean=0.272, max=0.297, sum=0.815 (3)",
-          "tab": "Accuracy",
-          "QuAC - ECE (10-bin)": {
-            "description": "min=0.012, mean=0.027, max=0.043, sum=0.082 (3)",
-            "tab": "Calibration",
-            "score": 0.02731272826999052
-          },
-          "QuAC - F1 (Robustness)": {
-            "description": "min=0.178, mean=0.193, max=0.202, sum=0.579 (3)",
-            "tab": "Robustness",
-            "score": 0.19293634470384977
-          },
-          "QuAC - F1 (Fairness)": {
-            "description": "min=0.173, mean=0.205, max=0.225, sum=0.616 (3)",
-            "tab": "Fairness",
-            "score": 0.20535008777735797
-          },
-          "QuAC - Denoised inference time (s)": {
-            "description": "min=4.186, mean=4.219, max=4.235, sum=12.656 (3)",
-            "tab": "Efficiency",
-            "score": 4.218568385192325
-          },
-          "QuAC - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "QuAC - # train": {
-            "description": "min=0.823, mean=0.874, max=0.929, sum=2.622 (3)",
-            "tab": "General information",
-            "score": 0.874
-          },
-          "QuAC - truncated": {
-            "description": "min=0.094, mean=0.134, max=0.177, sum=0.401 (3)",
-            "tab": "General information",
-            "score": 0.13366666666666668
-          },
-          "QuAC - # prompt tokens": {
-            "description": "min=1621.422, mean=1651.972, max=1668.212, sum=4955.915 (3)",
-            "tab": "General information",
-            "score": 1651.9716666666666
-          },
-          "QuAC - # output tokens": {
-            "description": "min=65.116, mean=73.565, max=88.524, sum=220.696 (3)",
-            "tab": "General information",
-            "score": 73.56533333333333
-          },
-          "QuAC - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "QuAC - Stereotypes (race)": {
-            "description": "min=0.604, mean=0.62, max=0.642, sum=1.86 (3)",
-            "tab": "Bias",
-            "score": 0.6201234839116704
-          },
-          "QuAC - Stereotypes (gender)": {
-            "description": "min=0.411, mean=0.431, max=0.451, sum=1.294 (3)",
-            "tab": "Bias",
-            "score": 0.43137624831417937
-          },
-          "QuAC - Representation (race)": {
-            "description": "min=0.372, mean=0.408, max=0.45, sum=1.224 (3)",
-            "tab": "Bias",
-            "score": 0.40815960651383004
-          },
-          "QuAC - Representation (gender)": {
-            "description": "min=0.259, mean=0.268, max=0.282, sum=0.803 (3)",
-            "tab": "Bias",
-            "score": 0.2675064821442643
-          },
-          "QuAC - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.001, sum=0.002 (3)",
-            "tab": "Toxicity",
-            "score": 0.0006666666666666666
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "HellaSwag",
-      "source_data": {
-        "dataset_name": "HellaSwag",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on HellaSwag",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "HellaSwag - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "HellaSwag - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "HellaSwag - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "HellaSwag - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "HellaSwag - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "HellaSwag - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "OpenbookQA - ECE (10-bin)": {
-            "description": "No matching runs",
-            "tab": "Calibration",
-            "score": null
-          },
-          "OpenbookQA - EM (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "OpenbookQA - EM (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "OpenbookQA - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "OpenbookQA - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "OpenbookQA - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "TruthfulQA",
-      "source_data": {
-        "dataset_name": "TruthfulQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on TruthfulQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.218,
-        "details": {
-          "description": "min=0.185, mean=0.218, max=0.232, sum=0.873 (4)",
-          "tab": "Accuracy",
-          "TruthfulQA - ECE (10-bin)": {
-            "description": "min=0.04, mean=0.088, max=0.12, sum=0.351 (4)",
-            "tab": "Calibration",
-            "score": 0.08770199071414088
-          },
-          "TruthfulQA - EM (Robustness)": {
-            "description": "min=0.147, mean=0.196, max=0.229, sum=0.784 (4)",
-            "tab": "Robustness",
-            "score": 0.19610091743119268
-          },
-          "TruthfulQA - EM (Fairness)": {
-            "description": "min=0.148, mean=0.192, max=0.229, sum=0.766 (4)",
-            "tab": "Fairness",
-            "score": 0.1915137614678899
-          },
-          "TruthfulQA - Denoised inference time (s)": {
-            "description": "min=0.069, mean=0.158, max=0.193, sum=0.633 (4)",
-            "tab": "Efficiency",
-            "score": 0.15830796687302695
-          },
-          "TruthfulQA - # eval": {
-            "description": "min=654, mean=654, max=654, sum=2616 (4)",
-            "tab": "General information",
-            "score": 654.0
-          },
-          "TruthfulQA - # train": {
-            "description": "min=0, mean=3.75, max=5, sum=15 (4)",
-            "tab": "General information",
-            "score": 3.75
-          },
-          "TruthfulQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (4)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "TruthfulQA - # prompt tokens": {
-            "description": "min=80.786, mean=389.036, max=521.786, sum=1556.144 (4)",
-            "tab": "General information",
-            "score": 389.0359327217125
-          },
-          "TruthfulQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=4 (4)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "TruthfulQA - # trials": {
-            "description": "min=1, mean=2.5, max=3, sum=10 (4)",
-            "tab": "General information",
-            "score": 2.5
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "MS MARCO (TREC)",
-      "source_data": {
-        "dataset_name": "MS MARCO (TREC)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "NDCG@10 on MS MARCO (TREC)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "MS MARCO (regular) - RR@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Robustness)": {
-            "description": "No matching runs",
-            "tab": "Robustness",
-            "score": null
-          },
-          "MS MARCO (regular) - RR@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (TREC) - NDCG@10 (Fairness)": {
-            "description": "No matching runs",
-            "tab": "Fairness",
-            "score": null
-          },
-          "MS MARCO (regular) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (TREC) - Denoised inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "MS MARCO (regular) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (TREC) - # trials": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Stereotypes (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (race)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (TREC) - Representation (gender)": {
-            "description": "No matching runs",
-            "tab": "Bias",
-            "score": null
-          },
-          "MS MARCO (regular) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          },
-          "MS MARCO (TREC) - Toxic fraction": {
-            "description": "No matching runs",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CNN/DailyMail",
-      "source_data": {
-        "dataset_name": "CNN/DailyMail",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on CNN/DailyMail",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.154,
-        "details": {
-          "description": "min=0.144, mean=0.154, max=0.166, sum=0.926 (6)",
-          "tab": "Accuracy",
-          "CNN/DailyMail - Denoised inference time (s)": {
-            "description": "min=3.427, mean=3.514, max=3.581, sum=21.082 (6)",
-            "tab": "Efficiency",
-            "score": 3.5136688752771708
-          },
-          "CNN/DailyMail - # eval": {
-            "description": "min=466, mean=466, max=466, sum=2796 (6)",
-            "tab": "General information",
-            "score": 466.0
-          },
-          "CNN/DailyMail - # train": {
-            "description": "min=5, mean=5, max=5, sum=30 (6)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CNN/DailyMail - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CNN/DailyMail - # prompt tokens": {
-            "description": "min=1644.124, mean=1657.124, max=1680.124, sum=9942.747 (6)",
-            "tab": "General information",
-            "score": 1657.1244635193134
-          },
-          "CNN/DailyMail - # output tokens": {
-            "description": "min=74.479, mean=82.997, max=91.644, sum=497.983 (6)",
-            "tab": "General information",
-            "score": 82.99713876967097
-          },
-          "CNN/DailyMail - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CNN/DailyMail - Stereotypes (race)": {
-            "description": "min=0.601, mean=0.611, max=0.623, sum=3.663 (6)",
-            "tab": "Bias",
-            "score": 0.61056496482126
-          },
-          "CNN/DailyMail - Stereotypes (gender)": {
-            "description": "min=0.377, mean=0.394, max=0.409, sum=2.367 (6)",
-            "tab": "Bias",
-            "score": 0.3944955327838351
-          },
-          "CNN/DailyMail - Representation (race)": {
-            "description": "min=0.276, mean=0.29, max=0.305, sum=1.741 (6)",
-            "tab": "Bias",
-            "score": 0.2901527051306585
-          },
-          "CNN/DailyMail - Representation (gender)": {
-            "description": "min=0.134, mean=0.139, max=0.147, sum=0.831 (6)",
-            "tab": "Bias",
-            "score": 0.13850777854837878
-          },
-          "CNN/DailyMail - Toxic fraction": {
-            "description": "min=0, mean=0.001, max=0.004, sum=0.009 (6)",
-            "tab": "Toxicity",
-            "score": 0.001430615164520744
-          },
-          "CNN/DailyMail - SummaC": {
-            "description": "min=0.537, mean=0.566, max=0.591, sum=1.699 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.5663194802454004
-          },
-          "CNN/DailyMail - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "CNN/DailyMail - BERTScore (F1)": {
-            "description": "min=0.266, mean=0.288, max=0.312, sum=0.863 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.287517514648812
-          },
-          "CNN/DailyMail - Coverage": {
-            "description": "min=0.96, mean=0.972, max=0.987, sum=5.835 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9724896258431271
-          },
-          "CNN/DailyMail - Density": {
-            "description": "min=24.014, mean=30.259, max=37.594, sum=181.554 (6)",
-            "tab": "Summarization metrics",
-            "score": 30.259024131398863
-          },
-          "CNN/DailyMail - Compression": {
-            "description": "min=7.643, mean=8.687, max=9.754, sum=52.123 (6)",
-            "tab": "Summarization metrics",
-            "score": 8.68711944818053
-          },
-          "CNN/DailyMail - HumanEval-faithfulness": {
-            "description": "min=0.889, mean=0.963, max=1, sum=5.778 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.9629629629629629
-          },
-          "CNN/DailyMail - HumanEval-relevance": {
-            "description": "min=3.889, mean=4.167, max=4.5, sum=25 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.166666666666667
-          },
-          "CNN/DailyMail - HumanEval-coherence": {
-            "description": "min=3.111, mean=3.463, max=3.833, sum=20.778 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.4629629629629632
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "XSUM",
-      "source_data": {
-        "dataset_name": "XSUM",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "ROUGE-2 on XSUM",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.132,
-        "details": {
-          "description": "min=0.131, mean=0.132, max=0.134, sum=0.794 (6)",
-          "tab": "Accuracy",
-          "XSUM - Denoised inference time (s)": {
-            "description": "min=2.516, mean=2.537, max=2.549, sum=15.224 (6)",
-            "tab": "Efficiency",
-            "score": 2.537310096660418
-          },
-          "XSUM - # eval": {
-            "description": "min=518, mean=518, max=518, sum=3108 (6)",
-            "tab": "General information",
-            "score": 518.0
-          },
-          "XSUM - # train": {
-            "description": "min=4.994, mean=4.996, max=4.998, sum=29.977 (6)",
-            "tab": "General information",
-            "score": 4.9961389961389955
-          },
-          "XSUM - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "XSUM - # prompt tokens": {
-            "description": "min=1516.483, mean=1567.312, max=1610.471, sum=9403.873 (6)",
-            "tab": "General information",
-            "score": 1567.3120978120978
-          },
-          "XSUM - # output tokens": {
-            "description": "min=25.458, mean=25.737, max=26.021, sum=154.421 (6)",
-            "tab": "General information",
-            "score": 25.73680823680824
-          },
-          "XSUM - # trials": {
-            "description": "min=3, mean=3, max=3, sum=18 (6)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "XSUM - Stereotypes (race)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=4.0 (6)",
-            "tab": "Bias",
-            "score": 0.6666666666666669
-          },
-          "XSUM - Stereotypes (gender)": {
-            "description": "min=0.399, mean=0.447, max=0.477, sum=2.684 (6)",
-            "tab": "Bias",
-            "score": 0.4473352072310406
-          },
-          "XSUM - Representation (race)": {
-            "description": "min=0.519, mean=0.545, max=0.579, sum=3.269 (6)",
-            "tab": "Bias",
-            "score": 0.5447683118463776
-          },
-          "XSUM - Representation (gender)": {
-            "description": "min=0.202, mean=0.207, max=0.211, sum=1.243 (6)",
-            "tab": "Bias",
-            "score": 0.2071945417372382
-          },
-          "XSUM - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (6)",
-            "tab": "Toxicity",
-            "score": 0.0
-          },
-          "XSUM - SummaC": {
-            "description": "min=-0.225, mean=-0.206, max=-0.183, sum=-0.617 (3)",
-            "tab": "Summarization metrics",
-            "score": -0.20556503322082545
-          },
-          "XSUM - QAFactEval": {
-            "description": "2 matching runs, but no matching metrics",
-            "tab": "Summarization metrics",
-            "score": null
-          },
-          "XSUM - BERTScore (F1)": {
-            "description": "min=0.427, mean=0.427, max=0.428, sum=1.282 (3)",
-            "tab": "Summarization metrics",
-            "score": 0.42745522151316395
-          },
-          "XSUM - Coverage": {
-            "description": "min=0.813, mean=0.817, max=0.82, sum=4.905 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.8174518357071618
-          },
-          "XSUM - Density": {
-            "description": "min=3.819, mean=4.041, max=4.367, sum=24.243 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.040514978645572
-          },
-          "XSUM - Compression": {
-            "description": "min=16.122, mean=16.25, max=16.375, sum=97.5 (6)",
-            "tab": "Summarization metrics",
-            "score": 16.25000448561988
-          },
-          "XSUM - HumanEval-faithfulness": {
-            "description": "min=0.583, mean=0.763, max=0.905, sum=4.576 (6)",
-            "tab": "Summarization metrics",
-            "score": 0.7626984126984127
-          },
-          "XSUM - HumanEval-relevance": {
-            "description": "min=3.333, mean=3.843, max=4.1, sum=23.057 (6)",
-            "tab": "Summarization metrics",
-            "score": 3.842857142857143
-          },
-          "XSUM - HumanEval-coherence": {
-            "description": "min=3.417, mean=4.25, max=4.667, sum=25.5 (6)",
-            "tab": "Summarization metrics",
-            "score": 4.249999999999999
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "IMDB",
-      "source_data": {
-        "dataset_name": "IMDB",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on IMDB",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.955,
-        "details": {
-          "description": "min=0.946, mean=0.955, max=0.961, sum=2.864 (3)",
-          "tab": "Accuracy",
-          "IMDB - ECE (10-bin)": {
-            "description": "min=0.117, mean=0.18, max=0.225, sum=0.541 (3)",
-            "tab": "Calibration",
-            "score": 0.18041748611363093
-          },
-          "IMDB - EM (Robustness)": {
-            "description": "min=0.921, mean=0.938, max=0.955, sum=2.814 (3)",
-            "tab": "Robustness",
-            "score": 0.9380000000000001
-          },
-          "IMDB - EM (Fairness)": {
-            "description": "min=0.92, mean=0.933, max=0.951, sum=2.799 (3)",
-            "tab": "Fairness",
-            "score": 0.9329999999999999
-          },
-          "IMDB - Denoised inference time (s)": {
-            "description": "min=1.446, mean=1.497, max=1.55, sum=4.491 (3)",
-            "tab": "Efficiency",
-            "score": 1.4970239554705547
-          },
-          "IMDB - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=3000 (3)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "IMDB - # train": {
-            "description": "min=4.832, mean=4.923, max=4.979, sum=14.77 (3)",
-            "tab": "General information",
-            "score": 4.923333333333333
-          },
-          "IMDB - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (3)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "IMDB - # prompt tokens": {
-            "description": "min=1182.719, mean=1412.285, max=1755.875, sum=4236.855 (3)",
-            "tab": "General information",
-            "score": 1412.2849999999999
-          },
-          "IMDB - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=6 (3)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "IMDB - # trials": {
-            "description": "min=3, mean=3, max=3, sum=9 (3)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "IMDB - Stereotypes (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Stereotypes (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (race)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Representation (gender)": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Bias",
-            "score": null
-          },
-          "IMDB - Toxic fraction": {
-            "description": "1 matching runs, but no matching metrics",
-            "tab": "Toxicity",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "CivilComments",
-      "source_data": {
-        "dataset_name": "CivilComments",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on CivilComments",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0, mean=0.5, max=1, sum=27.019 (54)",
-          "tab": "Accuracy",
-          "CivilComments - ECE (10-bin)": {
-            "description": "min=0.22, mean=0.486, max=0.749, sum=26.268 (54)",
-            "tab": "Calibration",
-            "score": 0.4864398714978027
-          },
-          "CivilComments - EM (Robustness)": {
-            "description": "min=0, mean=0.5, max=1, sum=27.004 (54)",
-            "tab": "Robustness",
-            "score": 0.5000703286326241
-          },
-          "CivilComments - EM (Fairness)": {
-            "description": "min=0, mean=0.5, max=1, sum=26.982 (54)",
-            "tab": "Fairness",
-            "score": 0.4996593325872097
-          },
-          "CivilComments - Denoised inference time (s)": {
-            "description": "min=0.442, mean=0.695, max=1.665, sum=37.54 (54)",
-            "tab": "Efficiency",
-            "score": 0.695191819583079
-          },
-          "CivilComments - # eval": {
-            "description": "min=74, mean=371.556, max=683, sum=20064 (54)",
-            "tab": "General information",
-            "score": 371.55555555555554
-          },
-          "CivilComments - # train": {
-            "description": "min=5, mean=5, max=5, sum=270 (54)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "CivilComments - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "CivilComments - # prompt tokens": {
-            "description": "min=342, mean=694.39, max=1246.337, sum=37497.067 (54)",
-            "tab": "General information",
-            "score": 694.3901297399493
-          },
-          "CivilComments - # output tokens": {
-            "description": "min=2, mean=2, max=2, sum=108 (54)",
-            "tab": "General information",
-            "score": 2.0
-          },
-          "CivilComments - # trials": {
-            "description": "min=3, mean=3, max=3, sum=162 (54)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "CivilComments - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "CivilComments - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (54)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "RAFT",
-      "source_data": {
-        "dataset_name": "RAFT",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/benchmark_output/releases/v0.4.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on RAFT",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.598,
-        "details": {
-          "description": "min=0, mean=0.598, max=0.975, sum=19.725 (33)",
-          "tab": "Accuracy",
-          "RAFT - ECE (10-bin)": {
-            "description": "min=0.045, mean=0.226, max=0.392, sum=7.451 (33)",
-            "tab": "Calibration",
-            "score": 0.225785860693393
-          },
-          "RAFT - EM (Robustness)": {
-            "description": "min=0, mean=0.577, max=0.975, sum=19.05 (33)",
-            "tab": "Robustness",
-            "score": 0.5772727272727272
-          },
-          "RAFT - EM (Fairness)": {
-            "description": "min=0, mean=0.575, max=0.975, sum=18.975 (33)",
-            "tab": "Fairness",
-            "score": 0.575
-          },
-          "RAFT - Denoised inference time (s)": {
-            "description": "min=0.333, mean=1.471, max=2.214, sum=48.528 (33)",
-            "tab": "Efficiency",
-            "score": 1.4705579548050658
-          },
-          "RAFT - # eval": {
-            "description": "min=40, mean=40, max=40, sum=1320 (33)",
-            "tab": "General information",
-            "score": 40.0
-          },
-          "RAFT - # train": {
-            "description": "min=0, mean=4.563, max=5, sum=150.575 (33)",
-            "tab": "General information",
-            "score": 4.5628787878787875
-          },
-          "RAFT - truncated": {
-            "description": "min=0, mean=0.07, max=1, sum=2.3 (33)",
-            "tab": "General information",
-            "score": 0.06969696969696969
-          },
-          "RAFT - # prompt tokens": {
-            "description": "min=244.45, mean=803.318, max=1757.15, sum=26509.5 (33)",
-            "tab": "General information",
-            "score": 803.3181818181819
-          },
-          "RAFT - # output tokens": {
-            "description": "min=2.6, mean=4.886, max=11.6, sum=161.25 (33)",
-            "tab": "General information",
-            "score": 4.886363636363637
-          },
-          "RAFT - # trials": {
-            "description": "min=3, mean=3, max=3, sum=99 (33)",
-            "tab": "General information",
-            "score": 3.0
-          },
-          "RAFT - Stereotypes (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Stereotypes (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (race)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Representation (gender)": {
-            "description": "(0)",
-            "tab": "Bias",
-            "score": null
-          },
-          "RAFT - Toxic fraction": {
-            "description": "min=0, mean=0, max=0, sum=0 (33)",
-            "tab": "Toxicity",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json b/data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json
deleted file mode 100644
index 31ab229b7..000000000
--- a/data/helm_instruct/anthropic/claude-v1.3/e5e98ffa-3c2d-42d4-86a9-0cb46a71c684.json
+++ /dev/null
@@ -1,267 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_instruct/anthropic_claude-v1.3/1770834858.3559701",
-  "retrieved_timestamp": "1770834858.3559701",
-  "source_metadata": {
-    "source_name": "helm_instruct",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Anthropic Claude v1.3",
-    "id": "anthropic/claude-v1.3",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_instruct",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.611,
-        "details": {
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "Anthropic RLHF dataset",
-      "source_data": {
-        "dataset_name": "Anthropic RLHF dataset",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Anthropic RLHF dataset",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.965,
-        "details": {
-          "description": "min=4.925, mean=4.965, max=5, sum=39.72 (8)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "hh",
-            "hh",
-            "hh",
-            "hh",
-            "red_team",
-            "red_team",
-            "red_team",
-            "red_team"
-          ],
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale",
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Best ChatGPT Prompts",
-      "source_data": {
-        "dataset_name": "Best ChatGPT Prompts",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Best ChatGPT Prompts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.995,
-        "details": {
-          "description": "min=4.985, mean=4.995, max=5, sum=19.98 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
-          "tags": "",
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Koala test dataset",
-      "source_data": {
-        "dataset_name": "Koala test dataset",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Koala test dataset",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.981,
-        "details": {
-          "description": "min=4.965, mean=4.981, max=5, sum=19.925 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Open Assistant",
-      "source_data": {
-        "dataset_name": "Open Assistant",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Open Assistant",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.975,
-        "details": {
-          "description": "min=4.935, mean=4.975, max=5, sum=19.9 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language": "en",
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Self Instruct",
-      "source_data": {
-        "dataset_name": "Self Instruct",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Self Instruct",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.992,
-        "details": {
-          "description": "min=4.98, mean=4.992, max=5, sum=19.97 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Vicuna",
-      "source_data": {
-        "dataset_name": "Vicuna",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Vicuna",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.989,
-        "details": {
-          "description": "min=4.956, mean=4.989, max=5, sum=19.956 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "category": "all",
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json b/data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json
deleted file mode 100644
index 2fd221159..000000000
--- a/data/helm_instruct/cohere/command-xlarge-beta/60724488-914d-4efe-98d6-f3ff26fe8fbc.json
+++ /dev/null
@@ -1,267 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_instruct/cohere_command-xlarge-beta/1770834858.3559701",
-  "retrieved_timestamp": "1770834858.3559701",
-  "source_metadata": {
-    "source_name": "helm_instruct",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cohere Command beta 52.4B",
-    "id": "cohere/command-xlarge-beta",
-    "developer": "cohere",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_instruct",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.089,
-        "details": {
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "Anthropic RLHF dataset",
-      "source_data": {
-        "dataset_name": "Anthropic RLHF dataset",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Anthropic RLHF dataset",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.214,
-        "details": {
-          "description": "min=3.38, mean=4.214, max=4.92, sum=33.715 (8)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "hh",
-            "hh",
-            "hh",
-            "hh",
-            "red_team",
-            "red_team",
-            "red_team",
-            "red_team"
-          ],
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale",
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Best ChatGPT Prompts",
-      "source_data": {
-        "dataset_name": "Best ChatGPT Prompts",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Best ChatGPT Prompts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.988,
-        "details": {
-          "description": "min=4.98, mean=4.988, max=5, sum=19.95 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
-          "tags": "",
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Koala test dataset",
-      "source_data": {
-        "dataset_name": "Koala test dataset",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Koala test dataset",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.969,
-        "details": {
-          "description": "min=4.936, mean=4.969, max=5, sum=19.874 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Open Assistant",
-      "source_data": {
-        "dataset_name": "Open Assistant",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Open Assistant",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.967,
-        "details": {
-          "description": "min=4.955, mean=4.967, max=5, sum=19.87 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language": "en",
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Self Instruct",
-      "source_data": {
-        "dataset_name": "Self Instruct",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Self Instruct",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.971,
-        "details": {
-          "description": "min=4.955, mean=4.971, max=5, sum=19.885 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Vicuna",
-      "source_data": {
-        "dataset_name": "Vicuna",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Vicuna",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.995,
-        "details": {
-          "description": "min=4.981, mean=4.995, max=5, sum=19.981 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "category": "all",
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json b/data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json
deleted file mode 100644
index 23dfc4397..000000000
--- a/data/helm_instruct/openai/gpt-3.5-turbo-0613/2aaae404-b510-41e0-9a4a-b2d053731454.json
+++ /dev/null
@@ -1,267 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_instruct/openai_gpt-3.5-turbo-0613/1770834858.3559701",
-  "retrieved_timestamp": "1770834858.3559701",
-  "source_metadata": {
-    "source_name": "helm_instruct",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-3.5 Turbo 0613",
-    "id": "openai/gpt-3.5-turbo-0613",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_instruct",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.689,
-        "details": {
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "Anthropic RLHF dataset",
-      "source_data": {
-        "dataset_name": "Anthropic RLHF dataset",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Anthropic RLHF dataset",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.964,
-        "details": {
-          "description": "min=4.915, mean=4.964, max=5, sum=39.715 (8)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "hh",
-            "hh",
-            "hh",
-            "hh",
-            "red_team",
-            "red_team",
-            "red_team",
-            "red_team"
-          ],
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale",
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Best ChatGPT Prompts",
-      "source_data": {
-        "dataset_name": "Best ChatGPT Prompts",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Best ChatGPT Prompts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.986,
-        "details": {
-          "description": "min=4.95, mean=4.986, max=5, sum=19.945 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
-          "tags": "",
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Koala test dataset",
-      "source_data": {
-        "dataset_name": "Koala test dataset",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Koala test dataset",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.987,
-        "details": {
-          "description": "min=4.969, mean=4.987, max=5, sum=19.95 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Open Assistant",
-      "source_data": {
-        "dataset_name": "Open Assistant",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Open Assistant",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.987,
-        "details": {
-          "description": "min=4.96, mean=4.987, max=5, sum=19.95 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language": "en",
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Self Instruct",
-      "source_data": {
-        "dataset_name": "Self Instruct",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Self Instruct",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.99,
-        "details": {
-          "description": "min=4.97, mean=4.99, max=5, sum=19.96 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Vicuna",
-      "source_data": {
-        "dataset_name": "Vicuna",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Vicuna",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.992,
-        "details": {
-          "description": "min=4.975, mean=4.992, max=5, sum=19.969 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "category": "all",
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json b/data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json
deleted file mode 100644
index 9ad1bca2e..000000000
--- a/data/helm_instruct/openai/gpt-4-0314/053badb4-b50a-434a-909c-c4d939c00b4e.json
+++ /dev/null
@@ -1,267 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_instruct/openai_gpt-4-0314/1770834858.3559701",
-  "retrieved_timestamp": "1770834858.3559701",
-  "source_metadata": {
-    "source_name": "helm_instruct",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-4 0314",
-    "id": "openai/gpt-4-0314",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_instruct",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperform on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.611,
-        "details": {
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "Anthropic RLHF dataset",
-      "source_data": {
-        "dataset_name": "Anthropic RLHF dataset",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Anthropic RLHF dataset",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.934,
-        "details": {
-          "description": "min=4.83, mean=4.934, max=5, sum=39.47 (8)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "hh",
-            "hh",
-            "hh",
-            "hh",
-            "red_team",
-            "red_team",
-            "red_team",
-            "red_team"
-          ],
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale",
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Best ChatGPT Prompts",
-      "source_data": {
-        "dataset_name": "Best ChatGPT Prompts",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Best ChatGPT Prompts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.973,
-        "details": {
-          "description": "min=4.915, mean=4.973, max=5, sum=19.894 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "path": "src_helm_benchmark_scenarios_best_chatgpt_prompts.yaml",
-          "tags": "",
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Koala test dataset",
-      "source_data": {
-        "dataset_name": "Koala test dataset",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Koala test dataset",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.966,
-        "details": {
-          "description": "min=4.913, mean=4.966, max=5, sum=19.863 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Open Assistant",
-      "source_data": {
-        "dataset_name": "Open Assistant",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Open Assistant",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.986,
-        "details": {
-          "description": "min=4.97, mean=4.986, max=5, sum=19.945 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language": "en",
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Self Instruct",
-      "source_data": {
-        "dataset_name": "Self Instruct",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Self Instruct",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.976,
-        "details": {
-          "description": "min=4.945, mean=4.976, max=5, sum=19.905 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Vicuna",
-      "source_data": {
-        "dataset_name": "Vicuna",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/instruct/benchmark_output/releases/v1.0.0/groups/instruction_following.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Harmlessness on Vicuna",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 5.0
-      },
-      "score_details": {
-        "score": 4.995,
-        "details": {
-          "description": "min=4.981, mean=4.995, max=5, sum=19.981 (4)",
-          "tab": "Instruction Following"
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "category": "all",
-          "evaluator": [
-            "claude",
-            "gpt4",
-            "mturk",
-            "scale"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json b/data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json
deleted file mode 100644
index 946b7db3e..000000000
--- a/data/helm_lite/01-ai/yi-34b/7b4a4c6d-e302-4010-a099-5b01c874ffe8.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/01-ai_yi-34b/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi 34B",
-    "id": "01-ai/yi-34b",
-    "developer": "01-ai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.57,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.2681148564294632
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.782,
-        "details": {
-          "description": "min=0.782, mean=0.782, max=0.782, sum=0.782 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=2.368, mean=2.368, max=2.368, sum=2.368 (1)",
-            "tab": "Efficiency",
-            "score": 2.368284817816506
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.868, mean=4.868, max=4.868, sum=4.868 (1)",
-            "tab": "General information",
-            "score": 4.867605633802817
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3611.445, mean=3611.445, max=3611.445, sum=3611.445 (1)",
-            "tab": "General information",
-            "score": 3611.445070422535
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.443,
-        "details": {
-          "description": "min=0.443, mean=0.443, max=0.443, sum=0.443 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.816, mean=1.816, max=1.816, sum=1.816 (1)",
-            "tab": "Efficiency",
-            "score": 1.8157690076828004
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=1.458, mean=1.458, max=1.458, sum=1.458 (1)",
-            "tab": "Efficiency",
-            "score": 1.4578230485916137
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.838, mean=4.838, max=4.838, sum=4.838 (1)",
-            "tab": "General information",
-            "score": 4.838
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
-            "tab": "General information",
-            "score": 0.026
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2171.698, mean=2171.698, max=2171.698, sum=2171.698 (1)",
-            "tab": "General information",
-            "score": 2171.698
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.995, mean=0.995, max=0.995, sum=0.995 (1)",
-            "tab": "General information",
-            "score": 0.995
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=131.695, mean=131.695, max=131.695, sum=131.695 (1)",
-            "tab": "General information",
-            "score": 131.695
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=0.92 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.823, mean=0.823, max=0.823, sum=0.823 (1)",
-            "tab": "Efficiency",
-            "score": 0.8229070715904235
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=260.002, mean=260.002, max=260.002, sum=260.002 (1)",
-            "tab": "General information",
-            "score": 260.002
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.65,
-        "details": {
-          "description": "min=0.4, mean=0.65, max=0.91, sum=3.248 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.511, mean=0.697, max=0.925, sum=3.486 (5)",
-            "tab": "Efficiency",
-            "score": 0.6972272023485417
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=383.67, mean=502.654, max=667.789, sum=2513.269 (5)",
-            "tab": "General information",
-            "score": 502.65389473684206
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375,
-        "details": {
-          "description": "min=0.167, mean=0.375, max=0.563, sum=2.623 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=2.651, mean=3.809, max=4.649, sum=26.664 (7)",
-            "tab": "Efficiency",
-            "score": 3.809198633421
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=976.696, mean=1468.935, max=2582.038, sum=10282.547 (7)",
-            "tab": "General information",
-            "score": 1468.9352369693863
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.648,
-        "details": {
-          "description": "min=0.648, mean=0.648, max=0.648, sum=0.648 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=4.887, mean=4.887, max=4.887, sum=4.887 (1)",
-            "tab": "Efficiency",
-            "score": 4.886563032150269
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1170.814, mean=1170.814, max=1170.814, sum=1170.814 (1)",
-            "tab": "General information",
-            "score": 1170.814
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.618,
-        "details": {
-          "description": "min=0.311, mean=0.618, max=0.8, sum=3.089 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.465, mean=0.8, max=1.207, sum=4.002 (5)",
-            "tab": "Efficiency",
-            "score": 0.8004560962069804
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=2, mean=4.2, max=5, sum=21 (5)",
-            "tab": "General information",
-            "score": 4.2
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=211.779, mean=951.524, max=3359.547, sum=4757.621 (5)",
-            "tab": "General information",
-            "score": 951.5242922438443
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.656,
-        "details": {
-          "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=1.064, mean=1.064, max=1.064, sum=1.064 (1)",
-            "tab": "Efficiency",
-            "score": 1.064007310696672
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1122.392, mean=1122.392, max=1122.392, sum=1122.392 (1)",
-            "tab": "General information",
-            "score": 1122.3916500994035
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.172,
-        "details": {
-          "description": "min=0.1, mean=0.172, max=0.218, sum=0.858 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.071, mean=1.404, max=2.506, sum=7.021 (5)",
-            "tab": "Efficiency",
-            "score": 1.4042062711970469
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=139.298, mean=187.092, max=317.56, sum=935.461 (5)",
-            "tab": "General information",
-            "score": 187.09213851506345
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json b/data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json
deleted file mode 100644
index 28ba5fb69..000000000
--- a/data/helm_lite/01-ai/yi-6b/db0c0e0c-fcc3-400a-88b4-230ba2929e0f.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/01-ai_yi-6b/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi 6B",
-    "id": "01-ai/yi-6b",
-    "developer": "01-ai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.253,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.6630461922596754
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.702,
-        "details": {
-          "description": "min=0.702, mean=0.702, max=0.702, sum=0.702 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.404, mean=1.404, max=1.404, sum=1.404 (1)",
-            "tab": "Efficiency",
-            "score": 1.4038719868995775
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.868, mean=4.868, max=4.868, sum=4.868 (1)",
-            "tab": "General information",
-            "score": 4.867605633802817
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3611.445, mean=3611.445, max=3611.445, sum=3611.445 (1)",
-            "tab": "General information",
-            "score": 3611.445070422535
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.31,
-        "details": {
-          "description": "min=0.31, mean=0.31, max=0.31, sum=0.31 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.911, mean=0.911, max=0.911, sum=0.911 (1)",
-            "tab": "Efficiency",
-            "score": 0.9108293209075927
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.413, mean=0.413, max=0.413, sum=0.413 (1)",
-            "tab": "Efficiency",
-            "score": 0.4127621691226959
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.838, mean=4.838, max=4.838, sum=4.838 (1)",
-            "tab": "General information",
-            "score": 4.838
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
-            "tab": "General information",
-            "score": 0.026
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2171.698, mean=2171.698, max=2171.698, sum=2171.698 (1)",
-            "tab": "General information",
-            "score": 2171.698
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.995, mean=0.995, max=0.995, sum=0.995 (1)",
-            "tab": "General information",
-            "score": 0.995
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=131.695, mean=131.695, max=131.695, sum=131.695 (1)",
-            "tab": "General information",
-            "score": 131.695
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=0.8 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.354, mean=0.354, max=0.354, sum=0.354 (1)",
-            "tab": "Efficiency",
-            "score": 0.3535394024848938
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=260.002, mean=260.002, max=260.002, sum=260.002 (1)",
-            "tab": "General information",
-            "score": 260.002
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53,
-        "details": {
-          "description": "min=0.3, mean=0.53, max=0.87, sum=2.651 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.339, max=0.368, sum=1.696 (5)",
-            "tab": "Efficiency",
-            "score": 0.3391338364283244
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=383.67, mean=502.654, max=667.789, sum=2513.269 (5)",
-            "tab": "General information",
-            "score": 502.65389473684206
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.126,
-        "details": {
-          "description": "min=0.058, mean=0.126, max=0.2, sum=0.881 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.167, mean=1.837, max=2.263, sum=12.86 (7)",
-            "tab": "Efficiency",
-            "score": 1.8371926514375443
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=976.696, mean=1468.935, max=2582.038, sum=10282.547 (7)",
-            "tab": "General information",
-            "score": 1468.9352369693863
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375,
-        "details": {
-          "description": "min=0.375, mean=0.375, max=0.375, sum=0.375 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.878, mean=1.878, max=1.878, sum=1.878 (1)",
-            "tab": "Efficiency",
-            "score": 1.8781680135726928
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1170.814, mean=1170.814, max=1170.814, sum=1170.814 (1)",
-            "tab": "General information",
-            "score": 1170.814
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.519,
-        "details": {
-          "description": "min=0.284, mean=0.519, max=0.779, sum=2.594 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.379, mean=0.553, max=1.149, sum=2.764 (5)",
-            "tab": "Efficiency",
-            "score": 0.5528668178286933
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=2, mean=4.2, max=5, sum=21 (5)",
-            "tab": "General information",
-            "score": 4.2
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=211.779, mean=951.524, max=3359.547, sum=4757.621 (5)",
-            "tab": "General information",
-            "score": 951.5242922438443
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.497,
-        "details": {
-          "description": "min=0.497, mean=0.497, max=0.497, sum=0.497 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.405, mean=0.405, max=0.405, sum=0.405 (1)",
-            "tab": "Efficiency",
-            "score": 0.4053303655051806
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1122.392, mean=1122.392, max=1122.392, sum=1122.392 (1)",
-            "tab": "General information",
-            "score": 1122.3916500994035
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.117,
-        "details": {
-          "description": "min=0.055, mean=0.117, max=0.182, sum=0.584 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.602, mean=0.626, max=0.666, sum=3.129 (5)",
-            "tab": "Efficiency",
-            "score": 0.6257070175426044
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=139.298, mean=187.092, max=317.56, sum=935.461 (5)",
-            "tab": "General information",
-            "score": 187.09213851506345
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json b/data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json
deleted file mode 100644
index 9fe678bb4..000000000
--- a/data/helm_lite/01-ai/yi-large-preview/f6808908-79d9-4de5-8434-94e4bdb854f2.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/01-ai_yi-large-preview/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi Large Preview",
-    "id": "01-ai/yi-large-preview",
-    "developer": "01-ai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.471,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.17893882646691636
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.373,
-        "details": {
-          "description": "min=0.373, mean=0.373, max=0.373, sum=0.373 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=2.672, mean=2.672, max=2.672, sum=2.672 (1)",
-            "tab": "Efficiency",
-            "score": 2.6724000897206053
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3724.042, mean=3724.042, max=3724.042, sum=3724.042 (1)",
-            "tab": "General information",
-            "score": 3724.042253521127
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=21.513, mean=21.513, max=21.513, sum=21.513 (1)",
-            "tab": "General information",
-            "score": 21.512676056338027
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.428,
-        "details": {
-          "description": "min=0.428, mean=0.428, max=0.428, sum=0.428 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=2.506, mean=2.506, max=2.506, sum=2.506 (1)",
-            "tab": "Efficiency",
-            "score": 2.506305232524872
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=1.036, mean=1.036, max=1.036, sum=1.036 (1)",
-            "tab": "Efficiency",
-            "score": 1.0360134015083313
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.983, mean=4.983, max=4.983, sum=4.983 (1)",
-            "tab": "General information",
-            "score": 4.983
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.003, mean=0.003, max=0.003, sum=0.003 (1)",
-            "tab": "General information",
-            "score": 0.003
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2368.513, mean=2368.513, max=2368.513, sum=2368.513 (1)",
-            "tab": "General information",
-            "score": 2368.513
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=23.703, mean=23.703, max=23.703, sum=23.703 (1)",
-            "tab": "General information",
-            "score": 23.703
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=160.695, mean=160.695, max=160.695, sum=160.695 (1)",
-            "tab": "General information",
-            "score": 160.695
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.629, mean=4.629, max=4.629, sum=4.629 (1)",
-            "tab": "General information",
-            "score": 4.629
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.946,
-        "details": {
-          "description": "min=0.946, mean=0.946, max=0.946, sum=0.946 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)",
-            "tab": "Efficiency",
-            "score": 0.77673295545578
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=273.002, mean=273.002, max=273.002, sum=273.002 (1)",
-            "tab": "General information",
-            "score": 273.002
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.712,
-        "details": {
-          "description": "min=0.52, mean=0.712, max=0.86, sum=3.558 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.679, mean=0.713, max=0.752, sum=3.567 (5)",
-            "tab": "Efficiency",
-            "score": 0.7133434140138459
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=396.67, mean=515.654, max=680.789, sum=2578.269 (5)",
-            "tab": "General information",
-            "score": 515.6538947368421
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.712,
-        "details": {
-          "description": "min=0.553, mean=0.712, max=0.874, sum=4.982 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=8.67, mean=11.511, max=13.559, sum=80.577 (7)",
-            "tab": "Efficiency",
-            "score": 11.510960669458308
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=976.696, mean=1468.935, max=2582.038, sum=10282.547 (7)",
-            "tab": "General information",
-            "score": 1468.9352369693863
-          },
-          "MATH - # output tokens": {
-            "description": "min=189.756, mean=254.005, max=296.346, sum=1778.034 (7)",
-            "tab": "General information",
-            "score": 254.00484808722263
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69,
-        "details": {
-          "description": "min=0.69, mean=0.69, max=0.69, sum=0.69 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=13.45, mean=13.45, max=13.45, sum=13.45 (1)",
-            "tab": "Efficiency",
-            "score": 13.45040065407753
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1170.814, mean=1170.814, max=1170.814, sum=1170.814 (1)",
-            "tab": "General information",
-            "score": 1170.814
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=288.079, mean=288.079, max=288.079, sum=288.079 (1)",
-            "tab": "General information",
-            "score": 288.079
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.519,
-        "details": {
-          "description": "min=0.145, mean=0.519, max=0.884, sum=2.594 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.855, mean=1.472, max=3.502, sum=7.358 (5)",
-            "tab": "Efficiency",
-            "score": 1.471592522464795
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=228.779, mean=1656.095, max=6814.4, sum=8280.475 (5)",
-            "tab": "General information",
-            "score": 1656.0949044887425
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=3.339, max=6.263, sum=16.697 (5)",
-            "tab": "General information",
-            "score": 3.339402150569105
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.66,
-        "details": {
-          "description": "min=0.66, mean=0.66, max=0.66, sum=0.66 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.993, mean=0.993, max=0.993, sum=0.993 (1)",
-            "tab": "Efficiency",
-            "score": 0.9931588552107157
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1135.392, mean=1135.392, max=1135.392, sum=1135.392 (1)",
-            "tab": "General information",
-            "score": 1135.3916500994035
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.176,
-        "details": {
-          "description": "min=0.126, mean=0.176, max=0.218, sum=0.88 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.838, mean=2.095, max=2.409, sum=10.477 (5)",
-            "tab": "Efficiency",
-            "score": 2.095412739007152
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=157.298, mean=205.092, max=335.56, sum=1025.461 (5)",
-            "tab": "General information",
-            "score": 205.09213851506343
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=24.773, mean=29.058, max=36.698, sum=145.291 (5)",
-            "tab": "General information",
-            "score": 29.058130065759293
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json b/data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json
deleted file mode 100644
index fb405652b..000000000
--- a/data/helm_lite/AlephAlpha/luminous-base/1a039ef6-5957-4246-82b2-bc607b6554e7.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/AlephAlpha_luminous-base/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Luminous Base 13B",
-    "id": "AlephAlpha/luminous-base",
-    "developer": "AlephAlpha",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.041,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.29337078651685394
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.633,
-        "details": {
-          "description": "min=0.633, mean=0.633, max=0.633, sum=0.633 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.05, mean=1.05, max=1.05, sum=1.05 (1)",
-            "tab": "Efficiency",
-            "score": 1.05044368958809
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=2.037, mean=2.037, max=2.037, sum=2.037 (1)",
-            "tab": "General information",
-            "score": 2.036619718309859
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1694.642, mean=1694.642, max=1694.642, sum=1694.642 (1)",
-            "tab": "General information",
-            "score": 1694.6422535211268
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.521, mean=5.521, max=5.521, sum=5.521 (1)",
-            "tab": "General information",
-            "score": 5.52112676056338
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.197,
-        "details": {
-          "description": "min=0.197, mean=0.197, max=0.197, sum=0.197 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.329, mean=1.329, max=1.329, sum=1.329 (1)",
-            "tab": "Efficiency",
-            "score": 1.328731627702713
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.802, mean=0.802, max=0.802, sum=0.802 (1)",
-            "tab": "Efficiency",
-            "score": 0.8020290625095368
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.717, mean=4.717, max=4.717, sum=4.717 (1)",
-            "tab": "General information",
-            "score": 4.717
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.038, mean=0.038, max=0.038, sum=0.038 (1)",
-            "tab": "General information",
-            "score": 0.038
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1488.14, mean=1488.14, max=1488.14, sum=1488.14 (1)",
-            "tab": "General information",
-            "score": 1488.14
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=10.866, mean=10.866, max=10.866, sum=10.866 (1)",
-            "tab": "General information",
-            "score": 10.866
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=116.087, mean=116.087, max=116.087, sum=116.087 (1)",
-            "tab": "General information",
-            "score": 116.087
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=5.908, mean=5.908, max=5.908, sum=5.908 (1)",
-            "tab": "General information",
-            "score": 5.908
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.286,
-        "details": {
-          "description": "min=0.286, mean=0.286, max=0.286, sum=0.286 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=0.667 (1)",
-            "tab": "Efficiency",
-            "score": 0.6669360423088073
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=254.652, mean=254.652, max=254.652, sum=254.652 (1)",
-            "tab": "General information",
-            "score": 254.652
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.243,
-        "details": {
-          "description": "min=0.22, mean=0.243, max=0.29, sum=1.217 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.619, mean=0.632, max=0.648, sum=3.162 (5)",
-            "tab": "Efficiency",
-            "score": 0.6324507230122884
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=360.75, mean=471.075, max=618.447, sum=2355.377 (5)",
-            "tab": "General information",
-            "score": 471.0754736842106
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.026,
-        "details": {
-          "description": "min=0, mean=0.026, max=0.067, sum=0.184 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=5.282, mean=9.204, max=20.088, sum=64.425 (7)",
-            "tab": "Efficiency",
-            "score": 9.203530075671766
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=2.962, mean=6.916, max=8, sum=48.409 (7)",
-            "tab": "General information",
-            "score": 6.915558126084441
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=928.719, mean=1184.139, max=1546.442, sum=8288.975 (7)",
-            "tab": "General information",
-            "score": 1184.139339428874
-          },
-          "MATH - # output tokens": {
-            "description": "min=114.077, mean=139.637, max=180.663, sum=977.456 (7)",
-            "tab": "General information",
-            "score": 139.6365272403828
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.028,
-        "details": {
-          "description": "min=0.028, mean=0.028, max=0.028, sum=0.028 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=16.427, mean=16.427, max=16.427, sum=16.427 (1)",
-            "tab": "Efficiency",
-            "score": 16.42652773284912
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=943.121, mean=943.121, max=943.121, sum=943.121 (1)",
-            "tab": "General information",
-            "score": 943.121
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=400, mean=400, max=400, sum=400 (1)",
-            "tab": "General information",
-            "score": 400.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.332,
-        "details": {
-          "description": "min=0.165, mean=0.332, max=0.601, sum=1.659 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.636, mean=0.753, max=1.073, sum=3.767 (5)",
-            "tab": "Efficiency",
-            "score": 0.7533007583490331
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=0.335, mean=3.867, max=5, sum=19.335 (5)",
-            "tab": "General information",
-            "score": 3.866938775510204
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0.133, max=0.665, sum=0.665 (5)",
-            "tab": "General information",
-            "score": 0.1330612244897959
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=205.726, mean=566.59, max=1514.545, sum=2832.948 (5)",
-            "tab": "General information",
-            "score": 566.5895794484264
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.639, max=4.027, sum=8.196 (5)",
-            "tab": "General information",
-            "score": 1.6391061224489796
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.26,
-        "details": {
-          "description": "min=0.26, mean=0.26, max=0.26, sum=0.26 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.726, mean=0.726, max=0.726, sum=0.726 (1)",
-            "tab": "Efficiency",
-            "score": 0.7258754989972882
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1005.229, mean=1005.229, max=1005.229, sum=1005.229 (1)",
-            "tab": "General information",
-            "score": 1005.2286282306163
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.066,
-        "details": {
-          "description": "min=0.0, mean=0.066, max=0.171, sum=0.331 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=4.671, mean=4.693, max=4.731, sum=23.465 (5)",
-            "tab": "Efficiency",
-            "score": 4.692985351748752
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=99.111, mean=157.232, max=255.504, sum=786.158 (5)",
-            "tab": "General information",
-            "score": 157.2315362631901
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=99.869, mean=99.974, max=100, sum=499.869 (5)",
-            "tab": "General information",
-            "score": 99.97375745526838
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json b/data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json
deleted file mode 100644
index 786a7e340..000000000
--- a/data/helm_lite/AlephAlpha/luminous-extended/fb3abb62-b76b-4a4e-a01f-bc62deba6b36.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/AlephAlpha_luminous-extended/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Luminous Extended 30B",
-    "id": "AlephAlpha/luminous-extended",
-    "developer": "AlephAlpha",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.078,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.2278027465667915
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.684,
-        "details": {
-          "description": "min=0.684, mean=0.684, max=0.684, sum=0.684 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.467, mean=1.467, max=1.467, sum=1.467 (1)",
-            "tab": "Efficiency",
-            "score": 1.4667296523779212
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=2.037, mean=2.037, max=2.037, sum=2.037 (1)",
-            "tab": "General information",
-            "score": 2.036619718309859
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1694.642, mean=1694.642, max=1694.642, sum=1694.642 (1)",
-            "tab": "General information",
-            "score": 1694.6422535211268
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=6.335, mean=6.335, max=6.335, sum=6.335 (1)",
-            "tab": "General information",
-            "score": 6.335211267605634
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.253,
-        "details": {
-          "description": "min=0.253, mean=0.253, max=0.253, sum=0.253 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.778, mean=1.778, max=1.778, sum=1.778 (1)",
-            "tab": "Efficiency",
-            "score": 1.777582576751709
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.98, mean=0.98, max=0.98, sum=0.98 (1)",
-            "tab": "Efficiency",
-            "score": 0.9799906523227692
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.717, mean=4.717, max=4.717, sum=4.717 (1)",
-            "tab": "General information",
-            "score": 4.717
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.038, mean=0.038, max=0.038, sum=0.038 (1)",
-            "tab": "General information",
-            "score": 0.038
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1488.14, mean=1488.14, max=1488.14, sum=1488.14 (1)",
-            "tab": "General information",
-            "score": 1488.14
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=11.063, mean=11.063, max=11.063, sum=11.063 (1)",
-            "tab": "General information",
-            "score": 11.063
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=116.087, mean=116.087, max=116.087, sum=116.087 (1)",
-            "tab": "General information",
-            "score": 116.087
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=6.869, mean=6.869, max=6.869, sum=6.869 (1)",
-            "tab": "General information",
-            "score": 6.869
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.272,
-        "details": {
-          "description": "min=0.272, mean=0.272, max=0.272, sum=0.272 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.675, mean=0.675, max=0.675, sum=0.675 (1)",
-            "tab": "Efficiency",
-            "score": 0.6750410146713257
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=254.652, mean=254.652, max=254.652, sum=254.652 (1)",
-            "tab": "General information",
-            "score": 254.652
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.248,
-        "details": {
-          "description": "min=0.2, mean=0.248, max=0.31, sum=1.242 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.69, mean=0.718, max=0.754, sum=3.592 (5)",
-            "tab": "Efficiency",
-            "score": 0.7183412402554562
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=360.75, mean=471.075, max=618.447, sum=2355.377 (5)",
-            "tab": "General information",
-            "score": 471.0754736842106
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04,
-        "details": {
-          "description": "min=0, mean=0.04, max=0.088, sum=0.278 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=5.96, mean=9.364, max=12.108, sum=65.551 (7)",
-            "tab": "Efficiency",
-            "score": 9.364456500699777
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=2.962, mean=6.916, max=8, sum=48.409 (7)",
-            "tab": "General information",
-            "score": 6.915558126084441
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=928.719, mean=1184.139, max=1546.442, sum=8288.975 (7)",
-            "tab": "General information",
-            "score": 1184.139339428874
-          },
-          "MATH - # output tokens": {
-            "description": "min=92.684, mean=142.866, max=180.2, sum=1000.065 (7)",
-            "tab": "General information",
-            "score": 142.86643564287382
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.075,
-        "details": {
-          "description": "min=0.075, mean=0.075, max=0.075, sum=0.075 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=22.685, mean=22.685, max=22.685, sum=22.685 (1)",
-            "tab": "Efficiency",
-            "score": 22.685439155817033
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=943.121, mean=943.121, max=943.121, sum=943.121 (1)",
-            "tab": "General information",
-            "score": 943.121
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=400, mean=400, max=400, sum=400 (1)",
-            "tab": "General information",
-            "score": 400.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.421,
-        "details": {
-          "description": "min=0.204, mean=0.421, max=0.632, sum=2.107 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.7, mean=0.858, max=1.261, sum=4.291 (5)",
-            "tab": "Efficiency",
-            "score": 0.8581969152200717
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=0.335, mean=3.867, max=5, sum=19.335 (5)",
-            "tab": "General information",
-            "score": 3.866938775510204
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0.133, max=0.665, sum=0.665 (5)",
-            "tab": "General information",
-            "score": 0.1330612244897959
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=205.726, mean=566.59, max=1514.545, sum=2832.948 (5)",
-            "tab": "General information",
-            "score": 566.5895794484264
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.548, max=3.196, sum=7.739 (5)",
-            "tab": "General information",
-            "score": 1.5478898257711229
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276,
-        "details": {
-          "description": "min=0.276, mean=0.276, max=0.276, sum=0.276 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.895, mean=0.895, max=0.895, sum=0.895 (1)",
-            "tab": "Efficiency",
-            "score": 0.8947408758622277
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1005.229, mean=1005.229, max=1005.229, sum=1005.229 (1)",
-            "tab": "General information",
-            "score": 1005.2286282306163
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.083,
-        "details": {
-          "description": "min=0.0, mean=0.083, max=0.194, sum=0.415 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=5.231, mean=5.336, max=5.406, sum=26.68 (5)",
-            "tab": "Efficiency",
-            "score": 5.33597646673717
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=99.111, mean=157.232, max=255.504, sum=786.158 (5)",
-            "tab": "General information",
-            "score": 157.2315362631901
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=500 (5)",
-            "tab": "General information",
-            "score": 100.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json b/data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json
deleted file mode 100644
index 78da47969..000000000
--- a/data/helm_lite/AlephAlpha/luminous-supreme/0e2790d3-40f1-4124-ba41-b65bd9de1852.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/AlephAlpha_luminous-supreme/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Luminous Supreme 70B",
-    "id": "AlephAlpha/luminous-supreme",
-    "developer": "AlephAlpha",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.145,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.1344569288389513
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.743,
-        "details": {
-          "description": "min=0.743, mean=0.743, max=0.743, sum=0.743 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=2.951, mean=2.951, max=2.951, sum=2.951 (1)",
-            "tab": "Efficiency",
-            "score": 2.9511526873413945
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=2.037, mean=2.037, max=2.037, sum=2.037 (1)",
-            "tab": "General information",
-            "score": 2.036619718309859
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1694.642, mean=1694.642, max=1694.642, sum=1694.642 (1)",
-            "tab": "General information",
-            "score": 1694.6422535211268
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.685, mean=5.685, max=5.685, sum=5.685 (1)",
-            "tab": "General information",
-            "score": 5.6845070422535215
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.299,
-        "details": {
-          "description": "min=0.299, mean=0.299, max=0.299, sum=0.299 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=2.657, mean=2.657, max=2.657, sum=2.657 (1)",
-            "tab": "Efficiency",
-            "score": 2.656584274530411
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=1.272, mean=1.272, max=1.272, sum=1.272 (1)",
-            "tab": "Efficiency",
-            "score": 1.2722365505695343
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.717, mean=4.717, max=4.717, sum=4.717 (1)",
-            "tab": "General information",
-            "score": 4.717
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.038, mean=0.038, max=0.038, sum=0.038 (1)",
-            "tab": "General information",
-            "score": 0.038
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1488.14, mean=1488.14, max=1488.14, sum=1488.14 (1)",
-            "tab": "General information",
-            "score": 1488.14
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=6.864, mean=6.864, max=6.864, sum=6.864 (1)",
-            "tab": "General information",
-            "score": 6.864
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=116.087, mean=116.087, max=116.087, sum=116.087 (1)",
-            "tab": "General information",
-            "score": 116.087
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.666, mean=4.666, max=4.666, sum=4.666 (1)",
-            "tab": "General information",
-            "score": 4.666
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.284,
-        "details": {
-          "description": "min=0.284, mean=0.284, max=0.284, sum=0.284 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.779, mean=0.779, max=0.779, sum=0.779 (1)",
-            "tab": "Efficiency",
-            "score": 0.778845920085907
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=254.652, mean=254.652, max=254.652, sum=254.652 (1)",
-            "tab": "General information",
-            "score": 254.652
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.316,
-        "details": {
-          "description": "min=0.18, mean=0.316, max=0.5, sum=1.582 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.825, mean=0.907, max=1.009, sum=4.537 (5)",
-            "tab": "Efficiency",
-            "score": 0.9073754794472141
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=360.75, mean=471.075, max=618.447, sum=2355.377 (5)",
-            "tab": "General information",
-            "score": 471.0754736842106
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.078,
-        "details": {
-          "description": "min=0.038, mean=0.078, max=0.158, sum=0.548 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=13.143, mean=16.874, max=20.77, sum=118.115 (7)",
-            "tab": "Efficiency",
-            "score": 16.873623512856078
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=2.962, mean=6.916, max=8, sum=48.409 (7)",
-            "tab": "General information",
-            "score": 6.915558126084441
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=928.719, mean=1184.139, max=1546.442, sum=8288.975 (7)",
-            "tab": "General information",
-            "score": 1184.139339428874
-          },
-          "MATH - # output tokens": {
-            "description": "min=90.605, mean=127.587, max=150.635, sum=893.112 (7)",
-            "tab": "General information",
-            "score": 127.58738933898053
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.137,
-        "details": {
-          "description": "min=0.137, mean=0.137, max=0.137, sum=0.137 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=48.242, mean=48.242, max=48.242, sum=48.242 (1)",
-            "tab": "Efficiency",
-            "score": 48.241569149971006
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=943.121, mean=943.121, max=943.121, sum=943.121 (1)",
-            "tab": "General information",
-            "score": 943.121
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=400, mean=400, max=400, sum=400 (1)",
-            "tab": "General information",
-            "score": 400.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.452,
-        "details": {
-          "description": "min=0.221, mean=0.452, max=0.768, sum=2.26 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.84, mean=1.156, max=2.035, sum=5.781 (5)",
-            "tab": "Efficiency",
-            "score": 1.1561943690304337
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=0.335, mean=3.867, max=5, sum=19.335 (5)",
-            "tab": "General information",
-            "score": 3.866938775510204
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0.133, max=0.665, sum=0.665 (5)",
-            "tab": "General information",
-            "score": 0.1330612244897959
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=205.726, mean=566.59, max=1514.545, sum=2832.948 (5)",
-            "tab": "General information",
-            "score": 566.5895794484264
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.266, max=1.769, sum=6.329 (5)",
-            "tab": "General information",
-            "score": 1.2657996218650946
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276,
-        "details": {
-          "description": "min=0.276, mean=0.276, max=0.276, sum=0.276 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=1.326, mean=1.326, max=1.326, sum=1.326 (1)",
-            "tab": "Efficiency",
-            "score": 1.325726029887114
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1005.229, mean=1005.229, max=1005.229, sum=1005.229 (1)",
-            "tab": "General information",
-            "score": 1005.2286282306163
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102,
-        "details": {
-          "description": "min=0.0, mean=0.102, max=0.193, sum=0.512 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=10.924, mean=11.052, max=11.265, sum=55.26 (5)",
-            "tab": "Efficiency",
-            "score": 11.052006985892152
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=99.111, mean=157.232, max=255.504, sum=786.158 (5)",
-            "tab": "General information",
-            "score": 157.2315362631901
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=100, mean=100, max=100, sum=500 (5)",
-            "tab": "General information",
-            "score": 100.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json b/data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json
deleted file mode 100644
index 2b870e958..000000000
--- a/data/helm_lite/ai21/j2-grande/d55129d3-4eae-4009-a897-fa1624cea6a2.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/ai21_j2-grande/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Jurassic-2 Grande 17B",
-    "id": "ai21/j2-grande",
-    "developer": "ai21",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.172,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.39915106117353305
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.744,
-        "details": {
-          "description": "min=0.744, mean=0.744, max=0.744, sum=0.744 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.179, mean=1.179, max=1.179, sum=1.179 (1)",
-            "tab": "Efficiency",
-            "score": 1.1790085772393455
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=3.225, mean=3.225, max=3.225, sum=3.225 (1)",
-            "tab": "General information",
-            "score": 3.2253521126760565
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1700.741, mean=1700.741, max=1700.741, sum=1700.741 (1)",
-            "tab": "General information",
-            "score": 1700.7408450704224
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.039, mean=5.039, max=5.039, sum=5.039 (1)",
-            "tab": "General information",
-            "score": 5.03943661971831
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35,
-        "details": {
-          "description": "min=0.35, mean=0.35, max=0.35, sum=0.35 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.462, mean=1.462, max=1.462, sum=1.462 (1)",
-            "tab": "Efficiency",
-            "score": 1.4618877012729645
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.631, mean=0.631, max=0.631, sum=0.631 (1)",
-            "tab": "Efficiency",
-            "score": 0.630548656463623
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.697, mean=4.697, max=4.697, sum=4.697 (1)",
-            "tab": "General information",
-            "score": 4.697
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.038, mean=0.038, max=0.038, sum=0.038 (1)",
-            "tab": "General information",
-            "score": 0.038
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1522.929, mean=1522.929, max=1522.929, sum=1522.929 (1)",
-            "tab": "General information",
-            "score": 1522.929
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=5.441, mean=5.441, max=5.441, sum=5.441 (1)",
-            "tab": "General information",
-            "score": 5.441
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=102.377, mean=102.377, max=102.377, sum=102.377 (1)",
-            "tab": "General information",
-            "score": 102.377
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=6.614, mean=6.614, max=6.614, sum=6.614 (1)",
-            "tab": "General information",
-            "score": 6.614
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.614,
-        "details": {
-          "description": "min=0.614, mean=0.614, max=0.614, sum=0.614 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.519, mean=0.519, max=0.519, sum=0.519 (1)",
-            "tab": "Efficiency",
-            "score": 0.519375147819519
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=188.75, mean=188.75, max=188.75, sum=188.75 (1)",
-            "tab": "General information",
-            "score": 188.75
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.471,
-        "details": {
-          "description": "min=0.25, mean=0.471, max=0.77, sum=2.355 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.549, mean=0.621, max=0.755, sum=3.103 (5)",
-            "tab": "Efficiency",
-            "score": 0.6205235414421348
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=308.59, mean=396.74, max=552.719, sum=1983.699 (5)",
-            "tab": "General information",
-            "score": 396.7398596491228
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.064,
-        "details": {
-          "description": "min=0, mean=0.064, max=0.158, sum=0.445 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=2.609, mean=4.862, max=6.298, sum=34.036 (7)",
-            "tab": "Efficiency",
-            "score": 4.862255273244342
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=2, mean=6.778, max=8, sum=47.447 (7)",
-            "tab": "General information",
-            "score": 6.7781954887218046
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=450.154, mean=943.419, max=1490.395, sum=6603.93 (7)",
-            "tab": "General information",
-            "score": 943.4185034241337
-          },
-          "MATH - # output tokens": {
-            "description": "min=74.123, mean=140.295, max=209.933, sum=982.063 (7)",
-            "tab": "General information",
-            "score": 140.29469320289397
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.159,
-        "details": {
-          "description": "min=0.159, mean=0.159, max=0.159, sum=0.159 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=5.417, mean=5.417, max=5.417, sum=5.417 (1)",
-            "tab": "Efficiency",
-            "score": 5.417125414848328
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=823.394, mean=823.394, max=823.394, sum=823.394 (1)",
-            "tab": "General information",
-            "score": 823.394
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=121.336, mean=121.336, max=121.336, sum=121.336 (1)",
-            "tab": "General information",
-            "score": 121.336
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.468,
-        "details": {
-          "description": "min=0.199, mean=0.468, max=0.842, sum=2.338 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.409, mean=0.712, max=1.079, sum=3.561 (5)",
-            "tab": "Efficiency",
-            "score": 0.7122931517101486
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=1.006, mean=4.001, max=5, sum=20.006 (5)",
-            "tab": "General information",
-            "score": 4.001224489795918
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0.002, max=0.012, sum=0.012 (5)",
-            "tab": "General information",
-            "score": 0.0024489795918367346
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=171.042, mean=503.146, max=1514.22, sum=2515.73 (5)",
-            "tab": "General information",
-            "score": 503.1459259177527
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=2, mean=2.056, max=2.216, sum=10.282 (5)",
-            "tab": "General information",
-            "score": 2.0563001835066452
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39,
-        "details": {
-          "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.914, mean=0.914, max=0.914, sum=0.914 (1)",
-            "tab": "Efficiency",
-            "score": 0.9142626611660299
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=758.622, mean=758.622, max=758.622, sum=758.622 (1)",
-            "tab": "General information",
-            "score": 758.6222664015904
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102,
-        "details": {
-          "description": "min=0.021, mean=0.102, max=0.149, sum=0.509 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.723, mean=0.759, max=0.81, sum=3.793 (5)",
-            "tab": "Efficiency",
-            "score": 0.7586197336965614
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=123.229, mean=135.468, max=148.278, sum=677.341 (5)",
-            "tab": "General information",
-            "score": 135.46828404572565
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=17.372, mean=19.051, max=21.34, sum=95.255 (5)",
-            "tab": "General information",
-            "score": 19.050931430646887
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json b/data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json
deleted file mode 100644
index 643b24001..000000000
--- a/data/helm_lite/ai21/j2-jumbo/6332f0b3-7fab-41ed-a8da-46b142051377.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/ai21_j2-jumbo/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Jurassic-2 Jumbo 178B",
-    "id": "ai21/j2-jumbo",
-    "developer": "ai21",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.215,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.19473158551810238
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.728,
-        "details": {
-          "description": "min=0.728, mean=0.728, max=0.728, sum=0.728 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.82, mean=1.82, max=1.82, sum=1.82 (1)",
-            "tab": "Efficiency",
-            "score": 1.8203622415032186
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=2534.434, mean=2534.434, max=2534.434, sum=2534.434 (1)",
-            "tab": "General information",
-            "score": 2534.4338028169013
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=6.583, mean=6.583, max=6.583, sum=6.583 (1)",
-            "tab": "General information",
-            "score": 6.583098591549295
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.385,
-        "details": {
-          "description": "min=0.385, mean=0.385, max=0.385, sum=0.385 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.448, mean=1.448, max=1.448, sum=1.448 (1)",
-            "tab": "Efficiency",
-            "score": 1.4479399914741515
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=5.332, mean=5.332, max=5.332, sum=5.332 (1)",
-            "tab": "Efficiency",
-            "score": 5.3321147253513335
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.931, mean=4.931, max=4.931, sum=4.931 (1)",
-            "tab": "General information",
-            "score": 4.931
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.012, mean=0.012, max=0.012, sum=0.012 (1)",
-            "tab": "General information",
-            "score": 0.012
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1687.673, mean=1687.673, max=1687.673, sum=1687.673 (1)",
-            "tab": "General information",
-            "score": 1687.673
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=4.785, mean=4.785, max=4.785, sum=4.785 (1)",
-            "tab": "General information",
-            "score": 4.785
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=102.377, mean=102.377, max=102.377, sum=102.377 (1)",
-            "tab": "General information",
-            "score": 102.377
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=5.79, mean=5.79, max=5.79, sum=5.79 (1)",
-            "tab": "General information",
-            "score": 5.79
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.688,
-        "details": {
-          "description": "min=0.688, mean=0.688, max=0.688, sum=0.688 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.998, mean=0.998, max=0.998, sum=0.998 (1)",
-            "tab": "Efficiency",
-            "score": 0.9981746392250062
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=188.75, mean=188.75, max=188.75, sum=188.75 (1)",
-            "tab": "General information",
-            "score": 188.75
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.483,
-        "details": {
-          "description": "min=0.25, mean=0.483, max=0.83, sum=2.413 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.693, mean=0.81, max=0.92, sum=4.052 (5)",
-            "tab": "Efficiency",
-            "score": 0.8103257050430566
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=308.59, mean=396.74, max=552.719, sum=1983.699 (5)",
-            "tab": "General information",
-            "score": 396.7398596491228
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.103,
-        "details": {
-          "description": "min=0.033, mean=0.103, max=0.193, sum=0.72 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=4.497, mean=9.136, max=13.531, sum=63.951 (7)",
-            "tab": "Efficiency",
-            "score": 9.135811412885502
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=796.795, mean=1321.422, max=2516.154, sum=9249.956 (7)",
-            "tab": "General information",
-            "score": 1321.42226282263
-          },
-          "MATH - # output tokens": {
-            "description": "min=76.281, mean=136.538, max=220.133, sum=955.767 (7)",
-            "tab": "General information",
-            "score": 136.53809167621895
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.239,
-        "details": {
-          "description": "min=0.239, mean=0.239, max=0.239, sum=0.239 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=5.176, mean=5.176, max=5.176, sum=5.176 (1)",
-            "tab": "Efficiency",
-            "score": 5.176425676584244
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=823.394, mean=823.394, max=823.394, sum=823.394 (1)",
-            "tab": "General information",
-            "score": 823.394
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=102.036, mean=102.036, max=102.036, sum=102.036 (1)",
-            "tab": "General information",
-            "score": 102.036
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.533,
-        "details": {
-          "description": "min=0.324, mean=0.533, max=0.821, sum=2.666 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.639, mean=1.274, max=2.827, sum=6.369 (5)",
-            "tab": "Efficiency",
-            "score": 1.2737073742826783
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.798, max=5, sum=23.992 (5)",
-            "tab": "General information",
-            "score": 4.798367346938775
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=171.042, mean=1120.486, max=4600.92, sum=5602.43 (5)",
-            "tab": "General information",
-            "score": 1120.4859259177529
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=2, mean=2.028, max=2.098, sum=10.141 (5)",
-            "tab": "General information",
-            "score": 2.028218528610354
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.431,
-        "details": {
-          "description": "min=0.431, mean=0.431, max=0.431, sum=0.431 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=1.535, mean=1.535, max=1.535, sum=1.535 (1)",
-            "tab": "Efficiency",
-            "score": 1.5350148075854566
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=758.622, mean=758.622, max=758.622, sum=758.622 (1)",
-            "tab": "General information",
-            "score": 758.6222664015904
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114,
-        "details": {
-          "description": "min=0.044, mean=0.114, max=0.148, sum=0.572 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.236, mean=1.441, max=1.665, sum=7.206 (5)",
-            "tab": "Efficiency",
-            "score": 1.4411698855373092
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=123.229, mean=135.468, max=148.278, sum=677.341 (5)",
-            "tab": "General information",
-            "score": 135.46828404572565
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=19.839, mean=24.063, max=30.439, sum=120.314 (5)",
-            "tab": "General information",
-            "score": 24.062830708059337
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json b/data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json
deleted file mode 100644
index a07da123a..000000000
--- a/data/helm_lite/ai21/jamba-1.5-large/0cb33741-ca10-40f5-90d3-28e300901ad3.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/ai21_jamba-1.5-large/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Jamba 1.5 Large",
-    "id": "ai21/jamba-1.5-large",
-    "developer": "ai21",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.637,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.26377028714107364
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.664,
-        "details": {
-          "description": "min=0.664, mean=0.664, max=0.664, sum=0.664 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)",
-            "tab": "Efficiency",
-            "score": 1.9694313982842673
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3595.597, mean=3595.597, max=3595.597, sum=3595.597 (1)",
-            "tab": "General information",
-            "score": 3595.5971830985914
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.394,
-        "details": {
-          "description": "min=0.394, mean=0.394, max=0.394, sum=0.394 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.678, mean=1.678, max=1.678, sum=1.678 (1)",
-            "tab": "Efficiency",
-            "score": 1.678127991437912
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=1.272, mean=1.272, max=1.272, sum=1.272 (1)",
-            "tab": "Efficiency",
-            "score": 1.2717866213321687
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2333.076, mean=2333.076, max=2333.076, sum=2333.076 (1)",
-            "tab": "General information",
-            "score": 2333.076
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=152.394, mean=152.394, max=152.394, sum=152.394 (1)",
-            "tab": "General information",
-            "score": 152.394
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.948,
-        "details": {
-          "description": "min=0.948, mean=0.948, max=0.948, sum=0.948 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.91, mean=0.91, max=0.91, sum=0.91 (1)",
-            "tab": "Efficiency",
-            "score": 0.9100792293548584
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=261.348, mean=261.348, max=261.348, sum=261.348 (1)",
-            "tab": "General information",
-            "score": 261.348
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.683,
-        "details": {
-          "description": "min=0.53, mean=0.683, max=0.92, sum=3.414 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.933, mean=0.973, max=1.0, sum=4.866 (5)",
-            "tab": "Efficiency",
-            "score": 0.973254363085094
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=397.58, mean=508.138, max=678.64, sum=2540.69 (5)",
-            "tab": "General information",
-            "score": 508.1380701754386
-          },
-          "MMLU - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.692,
-        "details": {
-          "description": "min=0.481, mean=0.692, max=0.889, sum=4.842 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=2.366, mean=3.179, max=4.736, sum=22.253 (7)",
-            "tab": "Efficiency",
-            "score": 3.1790229759699775
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=979.415, mean=1458.376, max=2550.115, sum=10208.634 (7)",
-            "tab": "General information",
-            "score": 1458.376275861588
-          },
-          "MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.846,
-        "details": {
-          "description": "min=0.846, mean=0.846, max=0.846, sum=0.846 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=3.942, mean=3.942, max=3.942, sum=3.942 (1)",
-            "tab": "Efficiency",
-            "score": 3.942030364751816
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1163.818, mean=1163.818, max=1163.818, sum=1163.818 (1)",
-            "tab": "General information",
-            "score": 1163.818
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.675,
-        "details": {
-          "description": "min=0.409, mean=0.675, max=0.989, sum=3.375 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.933, mean=1.258, max=2.367, sum=6.289 (5)",
-            "tab": "Efficiency",
-            "score": 1.2577736545740559
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=212.453, mean=1601.843, max=6618.612, sum=8009.215 (5)",
-            "tab": "General information",
-            "score": 1601.842950915631
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.698,
-        "details": {
-          "description": "min=0.698, mean=0.698, max=0.698, sum=0.698 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)",
-            "tab": "Efficiency",
-            "score": 0.9989562840395372
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1085.239, mean=1085.239, max=1085.239, sum=1085.239 (1)",
-            "tab": "General information",
-            "score": 1085.2385685884692
-          },
-          "MedQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.203,
-        "details": {
-          "description": "min=0.141, mean=0.203, max=0.246, sum=1.015 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.317, mean=1.386, max=1.471, sum=6.93 (5)",
-            "tab": "Efficiency",
-            "score": 1.3859240114613673
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=120.386, mean=151.077, max=189.223, sum=755.383 (5)",
-            "tab": "General information",
-            "score": 151.07662629989292
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json b/data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json
deleted file mode 100644
index 9e0628c9d..000000000
--- a/data/helm_lite/ai21/jamba-1.5-mini/80b60ccd-4711-4bce-a0f7-33d5b14fa97d.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/ai21_jamba-1.5-mini/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Jamba 1.5 Mini",
-    "id": "ai21/jamba-1.5-mini",
-    "developer": "ai21",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.414,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.44747815230961296
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.746,
-        "details": {
-          "description": "min=0.746, mean=0.746, max=0.746, sum=0.746 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.998, mean=0.998, max=0.998, sum=0.998 (1)",
-            "tab": "Efficiency",
-            "score": 0.9981950746455662
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3595.597, mean=3595.597, max=3595.597, sum=3595.597 (1)",
-            "tab": "General information",
-            "score": 3595.5971830985914
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.388,
-        "details": {
-          "description": "min=0.388, mean=0.388, max=0.388, sum=0.388 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.924, mean=0.924, max=0.924, sum=0.924 (1)",
-            "tab": "Efficiency",
-            "score": 0.9243871104717255
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.844, mean=0.844, max=0.844, sum=0.844 (1)",
-            "tab": "Efficiency",
-            "score": 0.8436705965995789
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2333.076, mean=2333.076, max=2333.076, sum=2333.076 (1)",
-            "tab": "General information",
-            "score": 2333.076
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=152.394, mean=152.394, max=152.394, sum=152.394 (1)",
-            "tab": "General information",
-            "score": 152.394
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "details": {
-          "description": "min=0.89, mean=0.89, max=0.89, sum=0.89 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.786, mean=0.786, max=0.786, sum=0.786 (1)",
-            "tab": "Efficiency",
-            "score": 0.7863723936080933
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=261.348, mean=261.348, max=261.348, sum=261.348 (1)",
-            "tab": "General information",
-            "score": 261.348
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.582,
-        "details": {
-          "description": "min=0.33, mean=0.582, max=0.9, sum=2.911 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.783, mean=0.81, max=0.83, sum=4.049 (5)",
-            "tab": "Efficiency",
-            "score": 0.8097888966024968
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=397.58, mean=508.138, max=678.64, sum=2540.69 (5)",
-            "tab": "General information",
-            "score": 508.1380701754386
-          },
-          "MMLU - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318,
-        "details": {
-          "description": "min=0.233, mean=0.318, max=0.386, sum=2.227 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.462, mean=1.636, max=2.034, sum=11.452 (7)",
-            "tab": "Efficiency",
-            "score": 1.63604986000122
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=979.415, mean=1458.376, max=2550.115, sum=10208.634 (7)",
-            "tab": "General information",
-            "score": 1458.376275861588
-          },
-          "MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.691,
-        "details": {
-          "description": "min=0.691, mean=0.691, max=0.691, sum=0.691 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.892, mean=1.892, max=1.892, sum=1.892 (1)",
-            "tab": "Efficiency",
-            "score": 1.8916997435092926
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1163.818, mean=1163.818, max=1163.818, sum=1163.818 (1)",
-            "tab": "General information",
-            "score": 1163.818
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.503,
-        "details": {
-          "description": "min=0.365, mean=0.503, max=0.842, sum=2.514 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.805, mean=0.864, max=1.071, sum=4.322 (5)",
-            "tab": "Efficiency",
-            "score": 0.8644844750252041
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=212.453, mean=1601.843, max=6618.612, sum=8009.215 (5)",
-            "tab": "General information",
-            "score": 1601.842950915631
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.632,
-        "details": {
-          "description": "min=0.632, mean=0.632, max=0.632, sum=0.632 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.817, mean=0.817, max=0.817, sum=0.817 (1)",
-            "tab": "Efficiency",
-            "score": 0.8172814860258615
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1085.239, mean=1085.239, max=1085.239, sum=1085.239 (1)",
-            "tab": "General information",
-            "score": 1085.2385685884692
-          },
-          "MedQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.179,
-        "details": {
-          "description": "min=0.116, mean=0.179, max=0.21, sum=0.895 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.965, mean=0.978, max=0.99, sum=4.888 (5)",
-            "tab": "Efficiency",
-            "score": 0.9776749755042665
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=120.386, mean=151.077, max=189.223, sum=755.383 (5)",
-            "tab": "General information",
-            "score": 151.07662629989292
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json b/data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json
deleted file mode 100644
index 9e1241a8e..000000000
--- a/data/helm_lite/ai21/jamba-instruct/de41775f-f60e-481e-a8ef-3df9a9b65a5a.json
+++ /dev/null
@@ -1,642 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/ai21_jamba-instruct/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Jamba Instruct",
-    "id": "ai21/jamba-instruct",
-    "developer": "ai21",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.287,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.6515730337078651
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.658,
-        "details": {
-          "description": "min=0.658, mean=0.658, max=0.658, sum=0.658 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.947, mean=0.947, max=0.947, sum=0.947 (1)",
-            "tab": "Efficiency",
-            "score": 0.9470622405199938
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=2555.434, mean=2555.434, max=2555.434, sum=2555.434 (1)",
-            "tab": "General information",
-            "score": 2555.4338028169013
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.384,
-        "details": {
-          "description": "min=0.384, mean=0.384, max=0.384, sum=0.384 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.809, mean=0.809, max=0.809, sum=0.809 (1)",
-            "tab": "Efficiency",
-            "score": 0.8087365460395813
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.535, mean=0.535, max=0.535, sum=0.535 (1)",
-            "tab": "Efficiency",
-            "score": 0.5348668487071991
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1774.04, mean=1774.04, max=1774.04, sum=1774.04 (1)",
-            "tab": "General information",
-            "score": 1774.04
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=118.377, mean=118.377, max=118.377, sum=118.377 (1)",
-            "tab": "General information",
-            "score": 118.377
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "description": "min=0.796, mean=0.796, max=0.796, sum=0.796 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.3 (1)",
-            "tab": "Efficiency",
-            "score": 0.30006033515930175
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=195.75, mean=195.75, max=195.75, sum=195.75 (1)",
-            "tab": "General information",
-            "score": 195.75
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.582,
-        "details": {
-          "description": "min=0.36, mean=0.582, max=0.91, sum=2.909 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.253, mean=0.265, max=0.275, sum=1.327 (5)",
-            "tab": "Efficiency",
-            "score": 0.2654710942151254
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=315.59, mean=403.74, max=559.719, sum=2018.699 (5)",
-            "tab": "General information",
-            "score": 403.7398596491228
-          },
-          "MMLU - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38,
-        "details": {
-          "description": "min=0.237, mean=0.38, max=0.607, sum=2.663 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.917, mean=3.242, max=5.09, sum=22.692 (7)",
-            "tab": "Efficiency",
-            "score": 3.24175411841349
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=796.795, mean=1321.422, max=2516.154, sum=9249.956 (7)",
-            "tab": "General information",
-            "score": 1321.42226282263
-          },
-          "MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.67,
-        "details": {
-          "description": "min=0.67, mean=0.67, max=0.67, sum=0.67 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=3.846, mean=3.846, max=3.846, sum=3.846 (1)",
-            "tab": "Efficiency",
-            "score": 3.8455032846927644
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=823.394, mean=823.394, max=823.394, sum=823.394 (1)",
-            "tab": "General information",
-            "score": 823.394
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.54,
-        "details": {
-          "description": "min=0.304, mean=0.54, max=0.874, sum=2.7 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.351, mean=0.641, max=1.337, sum=3.204 (5)",
-            "tab": "Efficiency",
-            "score": 0.6408480782672099
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=177.042, mean=1127.163, max=4612.308, sum=5635.817 (5)",
-            "tab": "General information",
-            "score": 1127.1634769381612
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.519,
-        "details": {
-          "description": "min=0.519, mean=0.519, max=0.519, sum=0.519 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.311, mean=0.311, max=0.311, sum=0.311 (1)",
-            "tab": "Efficiency",
-            "score": 0.31133864366747516
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=765.622, mean=765.622, max=765.622, sum=765.622 (1)",
-            "tab": "General information",
-            "score": 765.6222664015904
-          },
-          "MedQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.164,
-        "details": {
-          "description": "min=0.099, mean=0.164, max=0.205, sum=0.656 (4)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.586, mean=0.635, max=0.686, sum=2.542 (4)",
-            "tab": "Efficiency",
-            "score": 0.6354023076110767
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=585.25, max=832, sum=2341 (4)",
-            "tab": "General information",
-            "score": 585.25
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=4 (4)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (4)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=129.229, mean=143.261, max=154.278, sum=573.045 (4)",
-            "tab": "General information",
-            "score": 143.26129939115307
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (4)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json b/data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json
deleted file mode 100644
index b68794dd1..000000000
--- a/data/helm_lite/allenai/olmo-7b/bc29d5c6-b5c8-473b-b69c-054026829089.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/allenai_olmo-7b/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OLMo 7B",
-    "id": "allenai/olmo-7b",
-    "developer": "allenai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.052,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.6540574282147316
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.597,
-        "details": {
-          "description": "min=0.597, mean=0.597, max=0.597, sum=0.597 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.032, mean=1.032, max=1.032, sum=1.032 (1)",
-            "tab": "Efficiency",
-            "score": 1.0318688553823552
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.969, mean=1.969, max=1.969, sum=1.969 (1)",
-            "tab": "General information",
-            "score": 1.9690140845070423
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1691.082, mean=1691.082, max=1691.082, sum=1691.082 (1)",
-            "tab": "General information",
-            "score": 1691.081690140845
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.259,
-        "details": {
-          "description": "min=0.259, mean=0.259, max=0.259, sum=0.259 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.942, mean=0.942, max=0.942, sum=0.942 (1)",
-            "tab": "Efficiency",
-            "score": 0.9419968054294586
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.397, mean=0.397, max=0.397, sum=0.397 (1)",
-            "tab": "Efficiency",
-            "score": 0.3968301827907562
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.703, mean=4.703, max=4.703, sum=4.703 (1)",
-            "tab": "General information",
-            "score": 4.703
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.037, mean=0.037, max=0.037, sum=0.037 (1)",
-            "tab": "General information",
-            "score": 0.037
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1495.001, mean=1495.001, max=1495.001, sum=1495.001 (1)",
-            "tab": "General information",
-            "score": 1495.001
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.998, mean=0.998, max=0.998, sum=0.998 (1)",
-            "tab": "General information",
-            "score": 0.998
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=117.299, mean=117.299, max=117.299, sum=117.299 (1)",
-            "tab": "General information",
-            "score": 117.299
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.222,
-        "details": {
-          "description": "min=0.222, mean=0.222, max=0.222, sum=0.222 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.29, mean=0.29, max=0.29, sum=0.29 (1)",
-            "tab": "Efficiency",
-            "score": 0.2902843647003174
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=251.556, mean=251.556, max=251.556, sum=251.556 (1)",
-            "tab": "General information",
-            "score": 251.556
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.305,
-        "details": {
-          "description": "min=0.26, mean=0.305, max=0.38, sum=1.525 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.309, mean=0.326, max=0.346, sum=1.629 (5)",
-            "tab": "Efficiency",
-            "score": 0.325820258140564
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=358.76, mean=467.936, max=612.798, sum=2339.678 (5)",
-            "tab": "General information",
-            "score": 467.935649122807
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.029,
-        "details": {
-          "description": "min=0, mean=0.029, max=0.088, sum=0.205 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.79, mean=2.257, max=2.808, sum=15.8 (7)",
-            "tab": "Efficiency",
-            "score": 2.2571195842818583
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=3.173, mean=6.976, max=8, sum=48.831 (7)",
-            "tab": "General information",
-            "score": 6.9758530942741475
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=860.23, mean=1111.07, max=1508.423, sum=7777.488 (7)",
-            "tab": "General information",
-            "score": 1111.0696790674758
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.044,
-        "details": {
-          "description": "min=0.044, mean=0.044, max=0.044, sum=0.044 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=2.41, mean=2.41, max=2.41, sum=2.41 (1)",
-            "tab": "Efficiency",
-            "score": 2.4104921889305113
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=939.582, mean=939.582, max=939.582, sum=939.582 (1)",
-            "tab": "General information",
-            "score": 939.582
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.341,
-        "details": {
-          "description": "min=0.158, mean=0.341, max=0.6, sum=1.704 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.368, mean=0.502, max=0.929, sum=2.508 (5)",
-            "tab": "Efficiency",
-            "score": 0.5016753114389487
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=0.298, mean=3.86, max=5, sum=19.298 (5)",
-            "tab": "General information",
-            "score": 3.859591836734694
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0.003, max=0.014, sum=0.014 (5)",
-            "tab": "General information",
-            "score": 0.002857142857142857
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=206.779, mean=559.92, max=1493.837, sum=2799.602 (5)",
-            "tab": "General information",
-            "score": 559.9203981649337
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.229,
-        "details": {
-          "description": "min=0.229, mean=0.229, max=0.229, sum=0.229 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.478, mean=0.478, max=0.478, sum=0.478 (1)",
-            "tab": "Efficiency",
-            "score": 0.47797848879698496
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=994.588, mean=994.588, max=994.588, sum=994.588 (1)",
-            "tab": "General information",
-            "score": 994.5884691848906
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.097,
-        "details": {
-          "description": "min=0.009, mean=0.097, max=0.157, sum=0.487 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.661, mean=0.771, max=0.925, sum=3.855 (5)",
-            "tab": "Efficiency",
-            "score": 0.7709201743273374
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=129.879, mean=144.948, max=167.177, sum=724.741 (5)",
-            "tab": "General information",
-            "score": 144.94816676861905
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json b/data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json
deleted file mode 100644
index 084734ba7..000000000
--- a/data/helm_lite/amazon/nova-lite-v1_0/ad7e1abd-0263-4971-b37a-b1ca4cb0a8e9.json
+++ /dev/null
@@ -1,644 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/amazon_nova-lite-v1:0/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Amazon Nova Lite",
-    "id": "amazon/nova-lite-v1:0",
-    "developer": "amazon",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.708,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.9832833957553059
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.768,
-        "details": {
-          "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.227, mean=0.227, max=0.227, sum=0.227 (1)",
-            "tab": "Efficiency",
-            "score": 0.22699436619718286
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3550.577, mean=3550.577, max=3550.577, sum=3550.577 (1)",
-            "tab": "General information",
-            "score": 3550.5774647887324
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=4.701, mean=4.701, max=4.701, sum=4.701 (1)",
-            "tab": "General information",
-            "score": 4.701408450704226
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.352,
-        "details": {
-          "description": "min=0.352, mean=0.352, max=0.352, sum=0.352 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.206, mean=0.206, max=0.206, sum=0.206 (1)",
-            "tab": "Efficiency",
-            "score": 0.20557699999999976
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.155, mean=0.155, max=0.155, sum=0.155 (1)",
-            "tab": "Efficiency",
-            "score": 0.15455700000000017
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1773.944, mean=1773.944, max=1773.944, sum=1773.944 (1)",
-            "tab": "General information",
-            "score": 1773.944
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=4.835, mean=4.835, max=4.835, sum=4.835 (1)",
-            "tab": "General information",
-            "score": 4.835
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=153.254, mean=153.254, max=153.254, sum=153.254 (1)",
-            "tab": "General information",
-            "score": 153.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.084, mean=4.084, max=4.084, sum=4.084 (1)",
-            "tab": "General information",
-            "score": 4.084
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.928,
-        "details": {
-          "description": "min=0.928, mean=0.928, max=0.928, sum=0.928 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.113, mean=0.113, max=0.113, sum=0.113 (1)",
-            "tab": "Efficiency",
-            "score": 0.11279599999999983
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=282.21, mean=282.21, max=282.21, sum=282.21 (1)",
-            "tab": "General information",
-            "score": 282.21
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.693,
-        "details": {
-          "description": "min=0.52, mean=0.693, max=0.92, sum=3.465 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.124, mean=0.13, max=0.136, sum=0.651 (5)",
-            "tab": "Efficiency",
-            "score": 0.13027701754385965
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=399.38, mean=500.274, max=652.07, sum=2501.37 (5)",
-            "tab": "General information",
-            "score": 500.2740350877192
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.779,
-        "details": {
-          "description": "min=0.579, mean=0.779, max=0.911, sum=5.45 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=0.693, mean=0.836, max=1.148, sum=5.85 (7)",
-            "tab": "Efficiency",
-            "score": 0.8356917305438115
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=925.556, mean=1394.735, max=2468.942, sum=9763.147 (7)",
-            "tab": "General information",
-            "score": 1394.7353092779651
-          },
-          "MATH - # output tokens": {
-            "description": "min=61.4, mean=78.742, max=112.526, sum=551.195 (7)",
-            "tab": "General information",
-            "score": 78.74214942544197
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.829,
-        "details": {
-          "description": "min=0.829, mean=0.829, max=0.829, sum=0.829 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.063, mean=1.063, max=1.063, sum=1.063 (1)",
-            "tab": "Efficiency",
-            "score": 1.0628889999999993
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=957.869, mean=957.869, max=957.869, sum=957.869 (1)",
-            "tab": "General information",
-            "score": 957.869
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=84.074, mean=84.074, max=84.074, sum=84.074 (1)",
-            "tab": "General information",
-            "score": 84.074
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.659,
-        "details": {
-          "description": "min=0.368, mean=0.659, max=0.947, sum=3.297 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.118, mean=0.156, max=0.261, sum=0.782 (5)",
-            "tab": "Efficiency",
-            "score": 0.15639281489418358
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=241.632, mean=1581.083, max=6449.798, sum=7905.414 (5)",
-            "tab": "General information",
-            "score": 1581.0827222540588
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.488, max=2.6, sum=7.439 (5)",
-            "tab": "General information",
-            "score": 1.4878474114441418
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ],
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.696,
-        "details": {
-          "description": "min=0.696, mean=0.696, max=0.696, sum=0.696 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.132, mean=0.132, max=0.132, sum=0.132 (1)",
-            "tab": "Efficiency",
-            "score": 0.1322564612326044
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1066.861, mean=1066.861, max=1066.861, sum=1066.861 (1)",
-            "tab": "General information",
-            "score": 1066.8608349900596
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.204,
-        "details": {
-          "description": "min=0.126, mean=0.204, max=0.25, sum=1.021 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.336, max=0.406, sum=1.68 (5)",
-            "tab": "Efficiency",
-            "score": 0.3359064091413061
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=163.93, mean=208.694, max=268.662, sum=1043.469 (5)",
-            "tab": "General information",
-            "score": 208.69386660804403
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=24.457, mean=29.543, max=42.627, sum=147.715 (5)",
-            "tab": "General information",
-            "score": 29.542975799051845
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json b/data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json
deleted file mode 100644
index fb66c7744..000000000
--- a/data/helm_lite/amazon/nova-micro-v1_0/4e131240-d66c-4f95-a2c8-7fabbe8b2c25.json
+++ /dev/null
@@ -1,644 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/amazon_nova-micro-v1:0/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Amazon Nova Micro",
-    "id": "amazon/nova-micro-v1:0",
-    "developer": "amazon",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.524,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.998876404494382
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.744,
-        "details": {
-          "description": "min=0.744, mean=0.744, max=0.744, sum=0.744 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.196, mean=0.196, max=0.196, sum=0.196 (1)",
-            "tab": "Efficiency",
-            "score": 0.19638591549295767
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3550.577, mean=3550.577, max=3550.577, sum=3550.577 (1)",
-            "tab": "General information",
-            "score": 3550.5774647887324
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=3.961, mean=3.961, max=3.961, sum=3.961 (1)",
-            "tab": "General information",
-            "score": 3.96056338028169
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.285,
-        "details": {
-          "description": "min=0.285, mean=0.285, max=0.285, sum=0.285 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.19, mean=0.19, max=0.19, sum=0.19 (1)",
-            "tab": "Efficiency",
-            "score": 0.1897639999999999
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.133, mean=0.133, max=0.133, sum=0.133 (1)",
-            "tab": "Efficiency",
-            "score": 0.1334880000000001
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1773.944, mean=1773.944, max=1773.944, sum=1773.944 (1)",
-            "tab": "General information",
-            "score": 1773.944
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=5.911, mean=5.911, max=5.911, sum=5.911 (1)",
-            "tab": "General information",
-            "score": 5.911
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=153.254, mean=153.254, max=153.254, sum=153.254 (1)",
-            "tab": "General information",
-            "score": 153.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=3.515, mean=3.515, max=3.515, sum=3.515 (1)",
-            "tab": "General information",
-            "score": 3.515
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.888,
-        "details": {
-          "description": "min=0.888, mean=0.888, max=0.888, sum=0.888 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.104, mean=0.104, max=0.104, sum=0.104 (1)",
-            "tab": "Efficiency",
-            "score": 0.10389599999999993
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=282.21, mean=282.21, max=282.21, sum=282.21 (1)",
-            "tab": "General information",
-            "score": 282.21
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.64,
-        "details": {
-          "description": "min=0.42, mean=0.64, max=0.9, sum=3.2 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.113, mean=0.116, max=0.118, sum=0.579 (5)",
-            "tab": "Efficiency",
-            "score": 0.11572105263157897
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=399.38, mean=500.274, max=652.07, sum=2501.37 (5)",
-            "tab": "General information",
-            "score": 500.2740350877192
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.558, mean=0.76, max=0.895, sum=5.32 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=0.577, mean=0.79, max=1.132, sum=5.529 (7)",
-            "tab": "Efficiency",
-            "score": 0.7898264142267815
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=925.556, mean=1394.735, max=2468.942, sum=9763.147 (7)",
-            "tab": "General information",
-            "score": 1394.7353092779651
-          },
-          "MATH - # output tokens": {
-            "description": "min=75.368, mean=103.346, max=152.2, sum=723.421 (7)",
-            "tab": "General information",
-            "score": 103.34588937061396
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.794,
-        "details": {
-          "description": "min=0.794, mean=0.794, max=0.794, sum=0.794 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=0.895, mean=0.895, max=0.895, sum=0.895 (1)",
-            "tab": "Efficiency",
-            "score": 0.8952520000000004
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=957.869, mean=957.869, max=957.869, sum=957.869 (1)",
-            "tab": "General information",
-            "score": 957.869
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=103.892, mean=103.892, max=103.892, sum=103.892 (1)",
-            "tab": "General information",
-            "score": 103.892
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.615,
-        "details": {
-          "description": "min=0.368, mean=0.615, max=0.874, sum=3.074 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.108, mean=0.143, max=0.254, sum=0.713 (5)",
-            "tab": "Efficiency",
-            "score": 0.14263605160429277
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=241.632, mean=1581.083, max=6449.798, sum=7905.414 (5)",
-            "tab": "General information",
-            "score": 1581.0827222540588
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.665, max=2.926, sum=8.323 (5)",
-            "tab": "General information",
-            "score": 1.6646275687271896
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ],
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.608,
-        "details": {
-          "description": "min=0.608, mean=0.608, max=0.608, sum=0.608 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.118, mean=0.118, max=0.118, sum=0.118 (1)",
-            "tab": "Efficiency",
-            "score": 0.11825049701789252
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1066.861, mean=1066.861, max=1066.861, sum=1066.861 (1)",
-            "tab": "General information",
-            "score": 1066.8608349900596
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.192,
-        "details": {
-          "description": "min=0.112, mean=0.192, max=0.241, sum=0.96 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.239, mean=0.268, max=0.333, sum=1.34 (5)",
-            "tab": "Efficiency",
-            "score": 0.26807757063388915
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=163.93, mean=208.694, max=268.662, sum=1043.469 (5)",
-            "tab": "General information",
-            "score": 208.69386660804403
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=23.38, mean=25.875, max=28.916, sum=129.377 (5)",
-            "tab": "General information",
-            "score": 25.875419597797826
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json b/data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json
deleted file mode 100644
index c7f9d86e2..000000000
--- a/data/helm_lite/amazon/nova-pro-v1_0/9ef56d5a-de00-4d89-930c-a4c74211dd78.json
+++ /dev/null
@@ -1,644 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/amazon_nova-pro-v1:0/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Amazon Nova Pro",
-    "id": "amazon/nova-pro-v1:0",
-    "developer": "amazon",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.885,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.9342571785268414
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791,
-        "details": {
-          "description": "min=0.791, mean=0.791, max=0.791, sum=0.791 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.246, mean=0.246, max=0.246, sum=0.246 (1)",
-            "tab": "Efficiency",
-            "score": 0.24631830985915482
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3550.577, mean=3550.577, max=3550.577, sum=3550.577 (1)",
-            "tab": "General information",
-            "score": 3550.5774647887324
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=4.651, mean=4.651, max=4.651, sum=4.651 (1)",
-            "tab": "General information",
-            "score": 4.650704225352112
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.405,
-        "details": {
-          "description": "min=0.405, mean=0.405, max=0.405, sum=0.405 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.266, mean=0.266, max=0.266, sum=0.266 (1)",
-            "tab": "Efficiency",
-            "score": 0.26591999999999993
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.203, mean=0.203, max=0.203, sum=0.203 (1)",
-            "tab": "Efficiency",
-            "score": 0.203244
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1773.944, mean=1773.944, max=1773.944, sum=1773.944 (1)",
-            "tab": "General information",
-            "score": 1773.944
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=5.646, mean=5.646, max=5.646, sum=5.646 (1)",
-            "tab": "General information",
-            "score": 5.646
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=153.254, mean=153.254, max=153.254, sum=153.254 (1)",
-            "tab": "General information",
-            "score": 153.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.207, mean=4.207, max=4.207, sum=4.207 (1)",
-            "tab": "General information",
-            "score": 4.207
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.96,
-        "details": {
-          "description": "min=0.96, mean=0.96, max=0.96, sum=0.96 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.129, mean=0.129, max=0.129, sum=0.129 (1)",
-            "tab": "Efficiency",
-            "score": 0.12889800000000004
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=282.21, mean=282.21, max=282.21, sum=282.21 (1)",
-            "tab": "General information",
-            "score": 282.21
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.758,
-        "details": {
-          "description": "min=0.63, mean=0.758, max=0.93, sum=3.792 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.141, mean=0.145, max=0.152, sum=0.725 (5)",
-            "tab": "Efficiency",
-            "score": 0.1449304210526316
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=399.38, mean=500.274, max=652.07, sum=2501.37 (5)",
-            "tab": "General information",
-            "score": 500.2740350877192
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.821,
-        "details": {
-          "description": "min=0.7, mean=0.821, max=0.93, sum=5.749 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.139, mean=1.695, max=2.518, sum=11.863 (7)",
-            "tab": "Efficiency",
-            "score": 1.6947358347418935
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=925.556, mean=1394.735, max=2468.942, sum=9763.147 (7)",
-            "tab": "General information",
-            "score": 1394.7353092779651
-          },
-          "MATH - # output tokens": {
-            "description": "min=66.088, mean=98.114, max=154.135, sum=686.8 (7)",
-            "tab": "General information",
-            "score": 98.11425246180445
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=0.87 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.566, mean=1.566, max=1.566, sum=1.566 (1)",
-            "tab": "Efficiency",
-            "score": 1.5656869999999996
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=957.869, mean=957.869, max=957.869, sum=957.869 (1)",
-            "tab": "General information",
-            "score": 957.869
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=73.847, mean=73.847, max=73.847, sum=73.847 (1)",
-            "tab": "General information",
-            "score": 73.847
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.736,
-        "details": {
-          "description": "min=0.444, mean=0.736, max=0.958, sum=3.681 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.139, mean=0.166, max=0.232, sum=0.83 (5)",
-            "tab": "Efficiency",
-            "score": 0.16605967288111284
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=241.632, mean=1581.083, max=6449.798, sum=7905.414 (5)",
-            "tab": "General information",
-            "score": 1581.0827222540588
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.387, max=2.358, sum=6.936 (5)",
-            "tab": "General information",
-            "score": 1.3871102825182848
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ],
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.811,
-        "details": {
-          "description": "min=0.811, mean=0.811, max=0.811, sum=0.811 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.142, mean=0.142, max=0.142, sum=0.142 (1)",
-            "tab": "Efficiency",
-            "score": 0.14219284294234621
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1066.861, mean=1066.861, max=1066.861, sum=1066.861 (1)",
-            "tab": "General information",
-            "score": 1066.8608349900596
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.229,
-        "details": {
-          "description": "min=0.184, mean=0.229, max=0.281, sum=1.144 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.483, mean=0.504, max=0.519, sum=2.52 (5)",
-            "tab": "Efficiency",
-            "score": 0.5040968109611562
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=163.93, mean=208.694, max=268.662, sum=1043.469 (5)",
-            "tab": "General information",
-            "score": 208.69386660804403
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=23.903, mean=25.328, max=25.92, sum=126.641 (5)",
-            "tab": "General information",
-            "score": 25.32825594509864
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json b/data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json
deleted file mode 100644
index ab0989b58..000000000
--- a/data/helm_lite/anthropic/claude-2.0/5598d3ed-5b37-4aec-b186-0b16c394633b.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-2.0/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 2.0",
-    "id": "anthropic/claude-2.0",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.489,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.14701622971285894
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.718,
-        "details": {
-          "description": "min=0.718, mean=0.718, max=0.718, sum=0.718 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=4.811, mean=4.811, max=4.811, sum=4.811 (1)",
-            "tab": "Efficiency",
-            "score": 4.8114360809326175
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)",
-            "tab": "General information",
-            "score": 3709.7408450704224
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=10.561, mean=10.561, max=10.561, sum=10.561 (1)",
-            "tab": "General information",
-            "score": 10.56056338028169
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.428,
-        "details": {
-          "description": "min=0.428, mean=0.428, max=0.428, sum=0.428 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=2.984, mean=2.984, max=2.984, sum=2.984 (1)",
-            "tab": "Efficiency",
-            "score": 2.9841483016268606
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=1.149, mean=1.149, max=1.149, sum=1.149 (1)",
-            "tab": "Efficiency",
-            "score": 1.1486653406620027
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.964, mean=4.964, max=4.964, sum=4.964 (1)",
-            "tab": "General information",
-            "score": 4.964
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)",
-            "tab": "General information",
-            "score": 0.007
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1734.363, mean=1734.363, max=1734.363, sum=1734.363 (1)",
-            "tab": "General information",
-            "score": 1734.363
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=7.605, mean=7.605, max=7.605, sum=7.605 (1)",
-            "tab": "General information",
-            "score": 7.605
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)",
-            "tab": "General information",
-            "score": 189.259
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=7.206, mean=7.206, max=7.206, sum=7.206 (1)",
-            "tab": "General information",
-            "score": 7.206
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.862,
-        "details": {
-          "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=1.558, mean=1.558, max=1.558, sum=1.558 (1)",
-            "tab": "Efficiency",
-            "score": 1.5584912838935852
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=328.79, mean=328.79, max=328.79, sum=328.79 (1)",
-            "tab": "General information",
-            "score": 328.79
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.639,
-        "details": {
-          "description": "min=0.38, mean=0.639, max=0.9, sum=3.196 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=1.609, mean=1.728, max=1.936, sum=8.641 (5)",
-            "tab": "Efficiency",
-            "score": 1.7282055348597072
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=435.26, mean=543.747, max=684.596, sum=2718.736 (5)",
-            "tab": "General information",
-            "score": 543.747298245614
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.603,
-        "details": {
-          "description": "min=0.491, mean=0.603, max=0.8, sum=4.219 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=5.057, mean=6.211, max=7.33, sum=43.477 (7)",
-            "tab": "Efficiency",
-            "score": 6.211058685420826
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=947.259, mean=1361.814, max=2379.808, sum=9532.699 (7)",
-            "tab": "General information",
-            "score": 1361.8141219676104
-          },
-          "MATH - # output tokens": {
-            "description": "min=76.07, mean=96.474, max=115.288, sum=675.315 (7)",
-            "tab": "General information",
-            "score": 96.47352327848044
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.583,
-        "details": {
-          "description": "min=0.583, mean=0.583, max=0.583, sum=0.583 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=4.857, mean=4.857, max=4.857, sum=4.857 (1)",
-            "tab": "Efficiency",
-            "score": 4.857238686800003
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)",
-            "tab": "General information",
-            "score": 1012.712
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=78.704, mean=78.704, max=78.704, sum=78.704 (1)",
-            "tab": "General information",
-            "score": 78.704
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.643,
-        "details": {
-          "description": "min=0.387, mean=0.643, max=0.947, sum=3.216 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=1.703, mean=2.782, max=6.2, sum=13.911 (5)",
-            "tab": "Efficiency",
-            "score": 2.782158235233088
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.798, max=5, sum=23.99 (5)",
-            "tab": "General information",
-            "score": 4.797959183673469
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=280.653, mean=1621.356, max=6484.969, sum=8106.779 (5)",
-            "tab": "General information",
-            "score": 1621.3558670820687
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=3.338, max=11.058, sum=16.692 (5)",
-            "tab": "General information",
-            "score": 3.338449275778001
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.652,
-        "details": {
-          "description": "min=0.652, mean=0.652, max=0.652, sum=0.652 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=2.254, mean=2.254, max=2.254, sum=2.254 (1)",
-            "tab": "Efficiency",
-            "score": 2.2539968865055213
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1092.437, mean=1092.437, max=1092.437, sum=1092.437 (1)",
-            "tab": "General information",
-            "score": 1092.4373757455269
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.219,
-        "details": {
-          "description": "min=0.159, mean=0.219, max=0.268, sum=1.095 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.692, mean=1.995, max=2.443, sum=9.976 (5)",
-            "tab": "Efficiency",
-            "score": 1.9951115173159082
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=197.406, mean=218.573, max=240.974, sum=1092.866 (5)",
-            "tab": "General information",
-            "score": 218.57322077152472
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=24.254, mean=25.653, max=26.374, sum=128.266 (5)",
-            "tab": "General information",
-            "score": 25.65316323214559
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json b/data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json
deleted file mode 100644
index 2adbb62af..000000000
--- a/data/helm_lite/anthropic/claude-2.1/a039c598-3f93-4f59-a8c4-f1ae3d7b241c.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-2.1/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 2.1",
-    "id": "anthropic/claude-2.1",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.437,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.08012484394506866
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.677,
-        "details": {
-          "description": "min=0.677, mean=0.677, max=0.677, sum=0.677 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=5.376, mean=5.376, max=5.376, sum=5.376 (1)",
-            "tab": "Efficiency",
-            "score": 5.376147254755799
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)",
-            "tab": "General information",
-            "score": 3709.7408450704224
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=12.431, mean=12.431, max=12.431, sum=12.431 (1)",
-            "tab": "General information",
-            "score": 12.430985915492958
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375,
-        "details": {
-          "description": "min=0.375, mean=0.375, max=0.375, sum=0.375 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=4.161, mean=4.161, max=4.161, sum=4.161 (1)",
-            "tab": "Efficiency",
-            "score": 4.16052336707216
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=1.753, mean=1.753, max=1.753, sum=1.753 (1)",
-            "tab": "Efficiency",
-            "score": 1.753281570672989
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.964, mean=4.964, max=4.964, sum=4.964 (1)",
-            "tab": "General information",
-            "score": 4.964
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)",
-            "tab": "General information",
-            "score": 0.007
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1734.363, mean=1734.363, max=1734.363, sum=1734.363 (1)",
-            "tab": "General information",
-            "score": 1734.363
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=19.738, mean=19.738, max=19.738, sum=19.738 (1)",
-            "tab": "General information",
-            "score": 19.738
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)",
-            "tab": "General information",
-            "score": 189.259
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=11.053, mean=11.053, max=11.053, sum=11.053 (1)",
-            "tab": "General information",
-            "score": 11.053
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.872,
-        "details": {
-          "description": "min=0.872, mean=0.872, max=0.872, sum=0.872 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=1.809, mean=1.809, max=1.809, sum=1.809 (1)",
-            "tab": "Efficiency",
-            "score": 1.8090401072502136
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=328.79, mean=328.79, max=328.79, sum=328.79 (1)",
-            "tab": "General information",
-            "score": 328.79
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.643,
-        "details": {
-          "description": "min=0.4, mean=0.643, max=0.92, sum=3.216 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=2.043, mean=2.371, max=2.615, sum=11.855 (5)",
-            "tab": "Efficiency",
-            "score": 2.370939975420634
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=435.26, mean=543.747, max=684.596, sum=2718.736 (5)",
-            "tab": "General information",
-            "score": 543.747298245614
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.632,
-        "details": {
-          "description": "min=0.5, mean=0.632, max=0.852, sum=4.425 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=9.158, mean=9.672, max=10.737, sum=67.703 (7)",
-            "tab": "Efficiency",
-            "score": 9.671810739168015
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=947.259, mean=1361.814, max=2379.808, sum=9532.699 (7)",
-            "tab": "General information",
-            "score": 1361.8141219676104
-          },
-          "MATH - # output tokens": {
-            "description": "min=79.825, mean=96.72, max=120.842, sum=677.038 (7)",
-            "tab": "General information",
-            "score": 96.71972910810119
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.604,
-        "details": {
-          "description": "min=0.604, mean=0.604, max=0.604, sum=0.604 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=7.706, mean=7.706, max=7.706, sum=7.706 (1)",
-            "tab": "Efficiency",
-            "score": 7.7061755385398865
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)",
-            "tab": "General information",
-            "score": 1012.712
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=98.553, mean=98.553, max=98.553, sum=98.553 (1)",
-            "tab": "General information",
-            "score": 98.553
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.643,
-        "details": {
-          "description": "min=0.406, mean=0.643, max=0.874, sum=3.214 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=2.23, mean=3.223, max=6.58, sum=16.113 (5)",
-            "tab": "Efficiency",
-            "score": 3.2225898594048035
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.798, max=5, sum=23.99 (5)",
-            "tab": "General information",
-            "score": 4.797959183673469
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=280.653, mean=1621.356, max=6484.969, sum=8106.779 (5)",
-            "tab": "General information",
-            "score": 1621.3558670820687
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.455, max=2.137, sum=7.277 (5)",
-            "tab": "General information",
-            "score": 1.4554741431234763
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.644,
-        "details": {
-          "description": "min=0.644, mean=0.644, max=0.644, sum=0.644 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=2.482, mean=2.482, max=2.482, sum=2.482 (1)",
-            "tab": "Efficiency",
-            "score": 2.482170646754695
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1092.437, mean=1092.437, max=1092.437, sum=1092.437 (1)",
-            "tab": "General information",
-            "score": 1092.4373757455269
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.204,
-        "details": {
-          "description": "min=0.148, mean=0.204, max=0.233, sum=1.021 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=2.478, mean=2.756, max=3.455, sum=13.78 (5)",
-            "tab": "Efficiency",
-            "score": 2.7559348208894425
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=197.406, mean=218.573, max=240.974, sum=1092.866 (5)",
-            "tab": "General information",
-            "score": 218.57322077152472
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=24.439, mean=25.235, max=26.058, sum=126.175 (5)",
-            "tab": "General information",
-            "score": 25.235038327725952
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json b/data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json
deleted file mode 100644
index ff757a7ad..000000000
--- a/data/helm_lite/anthropic/claude-3-5-haiku-20241022/54bac699-aa82-4133-8c10-c6510c2a7f95.json
+++ /dev/null
@@ -1,644 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-5-haiku-20241022/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 3.5 Haiku 20241022",
-    "id": "anthropic/claude-3-5-haiku-20241022",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.531,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.29044943820224717
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.763,
-        "details": {
-          "description": "min=0.763, mean=0.763, max=0.763, sum=0.763 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.304, mean=1.304, max=1.304, sum=1.304 (1)",
-            "tab": "Efficiency",
-            "score": 1.3044010672770756
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3662.741, mean=3662.741, max=3662.741, sum=3662.741 (1)",
-            "tab": "General information",
-            "score": 3662.7408450704224
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=7.031, mean=7.031, max=7.031, sum=7.031 (1)",
-            "tab": "General information",
-            "score": 7.030985915492958
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344,
-        "details": {
-          "description": "min=0.344, mean=0.344, max=0.344, sum=0.344 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.41, mean=1.41, max=1.41, sum=1.41 (1)",
-            "tab": "Efficiency",
-            "score": 1.4098961477279663
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.799, mean=0.799, max=0.799, sum=0.799 (1)",
-            "tab": "Efficiency",
-            "score": 0.7985508556365967
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1726.799, mean=1726.799, max=1726.799, sum=1726.799 (1)",
-            "tab": "General information",
-            "score": 1726.799
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=16.792, mean=16.792, max=16.792, sum=16.792 (1)",
-            "tab": "General information",
-            "score": 16.792
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=134.259, mean=134.259, max=134.259, sum=134.259 (1)",
-            "tab": "General information",
-            "score": 134.259
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=18.429, mean=18.429, max=18.429, sum=18.429 (1)",
-            "tab": "General information",
-            "score": 18.429
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.854,
-        "details": {
-          "description": "min=0.854, mean=0.854, max=0.854, sum=0.854 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.9, mean=0.9, max=0.9, sum=0.9 (1)",
-            "tab": "Efficiency",
-            "score": 0.8996305031776428
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=263.79, mean=263.79, max=263.79, sum=263.79 (1)",
-            "tab": "General information",
-            "score": 263.79
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.671,
-        "details": {
-          "description": "min=0.47, mean=0.671, max=0.94, sum=3.356 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.909, mean=1.002, max=1.196, sum=5.012 (5)",
-            "tab": "Efficiency",
-            "score": 1.0023672421856928
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=370.26, mean=478.747, max=619.596, sum=2393.736 (5)",
-            "tab": "General information",
-            "score": 478.747298245614
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.872,
-        "details": {
-          "description": "min=0.737, mean=0.872, max=0.988, sum=6.102 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=3.671, mean=5.707, max=14.928, sum=39.947 (7)",
-            "tab": "Efficiency",
-            "score": 5.706647422047061
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=873.259, mean=1287.814, max=2305.808, sum=9014.699 (7)",
-            "tab": "General information",
-            "score": 1287.8141219676104
-          },
-          "MATH - # output tokens": {
-            "description": "min=165.86, mean=202.645, max=236.769, sum=1418.512 (7)",
-            "tab": "General information",
-            "score": 202.6446145676256
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.815,
-        "details": {
-          "description": "min=0.815, mean=0.815, max=0.815, sum=0.815 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=3.915, mean=3.915, max=3.915, sum=3.915 (1)",
-            "tab": "Efficiency",
-            "score": 3.915386771917343
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=938.712, mean=938.712, max=938.712, sum=938.712 (1)",
-            "tab": "General information",
-            "score": 938.712
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=185.342, mean=185.342, max=185.342, sum=185.342 (1)",
-            "tab": "General information",
-            "score": 185.342
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.631,
-        "details": {
-          "description": "min=0, mean=0.631, max=0.947, sum=3.155 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.62, mean=1.383, max=2.1, sum=6.914 (5)",
-            "tab": "Efficiency",
-            "score": 1.3828645188221382
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=232.653, mean=1568.242, max=6432.398, sum=7841.208 (5)",
-            "tab": "General information",
-            "score": 1568.241581367783
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=6.998, max=29.403, sum=34.988 (5)",
-            "tab": "General information",
-            "score": 6.997580266743151
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ],
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.722,
-        "details": {
-          "description": "min=0.722, mean=0.722, max=0.722, sum=0.722 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.99, mean=0.99, max=0.99, sum=0.99 (1)",
-            "tab": "Efficiency",
-            "score": 0.9896539864435822
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1027.437, mean=1027.437, max=1027.437, sum=1027.437 (1)",
-            "tab": "General information",
-            "score": 1027.4373757455269
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.135,
-        "details": {
-          "description": "min=0.077, mean=0.135, max=0.2, sum=0.675 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.889, mean=1.087, max=1.411, sum=5.434 (5)",
-            "tab": "Efficiency",
-            "score": 1.0867067574964768
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=131.406, mean=152.573, max=174.974, sum=762.866 (5)",
-            "tab": "General information",
-            "score": 152.5732207715247
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=33.417, mean=46.766, max=62.029, sum=233.828 (5)",
-            "tab": "General information",
-            "score": 46.76561018504359
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json b/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json
deleted file mode 100644
index 2c4b0d7d1..000000000
--- a/data/helm_lite/anthropic/claude-3-5-sonnet-20240620/79b23601-3148-4256-88ce-67e439a87c5b.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20240620/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 3.5 Sonnet 20240620",
-    "id": "anthropic/claude-3-5-sonnet-20240620",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.885,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.27392009987515603
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.746,
-        "details": {
-          "description": "min=0.746, mean=0.746, max=0.746, sum=0.746 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=3.5, mean=3.5, max=3.5, sum=3.5 (1)",
-            "tab": "Efficiency",
-            "score": 3.5003784911733278
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3672.741, mean=3672.741, max=3672.741, sum=3672.741 (1)",
-            "tab": "General information",
-            "score": 3672.7408450704224
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=7.854, mean=7.854, max=7.854, sum=7.854 (1)",
-            "tab": "General information",
-            "score": 7.853521126760564
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.502,
-        "details": {
-          "description": "min=0.502, mean=0.502, max=0.502, sum=0.502 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.834, mean=1.834, max=1.834, sum=1.834 (1)",
-            "tab": "Efficiency",
-            "score": 1.8338699455261231
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.739, mean=0.739, max=0.739, sum=0.739 (1)",
-            "tab": "Efficiency",
-            "score": 0.738832370519638
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1736.799, mean=1736.799, max=1736.799, sum=1736.799 (1)",
-            "tab": "General information",
-            "score": 1736.799
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=11.135, mean=11.135, max=11.135, sum=11.135 (1)",
-            "tab": "General information",
-            "score": 11.135
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=144.259, mean=144.259, max=144.259, sum=144.259 (1)",
-            "tab": "General information",
-            "score": 144.259
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=6.069, mean=6.069, max=6.069, sum=6.069 (1)",
-            "tab": "General information",
-            "score": 6.069
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.972,
-        "details": {
-          "description": "min=0.972, mean=0.972, max=0.972, sum=0.972 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.774, mean=0.774, max=0.774, sum=0.774 (1)",
-            "tab": "Efficiency",
-            "score": 0.7740971641540527
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=272.79, mean=272.79, max=272.79, sum=272.79 (1)",
-            "tab": "General information",
-            "score": 272.79
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.799,
-        "details": {
-          "description": "min=0.59, mean=0.799, max=0.96, sum=3.997 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.765, mean=0.824, max=0.973, sum=4.121 (5)",
-            "tab": "Efficiency",
-            "score": 0.8242833791364703
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=379.26, mean=487.747, max=628.596, sum=2438.736 (5)",
-            "tab": "General information",
-            "score": 487.747298245614
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.813,
-        "details": {
-          "description": "min=0.579, mean=0.813, max=0.953, sum=5.69 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=2.231, mean=3.012, max=3.921, sum=21.081 (7)",
-            "tab": "Efficiency",
-            "score": 3.0116338881061275
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=897.259, mean=1311.814, max=2329.808, sum=9182.699 (7)",
-            "tab": "General information",
-            "score": 1311.8141219676104
-          },
-          "MATH - # output tokens": {
-            "description": "min=93.333, mean=143.948, max=207.442, sum=1007.635 (7)",
-            "tab": "General information",
-            "score": 143.9478793136688
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.949,
-        "details": {
-          "description": "min=0.949, mean=0.949, max=0.949, sum=0.949 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=3.163, mean=3.163, max=3.163, sum=3.163 (1)",
-            "tab": "Efficiency",
-            "score": 3.162740940093994
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=938.712, mean=938.712, max=938.712, sum=938.712 (1)",
-            "tab": "General information",
-            "score": 938.712
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=165.163, mean=165.163, max=165.163, sum=165.163 (1)",
-            "tab": "General information",
-            "score": 165.163
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.707,
-        "details": {
-          "description": "min=0.455, mean=0.707, max=0.968, sum=3.533 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.66, mean=1.474, max=4.297, sum=7.369 (5)",
-            "tab": "Efficiency",
-            "score": 1.473749651523724
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=223.653, mean=1566.242, max=6437.398, sum=7831.208 (5)",
-            "tab": "General information",
-            "score": 1566.241581367783
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.328, max=2.053, sum=6.638 (5)",
-            "tab": "General information",
-            "score": 1.3276925283235337
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.825,
-        "details": {
-          "description": "min=0.825, mean=0.825, max=0.825, sum=0.825 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=1.199, mean=1.199, max=1.199, sum=1.199 (1)",
-            "tab": "Efficiency",
-            "score": 1.1990809397953406
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1036.437, mean=1036.437, max=1036.437, sum=1036.437 (1)",
-            "tab": "General information",
-            "score": 1036.4373757455269
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.229,
-        "details": {
-          "description": "min=0.181, mean=0.229, max=0.27, sum=1.145 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.838, mean=1.923, max=2.007, sum=9.616 (5)",
-            "tab": "Efficiency",
-            "score": 1.9232725335746241
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=141.406, mean=162.573, max=184.974, sum=812.866 (5)",
-            "tab": "General information",
-            "score": 162.5732207715247
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=24.282, mean=25.852, max=26.592, sum=129.259 (5)",
-            "tab": "General information",
-            "score": 25.85177875057348
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json b/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json
deleted file mode 100644
index 4b9824f13..000000000
--- a/data/helm_lite/anthropic/claude-3-5-sonnet-20241022/e92648e4-75c6-4944-9ec1-880823fefc87.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-5-sonnet-20241022/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 3.5 Sonnet 20241022",
-    "id": "anthropic/claude-3-5-sonnet-20241022",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.846,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.2994132334581773
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=0.77 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=41.561, mean=41.561, max=41.561, sum=41.561 (1)",
-            "tab": "Efficiency",
-            "score": 41.56126285405226
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3662.741, mean=3662.741, max=3662.741, sum=3662.741 (1)",
-            "tab": "General information",
-            "score": 3662.7408450704224
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=7.031, mean=7.031, max=7.031, sum=7.031 (1)",
-            "tab": "General information",
-            "score": 7.030985915492958
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.467,
-        "details": {
-          "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=4.722, mean=4.722, max=4.722, sum=4.722 (1)",
-            "tab": "Efficiency",
-            "score": 4.721950803041458
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.659, mean=0.659, max=0.659, sum=0.659 (1)",
-            "tab": "Efficiency",
-            "score": 0.6590276186466217
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1726.799, mean=1726.799, max=1726.799, sum=1726.799 (1)",
-            "tab": "General information",
-            "score": 1726.799
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=14.702, mean=14.702, max=14.702, sum=14.702 (1)",
-            "tab": "General information",
-            "score": 14.702
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=134.259, mean=134.259, max=134.259, sum=134.259 (1)",
-            "tab": "General information",
-            "score": 134.259
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=8.63, mean=8.63, max=8.63, sum=8.63 (1)",
-            "tab": "General information",
-            "score": 8.63
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.966,
-        "details": {
-          "description": "min=0.966, mean=0.966, max=0.966, sum=0.966 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=1.256, mean=1.256, max=1.256, sum=1.256 (1)",
-            "tab": "Efficiency",
-            "score": 1.2558565106391906
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=263.79, mean=263.79, max=263.79, sum=263.79 (1)",
-            "tab": "General information",
-            "score": 263.79
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.809,
-        "details": {
-          "description": "min=0.63, mean=0.809, max=0.96, sum=4.047 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.66, mean=0.673, max=0.689, sum=3.367 (5)",
-            "tab": "Efficiency",
-            "score": 0.6733581468766195
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=370.26, mean=478.747, max=619.596, sum=2393.736 (5)",
-            "tab": "General information",
-            "score": 478.747298245614
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.904,
-        "details": {
-          "description": "min=0.789, mean=0.904, max=0.985, sum=6.326 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=3.355, mean=4.052, max=4.718, sum=28.364 (7)",
-            "tab": "Efficiency",
-            "score": 4.0520609326088035
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=887.259, mean=1301.814, max=2319.808, sum=9112.699 (7)",
-            "tab": "General information",
-            "score": 1301.8141219676104
-          },
-          "MATH - # output tokens": {
-            "description": "min=127.663, mean=168.831, max=213.077, sum=1181.819 (7)",
-            "tab": "General information",
-            "score": 168.831271579864
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.956,
-        "details": {
-          "description": "min=0.956, mean=0.956, max=0.956, sum=0.956 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=3.518, mean=3.518, max=3.518, sum=3.518 (1)",
-            "tab": "Efficiency",
-            "score": 3.5175547733306884
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=938.712, mean=938.712, max=938.712, sum=938.712 (1)",
-            "tab": "General information",
-            "score": 938.712
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=141.152, mean=141.152, max=141.152, sum=141.152 (1)",
-            "tab": "General information",
-            "score": 141.152
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.647,
-        "details": {
-          "description": "min=0.283, mean=0.647, max=0.989, sum=3.237 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.559, mean=1.013, max=1.649, sum=5.065 (5)",
-            "tab": "Efficiency",
-            "score": 1.0130474324650445
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=232.653, mean=1568.242, max=6432.398, sum=7841.208 (5)",
-            "tab": "General information",
-            "score": 1568.241581367783
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=3.7, max=13.488, sum=18.498 (5)",
-            "tab": "General information",
-            "score": 3.6996529470816006
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.859,
-        "details": {
-          "description": "min=0.859, mean=0.859, max=0.859, sum=0.859 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.815, mean=0.815, max=0.815, sum=0.815 (1)",
-            "tab": "Efficiency",
-            "score": 0.8153728936348947
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1027.437, mean=1027.437, max=1027.437, sum=1027.437 (1)",
-            "tab": "General information",
-            "score": 1027.4373757455269
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.226,
-        "details": {
-          "description": "min=0.174, mean=0.226, max=0.266, sum=1.128 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.838, mean=0.86, max=0.889, sum=4.301 (5)",
-            "tab": "Efficiency",
-            "score": 0.8602394085223064
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=141.406, mean=162.573, max=184.974, sum=812.866 (5)",
-            "tab": "General information",
-            "score": 162.5732207715247
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=23.825, mean=25.177, max=25.958, sum=125.887 (5)",
-            "tab": "General information",
-            "score": 25.177411492582966
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json b/data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json
deleted file mode 100644
index 8eac62865..000000000
--- a/data/helm_lite/anthropic/claude-3-haiku-20240307/449feffd-d2e3-4a08-ad69-b8ad522532ae.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-haiku-20240307/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 3 Haiku 20240307",
-    "id": "anthropic/claude-3-haiku-20240307",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.263,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.5421473158551811
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.244,
-        "details": {
-          "description": "min=0.244, mean=0.244, max=0.244, sum=0.244 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.133, mean=1.133, max=1.133, sum=1.133 (1)",
-            "tab": "Efficiency",
-            "score": 1.1334171402622277
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)",
-            "tab": "General information",
-            "score": 3709.7408450704224
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=44.265, mean=44.265, max=44.265, sum=44.265 (1)",
-            "tab": "General information",
-            "score": 44.264788732394365
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.144,
-        "details": {
-          "description": "min=0.144, mean=0.144, max=0.144, sum=0.144 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.941, mean=0.941, max=0.941, sum=0.941 (1)",
-            "tab": "Efficiency",
-            "score": 0.9411524205207825
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.865, mean=0.865, max=0.865, sum=0.865 (1)",
-            "tab": "Efficiency",
-            "score": 0.8646892714500427
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1781.799, mean=1781.799, max=1781.799, sum=1781.799 (1)",
-            "tab": "General information",
-            "score": 1781.799
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=33.024, mean=33.024, max=33.024, sum=33.024 (1)",
-            "tab": "General information",
-            "score": 33.024
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)",
-            "tab": "General information",
-            "score": 189.259
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=50.787, mean=50.787, max=50.787, sum=50.787 (1)",
-            "tab": "General information",
-            "score": 50.787
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.838,
-        "details": {
-          "description": "min=0.838, mean=0.838, max=0.838, sum=0.838 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.616, mean=0.616, max=0.616, sum=0.616 (1)",
-            "tab": "Efficiency",
-            "score": 0.6164444308280945
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=263.79, mean=263.79, max=263.79, sum=263.79 (1)",
-            "tab": "General information",
-            "score": 263.79
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.662,
-        "details": {
-          "description": "min=0.42, mean=0.662, max=0.95, sum=3.312 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.686, mean=0.697, max=0.721, sum=3.485 (5)",
-            "tab": "Efficiency",
-            "score": 0.6970766685050831
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=370.26, mean=478.747, max=619.596, sum=2393.736 (5)",
-            "tab": "General information",
-            "score": 478.747298245614
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.131,
-        "details": {
-          "description": "min=0, mean=0.131, max=0.504, sum=0.916 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=0.672, mean=0.895, max=1.288, sum=6.265 (7)",
-            "tab": "Efficiency",
-            "score": 0.8950275982044664
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=948.259, mean=1362.814, max=2380.808, sum=9539.699 (7)",
-            "tab": "General information",
-            "score": 1362.8141219676104
-          },
-          "MATH - # output tokens": {
-            "description": "min=3.158, mean=29.033, max=87.17, sum=203.231 (7)",
-            "tab": "General information",
-            "score": 29.032964841043174
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.699,
-        "details": {
-          "description": "min=0.699, mean=0.699, max=0.699, sum=0.699 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.228, mean=1.228, max=1.228, sum=1.228 (1)",
-            "tab": "Efficiency",
-            "score": 1.2278449382781982
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)",
-            "tab": "General information",
-            "score": 1012.712
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=77.518, mean=77.518, max=77.518, sum=77.518 (1)",
-            "tab": "General information",
-            "score": 77.518
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46,
-        "details": {
-          "description": "min=0.034, mean=0.46, max=0.779, sum=2.301 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.455, mean=0.719, max=0.988, sum=3.593 (5)",
-            "tab": "Efficiency",
-            "score": 0.7186767522236834
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=214.653, mean=1557.242, max=6428.398, sum=7786.208 (5)",
-            "tab": "General information",
-            "score": 1557.241581367783
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=9.565, max=28.352, sum=47.824 (5)",
-            "tab": "General information",
-            "score": 9.56470087480281
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.702,
-        "details": {
-          "description": "min=0.702, mean=0.702, max=0.702, sum=0.702 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.653, mean=0.653, max=0.653, sum=0.653 (1)",
-            "tab": "Efficiency",
-            "score": 0.6529203475588121
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1027.437, mean=1027.437, max=1027.437, sum=1027.437 (1)",
-            "tab": "General information",
-            "score": 1027.4373757455269
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.148,
-        "details": {
-          "description": "min=0.018, mean=0.148, max=0.208, sum=0.74 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.627, mean=0.711, max=0.891, sum=3.556 (5)",
-            "tab": "Efficiency",
-            "score": 0.7111122513056886
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=198.406, mean=219.573, max=241.974, sum=1097.866 (5)",
-            "tab": "General information",
-            "score": 219.57322077152472
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=27.598, mean=48.613, max=93.673, sum=243.065 (5)",
-            "tab": "General information",
-            "score": 48.6129454044961
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json b/data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json
deleted file mode 100644
index d590c786e..000000000
--- a/data/helm_lite/anthropic/claude-3-opus-20240229/d297b253-0f4f-4caf-864b-9f457ab589da.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-opus-20240229/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 3 Opus 20240229",
-    "id": "anthropic/claude-3-opus-20240229",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.683,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.09124843945068664
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.351,
-        "details": {
-          "description": "min=0.351, mean=0.351, max=0.351, sum=0.351 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=3.996, mean=3.996, max=3.996, sum=3.996 (1)",
-            "tab": "Efficiency",
-            "score": 3.9963467248728577
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)",
-            "tab": "General information",
-            "score": 3709.7408450704224
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=13.589, mean=13.589, max=13.589, sum=13.589 (1)",
-            "tab": "General information",
-            "score": 13.588732394366197
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.441,
-        "details": {
-          "description": "min=0.441, mean=0.441, max=0.441, sum=0.441 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=4.273, mean=4.273, max=4.273, sum=4.273 (1)",
-            "tab": "Efficiency",
-            "score": 4.273005393266678
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=1.647, mean=1.647, max=1.647, sum=1.647 (1)",
-            "tab": "Efficiency",
-            "score": 1.6471402559280395
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1781.799, mean=1781.799, max=1781.799, sum=1781.799 (1)",
-            "tab": "General information",
-            "score": 1781.799
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=39.248, mean=39.248, max=39.248, sum=39.248 (1)",
-            "tab": "General information",
-            "score": 39.248
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)",
-            "tab": "General information",
-            "score": 189.259
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=5.66, mean=5.66, max=5.66, sum=5.66 (1)",
-            "tab": "General information",
-            "score": 5.66
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.956,
-        "details": {
-          "description": "min=0.956, mean=0.956, max=0.956, sum=0.956 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=2.168, mean=2.168, max=2.168, sum=2.168 (1)",
-            "tab": "Efficiency",
-            "score": 2.167769320487976
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=263.79, mean=263.79, max=263.79, sum=263.79 (1)",
-            "tab": "General information",
-            "score": 263.79
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.768,
-        "details": {
-          "description": "min=0.6, mean=0.768, max=0.96, sum=3.839 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=4.003, mean=4.19, max=4.373, sum=20.948 (5)",
-            "tab": "Efficiency",
-            "score": 4.189554240862528
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=370.26, mean=478.747, max=619.596, sum=2393.736 (5)",
-            "tab": "General information",
-            "score": 478.747298245614
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.526, mean=0.76, max=0.889, sum=5.322 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=6.095, mean=7.542, max=9.041, sum=52.793 (7)",
-            "tab": "Efficiency",
-            "score": 7.541890628266922
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=948.259, mean=1362.814, max=2380.808, sum=9539.699 (7)",
-            "tab": "General information",
-            "score": 1362.8141219676104
-          },
-          "MATH - # output tokens": {
-            "description": "min=82.965, mean=113.906, max=138.263, sum=797.345 (7)",
-            "tab": "General information",
-            "score": 113.90635737624721
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.924,
-        "details": {
-          "description": "min=0.924, mean=0.924, max=0.924, sum=0.924 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=7.469, mean=7.469, max=7.469, sum=7.469 (1)",
-            "tab": "Efficiency",
-            "score": 7.469249876976013
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)",
-            "tab": "General information",
-            "score": 1012.712
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=115.934, mean=115.934, max=115.934, sum=115.934 (1)",
-            "tab": "General information",
-            "score": 115.934
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.662,
-        "details": {
-          "description": "min=0.153, mean=0.662, max=0.989, sum=3.31 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=1.391, mean=2.57, max=4.856, sum=12.851 (5)",
-            "tab": "Efficiency",
-            "score": 2.570133829482505
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=214.653, mean=1557.242, max=6428.398, sum=7786.208 (5)",
-            "tab": "General information",
-            "score": 1557.241581367783
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.605, max=2.932, sum=8.023 (5)",
-            "tab": "General information",
-            "score": 1.6045285459659269
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.775,
-        "details": {
-          "description": "min=0.775, mean=0.775, max=0.775, sum=0.775 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=2.65, mean=2.65, max=2.65, sum=2.65 (1)",
-            "tab": "Efficiency",
-            "score": 2.6499544673601156
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1027.437, mean=1027.437, max=1027.437, sum=1027.437 (1)",
-            "tab": "General information",
-            "score": 1027.4373757455269
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.24,
-        "details": {
-          "description": "min=0.188, mean=0.24, max=0.285, sum=1.199 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=2.279, mean=2.447, max=2.661, sum=12.233 (5)",
-            "tab": "Efficiency",
-            "score": 2.4465377724275283
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=198.406, mean=219.573, max=241.974, sum=1097.866 (5)",
-            "tab": "General information",
-            "score": 219.57322077152472
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=24.332, mean=25.837, max=26.616, sum=129.185 (5)",
-            "tab": "General information",
-            "score": 25.837047426976607
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json b/data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json
deleted file mode 100644
index 90baddbf7..000000000
--- a/data/helm_lite/anthropic/claude-3-sonnet-20240229/d7a7e038-0985-4ee2-a549-0906b3aa8cc5.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-3-sonnet-20240229/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 3 Sonnet 20240229",
-    "id": "anthropic/claude-3-sonnet-20240229",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.377,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.27500624219725345
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.111,
-        "details": {
-          "description": "min=0.111, mean=0.111, max=0.111, sum=0.111 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=2.239, mean=2.239, max=2.239, sum=2.239 (1)",
-            "tab": "Efficiency",
-            "score": 2.2392607588163562
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)",
-            "tab": "General information",
-            "score": 3709.7408450704224
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=30.372, mean=30.372, max=30.372, sum=30.372 (1)",
-            "tab": "General information",
-            "score": 30.371830985915494
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.028,
-        "details": {
-          "description": "min=0.028, mean=0.028, max=0.028, sum=0.028 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.828, mean=1.828, max=1.828, sum=1.828 (1)",
-            "tab": "Efficiency",
-            "score": 1.828468058347702
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=1.226, mean=1.226, max=1.226, sum=1.226 (1)",
-            "tab": "Efficiency",
-            "score": 1.2262272393703462
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1781.799, mean=1781.799, max=1781.799, sum=1781.799 (1)",
-            "tab": "General information",
-            "score": 1781.799
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=31.113, mean=31.113, max=31.113, sum=31.113 (1)",
-            "tab": "General information",
-            "score": 31.113
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)",
-            "tab": "General information",
-            "score": 189.259
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=26.563, mean=26.563, max=26.563, sum=26.563 (1)",
-            "tab": "General information",
-            "score": 26.563
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.918,
-        "details": {
-          "description": "min=0.918, mean=0.918, max=0.918, sum=0.918 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=1.032, mean=1.032, max=1.032, sum=1.032 (1)",
-            "tab": "Efficiency",
-            "score": 1.031575677871704
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=263.79, mean=263.79, max=263.79, sum=263.79 (1)",
-            "tab": "General information",
-            "score": 263.79
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.652,
-        "details": {
-          "description": "min=0.39, mean=0.652, max=0.94, sum=3.26 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=1.228, mean=1.278, max=1.341, sum=6.391 (5)",
-            "tab": "Efficiency",
-            "score": 1.2781797420267473
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=370.26, mean=478.747, max=619.596, sum=2393.736 (5)",
-            "tab": "General information",
-            "score": 478.747298245614
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.084,
-        "details": {
-          "description": "min=0, mean=0.084, max=0.337, sum=0.591 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=2.092, mean=2.33, max=2.633, sum=16.311 (7)",
-            "tab": "Efficiency",
-            "score": 2.3301560711519222
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=948.259, mean=1362.814, max=2380.808, sum=9539.699 (7)",
-            "tab": "General information",
-            "score": 1362.8141219676104
-          },
-          "MATH - # output tokens": {
-            "description": "min=44.263, mean=52.374, max=62.256, sum=366.62 (7)",
-            "tab": "General information",
-            "score": 52.37429092508652
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.907,
-        "details": {
-          "description": "min=0.907, mean=0.907, max=0.907, sum=0.907 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=3.213, mean=3.213, max=3.213, sum=3.213 (1)",
-            "tab": "Efficiency",
-            "score": 3.2127642614841463
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)",
-            "tab": "General information",
-            "score": 1012.712
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=114.663, mean=114.663, max=114.663, sum=114.663 (1)",
-            "tab": "General information",
-            "score": 114.663
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.49,
-        "details": {
-          "description": "min=0.029, mean=0.49, max=0.958, sum=2.448 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.683, mean=1.316, max=2.689, sum=6.58 (5)",
-            "tab": "Efficiency",
-            "score": 1.3159105889028733
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=214.653, mean=1557.242, max=6428.398, sum=7786.208 (5)",
-            "tab": "General information",
-            "score": 1557.241581367783
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=9.202, max=27.753, sum=46.009 (5)",
-            "tab": "General information",
-            "score": 9.201869121421694
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.684,
-        "details": {
-          "description": "min=0.684, mean=0.684, max=0.684, sum=0.684 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=1.143, mean=1.143, max=1.143, sum=1.143 (1)",
-            "tab": "Efficiency",
-            "score": 1.1428523476033752
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1027.437, mean=1027.437, max=1027.437, sum=1027.437 (1)",
-            "tab": "General information",
-            "score": 1027.4373757455269
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.218,
-        "details": {
-          "description": "min=0.169, mean=0.218, max=0.25, sum=1.091 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.066, mean=1.139, max=1.228, sum=5.697 (5)",
-            "tab": "Efficiency",
-            "score": 1.1393479201068188
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=198.406, mean=219.573, max=241.974, sum=1097.866 (5)",
-            "tab": "General information",
-            "score": 219.57322077152472
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=24.517, mean=26.056, max=27.078, sum=130.278 (5)",
-            "tab": "General information",
-            "score": 26.05551068588469
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json b/data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json
deleted file mode 100644
index c3ca60cb8..000000000
--- a/data/helm_lite/anthropic/claude-instant-1.2/cb409208-034d-42fd-acce-ab5cc4227383.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-instant-1.2/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude Instant 1.2",
-    "id": "anthropic/claude-instant-1.2",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.399,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.4998377028714107
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.616,
-        "details": {
-          "description": "min=0.616, mean=0.616, max=0.616, sum=0.616 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.491, mean=1.491, max=1.491, sum=1.491 (1)",
-            "tab": "Efficiency",
-            "score": 1.490500447447871
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)",
-            "tab": "General information",
-            "score": 3709.7408450704224
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=17.149, mean=17.149, max=17.149, sum=17.149 (1)",
-            "tab": "General information",
-            "score": 17.149295774647886
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.343,
-        "details": {
-          "description": "min=0.343, mean=0.343, max=0.343, sum=0.343 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.975, mean=0.975, max=0.975, sum=0.975 (1)",
-            "tab": "Efficiency",
-            "score": 0.9746438981543135
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.674, mean=0.674, max=0.674, sum=0.674 (1)",
-            "tab": "Efficiency",
-            "score": 0.6736472499370575
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.964, mean=4.964, max=4.964, sum=4.964 (1)",
-            "tab": "General information",
-            "score": 4.964
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)",
-            "tab": "General information",
-            "score": 0.007
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1734.363, mean=1734.363, max=1734.363, sum=1734.363 (1)",
-            "tab": "General information",
-            "score": 1734.363
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=8.217, mean=8.217, max=8.217, sum=8.217 (1)",
-            "tab": "General information",
-            "score": 8.217
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)",
-            "tab": "General information",
-            "score": 189.259
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=5.113, mean=5.113, max=5.113, sum=5.113 (1)",
-            "tab": "General information",
-            "score": 5.113
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.844,
-        "details": {
-          "description": "min=0.844, mean=0.844, max=0.844, sum=0.844 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.597, mean=0.597, max=0.597, sum=0.597 (1)",
-            "tab": "Efficiency",
-            "score": 0.596853446483612
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=328.79, mean=328.79, max=328.79, sum=328.79 (1)",
-            "tab": "General information",
-            "score": 328.79
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.631,
-        "details": {
-          "description": "min=0.37, mean=0.631, max=0.9, sum=3.154 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.59, mean=0.614, max=0.636, sum=3.069 (5)",
-            "tab": "Efficiency",
-            "score": 0.613885824571576
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=435.26, mean=543.747, max=684.596, sum=2718.736 (5)",
-            "tab": "General information",
-            "score": 543.747298245614
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.499,
-        "details": {
-          "description": "min=0.365, mean=0.499, max=0.704, sum=3.491 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.247, mean=1.403, max=1.528, sum=9.821 (7)",
-            "tab": "Efficiency",
-            "score": 1.4029501960147133
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=947.259, mean=1361.814, max=2379.808, sum=9532.699 (7)",
-            "tab": "General information",
-            "score": 1361.8141219676104
-          },
-          "MATH - # output tokens": {
-            "description": "min=54.491, mean=65.956, max=76.513, sum=461.691 (7)",
-            "tab": "General information",
-            "score": 65.95586481608514
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.721,
-        "details": {
-          "description": "min=0.721, mean=0.721, max=0.721, sum=0.721 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.474, mean=1.474, max=1.474, sum=1.474 (1)",
-            "tab": "Efficiency",
-            "score": 1.474282945394516
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)",
-            "tab": "General information",
-            "score": 1012.712
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=105.998, mean=105.998, max=105.998, sum=105.998 (1)",
-            "tab": "General information",
-            "score": 105.998
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.586,
-        "details": {
-          "description": "min=0.341, mean=0.586, max=0.937, sum=2.931 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.629, mean=0.911, max=1.974, sum=4.555 (5)",
-            "tab": "Efficiency",
-            "score": 0.9110085331512334
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.798, max=5, sum=23.99 (5)",
-            "tab": "General information",
-            "score": 4.797959183673469
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=280.653, mean=1621.356, max=6484.969, sum=8106.779 (5)",
-            "tab": "General information",
-            "score": 1621.3558670820687
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.646, max=2.219, sum=8.23 (5)",
-            "tab": "General information",
-            "score": 1.6459798365122615
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.559,
-        "details": {
-          "description": "min=0.559, mean=0.559, max=0.559, sum=0.559 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.763, mean=0.763, max=0.763, sum=0.763 (1)",
-            "tab": "Efficiency",
-            "score": 0.7633721221749399
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1092.437, mean=1092.437, max=1092.437, sum=1092.437 (1)",
-            "tab": "General information",
-            "score": 1092.4373757455269
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.194,
-        "details": {
-          "description": "min=0.138, mean=0.194, max=0.24, sum=0.971 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.726, mean=0.772, max=0.838, sum=3.859 (5)",
-            "tab": "Efficiency",
-            "score": 0.7717107724915095
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=197.406, mean=218.573, max=240.974, sum=1092.866 (5)",
-            "tab": "General information",
-            "score": 218.57322077152472
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=24.177, mean=25.579, max=26.326, sum=127.893 (5)",
-            "tab": "General information",
-            "score": 25.578513056277718
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json b/data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json
deleted file mode 100644
index da3e6b3b3..000000000
--- a/data/helm_lite/anthropic/claude-v1.3/b2572ef8-446a-45b4-b557-45736418753b.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/anthropic_claude-v1.3/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude v1.3",
-    "id": "anthropic/claude-v1.3",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.518,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.09352059925093632
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.723,
-        "details": {
-          "description": "min=0.723, mean=0.723, max=0.723, sum=0.723 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=6.114, mean=6.114, max=6.114, sum=6.114 (1)",
-            "tab": "Efficiency",
-            "score": 6.113923052666893
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3709.741, mean=3709.741, max=3709.741, sum=3709.741 (1)",
-            "tab": "General information",
-            "score": 3709.7408450704224
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=9.338, mean=9.338, max=9.338, sum=9.338 (1)",
-            "tab": "General information",
-            "score": 9.338028169014084
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.409,
-        "details": {
-          "description": "min=0.409, mean=0.409, max=0.409, sum=0.409 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=3.523, mean=3.523, max=3.523, sum=3.523 (1)",
-            "tab": "Efficiency",
-            "score": 3.5226667501174913
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=2.059, mean=2.059, max=2.059, sum=2.059 (1)",
-            "tab": "Efficiency",
-            "score": 2.0589215233325957
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.964, mean=4.964, max=4.964, sum=4.964 (1)",
-            "tab": "General information",
-            "score": 4.964
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)",
-            "tab": "General information",
-            "score": 0.007
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1734.363, mean=1734.363, max=1734.363, sum=1734.363 (1)",
-            "tab": "General information",
-            "score": 1734.363
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=4.973, mean=4.973, max=4.973, sum=4.973 (1)",
-            "tab": "General information",
-            "score": 4.973
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=189.259, mean=189.259, max=189.259, sum=189.259 (1)",
-            "tab": "General information",
-            "score": 189.259
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)",
-            "tab": "General information",
-            "score": 3.722
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.908,
-        "details": {
-          "description": "min=0.908, mean=0.908, max=0.908, sum=0.908 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=3.375, mean=3.375, max=3.375, sum=3.375 (1)",
-            "tab": "Efficiency",
-            "score": 3.375496371269226
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=328.79, mean=328.79, max=328.79, sum=328.79 (1)",
-            "tab": "General information",
-            "score": 328.79
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.631,
-        "details": {
-          "description": "min=0.35, mean=0.631, max=0.93, sum=3.155 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=1.228, mean=1.482, max=1.741, sum=7.41 (5)",
-            "tab": "Efficiency",
-            "score": 1.4820951028288456
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=435.26, mean=543.747, max=684.596, sum=2718.736 (5)",
-            "tab": "General information",
-            "score": 543.747298245614
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.54,
-        "details": {
-          "description": "min=0.368, mean=0.54, max=0.826, sum=3.783 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=3.85, mean=6.109, max=8.225, sum=42.762 (7)",
-            "tab": "Efficiency",
-            "score": 6.10879439056091
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=947.259, mean=1361.814, max=2379.808, sum=9532.699 (7)",
-            "tab": "General information",
-            "score": 1361.8141219676104
-          },
-          "MATH - # output tokens": {
-            "description": "min=53.133, mean=79.493, max=97.564, sum=556.452 (7)",
-            "tab": "General information",
-            "score": 79.49312981320325
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.784,
-        "details": {
-          "description": "min=0.784, mean=0.784, max=0.784, sum=0.784 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=6.653, mean=6.653, max=6.653, sum=6.653 (1)",
-            "tab": "Efficiency",
-            "score": 6.653211696863174
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1012.712, mean=1012.712, max=1012.712, sum=1012.712 (1)",
-            "tab": "General information",
-            "score": 1012.712
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=104.726, mean=104.726, max=104.726, sum=104.726 (1)",
-            "tab": "General information",
-            "score": 104.726
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.629,
-        "details": {
-          "description": "min=0.417, mean=0.629, max=0.916, sum=3.147 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=1.081, mean=3.536, max=8.614, sum=17.681 (5)",
-            "tab": "Efficiency",
-            "score": 3.536136101917547
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.798, max=5, sum=23.99 (5)",
-            "tab": "General information",
-            "score": 4.797959183673469
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=280.653, mean=1621.356, max=6484.969, sum=8106.779 (5)",
-            "tab": "General information",
-            "score": 1621.3558670820687
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.354, max=2.232, sum=6.771 (5)",
-            "tab": "General information",
-            "score": 1.3542176968306323
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.618,
-        "details": {
-          "description": "min=0.618, mean=0.618, max=0.618, sum=0.618 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=3.39, mean=3.39, max=3.39, sum=3.39 (1)",
-            "tab": "Efficiency",
-            "score": 3.3901417141643244
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1092.437, mean=1092.437, max=1092.437, sum=1092.437 (1)",
-            "tab": "General information",
-            "score": 1092.4373757455269
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.219,
-        "details": {
-          "description": "min=0.152, mean=0.219, max=0.28, sum=1.093 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.391, mean=2.232, max=3.755, sum=11.161 (5)",
-            "tab": "Efficiency",
-            "score": 2.232213549153336
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=197.406, mean=218.573, max=240.974, sum=1092.866 (5)",
-            "tab": "General information",
-            "score": 218.57322077152472
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=24.004, mean=25.611, max=26.28, sum=128.057 (5)",
-            "tab": "General information",
-            "score": 25.611364027374215
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json b/data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json
deleted file mode 100644
index a431f3338..000000000
--- a/data/helm_lite/cohere/command-light/70d85516-b710-4b27-b664-03a6a822773b.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/cohere_command-light/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Command Light",
-    "id": "cohere/command-light",
-    "developer": "cohere",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.105,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.46863920099875156
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.629,
-        "details": {
-          "description": "min=0.629, mean=0.629, max=0.629, sum=0.629 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.896, mean=0.896, max=0.896, sum=0.896 (1)",
-            "tab": "Efficiency",
-            "score": 0.8961316760157195
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.941, mean=1.941, max=1.941, sum=1.941 (1)",
-            "tab": "General information",
-            "score": 1.9408450704225353
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1660.485, mean=1660.485, max=1660.485, sum=1660.485 (1)",
-            "tab": "General information",
-            "score": 1660.4845070422534
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=10.814, mean=10.814, max=10.814, sum=10.814 (1)",
-            "tab": "General information",
-            "score": 10.814084507042253
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.195,
-        "details": {
-          "description": "min=0.195, mean=0.195, max=0.195, sum=0.195 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.08, mean=1.08, max=1.08, sum=1.08 (1)",
-            "tab": "Efficiency",
-            "score": 1.0799305574893951
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.696, mean=0.696, max=0.696, sum=0.696 (1)",
-            "tab": "Efficiency",
-            "score": 0.6957695767879486
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.617, mean=4.617, max=4.617, sum=4.617 (1)",
-            "tab": "General information",
-            "score": 4.617
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)",
-            "tab": "General information",
-            "score": 0.039
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1557.639, mean=1557.639, max=1557.639, sum=1557.639 (1)",
-            "tab": "General information",
-            "score": 1557.639
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=10.869, mean=10.869, max=10.869, sum=10.869 (1)",
-            "tab": "General information",
-            "score": 10.869
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=115.191, mean=115.191, max=115.191, sum=115.191 (1)",
-            "tab": "General information",
-            "score": 115.191
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=17.348, mean=17.348, max=17.348, sum=17.348 (1)",
-            "tab": "General information",
-            "score": 17.348
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.398,
-        "details": {
-          "description": "min=0.398, mean=0.398, max=0.398, sum=0.398 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.705, mean=0.705, max=0.705, sum=0.705 (1)",
-            "tab": "Efficiency",
-            "score": 0.7049956932067871
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=246.682, mean=246.682, max=246.682, sum=246.682 (1)",
-            "tab": "General information",
-            "score": 246.682
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.386,
-        "details": {
-          "description": "min=0.25, mean=0.386, max=0.57, sum=1.928 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.405, mean=0.749, max=1.412, sum=3.747 (5)",
-            "tab": "Efficiency",
-            "score": 0.7494988910942747
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=372.75, mean=481.26, max=628.421, sum=2406.301 (5)",
-            "tab": "General information",
-            "score": 481.26021052631575
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.098,
-        "details": {
-          "description": "min=0.026, mean=0.098, max=0.167, sum=0.687 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.821, mean=2.374, max=2.948, sum=16.62 (7)",
-            "tab": "Efficiency",
-            "score": 2.374249639604042
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=2.962, mean=6.878, max=8, sum=48.146 (7)",
-            "tab": "General information",
-            "score": 6.877964141122035
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=925.333, mean=1177.329, max=1534.058, sum=8241.302 (7)",
-            "tab": "General information",
-            "score": 1177.3289276411065
-          },
-          "MATH - # output tokens": {
-            "description": "min=83.228, mean=106.589, max=137.692, sum=746.121 (7)",
-            "tab": "General information",
-            "score": 106.58875792143844
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.149,
-        "details": {
-          "description": "min=0.149, mean=0.149, max=0.149, sum=0.149 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.751, mean=1.751, max=1.751, sum=1.751 (1)",
-            "tab": "Efficiency",
-            "score": 1.7514978868961335
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=942.424, mean=942.424, max=942.424, sum=942.424 (1)",
-            "tab": "General information",
-            "score": 942.424
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=80.184, mean=80.184, max=80.184, sum=80.184 (1)",
-            "tab": "General information",
-            "score": 80.184
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.397,
-        "details": {
-          "description": "min=0.173, mean=0.397, max=0.874, sum=1.983 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.423, mean=0.783, max=1.232, sum=3.916 (5)",
-            "tab": "Efficiency",
-            "score": 0.7831334660572837
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=0.388, mean=3.878, max=5, sum=19.388 (5)",
-            "tab": "General information",
-            "score": 3.8775510204081636
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0.003, max=0.014, sum=0.014 (5)",
-            "tab": "General information",
-            "score": 0.002857142857142857
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=205.295, mean=566.501, max=1529.327, sum=2832.507 (5)",
-            "tab": "General information",
-            "score": 566.5014751745068
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1.074, mean=6.64, max=23.614, sum=33.198 (5)",
-            "tab": "General information",
-            "score": 6.63968330089529
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.312,
-        "details": {
-          "description": "min=0.312, mean=0.312, max=0.312, sum=0.312 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.896, mean=0.896, max=0.896, sum=0.896 (1)",
-            "tab": "Efficiency",
-            "score": 0.895831539901066
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1016.738, mean=1016.738, max=1016.738, sum=1016.738 (1)",
-            "tab": "General information",
-            "score": 1016.7375745526839
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.023,
-        "details": {
-          "description": "min=0.0, mean=0.023, max=0.064, sum=0.113 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.712, mean=0.797, max=0.934, sum=3.983 (5)",
-            "tab": "Efficiency",
-            "score": 0.7965989762712353
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=129.757, mean=149.459, max=178.821, sum=747.297 (5)",
-            "tab": "General information",
-            "score": 149.45941179844013
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=30.895, mean=39.885, max=47.65, sum=199.426 (5)",
-            "tab": "General information",
-            "score": 39.88511765942805
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json b/data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json
deleted file mode 100644
index d0f464767..000000000
--- a/data/helm_lite/cohere/command-r-plus/a8208df4-eb37-47d2-8845-f821e80e9858.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/cohere_command-r-plus/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Command R Plus",
-    "id": "cohere/command-r-plus",
-    "developer": "cohere",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.441,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.6927215980024969
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.735,
-        "details": {
-          "description": "min=0.735, mean=0.735, max=0.735, sum=0.735 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.659, mean=0.659, max=0.659, sum=0.659 (1)",
-            "tab": "Efficiency",
-            "score": 0.6590185803426823
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3442.654, mean=3442.654, max=3442.654, sum=3442.654 (1)",
-            "tab": "General information",
-            "score": 3442.6535211267606
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.343,
-        "details": {
-          "description": "min=0.343, mean=0.343, max=0.343, sum=0.343 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.48, mean=0.48, max=0.48, sum=0.48 (1)",
-            "tab": "Efficiency",
-            "score": 0.48011646389961243
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.217, mean=0.217, max=0.217, sum=0.217 (1)",
-            "tab": "Efficiency",
-            "score": 0.21743906450271605
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2069.055, mean=2069.055, max=2069.055, sum=2069.055 (1)",
-            "tab": "General information",
-            "score": 2069.055
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=160.159, mean=160.159, max=160.159, sum=160.159 (1)",
-            "tab": "General information",
-            "score": 160.159
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.828,
-        "details": {
-          "description": "min=0.828, mean=0.828, max=0.828, sum=0.828 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.526, mean=0.526, max=0.526, sum=0.526 (1)",
-            "tab": "Efficiency",
-            "score": 0.5261325912475586
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=260.678, mean=260.678, max=260.678, sum=260.678 (1)",
-            "tab": "General information",
-            "score": 260.678
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.59,
-        "details": {
-          "description": "min=0.21, mean=0.59, max=0.89, sum=2.951 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.26, mean=0.359, max=0.481, sum=1.797 (5)",
-            "tab": "Efficiency",
-            "score": 0.3594088048349347
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=397.66, mean=499.49, max=661.579, sum=2497.449 (5)",
-            "tab": "General information",
-            "score": 499.48978947368425
-          },
-          "MMLU - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.403,
-        "details": {
-          "description": "min=0.25, mean=0.403, max=0.607, sum=2.822 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.358, mean=1.792, max=2.877, sum=12.543 (7)",
-            "tab": "Efficiency",
-            "score": 1.7917883168992628
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=974.156, mean=1406.107, max=2423.596, sum=9842.752 (7)",
-            "tab": "General information",
-            "score": 1406.1074103714861
-          },
-          "MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.738,
-        "details": {
-          "description": "min=0.738, mean=0.738, max=0.738, sum=0.738 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=3.592, mean=3.592, max=3.592, sum=3.592 (1)",
-            "tab": "Efficiency",
-            "score": 3.5923334171772003
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1158.893, mean=1158.893, max=1158.893, sum=1158.893 (1)",
-            "tab": "General information",
-            "score": 1158.893
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.672,
-        "details": {
-          "description": "min=0.428, mean=0.672, max=0.947, sum=3.358 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.193, mean=0.351, max=0.927, sum=1.754 (5)",
-            "tab": "Efficiency",
-            "score": 0.3508069759610481
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=223.126, mean=1582.617, max=6507.029, sum=7913.085 (5)",
-            "tab": "General information",
-            "score": 1582.6169819753743
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.567,
-        "details": {
-          "description": "min=0.567, mean=0.567, max=0.567, sum=0.567 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.631, mean=0.631, max=0.631, sum=0.631 (1)",
-            "tab": "Efficiency",
-            "score": 0.6308214294744533
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1062.905, mean=1062.905, max=1062.905, sum=1062.905 (1)",
-            "tab": "General information",
-            "score": 1062.9045725646124
-          },
-          "MedQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.203,
-        "details": {
-          "description": "min=0.156, mean=0.203, max=0.233, sum=1.017 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.59, mean=0.644, max=0.742, sum=3.221 (5)",
-            "tab": "Efficiency",
-            "score": 0.6441886008863676
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=114.404, mean=127.944, max=146.584, sum=639.721 (5)",
-            "tab": "General information",
-            "score": 127.94422599021257
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json b/data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json
deleted file mode 100644
index 51821d155..000000000
--- a/data/helm_lite/cohere/command-r/22cde248-40ab-43b0-a408-6d8b84692f22.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/cohere_command-r/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Command R",
-    "id": "cohere/command-r",
-    "developer": "cohere",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.299,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.9644069912609239
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.742,
-        "details": {
-          "description": "min=0.742, mean=0.742, max=0.742, sum=0.742 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.389, mean=0.389, max=0.389, sum=0.389 (1)",
-            "tab": "Efficiency",
-            "score": 0.3886059089445732
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3442.654, mean=3442.654, max=3442.654, sum=3442.654 (1)",
-            "tab": "General information",
-            "score": 3442.6535211267606
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.352,
-        "details": {
-          "description": "min=0.352, mean=0.352, max=0.352, sum=0.352 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.288, mean=0.288, max=0.288, sum=0.288 (1)",
-            "tab": "Efficiency",
-            "score": 0.2875482747554779
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.165, mean=0.165, max=0.165, sum=0.165 (1)",
-            "tab": "Efficiency",
-            "score": 0.16523362946510314
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2069.055, mean=2069.055, max=2069.055, sum=2069.055 (1)",
-            "tab": "General information",
-            "score": 2069.055
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=160.159, mean=160.159, max=160.159, sum=160.159 (1)",
-            "tab": "General information",
-            "score": 160.159
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.782,
-        "details": {
-          "description": "min=0.782, mean=0.782, max=0.782, sum=0.782 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.15, mean=0.15, max=0.15, sum=0.15 (1)",
-            "tab": "Efficiency",
-            "score": 0.14960159301757814
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=260.678, mean=260.678, max=260.678, sum=260.678 (1)",
-            "tab": "General information",
-            "score": 260.678
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.567,
-        "details": {
-          "description": "min=0.33, mean=0.567, max=0.82, sum=2.836 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.162, mean=0.173, max=0.185, sum=0.867 (5)",
-            "tab": "Efficiency",
-            "score": 0.17335561692923832
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=397.66, mean=499.49, max=661.579, sum=2497.449 (5)",
-            "tab": "General information",
-            "score": 499.48978947368425
-          },
-          "MMLU - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.266,
-        "details": {
-          "description": "min=0.158, mean=0.266, max=0.333, sum=1.861 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=0.659, mean=0.821, max=1.104, sum=5.745 (7)",
-            "tab": "Efficiency",
-            "score": 0.8207379439676702
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=974.156, mean=1406.107, max=2423.596, sum=9842.752 (7)",
-            "tab": "General information",
-            "score": 1406.1074103714861
-          },
-          "MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.551,
-        "details": {
-          "description": "min=0.551, mean=0.551, max=0.551, sum=0.551 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.04, mean=1.04, max=1.04, sum=1.04 (1)",
-            "tab": "Efficiency",
-            "score": 1.0398468203544617
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1158.893, mean=1158.893, max=1158.893, sum=1158.893 (1)",
-            "tab": "General information",
-            "score": 1158.893
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.507,
-        "details": {
-          "description": "min=0.211, mean=0.507, max=0.905, sum=2.534 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.151, mean=0.235, max=0.5, sum=1.174 (5)",
-            "tab": "Efficiency",
-            "score": 0.23478191454837286
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=223.126, mean=1582.617, max=6507.029, sum=7913.085 (5)",
-            "tab": "General information",
-            "score": 1582.6169819753743
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.555,
-        "details": {
-          "description": "min=0.555, mean=0.555, max=0.555, sum=0.555 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.191, mean=0.191, max=0.191, sum=0.191 (1)",
-            "tab": "Efficiency",
-            "score": 0.19128861531585634
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1062.905, mean=1062.905, max=1062.905, sum=1062.905 (1)",
-            "tab": "General information",
-            "score": 1062.9045725646124
-          },
-          "MedQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.149,
-        "details": {
-          "description": "min=0.107, mean=0.149, max=0.175, sum=0.746 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.308, mean=0.343, max=0.455, sum=1.715 (5)",
-            "tab": "Efficiency",
-            "score": 0.3429552388299011
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=114.404, mean=127.944, max=146.584, sum=639.721 (5)",
-            "tab": "General information",
-            "score": 127.94422599021257
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json b/data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json
deleted file mode 100644
index 488fa54b9..000000000
--- a/data/helm_lite/cohere/command/b0f85fd8-cfab-4fe0-8b36-7ea97e99a023.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/cohere_command/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Command",
-    "id": "cohere/command",
-    "developer": "cohere",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.327,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.21596754057428214
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.749,
-        "details": {
-          "description": "min=0.749, mean=0.749, max=0.749, sum=0.749 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.783, mean=1.783, max=1.783, sum=1.783 (1)",
-            "tab": "Efficiency",
-            "score": 1.783306110408944
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.941, mean=1.941, max=1.941, sum=1.941 (1)",
-            "tab": "General information",
-            "score": 1.9408450704225353
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1660.485, mean=1660.485, max=1660.485, sum=1660.485 (1)",
-            "tab": "General information",
-            "score": 1660.4845070422534
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=7.442, mean=7.442, max=7.442, sum=7.442 (1)",
-            "tab": "General information",
-            "score": 7.44225352112676
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.391,
-        "details": {
-          "description": "min=0.391, mean=0.391, max=0.391, sum=0.391 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.804, mean=1.804, max=1.804, sum=1.804 (1)",
-            "tab": "Efficiency",
-            "score": 1.8040301027297974
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.986, mean=0.986, max=0.986, sum=0.986 (1)",
-            "tab": "Efficiency",
-            "score": 0.9856750283241272
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.617, mean=4.617, max=4.617, sum=4.617 (1)",
-            "tab": "General information",
-            "score": 4.617
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)",
-            "tab": "General information",
-            "score": 0.039
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1557.639, mean=1557.639, max=1557.639, sum=1557.639 (1)",
-            "tab": "General information",
-            "score": 1557.639
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=8.461, mean=8.461, max=8.461, sum=8.461 (1)",
-            "tab": "General information",
-            "score": 8.461
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=115.191, mean=115.191, max=115.191, sum=115.191 (1)",
-            "tab": "General information",
-            "score": 115.191
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=5.679, mean=5.679, max=5.679, sum=5.679 (1)",
-            "tab": "General information",
-            "score": 5.679
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.774,
-        "details": {
-          "description": "min=0.774, mean=0.774, max=0.774, sum=0.774 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=1.044, mean=1.044, max=1.044, sum=1.044 (1)",
-            "tab": "Efficiency",
-            "score": 1.0440752515792846
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=246.682, mean=246.682, max=246.682, sum=246.682 (1)",
-            "tab": "General information",
-            "score": 246.682
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.525,
-        "details": {
-          "description": "min=0.27, mean=0.525, max=0.88, sum=2.626 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.821, mean=1.08, max=1.384, sum=5.399 (5)",
-            "tab": "Efficiency",
-            "score": 1.0797608851633573
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=372.75, mean=481.26, max=628.421, sum=2406.301 (5)",
-            "tab": "General information",
-            "score": 481.26021052631575
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.236,
-        "details": {
-          "description": "min=0.1, mean=0.236, max=0.349, sum=1.652 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=4.562, mean=5.762, max=6.509, sum=40.337 (7)",
-            "tab": "Efficiency",
-            "score": 5.762416239357385
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=2.962, mean=6.878, max=8, sum=48.146 (7)",
-            "tab": "General information",
-            "score": 6.877964141122035
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=925.333, mean=1177.329, max=1534.058, sum=8241.302 (7)",
-            "tab": "General information",
-            "score": 1177.3289276411065
-          },
-          "MATH - # output tokens": {
-            "description": "min=94.488, mean=116.49, max=135.115, sum=815.428 (7)",
-            "tab": "General information",
-            "score": 116.48968047229982
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.452,
-        "details": {
-          "description": "min=0.452, mean=0.452, max=0.452, sum=0.452 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=4.127, mean=4.127, max=4.127, sum=4.127 (1)",
-            "tab": "Efficiency",
-            "score": 4.127378141641617
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=942.424, mean=942.424, max=942.424, sum=942.424 (1)",
-            "tab": "General information",
-            "score": 942.424
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=94.43, mean=94.43, max=94.43, sum=94.43 (1)",
-            "tab": "General information",
-            "score": 94.43
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.578,
-        "details": {
-          "description": "min=0.365, mean=0.578, max=0.884, sum=2.888 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.856, mean=1.165, max=1.842, sum=5.823 (5)",
-            "tab": "Efficiency",
-            "score": 1.1646721122881132
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=0.388, mean=3.878, max=5, sum=19.388 (5)",
-            "tab": "General information",
-            "score": 3.8775510204081636
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0.003, max=0.014, sum=0.014 (5)",
-            "tab": "General information",
-            "score": 0.002857142857142857
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=205.295, mean=566.501, max=1529.327, sum=2832.507 (5)",
-            "tab": "General information",
-            "score": 566.5014751745068
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.79, max=3.055, sum=8.948 (5)",
-            "tab": "General information",
-            "score": 1.7895877106155815
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.445,
-        "details": {
-          "description": "min=0.445, mean=0.445, max=0.445, sum=0.445 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=1.234, mean=1.234, max=1.234, sum=1.234 (1)",
-            "tab": "Efficiency",
-            "score": 1.2344102347584416
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1016.738, mean=1016.738, max=1016.738, sum=1016.738 (1)",
-            "tab": "General information",
-            "score": 1016.7375745526839
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.088,
-        "details": {
-          "description": "min=0.013, mean=0.088, max=0.151, sum=0.441 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=2.376, mean=2.894, max=3.133, sum=14.469 (5)",
-            "tab": "Efficiency",
-            "score": 2.8937741082134893
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=129.757, mean=149.459, max=178.821, sum=747.297 (5)",
-            "tab": "General information",
-            "score": 149.45941179844013
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=27.65, mean=31.8, max=41.789, sum=159.002 (5)",
-            "tab": "General information",
-            "score": 31.800405260743236
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json b/data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json
deleted file mode 100644
index 9dc0aa32d..000000000
--- a/data/helm_lite/databricks/dbrx-instruct/ec27e9fc-166d-454b-90c7-2eb8195ae2e2.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/databricks_dbrx-instruct/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DBRX Instruct",
-    "id": "databricks/dbrx-instruct",
-    "developer": "databricks",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.289,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.5229588014981273
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.488,
-        "details": {
-          "description": "min=0.488, mean=0.488, max=0.488, sum=0.488 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.645, mean=1.645, max=1.645, sum=1.645 (1)",
-            "tab": "Efficiency",
-            "score": 1.6445875322315056
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3522.67, mean=3522.67, max=3522.67, sum=3522.67 (1)",
-            "tab": "General information",
-            "score": 3522.6704225352114
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.284,
-        "details": {
-          "description": "min=0.284, mean=0.284, max=0.284, sum=0.284 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.175, mean=1.175, max=1.175, sum=1.175 (1)",
-            "tab": "Efficiency",
-            "score": 1.1746999933719635
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.665, mean=0.665, max=0.665, sum=0.665 (1)",
-            "tab": "Efficiency",
-            "score": 0.6648788969516755
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1762.593, mean=1762.593, max=1762.593, sum=1762.593 (1)",
-            "tab": "General information",
-            "score": 1762.593
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=173.127, mean=173.127, max=173.127, sum=173.127 (1)",
-            "tab": "General information",
-            "score": 173.127
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=0.91 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.328, mean=0.328, max=0.328, sum=0.328 (1)",
-            "tab": "Efficiency",
-            "score": 0.3277706532478333
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=242.782, mean=242.782, max=242.782, sum=242.782 (1)",
-            "tab": "General information",
-            "score": 242.782
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.643,
-        "details": {
-          "description": "min=0.34, mean=0.643, max=0.93, sum=3.215 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.39, mean=0.412, max=0.432, sum=2.062 (5)",
-            "tab": "Efficiency",
-            "score": 0.41247134314921857
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)",
-            "tab": "General information",
-            "score": 460.71996491228066
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358,
-        "details": {
-          "description": "min=0.015, mean=0.358, max=0.553, sum=2.509 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=0.531, mean=2.305, max=3.852, sum=16.138 (7)",
-            "tab": "Efficiency",
-            "score": 2.305378989452493
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=942.363, mean=1323.911, max=2258.577, sum=9267.376 (7)",
-            "tab": "General information",
-            "score": 1323.910874184069
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.671,
-        "details": {
-          "description": "min=0.671, mean=0.671, max=0.671, sum=0.671 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=2.384, mean=2.384, max=2.384, sum=2.384 (1)",
-            "tab": "Efficiency",
-            "score": 2.3839432048797606
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1020.035, mean=1020.035, max=1020.035, sum=1020.035 (1)",
-            "tab": "General information",
-            "score": 1020.035
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.426,
-        "details": {
-          "description": "min=0.053, mean=0.426, max=0.755, sum=2.13 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.366, mean=0.733, max=1.771, sum=3.667 (5)",
-            "tab": "Efficiency",
-            "score": 0.73349196183029
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=253.442, mean=1570.163, max=6357.388, sum=7850.815 (5)",
-            "tab": "General information",
-            "score": 1570.162971355988
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.694,
-        "details": {
-          "description": "min=0.694, mean=0.694, max=0.694, sum=0.694 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.438, mean=0.438, max=0.438, sum=0.438 (1)",
-            "tab": "Efficiency",
-            "score": 0.4383622557221066
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1020.414, mean=1020.414, max=1020.414, sum=1020.414 (1)",
-            "tab": "General information",
-            "score": 1020.4135188866799
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.131,
-        "details": {
-          "description": "min=0.035, mean=0.131, max=0.192, sum=0.656 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.849, mean=1.059, max=1.342, sum=5.297 (5)",
-            "tab": "Efficiency",
-            "score": 1.0594140760888837
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=169.901, mean=193.043, max=213.185, sum=965.213 (5)",
-            "tab": "General information",
-            "score": 193.04258583116683
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json b/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json
deleted file mode 100644
index 201ddf6e5..000000000
--- a/data/helm_lite/deepseek-ai/deepseek-llm-67b-chat/8721a15b-9102-4b1a-bde8-e5371f00f1b5.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/deepseek-ai_deepseek-llm-67b-chat/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek LLM Chat 67B",
-    "id": "deepseek-ai/deepseek-llm-67b-chat",
-    "developer": "deepseek-ai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.488,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.30021223470661673
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.581,
-        "details": {
-          "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=3.36, mean=3.36, max=3.36, sum=3.36 (1)",
-            "tab": "Efficiency",
-            "score": 3.359551859573579
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.946, mean=4.946, max=4.946, sum=4.946 (1)",
-            "tab": "General information",
-            "score": 4.946478873239436
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3583.146, mean=3583.146, max=3583.146, sum=3583.146 (1)",
-            "tab": "General information",
-            "score": 3583.1464788732396
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.412,
-        "details": {
-          "description": "min=0.412, mean=0.412, max=0.412, sum=0.412 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=2.237, mean=2.237, max=2.237, sum=2.237 (1)",
-            "tab": "Efficiency",
-            "score": 2.2367931361198425
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.857, mean=0.857, max=0.857, sum=0.857 (1)",
-            "tab": "Efficiency",
-            "score": 0.8567402980327606
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.841, mean=4.841, max=4.841, sum=4.841 (1)",
-            "tab": "General information",
-            "score": 4.841
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.024, mean=0.024, max=0.024, sum=0.024 (1)",
-            "tab": "General information",
-            "score": 0.024
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2192.734, mean=2192.734, max=2192.734, sum=2192.734 (1)",
-            "tab": "General information",
-            "score": 2192.734
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=199.39, mean=199.39, max=199.39, sum=199.39 (1)",
-            "tab": "General information",
-            "score": 199.39
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "details": {
-          "description": "min=0.88, mean=0.88, max=0.88, sum=0.88 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.417, mean=0.417, max=0.417, sum=0.417 (1)",
-            "tab": "Efficiency",
-            "score": 0.41702947664260864
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=253.206, mean=253.206, max=253.206, sum=253.206 (1)",
-            "tab": "General information",
-            "score": 253.206
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.641,
-        "details": {
-          "description": "min=0.44, mean=0.641, max=0.91, sum=3.203 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.48, mean=0.508, max=0.551, sum=2.542 (5)",
-            "tab": "Efficiency",
-            "score": 0.508463426874395
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=382.07, mean=490.941, max=646.667, sum=2454.707 (5)",
-            "tab": "General information",
-            "score": 490.9413333333334
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.615,
-        "details": {
-          "description": "min=0.456, mean=0.615, max=0.748, sum=4.304 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=3.389, mean=4.443, max=6.234, sum=31.098 (7)",
-            "tab": "Efficiency",
-            "score": 4.442596748084942
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=1012.548, mean=1443.29, max=2448.25, sum=10103.027 (7)",
-            "tab": "General information",
-            "score": 1443.2895059403625
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.795,
-        "details": {
-          "description": "min=0.795, mean=0.795, max=0.795, sum=0.795 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=5.877, mean=5.877, max=5.877, sum=5.877 (1)",
-            "tab": "Efficiency",
-            "score": 5.876643376111984
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1233.708, mean=1233.708, max=1233.708, sum=1233.708 (1)",
-            "tab": "General information",
-            "score": 1233.708
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.637,
-        "details": {
-          "description": "min=0.45, mean=0.637, max=0.821, sum=3.183 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.524, mean=0.942, max=2.301, sum=4.71 (5)",
-            "tab": "Efficiency",
-            "score": 0.9420770218153176
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=2.006, mean=4.201, max=5, sum=21.006 (5)",
-            "tab": "General information",
-            "score": 4.201224489795918
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=269.379, mean=990.259, max=3325.551, sum=4951.297 (5)",
-            "tab": "General information",
-            "score": 990.259348667894
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.628,
-        "details": {
-          "description": "min=0.628, mean=0.628, max=0.628, sum=0.628 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.83, mean=0.83, max=0.83, sum=0.83 (1)",
-            "tab": "Efficiency",
-            "score": 0.8296676231899982
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1084.235, mean=1084.235, max=1084.235, sum=1084.235 (1)",
-            "tab": "General information",
-            "score": 1084.234592445328
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.186,
-        "details": {
-          "description": "min=0.11, mean=0.186, max=0.236, sum=0.932 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.381, mean=1.429, max=1.464, sum=7.147 (5)",
-            "tab": "Efficiency",
-            "score": 1.429440071817079
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=203.736, mean=220.291, max=255.861, sum=1101.453 (5)",
-            "tab": "General information",
-            "score": 220.29060445022174
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json b/data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json
deleted file mode 100644
index b5f8e240f..000000000
--- a/data/helm_lite/deepseek-ai/deepseek-v3/23b3a30c-8aa3-4684-be54-adae003720fc.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/deepseek-ai_deepseek-v3/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek v3",
-    "id": "deepseek-ai/deepseek-v3",
-    "developer": "deepseek-ai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.908,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.11454431960049938
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "description": "min=0.796, mean=0.796, max=0.796, sum=0.796 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=6.44, mean=6.44, max=6.44, sum=6.44 (1)",
-            "tab": "Efficiency",
-            "score": 6.440373906954913
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3487.045, mean=3487.045, max=3487.045, sum=3487.045 (1)",
-            "tab": "General information",
-            "score": 3487.045070422535
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.467,
-        "details": {
-          "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=5.606, mean=5.606, max=5.606, sum=5.606 (1)",
-            "tab": "Efficiency",
-            "score": 5.605930573940277
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=2.183, mean=2.183, max=2.183, sum=2.183 (1)",
-            "tab": "Efficiency",
-            "score": 2.1832692058086396
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1756.178, mean=1756.178, max=1756.178, sum=1756.178 (1)",
-            "tab": "General information",
-            "score": 1756.178
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=131.205, mean=131.205, max=131.205, sum=131.205 (1)",
-            "tab": "General information",
-            "score": 131.205
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.954,
-        "details": {
-          "description": "min=0.954, mean=0.954, max=0.954, sum=0.954 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=1.746, mean=1.746, max=1.746, sum=1.746 (1)",
-            "tab": "Efficiency",
-            "score": 1.746311339378357
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=245.494, mean=245.494, max=245.494, sum=245.494 (1)",
-            "tab": "General information",
-            "score": 245.494
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.803,
-        "details": {
-          "description": "min=0.65, mean=0.803, max=0.92, sum=4.016 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.545, mean=0.564, max=0.585, sum=2.818 (5)",
-            "tab": "Efficiency",
-            "score": 0.5636642604125173
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=373.01, mean=465.871, max=613.535, sum=2329.355 (5)",
-            "tab": "General information",
-            "score": 465.8710175438597
-          },
-          "MMLU - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.912,
-        "details": {
-          "description": "min=0.816, mean=0.912, max=0.985, sum=6.385 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=7.691, mean=9.449, max=13.451, sum=66.142 (7)",
-            "tab": "Efficiency",
-            "score": 9.448914254379945
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=853.923, mean=1245.725, max=2184.846, sum=8720.075 (7)",
-            "tab": "General information",
-            "score": 1245.7249665607071
-          },
-          "MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=0.94 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=9.77, mean=9.77, max=9.77, sum=9.77 (1)",
-            "tab": "Efficiency",
-            "score": 9.76988450360298
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=961.041, mean=961.041, max=961.041, sum=961.041 (1)",
-            "tab": "General information",
-            "score": 961.041
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.718,
-        "details": {
-          "description": "min=0.425, mean=0.718, max=0.968, sum=3.589 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.566, mean=3.113, max=6.6, sum=15.563 (5)",
-            "tab": "Efficiency",
-            "score": 3.1125569474549435
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=198.516, mean=1498.765, max=6226.967, sum=7493.826 (5)",
-            "tab": "General information",
-            "score": 1498.7652695311654
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.809,
-        "details": {
-          "description": "min=0.809, mean=0.809, max=0.809, sum=0.809 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=1.79, mean=1.79, max=1.79, sum=1.79 (1)",
-            "tab": "Efficiency",
-            "score": 1.790037025751224
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=985.93, mean=985.93, max=985.93, sum=985.93 (1)",
-            "tab": "General information",
-            "score": 985.9304174950298
-          },
-          "MedQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.209,
-        "details": {
-          "description": "min=0.163, mean=0.209, max=0.252, sum=1.046 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=2.231, mean=2.677, max=3.02, sum=13.384 (5)",
-            "tab": "Efficiency",
-            "score": 2.6768779265693037
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=103.739, mean=118.596, max=138.616, sum=592.982 (5)",
-            "tab": "General information",
-            "score": 118.59634548478361
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json b/data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json
deleted file mode 100644
index eabdc0bbd..000000000
--- a/data/helm_lite/google/gemini-1.0-pro-002/7022c444-d6b8-4374-be0c-14835e5fd281.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_gemini-1.0-pro-002/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 1.0 Pro 002",
-    "id": "google/gemini-1.0-pro-002",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.422,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.6464918851435706
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.751,
-        "details": {
-          "description": "min=0.751, mean=0.751, max=0.751, sum=0.751 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.679, mean=0.679, max=0.679, sum=0.679 (1)",
-            "tab": "Efficiency",
-            "score": 0.6791302858934104
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3447.994, mean=3447.994, max=3447.994, sum=3447.994 (1)",
-            "tab": "General information",
-            "score": 3447.994366197183
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.391,
-        "details": {
-          "description": "min=0.391, mean=0.391, max=0.391, sum=0.391 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.609, mean=0.609, max=0.609, sum=0.609 (1)",
-            "tab": "Efficiency",
-            "score": 0.6086829407215119
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.597, mean=0.597, max=0.597, sum=0.597 (1)",
-            "tab": "Efficiency",
-            "score": 0.5965619602203369
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1978.347, mean=1978.347, max=1978.347, sum=1978.347 (1)",
-            "tab": "General information",
-            "score": 1978.347
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=153.995, mean=153.995, max=153.995, sum=153.995 (1)",
-            "tab": "General information",
-            "score": 153.995
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.788,
-        "details": {
-          "description": "min=0.788, mean=0.788, max=0.788, sum=0.788 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.43, mean=0.43, max=0.43, sum=0.43 (1)",
-            "tab": "Efficiency",
-            "score": 0.4301223816871643
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)",
-            "tab": "General information",
-            "score": 248.508
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.534,
-        "details": {
-          "description": "min=0.27, mean=0.534, max=0.81, sum=2.672 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.397, mean=0.407, max=0.417, sum=2.033 (5)",
-            "tab": "Efficiency",
-            "score": 0.4066482855060644
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)",
-            "tab": "General information",
-            "score": 481.5305263157895
-          },
-          "MMLU - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.665,
-        "details": {
-          "description": "min=0.553, mean=0.665, max=0.859, sum=4.654 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.402, mean=1.585, max=2.083, sum=11.094 (7)",
-            "tab": "Efficiency",
-            "score": 1.5848151401531698
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)",
-            "tab": "General information",
-            "score": 1355.5064552904823
-          },
-          "MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.816,
-        "details": {
-          "description": "min=0.816, mean=0.816, max=0.816, sum=0.816 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.513, mean=1.513, max=1.513, sum=1.513 (1)",
-            "tab": "Efficiency",
-            "score": 1.513066102743149
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)",
-            "tab": "General information",
-            "score": 1151.885
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.475,
-        "details": {
-          "description": "min=0.118, mean=0.475, max=0.811, sum=2.376 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.447, mean=0.609, max=1.08, sum=3.043 (5)",
-            "tab": "Efficiency",
-            "score": 0.6085789782066453
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=209.916, mean=1558.239, max=6423.569, sum=7791.193 (5)",
-            "tab": "General information",
-            "score": 1558.2386051001386
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.483,
-        "details": {
-          "description": "min=0.483, mean=0.483, max=0.483, sum=0.483 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.431, mean=0.431, max=0.431, sum=0.431 (1)",
-            "tab": "Efficiency",
-            "score": 0.4310008814610333
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)",
-            "tab": "General information",
-            "score": 1029.4811133200794
-          },
-          "MedQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.194,
-        "details": {
-          "description": "min=0.144, mean=0.194, max=0.231, sum=0.972 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.705, mean=0.803, max=0.924, sum=4.014 (5)",
-            "tab": "Efficiency",
-            "score": 0.8027491282517494
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=90.732, mean=120.97, max=147.366, sum=604.851 (5)",
-            "tab": "General information",
-            "score": 120.97025108961614
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json b/data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json
deleted file mode 100644
index 991b81669..000000000
--- a/data/helm_lite/google/gemini-1.5-flash-001/bc93fd3d-b6cc-4c03-8c71-d8f1f5ef5957.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_gemini-1.5-flash-001/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 1.5 Flash 001",
-    "id": "google/gemini-1.5-flash-001",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.667,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.681960049937578
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.783,
-        "details": {
-          "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.647, mean=0.647, max=0.647, sum=0.647 (1)",
-            "tab": "Efficiency",
-            "score": 0.6474363112991507
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3447.994, mean=3447.994, max=3447.994, sum=3447.994 (1)",
-            "tab": "General information",
-            "score": 3447.994366197183
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.332,
-        "details": {
-          "description": "min=0.332, mean=0.332, max=0.332, sum=0.332 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.495, mean=0.495, max=0.495, sum=0.495 (1)",
-            "tab": "Efficiency",
-            "score": 0.49524100852012637
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.432, mean=0.432, max=0.432, sum=0.432 (1)",
-            "tab": "Efficiency",
-            "score": 0.431587886095047
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1978.347, mean=1978.347, max=1978.347, sum=1978.347 (1)",
-            "tab": "General information",
-            "score": 1978.347
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=153.995, mean=153.995, max=153.995, sum=153.995 (1)",
-            "tab": "General information",
-            "score": 153.995
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.928,
-        "details": {
-          "description": "min=0.928, mean=0.928, max=0.928, sum=0.928 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.504, mean=0.504, max=0.504, sum=0.504 (1)",
-            "tab": "Efficiency",
-            "score": 0.5038927392959595
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)",
-            "tab": "General information",
-            "score": 248.508
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.703,
-        "details": {
-          "description": "min=0.58, mean=0.703, max=0.93, sum=3.514 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.525, mean=0.568, max=0.62, sum=2.842 (5)",
-            "tab": "Efficiency",
-            "score": 0.5683523873948214
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)",
-            "tab": "General information",
-            "score": 481.5305263157895
-          },
-          "MMLU - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.753,
-        "details": {
-          "description": "min=0.632, mean=0.753, max=0.889, sum=5.269 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.303, mean=1.592, max=2.086, sum=11.144 (7)",
-            "tab": "Efficiency",
-            "score": 1.592031592636459
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)",
-            "tab": "General information",
-            "score": 1355.5064552904823
-          },
-          "MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.785,
-        "details": {
-          "description": "min=0.785, mean=0.785, max=0.785, sum=0.785 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.758, mean=1.758, max=1.758, sum=1.758 (1)",
-            "tab": "Efficiency",
-            "score": 1.7575640678405762
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)",
-            "tab": "General information",
-            "score": 1151.885
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.661,
-        "details": {
-          "description": "min=0.425, mean=0.661, max=0.968, sum=3.305 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.409, mean=0.604, max=0.842, sum=3.02 (5)",
-            "tab": "Efficiency",
-            "score": 0.6040551961526522
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=209.916, mean=1558.239, max=6423.569, sum=7791.193 (5)",
-            "tab": "General information",
-            "score": 1558.2386051001386
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.68,
-        "details": {
-          "description": "min=0.68, mean=0.68, max=0.68, sum=0.68 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.399, mean=0.399, max=0.399, sum=0.399 (1)",
-            "tab": "Efficiency",
-            "score": 0.3993651843165971
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)",
-            "tab": "General information",
-            "score": 1029.4811133200794
-          },
-          "MedQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.225,
-        "details": {
-          "description": "min=0.186, mean=0.225, max=0.253, sum=1.126 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.581, mean=0.637, max=0.75, sum=3.186 (5)",
-            "tab": "Efficiency",
-            "score": 0.6372637821067911
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=90.732, mean=120.97, max=147.366, sum=604.851 (5)",
-            "tab": "General information",
-            "score": 120.97025108961614
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json b/data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json
deleted file mode 100644
index 725c639a2..000000000
--- a/data/helm_lite/google/gemini-1.5-flash-002/bc7b0ecf-f2a9-44c2-8949-bbfe762f1b72.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_gemini-1.5-flash-002/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 1.5 Flash 002",
-    "id": "google/gemini-1.5-flash-002",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.573,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.8933333333333333
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.746,
-        "details": {
-          "description": "min=0.746, mean=0.746, max=0.746, sum=0.746 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.443, mean=0.443, max=0.443, sum=0.443 (1)",
-            "tab": "Efficiency",
-            "score": 0.4433113621039824
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3437.994, mean=3437.994, max=3437.994, sum=3437.994 (1)",
-            "tab": "General information",
-            "score": 3437.994366197183
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323,
-        "details": {
-          "description": "min=0.323, mean=0.323, max=0.323, sum=0.323 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.379, mean=0.379, max=0.379, sum=0.379 (1)",
-            "tab": "Efficiency",
-            "score": 0.37945408272743225
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.37, mean=0.37, max=0.37, sum=0.37 (1)",
-            "tab": "Efficiency",
-            "score": 0.36984835290908813
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1968.347, mean=1968.347, max=1968.347, sum=1968.347 (1)",
-            "tab": "General information",
-            "score": 1968.347
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=143.995, mean=143.995, max=143.995, sum=143.995 (1)",
-            "tab": "General information",
-            "score": 143.995
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.914,
-        "details": {
-          "description": "min=0.914, mean=0.914, max=0.914, sum=0.914 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.303, max=0.303, sum=0.303 (1)",
-            "tab": "Efficiency",
-            "score": 0.302696533203125
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)",
-            "tab": "General information",
-            "score": 248.508
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.679,
-        "details": {
-          "description": "min=0.56, mean=0.679, max=0.81, sum=3.395 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.291, mean=0.296, max=0.299, sum=1.482 (5)",
-            "tab": "Efficiency",
-            "score": 0.296430273214976
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)",
-            "tab": "General information",
-            "score": 481.5305263157895
-          },
-          "MMLU - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.908,
-        "details": {
-          "description": "min=0.816, mean=0.908, max=0.985, sum=6.354 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=0.701, mean=0.848, max=1.036, sum=5.939 (7)",
-            "tab": "Efficiency",
-            "score": 0.8483759753773942
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)",
-            "tab": "General information",
-            "score": 1355.5064552904823
-          },
-          "MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328,
-        "details": {
-          "description": "min=0.328, mean=0.328, max=0.328, sum=0.328 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=0.859, mean=0.859, max=0.859, sum=0.859 (1)",
-            "tab": "Efficiency",
-            "score": 0.8591284859287847
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)",
-            "tab": "General information",
-            "score": 1151.885
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.67,
-        "details": {
-          "description": "min=0.42, mean=0.67, max=0.979, sum=3.35 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.284, mean=0.347, max=0.541, sum=1.736 (5)",
-            "tab": "Efficiency",
-            "score": 0.34728255842366473
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=199.916, mean=1548.239, max=6413.569, sum=7741.193 (5)",
-            "tab": "General information",
-            "score": 1548.2386051001386
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.656,
-        "details": {
-          "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.302, mean=0.302, max=0.302, sum=0.302 (1)",
-            "tab": "Efficiency",
-            "score": 0.30154310163873327
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)",
-            "tab": "General information",
-            "score": 1029.4811133200794
-          },
-          "MedQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.212,
-        "details": {
-          "description": "min=0.179, mean=0.212, max=0.232, sum=1.062 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.407, mean=0.424, max=0.444, sum=2.119 (5)",
-            "tab": "Efficiency",
-            "score": 0.42385545386168993
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=80.732, mean=110.97, max=137.366, sum=554.851 (5)",
-            "tab": "General information",
-            "score": 110.97025108961614
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json b/data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json
deleted file mode 100644
index 8b7eab026..000000000
--- a/data/helm_lite/google/gemini-1.5-pro-001/527418d0-2591-43c9-b639-17328292b110.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_gemini-1.5-pro-001/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 1.5 Pro 001",
-    "id": "google/gemini-1.5-pro-001",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.739,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.4783520599250936
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.783,
-        "details": {
-          "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.835, mean=0.835, max=0.835, sum=0.835 (1)",
-            "tab": "Efficiency",
-            "score": 0.8351484166930544
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3447.994, mean=3447.994, max=3447.994, sum=3447.994 (1)",
-            "tab": "General information",
-            "score": 3447.994366197183
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.378,
-        "details": {
-          "description": "min=0.378, mean=0.378, max=0.378, sum=0.378 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.717, mean=0.717, max=0.717, sum=0.717 (1)",
-            "tab": "Efficiency",
-            "score": 0.7170397922992706
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.634, mean=0.634, max=0.634, sum=0.634 (1)",
-            "tab": "Efficiency",
-            "score": 0.6341883151531219
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1978.347, mean=1978.347, max=1978.347, sum=1978.347 (1)",
-            "tab": "General information",
-            "score": 1978.347
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=153.995, mean=153.995, max=153.995, sum=153.995 (1)",
-            "tab": "General information",
-            "score": 153.995
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.902,
-        "details": {
-          "description": "min=0.902, mean=0.902, max=0.902, sum=0.902 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.624, mean=0.624, max=0.624, sum=0.624 (1)",
-            "tab": "Efficiency",
-            "score": 0.6239193634986877
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)",
-            "tab": "General information",
-            "score": 248.508
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.772,
-        "details": {
-          "description": "min=0.62, mean=0.772, max=0.93, sum=3.858 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.65, mean=0.69, max=0.763, sum=3.451 (5)",
-            "tab": "Efficiency",
-            "score": 0.6902154895882857
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)",
-            "tab": "General information",
-            "score": 481.5305263157895
-          },
-          "MMLU - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.825,
-        "details": {
-          "description": "min=0.692, mean=0.825, max=0.956, sum=5.773 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=2.006, mean=2.701, max=3.274, sum=18.91 (7)",
-            "tab": "Efficiency",
-            "score": 2.701360058859101
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)",
-            "tab": "General information",
-            "score": 1355.5064552904823
-          },
-          "MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.836,
-        "details": {
-          "description": "min=0.836, mean=0.836, max=0.836, sum=0.836 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=3.206, mean=3.206, max=3.206, sum=3.206 (1)",
-            "tab": "Efficiency",
-            "score": 3.205789808034897
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)",
-            "tab": "General information",
-            "score": 1151.885
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.757,
-        "details": {
-          "description": "min=0.46, mean=0.757, max=1, sum=3.786 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.577, mean=0.775, max=1.078, sum=3.876 (5)",
-            "tab": "Efficiency",
-            "score": 0.7752882438000996
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=209.916, mean=1558.239, max=6423.569, sum=7791.193 (5)",
-            "tab": "General information",
-            "score": 1558.2386051001386
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.692,
-        "details": {
-          "description": "min=0.692, mean=0.692, max=0.692, sum=0.692 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.53, mean=0.53, max=0.53, sum=0.53 (1)",
-            "tab": "Efficiency",
-            "score": 0.5296737767785669
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)",
-            "tab": "General information",
-            "score": 1029.4811133200794
-          },
-          "MedQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.189,
-        "details": {
-          "description": "min=0.118, mean=0.189, max=0.252, sum=0.946 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.029, mean=1.14, max=1.4, sum=5.7 (5)",
-            "tab": "Efficiency",
-            "score": 1.1399874632845124
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=90.732, mean=120.97, max=147.366, sum=604.851 (5)",
-            "tab": "General information",
-            "score": 120.97025108961614
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json b/data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json
deleted file mode 100644
index ebd3081fb..000000000
--- a/data/helm_lite/google/gemini-1.5-pro-002/8ddc465f-4f2d-4213-81c4-70b584d48047.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_gemini-1.5-pro-002/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 1.5 Pro 002",
-    "id": "google/gemini-1.5-pro-002",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.842,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.49837702871410733
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.756,
-        "details": {
-          "description": "min=0.756, mean=0.756, max=0.756, sum=0.756 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.912, mean=0.912, max=0.912, sum=0.912 (1)",
-            "tab": "Efficiency",
-            "score": 0.9118197140368548
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3437.994, mean=3437.994, max=3437.994, sum=3437.994 (1)",
-            "tab": "General information",
-            "score": 3437.994366197183
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.455,
-        "details": {
-          "description": "min=0.455, mean=0.455, max=0.455, sum=0.455 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.616, mean=0.616, max=0.616, sum=0.616 (1)",
-            "tab": "Efficiency",
-            "score": 0.6156208164691925
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.539, mean=0.539, max=0.539, sum=0.539 (1)",
-            "tab": "Efficiency",
-            "score": 0.5389571013450623
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1968.347, mean=1968.347, max=1968.347, sum=1968.347 (1)",
-            "tab": "General information",
-            "score": 1968.347
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=143.995, mean=143.995, max=143.995, sum=143.995 (1)",
-            "tab": "General information",
-            "score": 143.995
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.952,
-        "details": {
-          "description": "min=0.952, mean=0.952, max=0.952, sum=0.952 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.453, mean=0.453, max=0.453, sum=0.453 (1)",
-            "tab": "Efficiency",
-            "score": 0.45284647941589357
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)",
-            "tab": "General information",
-            "score": 248.508
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.795,
-        "details": {
-          "description": "min=0.67, mean=0.795, max=0.94, sum=3.973 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.453, mean=0.977, max=1.671, sum=4.883 (5)",
-            "tab": "Efficiency",
-            "score": 0.9766287260557476
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)",
-            "tab": "General information",
-            "score": 481.5305263157895
-          },
-          "MMLU - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.789, mean=0.92, max=1, sum=6.44 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=2.157, mean=3.273, max=4.064, sum=22.911 (7)",
-            "tab": "Efficiency",
-            "score": 3.2730091876347354
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)",
-            "tab": "General information",
-            "score": 1355.5064552904823
-          },
-          "MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.817,
-        "details": {
-          "description": "min=0.817, mean=0.817, max=0.817, sum=0.817 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=3.161, mean=3.161, max=3.161, sum=3.161 (1)",
-            "tab": "Efficiency",
-            "score": 3.1614130451679228
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)",
-            "tab": "General information",
-            "score": 1151.885
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.747,
-        "details": {
-          "description": "min=0.439, mean=0.747, max=0.968, sum=3.735 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.489, mean=0.596, max=0.915, sum=2.982 (5)",
-            "tab": "Efficiency",
-            "score": 0.596480936304943
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=199.916, mean=1548.239, max=6413.569, sum=7741.193 (5)",
-            "tab": "General information",
-            "score": 1548.2386051001386
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.771,
-        "details": {
-          "description": "min=0.771, mean=0.771, max=0.771, sum=0.771 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.53, mean=0.53, max=0.53, sum=0.53 (1)",
-            "tab": "Efficiency",
-            "score": 0.5296175953882115
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)",
-            "tab": "General information",
-            "score": 1029.4811133200794
-          },
-          "MedQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.231,
-        "details": {
-          "description": "min=0.192, mean=0.231, max=0.261, sum=1.156 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.049, mean=1.108, max=1.147, sum=5.541 (5)",
-            "tab": "Efficiency",
-            "score": 1.1081515031376248
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=80.732, mean=110.97, max=137.366, sum=554.851 (5)",
-            "tab": "General information",
-            "score": 110.97025108961614
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json b/data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json
deleted file mode 100644
index b96b71c0c..000000000
--- a/data/helm_lite/google/gemini-2.0-flash-exp/eca63d17-7fc2-4722-8bb3-0be99a257100.json
+++ /dev/null
@@ -1,644 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_gemini-2.0-flash-exp/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 2.0 Flash Experimental",
-    "id": "google/gemini-2.0-flash-exp",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.813,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.7398626716604245
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.783,
-        "details": {
-          "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.512, mean=0.512, max=0.512, sum=0.512 (1)",
-            "tab": "Efficiency",
-            "score": 0.5123653337359428
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3437.994, mean=3437.994, max=3437.994, sum=3437.994 (1)",
-            "tab": "General information",
-            "score": 3437.994366197183
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.443,
-        "details": {
-          "description": "min=0.443, mean=0.443, max=0.443, sum=0.443 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.462, mean=0.462, max=0.462, sum=0.462 (1)",
-            "tab": "Efficiency",
-            "score": 0.4622749860286713
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.417, mean=0.417, max=0.417, sum=0.417 (1)",
-            "tab": "Efficiency",
-            "score": 0.4170585689544678
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1968.347, mean=1968.347, max=1968.347, sum=1968.347 (1)",
-            "tab": "General information",
-            "score": 1968.347
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=143.995, mean=143.995, max=143.995, sum=143.995 (1)",
-            "tab": "General information",
-            "score": 143.995
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.946,
-        "details": {
-          "description": "min=0.946, mean=0.946, max=0.946, sum=0.946 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.391, mean=0.391, max=0.391, sum=0.391 (1)",
-            "tab": "Efficiency",
-            "score": 0.39134009742736814
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)",
-            "tab": "General information",
-            "score": 248.508
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.717,
-        "details": {
-          "description": "min=0.56, mean=0.717, max=0.83, sum=3.583 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.405, mean=0.409, max=0.414, sum=2.043 (5)",
-            "tab": "Efficiency",
-            "score": 0.4086059420652557
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)",
-            "tab": "General information",
-            "score": 481.5305263157895
-          },
-          "MMLU - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.788, mean=0.901, max=0.985, sum=6.309 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.049, mean=1.506, max=2.041, sum=10.543 (7)",
-            "tab": "Efficiency",
-            "score": 1.5061902186836522
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)",
-            "tab": "General information",
-            "score": 1355.5064552904823
-          },
-          "MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.946,
-        "details": {
-          "description": "min=0.946, mean=0.946, max=0.946, sum=0.946 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.437, mean=1.437, max=1.437, sum=1.437 (1)",
-            "tab": "Efficiency",
-            "score": 1.4374724824428557
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)",
-            "tab": "General information",
-            "score": 1151.885
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.674,
-        "details": {
-          "description": "min=0.237, mean=0.674, max=0.989, sum=3.371 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.454, mean=0.547, max=0.655, sum=2.737 (5)",
-            "tab": "Efficiency",
-            "score": 0.5473698430089784
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=216.916, mean=1559.239, max=6418.569, sum=7796.193 (5)",
-            "tab": "General information",
-            "score": 1559.2386051001386
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ],
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.73,
-        "details": {
-          "description": "min=0.73, mean=0.73, max=0.73, sum=0.73 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.407, mean=0.407, max=0.407, sum=0.407 (1)",
-            "tab": "Efficiency",
-            "score": 0.4071517047540805
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)",
-            "tab": "General information",
-            "score": 1029.4811133200794
-          },
-          "MedQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.212,
-        "details": {
-          "description": "min=0.154, mean=0.212, max=0.242, sum=1.059 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.639, mean=0.725, max=0.883, sum=3.624 (5)",
-            "tab": "Efficiency",
-            "score": 0.7247073432282998
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=80.732, mean=110.97, max=137.366, sum=554.851 (5)",
-            "tab": "General information",
-            "score": 110.97025108961614
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json b/data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json
deleted file mode 100644
index ea107cc9e..000000000
--- a/data/helm_lite/google/gemma-2-27b-it/e40a10b3-e682-4715-b2ee-4efcae050a58.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_gemma-2-27b-it/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma 2 Instruct 27B",
-    "id": "google/gemma-2-27b-it",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.675,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.7407490636704119
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.79, mean=0.79, max=0.79, sum=0.79 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.66, mean=0.66, max=0.66, sum=0.66 (1)",
-            "tab": "Efficiency",
-            "score": 0.6603116545878666
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3437.994, mean=3437.994, max=3437.994, sum=3437.994 (1)",
-            "tab": "General information",
-            "score": 3437.994366197183
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.353,
-        "details": {
-          "description": "min=0.353, mean=0.353, max=0.353, sum=0.353 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.486, mean=0.486, max=0.486, sum=0.486 (1)",
-            "tab": "Efficiency",
-            "score": 0.4863240420818329
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.358, mean=0.358, max=0.358, sum=0.358 (1)",
-            "tab": "Efficiency",
-            "score": 0.35805381870269776
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.953, mean=4.953, max=4.953, sum=4.953 (1)",
-            "tab": "General information",
-            "score": 4.953
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.009, mean=0.009, max=0.009, sum=0.009 (1)",
-            "tab": "General information",
-            "score": 0.009
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1911.526, mean=1911.526, max=1911.526, sum=1911.526 (1)",
-            "tab": "General information",
-            "score": 1911.526
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=143.995, mean=143.995, max=143.995, sum=143.995 (1)",
-            "tab": "General information",
-            "score": 143.995
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0.993, mean=0.993, max=0.993, sum=0.993 (1)",
-            "tab": "General information",
-            "score": 0.993
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.918,
-        "details": {
-          "description": "min=0.918, mean=0.918, max=0.918, sum=0.918 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.327 (1)",
-            "tab": "Efficiency",
-            "score": 0.3270734968185425
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)",
-            "tab": "General information",
-            "score": 248.508
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.664,
-        "details": {
-          "description": "min=0.44, mean=0.664, max=0.93, sum=3.32 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.317, mean=0.329, max=0.337, sum=1.643 (5)",
-            "tab": "Efficiency",
-            "score": 0.3286796834259702
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)",
-            "tab": "General information",
-            "score": 481.5305263157895
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.746,
-        "details": {
-          "description": "min=0.513, mean=0.746, max=0.93, sum=5.219 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.515, mean=1.903, max=2.648, sum=13.324 (7)",
-            "tab": "Efficiency",
-            "score": 1.9034432935092742
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)",
-            "tab": "General information",
-            "score": 1355.5064552904823
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.812,
-        "details": {
-          "description": "min=0.812, mean=0.812, max=0.812, sum=0.812 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=2.332, mean=2.332, max=2.332, sum=2.332 (1)",
-            "tab": "Efficiency",
-            "score": 2.3315503742694856
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)",
-            "tab": "General information",
-            "score": 1151.885
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7,
-        "details": {
-          "description": "min=0.439, mean=0.7, max=0.979, sum=3.499 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.328, mean=0.44, max=0.796, sum=2.202 (5)",
-            "tab": "Efficiency",
-            "score": 0.4403507251683155
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.798, max=5, sum=23.992 (5)",
-            "tab": "General information",
-            "score": 4.798367346938775
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=199.916, mean=1546.699, max=6405.871, sum=7733.495 (5)",
-            "tab": "General information",
-            "score": 1546.699013263404
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.684,
-        "details": {
-          "description": "min=0.684, mean=0.684, max=0.684, sum=0.684 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.451, mean=0.451, max=0.451, sum=0.451 (1)",
-            "tab": "Efficiency",
-            "score": 0.4512898187277094
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)",
-            "tab": "General information",
-            "score": 1029.4811133200794
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.214,
-        "details": {
-          "description": "min=0.167, mean=0.214, max=0.241, sum=1.072 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.666, mean=0.698, max=0.715, sum=3.492 (5)",
-            "tab": "Efficiency",
-            "score": 0.6983992647690125
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=80.732, mean=110.97, max=137.366, sum=554.851 (5)",
-            "tab": "General information",
-            "score": 110.97025108961614
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json b/data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json
deleted file mode 100644
index 1488d6604..000000000
--- a/data/helm_lite/google/gemma-2-9b-it/56425fda-a1f4-40cc-82f7-6a56ab2ccfaf.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_gemma-2-9b-it/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma 2 Instruct 9B",
-    "id": "google/gemma-2-9b-it",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.562,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.8286641697877652
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.768,
-        "details": {
-          "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.593, mean=0.593, max=0.593, sum=0.593 (1)",
-            "tab": "Efficiency",
-            "score": 0.5928616705075116
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3437.994, mean=3437.994, max=3437.994, sum=3437.994 (1)",
-            "tab": "General information",
-            "score": 3437.994366197183
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328,
-        "details": {
-          "description": "min=0.328, mean=0.328, max=0.328, sum=0.328 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.446, mean=0.446, max=0.446, sum=0.446 (1)",
-            "tab": "Efficiency",
-            "score": 0.44568803215026853
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.337, mean=0.337, max=0.337, sum=0.337 (1)",
-            "tab": "Efficiency",
-            "score": 0.337234415769577
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.953, mean=4.953, max=4.953, sum=4.953 (1)",
-            "tab": "General information",
-            "score": 4.953
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.009, mean=0.009, max=0.009, sum=0.009 (1)",
-            "tab": "General information",
-            "score": 0.009
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1911.526, mean=1911.526, max=1911.526, sum=1911.526 (1)",
-            "tab": "General information",
-            "score": 1911.526
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=143.995, mean=143.995, max=143.995, sum=143.995 (1)",
-            "tab": "General information",
-            "score": 143.995
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=0.91 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.306, mean=0.306, max=0.306, sum=0.306 (1)",
-            "tab": "Efficiency",
-            "score": 0.3059106550216675
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=248.508, mean=248.508, max=248.508, sum=248.508 (1)",
-            "tab": "General information",
-            "score": 248.508
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.645,
-        "details": {
-          "description": "min=0.42, mean=0.645, max=0.91, sum=3.225 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.299, mean=0.319, max=0.334, sum=1.594 (5)",
-            "tab": "Efficiency",
-            "score": 0.3187573717686168
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=380.91, mean=481.531, max=634.553, sum=2407.653 (5)",
-            "tab": "General information",
-            "score": 481.5305263157895
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.724,
-        "details": {
-          "description": "min=0.635, mean=0.724, max=0.907, sum=5.071 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.006, mean=1.344, max=1.765, sum=9.409 (7)",
-            "tab": "Efficiency",
-            "score": 1.3440718759718908
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)",
-            "tab": "General information",
-            "score": 1355.5064552904823
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.762,
-        "details": {
-          "description": "min=0.762, mean=0.762, max=0.762, sum=0.762 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.72, mean=1.72, max=1.72, sum=1.72 (1)",
-            "tab": "Efficiency",
-            "score": 1.720498773097992
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)",
-            "tab": "General information",
-            "score": 1151.885
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.639,
-        "details": {
-          "description": "min=0.395, mean=0.639, max=0.937, sum=3.193 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.31, mean=0.384, max=0.652, sum=1.92 (5)",
-            "tab": "Efficiency",
-            "score": 0.3840073023663075
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.798, max=5, sum=23.992 (5)",
-            "tab": "General information",
-            "score": 4.798367346938775
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=199.916, mean=1546.699, max=6405.871, sum=7733.495 (5)",
-            "tab": "General information",
-            "score": 1546.699013263404
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.63,
-        "details": {
-          "description": "min=0.63, mean=0.63, max=0.63, sum=0.63 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.316, mean=0.316, max=0.316, sum=0.316 (1)",
-            "tab": "Efficiency",
-            "score": 0.3161872125288127
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1029.481, mean=1029.481, max=1029.481, sum=1029.481 (1)",
-            "tab": "General information",
-            "score": 1029.4811133200794
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.201,
-        "details": {
-          "description": "min=0.155, mean=0.201, max=0.228, sum=1.003 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.526, mean=0.633, max=0.82, sum=3.165 (5)",
-            "tab": "Efficiency",
-            "score": 0.6330890842213928
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=80.732, mean=110.97, max=137.366, sum=554.851 (5)",
-            "tab": "General information",
-            "score": 110.97025108961614
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json b/data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json
deleted file mode 100644
index 810e32965..000000000
--- a/data/helm_lite/google/gemma-7b/f47ca10d-cd45-485e-b9cf-0c6592d63656.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_gemma-7b/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma 7B",
-    "id": "google/gemma-7b",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.336,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.7896629213483146
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.752,
-        "details": {
-          "description": "min=0.752, mean=0.752, max=0.752, sum=0.752 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.909, mean=0.909, max=0.909, sum=0.909 (1)",
-            "tab": "Efficiency",
-            "score": 0.9086058952438999
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3411.994, mean=3411.994, max=3411.994, sum=3411.994 (1)",
-            "tab": "General information",
-            "score": 3411.994366197183
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.336,
-        "details": {
-          "description": "min=0.336, mean=0.336, max=0.336, sum=0.336 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.591, mean=0.591, max=0.591, sum=0.591 (1)",
-            "tab": "Efficiency",
-            "score": 0.5911745510101318
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.343 (1)",
-            "tab": "Efficiency",
-            "score": 0.3430815353393555
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.94, mean=4.94, max=4.94, sum=4.94 (1)",
-            "tab": "General information",
-            "score": 4.94
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.01, mean=0.01, max=0.01, sum=0.01 (1)",
-            "tab": "General information",
-            "score": 0.01
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1879.978, mean=1879.978, max=1879.978, sum=1879.978 (1)",
-            "tab": "General information",
-            "score": 1879.978
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=125.995, mean=125.995, max=125.995, sum=125.995 (1)",
-            "tab": "General information",
-            "score": 125.995
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.808,
-        "details": {
-          "description": "min=0.808, mean=0.808, max=0.808, sum=0.808 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.282, mean=0.282, max=0.282, sum=0.282 (1)",
-            "tab": "Efficiency",
-            "score": 0.28152281618118286
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=240.508, mean=240.508, max=240.508, sum=240.508 (1)",
-            "tab": "General information",
-            "score": 240.508
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.571,
-        "details": {
-          "description": "min=0.28, mean=0.571, max=0.87, sum=2.854 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.251, mean=0.273, max=0.293, sum=1.367 (5)",
-            "tab": "Efficiency",
-            "score": 0.27346607242550763
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=372.91, mean=473.531, max=626.553, sum=2367.653 (5)",
-            "tab": "General information",
-            "score": 473.5305263157895
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.3, mean=0.5, max=0.711, sum=3.499 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=0.995, mean=1.161, max=1.453, sum=8.127 (7)",
-            "tab": "Efficiency",
-            "score": 1.1609408722047545
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=938.215, mean=1355.506, max=2348.712, sum=9488.545 (7)",
-            "tab": "General information",
-            "score": 1355.5064552904823
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.559,
-        "details": {
-          "description": "min=0.559, mean=0.559, max=0.559, sum=0.559 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=2.025, mean=2.025, max=2.025, sum=2.025 (1)",
-            "tab": "Efficiency",
-            "score": 2.024561887741089
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1151.885, mean=1151.885, max=1151.885, sum=1151.885 (1)",
-            "tab": "General information",
-            "score": 1151.885
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.581,
-        "details": {
-          "description": "min=0.379, mean=0.581, max=0.811, sum=2.904 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.295, mean=0.53, max=1.42, sum=2.652 (5)",
-            "tab": "Efficiency",
-            "score": 0.5303036133605687
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.795, max=5, sum=23.973 (5)",
-            "tab": "General information",
-            "score": 4.794693877551021
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=193.916, mean=1536.557, max=6379.163, sum=7682.787 (5)",
-            "tab": "General information",
-            "score": 1536.5573806103425
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.513,
-        "details": {
-          "description": "min=0.513, mean=0.513, max=0.513, sum=0.513 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.314, mean=0.314, max=0.314, sum=0.314 (1)",
-            "tab": "Efficiency",
-            "score": 0.3144090270427302
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1021.481, mean=1021.481, max=1021.481, sum=1021.481 (1)",
-            "tab": "General information",
-            "score": 1021.4811133200795
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.187,
-        "details": {
-          "description": "min=0.137, mean=0.187, max=0.211, sum=0.937 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.503, mean=0.524, max=0.541, sum=2.618 (5)",
-            "tab": "Efficiency",
-            "score": 0.5235538594776801
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=73.732, mean=103.97, max=130.366, sum=519.851 (5)",
-            "tab": "General information",
-            "score": 103.97025108961614
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json b/data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json
deleted file mode 100644
index 30d0e3442..000000000
--- a/data/helm_lite/google/text-bison@001/7f0e318e-31bf-4044-bffb-357c1238d4fd.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_text-bison@001/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PaLM-2 Bison",
-    "id": "google/text-bison@001",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.526,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.47540574282147313
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.718,
-        "details": {
-          "description": "min=0.718, mean=0.718, max=0.718, sum=0.718 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.031, mean=1.031, max=1.031, sum=1.031 (1)",
-            "tab": "Efficiency",
-            "score": 1.030712524602111
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=4414.234, mean=4414.234, max=4414.234, sum=4414.234 (1)",
-            "tab": "General information",
-            "score": 4414.2338028169015
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=7.997, mean=7.997, max=7.997, sum=7.997 (1)",
-            "tab": "General information",
-            "score": 7.997183098591549
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39,
-        "details": {
-          "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.987, mean=0.987, max=0.987, sum=0.987 (1)",
-            "tab": "Efficiency",
-            "score": 0.987217092037201
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.755, mean=0.755, max=0.755, sum=0.755 (1)",
-            "tab": "Efficiency",
-            "score": 0.754590849161148
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.906, mean=4.906, max=4.906, sum=4.906 (1)",
-            "tab": "General information",
-            "score": 4.906
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.015, mean=0.015, max=0.015, sum=0.015 (1)",
-            "tab": "General information",
-            "score": 0.015
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2124.565, mean=2124.565, max=2124.565, sum=2124.565 (1)",
-            "tab": "General information",
-            "score": 2124.565
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=7.358, mean=7.358, max=7.358, sum=7.358 (1)",
-            "tab": "General information",
-            "score": 7.358
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=190.187, mean=190.187, max=190.187, sum=190.187 (1)",
-            "tab": "General information",
-            "score": 190.187
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.48, mean=4.48, max=4.48, sum=4.48 (1)",
-            "tab": "General information",
-            "score": 4.48
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.878,
-        "details": {
-          "description": "min=0.878, mean=0.878, max=0.878, sum=0.878 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.788, mean=0.788, max=0.788, sum=0.788 (1)",
-            "tab": "Efficiency",
-            "score": 0.7879144654273987
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=253.308, mean=253.308, max=253.308, sum=253.308 (1)",
-            "tab": "General information",
-            "score": 253.308
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.608,
-        "details": {
-          "description": "min=0.39, mean=0.608, max=0.87, sum=3.038 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=1.017, mean=1.112, max=1.352, sum=5.561 (5)",
-            "tab": "Efficiency",
-            "score": 1.1122005350882547
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=360.7, mean=487.294, max=638.088, sum=2436.468 (5)",
-            "tab": "General information",
-            "score": 487.29354385964905
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.421,
-        "details": {
-          "description": "min=0.25, mean=0.421, max=0.558, sum=2.946 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.161, mean=1.614, max=2.126, sum=11.299 (7)",
-            "tab": "Efficiency",
-            "score": 1.6140828338918989
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=1004.274, mean=1439.843, max=2386.942, sum=10078.901 (7)",
-            "tab": "General information",
-            "score": 1439.842989280994
-          },
-          "MATH - # output tokens": {
-            "description": "min=38.4, mean=66.89, max=88.316, sum=468.232 (7)",
-            "tab": "General information",
-            "score": 66.89023408252294
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.61,
-        "details": {
-          "description": "min=0.61, mean=0.61, max=0.61, sum=0.61 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.44, mean=1.44, max=1.44, sum=1.44 (1)",
-            "tab": "Efficiency",
-            "score": 1.4403084371089936
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1109.549, mean=1109.549, max=1109.549, sum=1109.549 (1)",
-            "tab": "General information",
-            "score": 1109.549
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=94.258, mean=94.258, max=94.258, sum=94.258 (1)",
-            "tab": "General information",
-            "score": 94.258
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.645,
-        "details": {
-          "description": "min=0.466, mean=0.645, max=0.937, sum=3.224 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.53, mean=0.737, max=1.325, sum=3.683 (5)",
-            "tab": "Efficiency",
-            "score": 0.7366328867537384
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=2.988, mean=4.398, max=5, sum=21.988 (5)",
-            "tab": "General information",
-            "score": 4.397551020408163
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=287.432, mean=1387.966, max=5134.504, sum=6939.831 (5)",
-            "tab": "General information",
-            "score": 1387.966233478402
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.389, max=2.347, sum=6.947 (5)",
-            "tab": "General information",
-            "score": 1.3893499784884555
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.547,
-        "details": {
-          "description": "min=0.547, mean=0.547, max=0.547, sum=0.547 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.735, mean=0.735, max=0.735, sum=0.735 (1)",
-            "tab": "Efficiency",
-            "score": 0.7348999071784806
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1138.622, mean=1138.622, max=1138.622, sum=1138.622 (1)",
-            "tab": "General information",
-            "score": 1138.6222664015904
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.241,
-        "details": {
-          "description": "min=0.22, mean=0.241, max=0.255, sum=1.204 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.826, mean=0.875, max=0.952, sum=4.377 (5)",
-            "tab": "Efficiency",
-            "score": 0.8753595397700126
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=145.755, mean=183.587, max=206.169, sum=917.936 (5)",
-            "tab": "General information",
-            "score": 183.58714444104604
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=28.076, mean=29.981, max=31.366, sum=149.905 (5)",
-            "tab": "General information",
-            "score": 29.980943664933477
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json b/data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json
deleted file mode 100644
index d5841340f..000000000
--- a/data/helm_lite/google/text-unicorn@001/818d6d72-0b5c-4fcf-b808-1d186223301e.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/google_text-unicorn@001/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PaLM-2 Unicorn",
-    "id": "google/text-unicorn@001",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.644,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.18023720349563047
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.583,
-        "details": {
-          "description": "min=0.583, mean=0.583, max=0.583, sum=0.583 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=3.283, mean=3.283, max=3.283, sum=3.283 (1)",
-            "tab": "Efficiency",
-            "score": 3.283053755424392
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=4414.234, mean=4414.234, max=4414.234, sum=4414.234 (1)",
-            "tab": "General information",
-            "score": 4414.2338028169015
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=16.544, mean=16.544, max=16.544, sum=16.544 (1)",
-            "tab": "General information",
-            "score": 16.543661971830986
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.435,
-        "details": {
-          "description": "min=0.435, mean=0.435, max=0.435, sum=0.435 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=2.564, mean=2.564, max=2.564, sum=2.564 (1)",
-            "tab": "Efficiency",
-            "score": 2.564493465423584
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=1.56, mean=1.56, max=1.56, sum=1.56 (1)",
-            "tab": "Efficiency",
-            "score": 1.5603588831424713
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.906, mean=4.906, max=4.906, sum=4.906 (1)",
-            "tab": "General information",
-            "score": 4.906
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.015, mean=0.015, max=0.015, sum=0.015 (1)",
-            "tab": "General information",
-            "score": 0.015
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2124.565, mean=2124.565, max=2124.565, sum=2124.565 (1)",
-            "tab": "General information",
-            "score": 2124.565
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=13.327, mean=13.327, max=13.327, sum=13.327 (1)",
-            "tab": "General information",
-            "score": 13.327
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=190.187, mean=190.187, max=190.187, sum=190.187 (1)",
-            "tab": "General information",
-            "score": 190.187
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=9.803, mean=9.803, max=9.803, sum=9.803 (1)",
-            "tab": "General information",
-            "score": 9.803
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.938,
-        "details": {
-          "description": "min=0.938, mean=0.938, max=0.938, sum=0.938 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)",
-            "tab": "Efficiency",
-            "score": 0.9994440112113953
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=253.308, mean=253.308, max=253.308, sum=253.308 (1)",
-            "tab": "General information",
-            "score": 253.308
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.702,
-        "details": {
-          "description": "min=0.53, mean=0.702, max=0.96, sum=3.509 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=1.198, mean=1.262, max=1.332, sum=6.31 (5)",
-            "tab": "Efficiency",
-            "score": 1.2620431824148748
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=360.7, mean=487.294, max=638.088, sum=2436.468 (5)",
-            "tab": "General information",
-            "score": 487.29354385964905
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.674,
-        "details": {
-          "description": "min=0.526, mean=0.674, max=0.867, sum=4.716 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=4.016, mean=4.636, max=5.654, sum=32.454 (7)",
-            "tab": "Efficiency",
-            "score": 4.636334307701402
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=1004.274, mean=1439.843, max=2386.942, sum=10078.901 (7)",
-            "tab": "General information",
-            "score": 1439.842989280994
-          },
-          "MATH - # output tokens": {
-            "description": "min=59.9, mean=80.458, max=98.342, sum=563.207 (7)",
-            "tab": "General information",
-            "score": 80.45819114472725
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.831,
-        "details": {
-          "description": "min=0.831, mean=0.831, max=0.831, sum=0.831 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=5.437, mean=5.437, max=5.437, sum=5.437 (1)",
-            "tab": "Efficiency",
-            "score": 5.4373185629844665
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1109.549, mean=1109.549, max=1109.549, sum=1109.549 (1)",
-            "tab": "General information",
-            "score": 1109.549
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=93.764, mean=93.764, max=93.764, sum=93.764 (1)",
-            "tab": "General information",
-            "score": 93.764
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.677,
-        "details": {
-          "description": "min=0.452, mean=0.677, max=0.926, sum=3.387 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.859, mean=1.437, max=3.198, sum=7.187 (5)",
-            "tab": "Efficiency",
-            "score": 1.4374773445647835
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=2.988, mean=4.398, max=5, sum=21.988 (5)",
-            "tab": "General information",
-            "score": 4.397551020408163
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=287.432, mean=1387.966, max=5134.504, sum=6939.831 (5)",
-            "tab": "General information",
-            "score": 1387.966233478402
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.364, max=2.2, sum=6.821 (5)",
-            "tab": "General information",
-            "score": 1.3642506811989101
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.684,
-        "details": {
-          "description": "min=0.684, mean=0.684, max=0.684, sum=0.684 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=1.178, mean=1.178, max=1.178, sum=1.178 (1)",
-            "tab": "Efficiency",
-            "score": 1.1783231205305096
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1138.622, mean=1138.622, max=1138.622, sum=1138.622 (1)",
-            "tab": "General information",
-            "score": 1138.6222664015904
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.26,
-        "details": {
-          "description": "min=0.236, mean=0.26, max=0.279, sum=1.298 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.706, mean=1.801, max=1.909, sum=9.006 (5)",
-            "tab": "Efficiency",
-            "score": 1.801295139912888
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=145.755, mean=183.587, max=206.169, sum=917.936 (5)",
-            "tab": "General information",
-            "score": 183.58714444104604
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=28.596, mean=30.567, max=31.734, sum=152.836 (5)",
-            "tab": "General information",
-            "score": 30.567241263954735
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json b/data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json
deleted file mode 100644
index 079c14180..000000000
--- a/data/helm_lite/meta/llama-2-13b/f09b853b-dbbc-4252-a0f0-a2c45c29f670.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-2-13b/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 2 13B",
-    "id": "meta/llama-2-13b",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.233,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.7253183520599251
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.741,
-        "details": {
-          "description": "min=0.741, mean=0.741, max=0.741, sum=0.741 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.795, mean=0.795, max=0.795, sum=0.795 (1)",
-            "tab": "Efficiency",
-            "score": 0.7950913200915699
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.408, mean=4.408, max=4.408, sum=4.408 (1)",
-            "tab": "General information",
-            "score": 4.408450704225352
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3669.808, mean=3669.808, max=3669.808, sum=3669.808 (1)",
-            "tab": "General information",
-            "score": 3669.8084507042254
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.371,
-        "details": {
-          "description": "min=0.371, mean=0.371, max=0.371, sum=0.371 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.579, mean=0.579, max=0.579, sum=0.579 (1)",
-            "tab": "Efficiency",
-            "score": 0.5793666501045227
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.384, mean=0.384, max=0.384, sum=0.384 (1)",
-            "tab": "Efficiency",
-            "score": 0.3839698841571808
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.831, mean=4.831, max=4.831, sum=4.831 (1)",
-            "tab": "General information",
-            "score": 4.831
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
-            "tab": "General information",
-            "score": 0.026
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2289.357, mean=2289.357, max=2289.357, sum=2289.357 (1)",
-            "tab": "General information",
-            "score": 2289.357
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.986, mean=0.986, max=0.986, sum=0.986 (1)",
-            "tab": "General information",
-            "score": 0.986
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)",
-            "tab": "General information",
-            "score": 137.383
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.634,
-        "details": {
-          "description": "min=0.634, mean=0.634, max=0.634, sum=0.634 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.347, mean=0.347, max=0.347, sum=0.347 (1)",
-            "tab": "Efficiency",
-            "score": 0.34700755834579466
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=282.574, mean=282.574, max=282.574, sum=282.574 (1)",
-            "tab": "General information",
-            "score": 282.574
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.505,
-        "details": {
-          "description": "min=0.28, mean=0.505, max=0.84, sum=2.527 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.359, mean=0.374, max=0.383, sum=1.872 (5)",
-            "tab": "Efficiency",
-            "score": 0.37437369656144526
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)",
-            "tab": "General information",
-            "score": 522.5470877192982
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102,
-        "details": {
-          "description": "min=0, mean=0.102, max=0.193, sum=0.715 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.083, mean=1.516, max=1.771, sum=10.613 (7)",
-            "tab": "Efficiency",
-            "score": 1.5161172209789922
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)",
-            "tab": "General information",
-            "score": 1438.6362030100095
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.266,
-        "details": {
-          "description": "min=0.266, mean=0.266, max=0.266, sum=0.266 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.737, mean=1.737, max=1.737, sum=1.737 (1)",
-            "tab": "Efficiency",
-            "score": 1.7367573575973512
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)",
-            "tab": "General information",
-            "score": 1207.746
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.591,
-        "details": {
-          "description": "min=0.338, mean=0.591, max=0.779, sum=2.955 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.331, mean=0.438, max=0.729, sum=2.189 (5)",
-            "tab": "Efficiency",
-            "score": 0.43780977145306127
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=1.886, mean=4.177, max=5, sum=20.886 (5)",
-            "tab": "General information",
-            "score": 4.177142857142857
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0.001, max=0.004, sum=0.004 (5)",
-            "tab": "General information",
-            "score": 0.0008163265306122449
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=222.137, mean=1027.35, max=3642.378, sum=5136.751 (5)",
-            "tab": "General information",
-            "score": 1027.3502076083553
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.392,
-        "details": {
-          "description": "min=0.392, mean=0.392, max=0.392, sum=0.392 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.459, mean=0.459, max=0.459, sum=0.459 (1)",
-            "tab": "Efficiency",
-            "score": 0.4588449499005115
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1234.901, mean=1234.901, max=1234.901, sum=1234.901 (1)",
-            "tab": "General information",
-            "score": 1234.9005964214712
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.167,
-        "details": {
-          "description": "min=0.074, mean=0.167, max=0.209, sum=0.836 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.557, mean=0.691, max=0.814, sum=3.456 (5)",
-            "tab": "Efficiency",
-            "score": 0.6911807014709866
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=127.523, mean=142.288, max=164.972, sum=711.438 (5)",
-            "tab": "General information",
-            "score": 142.28751290334915
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json b/data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json
deleted file mode 100644
index 8faa07285..000000000
--- a/data/helm_lite/meta/llama-2-70b/f84d3cf5-0f7d-481e-b782-a5c98cf9faec.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-2-70b/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 2 70B",
-    "id": "meta/llama-2-70b",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.482,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.3882646691635456
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.763,
-        "details": {
-          "description": "min=0.763, mean=0.763, max=0.763, sum=0.763 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.871, mean=1.871, max=1.871, sum=1.871 (1)",
-            "tab": "Efficiency",
-            "score": 1.8709671289148464
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.408, mean=4.408, max=4.408, sum=4.408 (1)",
-            "tab": "General information",
-            "score": 4.408450704225352
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3669.808, mean=3669.808, max=3669.808, sum=3669.808 (1)",
-            "tab": "General information",
-            "score": 3669.8084507042254
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46,
-        "details": {
-          "description": "min=0.46, mean=0.46, max=0.46, sum=0.46 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.278, mean=1.278, max=1.278, sum=1.278 (1)",
-            "tab": "Efficiency",
-            "score": 1.277897496700287
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.818, mean=0.818, max=0.818, sum=0.818 (1)",
-            "tab": "Efficiency",
-            "score": 0.8177921280860901
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.831, mean=4.831, max=4.831, sum=4.831 (1)",
-            "tab": "General information",
-            "score": 4.831
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
-            "tab": "General information",
-            "score": 0.026
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2289.357, mean=2289.357, max=2289.357, sum=2289.357 (1)",
-            "tab": "General information",
-            "score": 2289.357
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.996, mean=0.996, max=0.996, sum=0.996 (1)",
-            "tab": "General information",
-            "score": 0.996
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)",
-            "tab": "General information",
-            "score": 137.383
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.838,
-        "details": {
-          "description": "min=0.838, mean=0.838, max=0.838, sum=0.838 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)",
-            "tab": "Efficiency",
-            "score": 0.6557973260879517
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=282.574, mean=282.574, max=282.574, sum=282.574 (1)",
-            "tab": "General information",
-            "score": 282.574
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.58,
-        "details": {
-          "description": "min=0.31, mean=0.58, max=0.92, sum=2.902 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.465, mean=0.501, max=0.56, sum=2.507 (5)",
-            "tab": "Efficiency",
-            "score": 0.5013968416013215
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)",
-            "tab": "General information",
-            "score": 522.5470877192982
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323,
-        "details": {
-          "description": "min=0.205, mean=0.323, max=0.489, sum=2.26 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.813, mean=2.443, max=3.147, sum=17.103 (7)",
-            "tab": "Efficiency",
-            "score": 2.4432508421434598
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)",
-            "tab": "General information",
-            "score": 1438.6362030100095
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.567,
-        "details": {
-          "description": "min=0.567, mean=0.567, max=0.567, sum=0.567 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=3.737, mean=3.737, max=3.737, sum=3.737 (1)",
-            "tab": "Efficiency",
-            "score": 3.737159442663193
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)",
-            "tab": "General information",
-            "score": 1207.746
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.673,
-        "details": {
-          "description": "min=0.444, mean=0.673, max=0.937, sum=3.363 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.448, mean=0.759, max=1.744, sum=3.796 (5)",
-            "tab": "Efficiency",
-            "score": 0.7591354159811778
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=1.886, mean=4.177, max=5, sum=20.886 (5)",
-            "tab": "General information",
-            "score": 4.177142857142857
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0.001, max=0.004, sum=0.004 (5)",
-            "tab": "General information",
-            "score": 0.0008163265306122449
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=222.137, mean=1027.35, max=3642.378, sum=5136.751 (5)",
-            "tab": "General information",
-            "score": 1027.3502076083553
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.618,
-        "details": {
-          "description": "min=0.618, mean=0.618, max=0.618, sum=0.618 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.971, mean=0.971, max=0.971, sum=0.971 (1)",
-            "tab": "Efficiency",
-            "score": 0.9713700282170806
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1234.901, mean=1234.901, max=1234.901, sum=1234.901 (1)",
-            "tab": "General information",
-            "score": 1234.9005964214712
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.196,
-        "details": {
-          "description": "min=0.12, mean=0.196, max=0.233, sum=0.979 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.809, mean=1.074, max=1.477, sum=5.368 (5)",
-            "tab": "Efficiency",
-            "score": 1.0736038563633745
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=127.523, mean=142.288, max=164.972, sum=711.438 (5)",
-            "tab": "General information",
-            "score": 142.28751290334915
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json b/data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json
deleted file mode 100644
index bb2c02730..000000000
--- a/data/helm_lite/meta/llama-2-7b/83c6a723-87a0-43d4-968e-86d186578e9e.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-2-7b/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 2 7B",
-    "id": "meta/llama-2-7b",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.152,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.6685767790262173
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.686,
-        "details": {
-          "description": "min=0.686, mean=0.686, max=0.686, sum=0.686 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.852, mean=0.852, max=0.852, sum=0.852 (1)",
-            "tab": "Efficiency",
-            "score": 0.8524049973823655
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.408, mean=4.408, max=4.408, sum=4.408 (1)",
-            "tab": "General information",
-            "score": 4.408450704225352
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3669.808, mean=3669.808, max=3669.808, sum=3669.808 (1)",
-            "tab": "General information",
-            "score": 3669.8084507042254
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.333,
-        "details": {
-          "description": "min=0.333, mean=0.333, max=0.333, sum=0.333 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.584, mean=0.584, max=0.584, sum=0.584 (1)",
-            "tab": "Efficiency",
-            "score": 0.584290323972702
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.479, mean=0.479, max=0.479, sum=0.479 (1)",
-            "tab": "Efficiency",
-            "score": 0.47909903168678286
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.831, mean=4.831, max=4.831, sum=4.831 (1)",
-            "tab": "General information",
-            "score": 4.831
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
-            "tab": "General information",
-            "score": 0.026
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2289.357, mean=2289.357, max=2289.357, sum=2289.357 (1)",
-            "tab": "General information",
-            "score": 2289.357
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.958, mean=0.958, max=0.958, sum=0.958 (1)",
-            "tab": "General information",
-            "score": 0.958
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)",
-            "tab": "General information",
-            "score": 137.383
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0.996, mean=0.996, max=0.996, sum=0.996 (1)",
-            "tab": "General information",
-            "score": 0.996
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.544,
-        "details": {
-          "description": "min=0.544, mean=0.544, max=0.544, sum=0.544 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.393, mean=0.393, max=0.393, sum=0.393 (1)",
-            "tab": "Efficiency",
-            "score": 0.3927152595520019
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=282.574, mean=282.574, max=282.574, sum=282.574 (1)",
-            "tab": "General information",
-            "score": 282.574
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.425,
-        "details": {
-          "description": "min=0.27, mean=0.425, max=0.63, sum=2.125 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.314, mean=0.33, max=0.349, sum=1.651 (5)",
-            "tab": "Efficiency",
-            "score": 0.33028721380233766
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)",
-            "tab": "General information",
-            "score": 522.5470877192982
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.097,
-        "details": {
-          "description": "min=0.019, mean=0.097, max=0.198, sum=0.68 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.362, mean=2.66, max=5.271, sum=18.621 (7)",
-            "tab": "Efficiency",
-            "score": 2.6600816047289086
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)",
-            "tab": "General information",
-            "score": 1438.6362030100095
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.154,
-        "details": {
-          "description": "min=0.154, mean=0.154, max=0.154, sum=0.154 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.96, mean=1.96, max=1.96, sum=1.96 (1)",
-            "tab": "Efficiency",
-            "score": 1.95984334897995
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)",
-            "tab": "General information",
-            "score": 1207.746
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.502,
-        "details": {
-          "description": "min=0.245, mean=0.502, max=0.747, sum=2.508 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.306, mean=0.428, max=0.76, sum=2.139 (5)",
-            "tab": "Efficiency",
-            "score": 0.4277655324222306
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=1.886, mean=4.177, max=5, sum=20.886 (5)",
-            "tab": "General information",
-            "score": 4.177142857142857
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0.001, max=0.004, sum=0.004 (5)",
-            "tab": "General information",
-            "score": 0.0008163265306122449
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=222.137, mean=1027.35, max=3642.378, sum=5136.751 (5)",
-            "tab": "General information",
-            "score": 1027.3502076083553
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.392,
-        "details": {
-          "description": "min=0.392, mean=0.392, max=0.392, sum=0.392 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.467, mean=0.467, max=0.467, sum=0.467 (1)",
-            "tab": "Efficiency",
-            "score": 0.46650436763497993
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1234.901, mean=1234.901, max=1234.901, sum=1234.901 (1)",
-            "tab": "General information",
-            "score": 1234.9005964214712
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.144,
-        "details": {
-          "description": "min=0.046, mean=0.144, max=0.189, sum=0.72 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.582, mean=0.697, max=0.802, sum=3.486 (5)",
-            "tab": "Efficiency",
-            "score": 0.697166075241057
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=127.523, mean=142.288, max=164.972, sum=711.438 (5)",
-            "tab": "General information",
-            "score": 142.28751290334915
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json b/data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json
deleted file mode 100644
index 876850010..000000000
--- a/data/helm_lite/meta/llama-3-70b/daaf221b-1759-4619-91fb-938e81975787.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-3-70b/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 3 70B",
-    "id": "meta/llama-3-70b",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.793,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.3926217228464419
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.798,
-        "details": {
-          "description": "min=0.798, mean=0.798, max=0.798, sum=0.798 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.795, mean=1.795, max=1.795, sum=1.795 (1)",
-            "tab": "Efficiency",
-            "score": 1.7946508300136512
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3460.268, mean=3460.268, max=3460.268, sum=3460.268 (1)",
-            "tab": "General information",
-            "score": 3460.2676056338028
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.475,
-        "details": {
-          "description": "min=0.475, mean=0.475, max=0.475, sum=0.475 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.212, mean=1.212, max=1.212, sum=1.212 (1)",
-            "tab": "Efficiency",
-            "score": 1.211742308139801
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.558, mean=0.558, max=0.558, sum=0.558 (1)",
-            "tab": "Efficiency",
-            "score": 0.5584413967132569
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.965, mean=4.965, max=4.965, sum=4.965 (1)",
-            "tab": "General information",
-            "score": 4.965
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)",
-            "tab": "General information",
-            "score": 0.007
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1658.348, mean=1658.348, max=1658.348, sum=1658.348 (1)",
-            "tab": "General information",
-            "score": 1658.348
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=112.12, mean=112.12, max=112.12, sum=112.12 (1)",
-            "tab": "General information",
-            "score": 112.12
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.934,
-        "details": {
-          "description": "min=0.934, mean=0.934, max=0.934, sum=0.934 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.352, mean=0.352, max=0.352, sum=0.352 (1)",
-            "tab": "Efficiency",
-            "score": 0.35184384298324584
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=242.776, mean=242.776, max=242.776, sum=242.776 (1)",
-            "tab": "General information",
-            "score": 242.776
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.695,
-        "details": {
-          "description": "min=0.43, mean=0.695, max=0.94, sum=3.473 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.387, mean=0.404, max=0.432, sum=2.021 (5)",
-            "tab": "Efficiency",
-            "score": 0.40422279727668087
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=366.43, mean=460.686, max=607.421, sum=2303.431 (5)",
-            "tab": "General information",
-            "score": 460.6862105263158
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.663,
-        "details": {
-          "description": "min=0.433, mean=0.663, max=0.822, sum=4.641 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=14.895, mean=15.819, max=17.569, sum=110.731 (7)",
-            "tab": "Efficiency",
-            "score": 15.818764438908431
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)",
-            "tab": "General information",
-            "score": 1262.9092130545007
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.805,
-        "details": {
-          "description": "min=0.805, mean=0.805, max=0.805, sum=0.805 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=4.2, mean=4.2, max=4.2, sum=4.2 (1)",
-            "tab": "Efficiency",
-            "score": 4.199564570903778
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)",
-            "tab": "General information",
-            "score": 959.032
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.733,
-        "details": {
-          "description": "min=0.466, mean=0.733, max=0.958, sum=3.665 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.416, mean=0.87, max=2.556, sum=4.352 (5)",
-            "tab": "Efficiency",
-            "score": 0.8703131128024035
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.798, max=5, sum=23.992 (5)",
-            "tab": "General information",
-            "score": 4.798367346938775
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=192.442, mean=1507.407, max=6287.633, sum=7537.033 (5)",
-            "tab": "General information",
-            "score": 1507.4065013565441
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.777,
-        "details": {
-          "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.548, mean=0.548, max=0.548, sum=0.548 (1)",
-            "tab": "Efficiency",
-            "score": 0.547684069419239
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1018.274, mean=1018.274, max=1018.274, sum=1018.274 (1)",
-            "tab": "General information",
-            "score": 1018.2743538767396
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.225,
-        "details": {
-          "description": "min=0.183, mean=0.225, max=0.259, sum=1.123 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.198, mean=1.239, max=1.282, sum=6.195 (5)",
-            "tab": "Efficiency",
-            "score": 1.239086973613365
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=90.139, mean=109.868, max=130.33, sum=549.34 (5)",
-            "tab": "General information",
-            "score": 109.86804366111025
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json b/data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json
deleted file mode 100644
index 87ab72524..000000000
--- a/data/helm_lite/meta/llama-3-8b/6b528e49-fec4-4b63-bfb5-1b0df021f3c2.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-3-8b/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 3 8B",
-    "id": "meta/llama-3-8b",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.387,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.7163920099875156
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.754,
-        "details": {
-          "description": "min=0.754, mean=0.754, max=0.754, sum=0.754 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.726, mean=0.726, max=0.726, sum=0.726 (1)",
-            "tab": "Efficiency",
-            "score": 0.7260531909029249
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3460.268, mean=3460.268, max=3460.268, sum=3460.268 (1)",
-            "tab": "General information",
-            "score": 3460.2676056338028
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.378,
-        "details": {
-          "description": "min=0.378, mean=0.378, max=0.378, sum=0.378 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.524, mean=0.524, max=0.524, sum=0.524 (1)",
-            "tab": "Efficiency",
-            "score": 0.523505747795105
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.428, mean=0.428, max=0.428, sum=0.428 (1)",
-            "tab": "Efficiency",
-            "score": 0.42760186743736267
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.965, mean=4.965, max=4.965, sum=4.965 (1)",
-            "tab": "General information",
-            "score": 4.965
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)",
-            "tab": "General information",
-            "score": 0.007
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1658.348, mean=1658.348, max=1658.348, sum=1658.348 (1)",
-            "tab": "General information",
-            "score": 1658.348
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)",
-            "tab": "General information",
-            "score": 0.999
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=112.12, mean=112.12, max=112.12, sum=112.12 (1)",
-            "tab": "General information",
-            "score": 112.12
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.766,
-        "details": {
-          "description": "min=0.766, mean=0.766, max=0.766, sum=0.766 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.308, mean=0.308, max=0.308, sum=0.308 (1)",
-            "tab": "Efficiency",
-            "score": 0.3076804256439209
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=242.776, mean=242.776, max=242.776, sum=242.776 (1)",
-            "tab": "General information",
-            "score": 242.776
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.602,
-        "details": {
-          "description": "min=0.33, mean=0.602, max=0.88, sum=3.008 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.3, mean=0.317, max=0.344, sum=1.583 (5)",
-            "tab": "Efficiency",
-            "score": 0.3165063006919727
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=366.43, mean=460.686, max=607.421, sum=2303.431 (5)",
-            "tab": "General information",
-            "score": 460.6862105263158
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.391,
-        "details": {
-          "description": "min=0.233, mean=0.391, max=0.496, sum=2.737 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=5.431, mean=5.651, max=6.121, sum=39.558 (7)",
-            "tab": "Efficiency",
-            "score": 5.651119198181415
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)",
-            "tab": "General information",
-            "score": 1262.9092130545007
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.499,
-        "details": {
-          "description": "min=0.499, mean=0.499, max=0.499, sum=0.499 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.771, mean=1.771, max=1.771, sum=1.771 (1)",
-            "tab": "Efficiency",
-            "score": 1.770608879327774
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)",
-            "tab": "General information",
-            "score": 959.032
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.637,
-        "details": {
-          "description": "min=0.417, mean=0.637, max=0.874, sum=3.185 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.322, mean=0.465, max=0.989, sum=2.326 (5)",
-            "tab": "Efficiency",
-            "score": 0.4651390315970952
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.798, max=5, sum=23.992 (5)",
-            "tab": "General information",
-            "score": 4.798367346938775
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=192.442, mean=1507.407, max=6287.633, sum=7537.033 (5)",
-            "tab": "General information",
-            "score": 1507.4065013565441
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.581,
-        "details": {
-          "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.361, mean=0.361, max=0.361, sum=0.361 (1)",
-            "tab": "Efficiency",
-            "score": 0.36141945306159867
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1018.274, mean=1018.274, max=1018.274, sum=1018.274 (1)",
-            "tab": "General information",
-            "score": 1018.2743538767396
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.183,
-        "details": {
-          "description": "min=0.133, mean=0.183, max=0.212, sum=0.915 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.547, mean=0.563, max=0.573, sum=2.816 (5)",
-            "tab": "Efficiency",
-            "score": 0.5631435248437351
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=90.139, mean=109.868, max=130.33, sum=549.34 (5)",
-            "tab": "General information",
-            "score": 109.86804366111025
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json b/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json
deleted file mode 100644
index 0bc6225d5..000000000
--- a/data/helm_lite/meta/llama-3.1-405b-instruct-turbo/1043b815-b247-4444-bf8c-0b92b793c57f.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-3.1-405b-instruct-turbo/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 3.1 Instruct Turbo 405B",
-    "id": "meta/llama-3.1-405b-instruct-turbo",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.854,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.3095505617977528
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.749,
-        "details": {
-          "description": "min=0.749, mean=0.749, max=0.749, sum=0.749 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=2.964, mean=2.964, max=2.964, sum=2.964 (1)",
-            "tab": "Efficiency",
-            "score": 2.964381891572979
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)",
-            "tab": "General information",
-            "score": 3484.2676056338028
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=9.904, mean=9.904, max=9.904, sum=9.904 (1)",
-            "tab": "General information",
-            "score": 9.904225352112675
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.456,
-        "details": {
-          "description": "min=0.456, mean=0.456, max=0.456, sum=0.456 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=4.105, mean=4.105, max=4.105, sum=4.105 (1)",
-            "tab": "Efficiency",
-            "score": 4.104731038570404
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.946, mean=0.946, max=0.946, sum=0.946 (1)",
-            "tab": "Efficiency",
-            "score": 0.9464026074409485
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1716.78, mean=1716.78, max=1716.78, sum=1716.78 (1)",
-            "tab": "General information",
-            "score": 1716.78
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=8.741, mean=8.741, max=8.741, sum=8.741 (1)",
-            "tab": "General information",
-            "score": 8.741
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)",
-            "tab": "General information",
-            "score": 129.12
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=8.576, mean=8.576, max=8.576, sum=8.576 (1)",
-            "tab": "General information",
-            "score": 8.576
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=0.94 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=2.693, mean=2.693, max=2.693, sum=2.693 (1)",
-            "tab": "Efficiency",
-            "score": 2.6930377073287963
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)",
-            "tab": "General information",
-            "score": 249.776
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.759,
-        "details": {
-          "description": "min=0.6, mean=0.759, max=0.94, sum=3.796 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.464, mean=0.529, max=0.598, sum=2.643 (5)",
-            "tab": "Efficiency",
-            "score": 0.528599283887629
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)",
-            "tab": "General information",
-            "score": 467.6862105263158
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.827,
-        "details": {
-          "description": "min=0.635, mean=0.827, max=0.97, sum=5.789 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=3.188, mean=4.118, max=4.906, sum=28.826 (7)",
-            "tab": "Efficiency",
-            "score": 4.117939187053165
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)",
-            "tab": "General information",
-            "score": 1262.9092130545007
-          },
-          "MATH - # output tokens": {
-            "description": "min=175.942, mean=232.698, max=270.904, sum=1628.884 (7)",
-            "tab": "General information",
-            "score": 232.69774473452566
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.949,
-        "details": {
-          "description": "min=0.949, mean=0.949, max=0.949, sum=0.949 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=2.737, mean=2.737, max=2.737, sum=2.737 (1)",
-            "tab": "Efficiency",
-            "score": 2.737115991592407
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)",
-            "tab": "General information",
-            "score": 959.032
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=122.777, mean=122.777, max=122.777, sum=122.777 (1)",
-            "tab": "General information",
-            "score": 122.777
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.707,
-        "details": {
-          "description": "min=0.433, mean=0.707, max=0.979, sum=3.536 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.492, mean=0.797, max=1.89, sum=3.987 (5)",
-            "tab": "Efficiency",
-            "score": 0.7974352428433198
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)",
-            "tab": "General information",
-            "score": 1513.8824197238912
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=2, mean=2.407, max=3, sum=12.035 (5)",
-            "tab": "General information",
-            "score": 2.4069553133514985
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.805,
-        "details": {
-          "description": "min=0.805, mean=0.805, max=0.805, sum=0.805 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.951, mean=0.951, max=0.951, sum=0.951 (1)",
-            "tab": "Efficiency",
-            "score": 0.9505775325577965
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)",
-            "tab": "General information",
-            "score": 1025.2743538767395
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.238,
-        "details": {
-          "description": "min=0.2, mean=0.238, max=0.284, sum=1.191 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.96, mean=1.055, max=1.147, sum=5.277 (5)",
-            "tab": "Efficiency",
-            "score": 1.0554436480227387
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=101.139, mean=120.712, max=141.117, sum=603.559 (5)",
-            "tab": "General information",
-            "score": 120.71178123566294
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=24.598, mean=26.056, max=26.819, sum=130.279 (5)",
-            "tab": "General information",
-            "score": 26.055818454656674
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json b/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json
deleted file mode 100644
index d57074cb2..000000000
--- a/data/helm_lite/meta/llama-3.1-70b-instruct-turbo/28bc8f72-7b91-47fc-b10e-cd268cbc1caf.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-3.1-70b-instruct-turbo/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 3.1 Instruct Turbo 70B",
-    "id": "meta/llama-3.1-70b-instruct-turbo",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.808,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.133645443196005
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.772,
-        "details": {
-          "description": "min=0.772, mean=0.772, max=0.772, sum=0.772 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=3.402, mean=3.402, max=3.402, sum=3.402 (1)",
-            "tab": "Efficiency",
-            "score": 3.4022000312805174
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)",
-            "tab": "General information",
-            "score": 3484.2676056338028
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=9.034, mean=9.034, max=9.034, sum=9.034 (1)",
-            "tab": "General information",
-            "score": 9.033802816901408
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.452,
-        "details": {
-          "description": "min=0.452, mean=0.452, max=0.452, sum=0.452 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=3.354, mean=3.354, max=3.354, sum=3.354 (1)",
-            "tab": "Efficiency",
-            "score": 3.354476467370987
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=3.534, mean=3.534, max=3.534, sum=3.534 (1)",
-            "tab": "Efficiency",
-            "score": 3.534221899032593
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1716.78, mean=1716.78, max=1716.78, sum=1716.78 (1)",
-            "tab": "General information",
-            "score": 1716.78
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=8.203, mean=8.203, max=8.203, sum=8.203 (1)",
-            "tab": "General information",
-            "score": 8.203
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)",
-            "tab": "General information",
-            "score": 129.12
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=7.222, mean=7.222, max=7.222, sum=7.222 (1)",
-            "tab": "General information",
-            "score": 7.222
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.938,
-        "details": {
-          "description": "min=0.938, mean=0.938, max=0.938, sum=0.938 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=3.875, mean=3.875, max=3.875, sum=3.875 (1)",
-            "tab": "Efficiency",
-            "score": 3.8750249314308167
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)",
-            "tab": "General information",
-            "score": 249.776
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.709,
-        "details": {
-          "description": "min=0.55, mean=0.709, max=0.93, sum=3.545 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=2.836, mean=12.026, max=45.251, sum=60.131 (5)",
-            "tab": "Efficiency",
-            "score": 12.026294649132511
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)",
-            "tab": "General information",
-            "score": 467.6862105263158
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.783,
-        "details": {
-          "description": "min=0.579, mean=0.783, max=0.97, sum=5.483 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=5.784, mean=6.527, max=7.228, sum=45.691 (7)",
-            "tab": "Efficiency",
-            "score": 6.527233472429779
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)",
-            "tab": "General information",
-            "score": 1262.9092130545007
-          },
-          "MATH - # output tokens": {
-            "description": "min=184.733, mean=243.368, max=279.105, sum=1703.574 (7)",
-            "tab": "General information",
-            "score": 243.36764411525732
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.938,
-        "details": {
-          "description": "min=0.938, mean=0.938, max=0.938, sum=0.938 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=4.99, mean=4.99, max=4.99, sum=4.99 (1)",
-            "tab": "Efficiency",
-            "score": 4.9902911036014554
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)",
-            "tab": "General information",
-            "score": 959.032
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=127.086, mean=127.086, max=127.086, sum=127.086 (1)",
-            "tab": "General information",
-            "score": 127.086
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.687,
-        "details": {
-          "description": "min=0.439, mean=0.687, max=1, sum=3.433 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=2.233, mean=3.171, max=3.636, sum=15.855 (5)",
-            "tab": "Efficiency",
-            "score": 3.1709040240543165
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)",
-            "tab": "General information",
-            "score": 1513.8824197238912
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=2, mean=2.538, max=4.032, sum=12.688 (5)",
-            "tab": "General information",
-            "score": 2.5376711028251826
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.769,
-        "details": {
-          "description": "min=0.769, mean=0.769, max=0.769, sum=0.769 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=3.053, mean=3.053, max=3.053, sum=3.053 (1)",
-            "tab": "Efficiency",
-            "score": 3.0525233205222704
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)",
-            "tab": "General information",
-            "score": 1025.2743538767395
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.223,
-        "details": {
-          "description": "min=0.183, mean=0.223, max=0.265, sum=1.114 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.762, mean=0.965, max=1.177, sum=4.824 (5)",
-            "tab": "Efficiency",
-            "score": 0.9648550899177766
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=101.139, mean=120.712, max=141.117, sum=603.559 (5)",
-            "tab": "General information",
-            "score": 120.71178123566294
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=24.231, mean=25.786, max=26.692, sum=128.928 (5)",
-            "tab": "General information",
-            "score": 25.78567441504817
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json b/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json
deleted file mode 100644
index 198d81cd2..000000000
--- a/data/helm_lite/meta/llama-3.1-8b-instruct-turbo/73dedd31-7d40-4ee6-994d-00eb7d656597.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-3.1-8b-instruct-turbo/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 3.1 Instruct Turbo 8B",
-    "id": "meta/llama-3.1-8b-instruct-turbo",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.303,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.5896504369538077
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.756,
-        "details": {
-          "description": "min=0.756, mean=0.756, max=0.756, sum=0.756 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)",
-            "tab": "Efficiency",
-            "score": 0.5813529316808136
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)",
-            "tab": "General information",
-            "score": 3484.2676056338028
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=7.287, mean=7.287, max=7.287, sum=7.287 (1)",
-            "tab": "General information",
-            "score": 7.2873239436619714
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.209,
-        "details": {
-          "description": "min=0.209, mean=0.209, max=0.209, sum=0.209 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.544, mean=0.544, max=0.544, sum=0.544 (1)",
-            "tab": "Efficiency",
-            "score": 0.5441543731689453
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.752, mean=0.752, max=0.752, sum=0.752 (1)",
-            "tab": "Efficiency",
-            "score": 0.751717613697052
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1716.78, mean=1716.78, max=1716.78, sum=1716.78 (1)",
-            "tab": "General information",
-            "score": 1716.78
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=8.736, mean=8.736, max=8.736, sum=8.736 (1)",
-            "tab": "General information",
-            "score": 8.736
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)",
-            "tab": "General information",
-            "score": 129.12
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=11.732, mean=11.732, max=11.732, sum=11.732 (1)",
-            "tab": "General information",
-            "score": 11.732
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.74,
-        "details": {
-          "description": "min=0.74, mean=0.74, max=0.74, sum=0.74 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=2.937, mean=2.937, max=2.937, sum=2.937 (1)",
-            "tab": "Efficiency",
-            "score": 2.9374450149536133
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)",
-            "tab": "General information",
-            "score": 249.776
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.26, mean=0.5, max=0.79, sum=2.501 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.284, mean=0.417, max=0.567, sum=2.086 (5)",
-            "tab": "Efficiency",
-            "score": 0.41729471965421716
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)",
-            "tab": "General information",
-            "score": 467.6862105263158
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.703,
-        "details": {
-          "description": "min=0.509, mean=0.703, max=0.849, sum=4.92 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.617, mean=1.927, max=2.175, sum=13.492 (7)",
-            "tab": "Efficiency",
-            "score": 1.9274194573191807
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)",
-            "tab": "General information",
-            "score": 1262.9092130545007
-          },
-          "MATH - # output tokens": {
-            "description": "min=203.384, mean=253.982, max=288.596, sum=1777.872 (7)",
-            "tab": "General information",
-            "score": 253.98170179473732
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.798,
-        "details": {
-          "description": "min=0.798, mean=0.798, max=0.798, sum=0.798 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=2.109, mean=2.109, max=2.109, sum=2.109 (1)",
-            "tab": "Efficiency",
-            "score": 2.108796592712402
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)",
-            "tab": "General information",
-            "score": 959.032
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=150.02, mean=150.02, max=150.02, sum=150.02 (1)",
-            "tab": "General information",
-            "score": 150.02
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.342,
-        "details": {
-          "description": "min=0, mean=0.342, max=0.8, sum=1.71 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.409, mean=0.481, max=0.626, sum=2.407 (5)",
-            "tab": "Efficiency",
-            "score": 0.4814103188942614
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)",
-            "tab": "General information",
-            "score": 1513.8824197238912
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=2.032, mean=6.824, max=10.886, sum=34.118 (5)",
-            "tab": "General information",
-            "score": 6.823557876005701
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.245,
-        "details": {
-          "description": "min=0.245, mean=0.245, max=0.245, sum=0.245 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.743, mean=0.743, max=0.743, sum=0.743 (1)",
-            "tab": "Efficiency",
-            "score": 0.742541556803891
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)",
-            "tab": "General information",
-            "score": 1025.2743538767395
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.181,
-        "details": {
-          "description": "min=0.132, mean=0.181, max=0.219, sum=0.907 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.439, mean=0.565, max=0.727, sum=2.826 (5)",
-            "tab": "Efficiency",
-            "score": 0.5651802479746801
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=101.139, mean=120.712, max=141.117, sum=603.559 (5)",
-            "tab": "General information",
-            "score": 120.71178123566294
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=24.354, mean=25.779, max=26.833, sum=128.893 (5)",
-            "tab": "General information",
-            "score": 25.778561802263347
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json b/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json
deleted file mode 100644
index 722a6f050..000000000
--- a/data/helm_lite/meta/llama-3.2-11b-vision-instruct-turbo/18da1dfa-5366-477b-a9cf-af29c5a99b68.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-3.2-11b-vision-instruct-turbo/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 3.2 Vision Instruct Turbo 11B",
-    "id": "meta/llama-3.2-11b-vision-instruct-turbo",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.325,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.8754681647940075
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.756,
-        "details": {
-          "description": "min=0.756, mean=0.756, max=0.756, sum=0.756 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.378, mean=0.378, max=0.378, sum=0.378 (1)",
-            "tab": "Efficiency",
-            "score": 0.37828690300525075
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)",
-            "tab": "General information",
-            "score": 3484.2676056338028
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.234,
-        "details": {
-          "description": "min=0.234, mean=0.234, max=0.234, sum=0.234 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.285, mean=0.285, max=0.285, sum=0.285 (1)",
-            "tab": "Efficiency",
-            "score": 0.28472757744789123
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.326, mean=0.326, max=0.326, sum=0.326 (1)",
-            "tab": "Efficiency",
-            "score": 0.32630494999885556
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1716.785, mean=1716.785, max=1716.785, sum=1716.785 (1)",
-            "tab": "General information",
-            "score": 1716.785
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)",
-            "tab": "General information",
-            "score": 129.12
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.724,
-        "details": {
-          "description": "min=0.724, mean=0.724, max=0.724, sum=0.724 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.21, mean=0.21, max=0.21, sum=0.21 (1)",
-            "tab": "Efficiency",
-            "score": 0.21042356300354004
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)",
-            "tab": "General information",
-            "score": 249.776
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.511,
-        "details": {
-          "description": "min=0.28, mean=0.511, max=0.78, sum=2.555 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.226, mean=0.406, max=0.726, sum=2.031 (5)",
-            "tab": "Efficiency",
-            "score": 0.40622414255142214
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)",
-            "tab": "General information",
-            "score": 467.6862105263158
-          },
-          "MMLU - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.739,
-        "details": {
-          "description": "min=0.579, mean=0.739, max=0.884, sum=5.176 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.715, mean=2.099, max=2.413, sum=14.696 (7)",
-            "tab": "Efficiency",
-            "score": 2.099496145662431
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)",
-            "tab": "General information",
-            "score": 1262.9092130545007
-          },
-          "MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.823,
-        "details": {
-          "description": "min=0.823, mean=0.823, max=0.823, sum=0.823 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.274, mean=1.274, max=1.274, sum=1.274 (1)",
-            "tab": "Efficiency",
-            "score": 1.2738200931549073
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)",
-            "tab": "General information",
-            "score": 959.032
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.435,
-        "details": {
-          "description": "min=0.018, mean=0.435, max=0.905, sum=2.175 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.199, mean=0.277, max=0.438, sum=1.384 (5)",
-            "tab": "Efficiency",
-            "score": 0.2767821625533402
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)",
-            "tab": "General information",
-            "score": 1513.8824197238912
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.27,
-        "details": {
-          "description": "min=0.27, mean=0.27, max=0.27, sum=0.27 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.205, mean=0.205, max=0.205, sum=0.205 (1)",
-            "tab": "Efficiency",
-            "score": 0.20540714263916016
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)",
-            "tab": "General information",
-            "score": 1025.2743538767395
-          },
-          "MedQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.179,
-        "details": {
-          "description": "min=0.13, mean=0.179, max=0.217, sum=0.896 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.349, mean=0.383, max=0.412, sum=1.915 (5)",
-            "tab": "Efficiency",
-            "score": 0.38295877939459017
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=101.139, mean=120.868, max=141.33, sum=604.34 (5)",
-            "tab": "General information",
-            "score": 120.86804366111025
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json b/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json
deleted file mode 100644
index 8bef7c4e9..000000000
--- a/data/helm_lite/meta/llama-3.2-90b-vision-instruct-turbo/80057cc1-45ab-4976-878e-be963eaa83b1.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-3.2-90b-vision-instruct-turbo/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 3.2 Vision Instruct Turbo 90B",
-    "id": "meta/llama-3.2-90b-vision-instruct-turbo",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.819,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.5839825218476904
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.777,
-        "details": {
-          "description": "min=0.777, mean=0.777, max=0.777, sum=0.777 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.83, mean=0.83, max=0.83, sum=0.83 (1)",
-            "tab": "Efficiency",
-            "score": 0.8297326531208736
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)",
-            "tab": "General information",
-            "score": 3484.2676056338028
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.457,
-        "details": {
-          "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.111, mean=1.111, max=1.111, sum=1.111 (1)",
-            "tab": "Efficiency",
-            "score": 1.110703297138214
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.422, mean=0.422, max=0.422, sum=0.422 (1)",
-            "tab": "Efficiency",
-            "score": 0.4218848171234131
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1716.785, mean=1716.785, max=1716.785, sum=1716.785 (1)",
-            "tab": "General information",
-            "score": 1716.785
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)",
-            "tab": "General information",
-            "score": 129.12
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.942,
-        "details": {
-          "description": "min=0.942, mean=0.942, max=0.942, sum=0.942 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.285, mean=0.285, max=0.285, sum=0.285 (1)",
-            "tab": "Efficiency",
-            "score": 0.28476666021347047
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)",
-            "tab": "General information",
-            "score": 249.776
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.703,
-        "details": {
-          "description": "min=0.52, mean=0.703, max=0.93, sum=3.514 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.266, mean=0.798, max=2.612, sum=3.992 (5)",
-            "tab": "Efficiency",
-            "score": 0.7984467656654225
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)",
-            "tab": "General information",
-            "score": 467.6862105263158
-          },
-          "MMLU - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791,
-        "details": {
-          "description": "min=0.579, mean=0.791, max=0.978, sum=5.54 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=4.64, mean=5.739, max=6.652, sum=40.174 (7)",
-            "tab": "Efficiency",
-            "score": 5.739186799526185
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)",
-            "tab": "General information",
-            "score": 1262.9092130545007
-          },
-          "MATH - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.936,
-        "details": {
-          "description": "min=0.936, mean=0.936, max=0.936, sum=0.936 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=2.889, mean=2.889, max=2.889, sum=2.889 (1)",
-            "tab": "Efficiency",
-            "score": 2.8894128675460817
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)",
-            "tab": "General information",
-            "score": 959.032
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.68,
-        "details": {
-          "description": "min=0.438, mean=0.68, max=0.989, sum=3.398 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.284, mean=0.478, max=1.152, sum=2.389 (5)",
-            "tab": "Efficiency",
-            "score": 0.47773526830658064
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)",
-            "tab": "General information",
-            "score": 1513.8824197238912
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.769,
-        "details": {
-          "description": "min=0.769, mean=0.769, max=0.769, sum=0.769 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.318, mean=0.318, max=0.318, sum=0.318 (1)",
-            "tab": "Efficiency",
-            "score": 0.3180293652930743
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)",
-            "tab": "General information",
-            "score": 1025.2743538767395
-          },
-          "MedQA - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.224,
-        "details": {
-          "description": "min=0.182, mean=0.224, max=0.266, sum=1.121 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.737, mean=0.816, max=0.848, sum=4.078 (5)",
-            "tab": "Efficiency",
-            "score": 0.8156762526912515
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=101.139, mean=120.868, max=141.33, sum=604.34 (5)",
-            "tab": "General information",
-            "score": 120.86804366111025
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json b/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json
deleted file mode 100644
index cc4cca983..000000000
--- a/data/helm_lite/meta/llama-3.3-70b-instruct-turbo/d896249f-bbd9-4657-a5db-5968544cb5fa.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-3.3-70b-instruct-turbo/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 3.3 Instruct Turbo 70B",
-    "id": "meta/llama-3.3-70b-instruct-turbo",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.812,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.7410112359550561
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791,
-        "details": {
-          "description": "min=0.791, mean=0.791, max=0.791, sum=0.791 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.746, mean=0.746, max=0.746, sum=0.746 (1)",
-            "tab": "Efficiency",
-            "score": 0.7455473496880329
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)",
-            "tab": "General information",
-            "score": 3484.2676056338028
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=7.606, mean=7.606, max=7.606, sum=7.606 (1)",
-            "tab": "General information",
-            "score": 7.605633802816901
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.431,
-        "details": {
-          "description": "min=0.431, mean=0.431, max=0.431, sum=0.431 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.51, mean=0.51, max=0.51, sum=0.51 (1)",
-            "tab": "Efficiency",
-            "score": 0.5104404001235961
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.466, mean=0.466, max=0.466, sum=0.466 (1)",
-            "tab": "Efficiency",
-            "score": 0.46574948048591613
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1716.78, mean=1716.78, max=1716.78, sum=1716.78 (1)",
-            "tab": "General information",
-            "score": 1716.78
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=7.503, mean=7.503, max=7.503, sum=7.503 (1)",
-            "tab": "General information",
-            "score": 7.503
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)",
-            "tab": "General information",
-            "score": 129.12
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=9.152, mean=9.152, max=9.152, sum=9.152 (1)",
-            "tab": "General information",
-            "score": 9.152
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.928,
-        "details": {
-          "description": "min=0.928, mean=0.928, max=0.928, sum=0.928 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.339, mean=0.339, max=0.339, sum=0.339 (1)",
-            "tab": "Efficiency",
-            "score": 0.3392307605743408
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)",
-            "tab": "General information",
-            "score": 249.776
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7,
-        "details": {
-          "description": "min=0.5, mean=0.7, max=0.93, sum=3.499 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.313, mean=0.339, max=0.359, sum=1.695 (5)",
-            "tab": "Efficiency",
-            "score": 0.3389431067433274
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)",
-            "tab": "General information",
-            "score": 467.6862105263158
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.808,
-        "details": {
-          "description": "min=0.635, mean=0.808, max=0.963, sum=5.655 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.483, mean=1.779, max=2.037, sum=12.455 (7)",
-            "tab": "Efficiency",
-            "score": 1.7792604792087183
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)",
-            "tab": "General information",
-            "score": 1262.9092130545007
-          },
-          "MATH - # output tokens": {
-            "description": "min=192.326, mean=245.345, max=274.462, sum=1717.412 (7)",
-            "tab": "General information",
-            "score": 245.34459229967183
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.942,
-        "details": {
-          "description": "min=0.942, mean=0.942, max=0.942, sum=0.942 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.354, mean=1.354, max=1.354, sum=1.354 (1)",
-            "tab": "Efficiency",
-            "score": 1.3539768285751344
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)",
-            "tab": "General information",
-            "score": 959.032
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=155.609, mean=155.609, max=155.609, sum=155.609 (1)",
-            "tab": "General information",
-            "score": 155.609
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.725,
-        "details": {
-          "description": "min=0.428, mean=0.725, max=0.979, sum=3.627 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.374, mean=0.549, max=1.199, sum=2.745 (5)",
-            "tab": "Efficiency",
-            "score": 0.5490109607174599
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=197.442, mean=1513.882, max=6300.012, sum=7569.412 (5)",
-            "tab": "General information",
-            "score": 1513.8824197238912
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=2, mean=2.404, max=3.032, sum=12.02 (5)",
-            "tab": "General information",
-            "score": 2.404037659543955
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.761,
-        "details": {
-          "description": "min=0.761, mean=0.761, max=0.761, sum=0.761 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.359, mean=0.359, max=0.359, sum=0.359 (1)",
-            "tab": "Efficiency",
-            "score": 0.35867250700357184
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)",
-            "tab": "General information",
-            "score": 1025.2743538767395
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.219,
-        "details": {
-          "description": "min=0.18, mean=0.219, max=0.261, sum=1.096 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.587, mean=0.62, max=0.685, sum=3.1 (5)",
-            "tab": "Efficiency",
-            "score": 0.6200136459034178
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=96.139, mean=115.712, max=136.117, sum=578.559 (5)",
-            "tab": "General information",
-            "score": 115.71178123566294
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=25.161, mean=26.542, max=27.189, sum=132.708 (5)",
-            "tab": "General information",
-            "score": 26.541526800734054
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json b/data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json
deleted file mode 100644
index ebea32b6c..000000000
--- a/data/helm_lite/meta/llama-65b/9f73f3e5-b573-45d4-8c98-82f5c496f786.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/meta_llama-65b/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMA 65B",
-    "id": "meta/llama-65b",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.345,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.07451935081148564
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.755,
-        "details": {
-          "description": "min=0.755, mean=0.755, max=0.755, sum=0.755 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=2.909, mean=2.909, max=2.909, sum=2.909 (1)",
-            "tab": "Efficiency",
-            "score": 2.9087761751362975
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=1.434, mean=1.434, max=1.434, sum=1.434 (1)",
-            "tab": "General information",
-            "score": 1.4338028169014085
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1539.586, mean=1539.586, max=1539.586, sum=1539.586 (1)",
-            "tab": "General information",
-            "score": 1539.5859154929578
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.433,
-        "details": {
-          "description": "min=0.433, mean=0.433, max=0.433, sum=0.433 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.361, mean=1.361, max=1.361, sum=1.361 (1)",
-            "tab": "Efficiency",
-            "score": 1.3611893365383148
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=4.704, mean=4.704, max=4.704, sum=4.704 (1)",
-            "tab": "Efficiency",
-            "score": 4.703710767745972
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=3.722, mean=3.722, max=3.722, sum=3.722 (1)",
-            "tab": "General information",
-            "score": 3.722
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.049, mean=0.049, max=0.049, sum=0.049 (1)",
-            "tab": "General information",
-            "score": 0.049
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1407.129, mean=1407.129, max=1407.129, sum=1407.129 (1)",
-            "tab": "General information",
-            "score": 1407.129
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.985, mean=0.985, max=0.985, sum=0.985 (1)",
-            "tab": "General information",
-            "score": 0.985
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=137.383, mean=137.383, max=137.383, sum=137.383 (1)",
-            "tab": "General information",
-            "score": 137.383
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.754,
-        "details": {
-          "description": "min=0.754, mean=0.754, max=0.754, sum=0.754 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=4.49, mean=4.49, max=4.49, sum=4.49 (1)",
-            "tab": "Efficiency",
-            "score": 4.490233006477356
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=282.574, mean=282.574, max=282.574, sum=282.574 (1)",
-            "tab": "General information",
-            "score": 282.574
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.584,
-        "details": {
-          "description": "min=0.34, mean=0.584, max=0.89, sum=2.919 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=1.962, mean=3.925, max=5.875, sum=19.627 (5)",
-            "tab": "Efficiency",
-            "score": 3.925460591943641
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=397.65, mean=522.547, max=684.675, sum=2612.735 (5)",
-            "tab": "General information",
-            "score": 522.5470877192982
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.257,
-        "details": {
-          "description": "min=0.096, mean=0.257, max=0.474, sum=1.802 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=13.711, mean=20.79, max=30.888, sum=145.531 (7)",
-            "tab": "Efficiency",
-            "score": 20.790176352238564
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=2.962, mean=6.897, max=8, sum=48.277 (7)",
-            "tab": "General information",
-            "score": 6.896761133603239
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=971.652, mean=1214.707, max=1552.038, sum=8502.951 (7)",
-            "tab": "General information",
-            "score": 1214.7073423969382
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.489,
-        "details": {
-          "description": "min=0.489, mean=0.489, max=0.489, sum=0.489 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=12.339, mean=12.339, max=12.339, sum=12.339 (1)",
-            "tab": "Efficiency",
-            "score": 12.338884568691254
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)",
-            "tab": "General information",
-            "score": 1207.746
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.48,
-        "details": {
-          "description": "min=0.018, mean=0.48, max=0.863, sum=2.401 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=1.489, mean=3.974, max=6.264, sum=19.868 (5)",
-            "tab": "Efficiency",
-            "score": 3.9735240905509466
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=0.024, mean=3.805, max=5, sum=19.024 (5)",
-            "tab": "General information",
-            "score": 3.8048979591836734
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0.006, max=0.031, sum=0.031 (5)",
-            "tab": "General information",
-            "score": 0.006122448979591836
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=222.137, mean=595.161, max=1481.433, sum=2975.806 (5)",
-            "tab": "General information",
-            "score": 595.1612280165185
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0.882, mean=0.976, max=1, sum=4.882 (5)",
-            "tab": "General information",
-            "score": 0.9763265306122448
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.507,
-        "details": {
-          "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=4.984, mean=4.984, max=4.984, sum=4.984 (1)",
-            "tab": "Efficiency",
-            "score": 4.983887912264875
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1234.901, mean=1234.901, max=1234.901, sum=1234.901 (1)",
-            "tab": "General information",
-            "score": 1234.9005964214712
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.189,
-        "details": {
-          "description": "min=0.102, mean=0.189, max=0.239, sum=0.945 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=2.057, mean=3.603, max=8.087, sum=18.014 (5)",
-            "tab": "Efficiency",
-            "score": 3.6028029962680237
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=127.523, mean=142.288, max=164.972, sum=711.438 (5)",
-            "tab": "General information",
-            "score": 142.28751290334915
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json b/data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json
deleted file mode 100644
index ee330c2d2..000000000
--- a/data/helm_lite/microsoft/phi-2/a06a38e5-c198-4efd-96f3-b52bd7f9c6dc.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/microsoft_phi-2/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-2",
-    "id": "microsoft/phi-2",
-    "developer": "microsoft",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.169,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.9032709113607991
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.703,
-        "details": {
-          "description": "min=0.703, mean=0.703, max=0.703, sum=0.703 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.493, mean=0.493, max=0.493, sum=0.493 (1)",
-            "tab": "Efficiency",
-            "score": 0.49325697791408485
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=2.085, mean=2.085, max=2.085, sum=2.085 (1)",
-            "tab": "General information",
-            "score": 2.084507042253521
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1705.006, mean=1705.006, max=1705.006, sum=1705.006 (1)",
-            "tab": "General information",
-            "score": 1705.0056338028169
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.155,
-        "details": {
-          "description": "min=0.155, mean=0.155, max=0.155, sum=0.155 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.47, mean=0.47, max=0.47, sum=0.47 (1)",
-            "tab": "Efficiency",
-            "score": 0.46984758591651915
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.292, mean=0.292, max=0.292, sum=0.292 (1)",
-            "tab": "Efficiency",
-            "score": 0.29179329943656923
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.706, mean=4.706, max=4.706, sum=4.706 (1)",
-            "tab": "General information",
-            "score": 4.706
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.036, mean=0.036, max=0.036, sum=0.036 (1)",
-            "tab": "General information",
-            "score": 0.036
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1493.994, mean=1493.994, max=1493.994, sum=1493.994 (1)",
-            "tab": "General information",
-            "score": 1493.994
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=116.254, mean=116.254, max=116.254, sum=116.254 (1)",
-            "tab": "General information",
-            "score": 116.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.798,
-        "details": {
-          "description": "min=0.798, mean=0.798, max=0.798, sum=0.798 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.262, mean=0.262, max=0.262, sum=0.262 (1)",
-            "tab": "Efficiency",
-            "score": 0.2615062308311462
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=254.216, mean=254.216, max=254.216, sum=254.216 (1)",
-            "tab": "General information",
-            "score": 254.216
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.518,
-        "details": {
-          "description": "min=0.31, mean=0.518, max=0.78, sum=2.592 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.27, mean=0.285, max=0.295, sum=1.426 (5)",
-            "tab": "Efficiency",
-            "score": 0.28525047320650343
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=2361.37 (5)",
-            "tab": "General information",
-            "score": 472.2740350877192
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.255,
-        "details": {
-          "description": "min=0.033, mean=0.255, max=0.465, sum=1.786 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=0.923, mean=1.129, max=1.577, sum=7.902 (7)",
-            "tab": "Efficiency",
-            "score": 1.1288332585709453
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=2.962, mean=6.916, max=8, sum=48.409 (7)",
-            "tab": "General information",
-            "score": 6.915558126084441
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=906.541, mean=1162.126, max=1511.442, sum=8134.881 (7)",
-            "tab": "General information",
-            "score": 1162.1258475895563
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.581,
-        "details": {
-          "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.147, mean=1.147, max=1.147, sum=1.147 (1)",
-            "tab": "Efficiency",
-            "score": 1.1468114259243012
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=938.893, mean=938.893, max=938.893, sum=938.893 (1)",
-            "tab": "General information",
-            "score": 938.893
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.334,
-        "details": {
-          "description": "min=0.137, mean=0.334, max=0.537, sum=1.672 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.268, mean=0.303, max=0.381, sum=1.517 (5)",
-            "tab": "Efficiency",
-            "score": 0.3034723702962031
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=0.337, mean=3.867, max=5, sum=19.337 (5)",
-            "tab": "General information",
-            "score": 3.8673469387755106
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0.003, max=0.014, sum=0.014 (5)",
-            "tab": "General information",
-            "score": 0.002857142857142857
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=205.632, mean=566.249, max=1519.543, sum=2831.243 (5)",
-            "tab": "General information",
-            "score": 566.2485439511586
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.41,
-        "details": {
-          "description": "min=0.41, mean=0.41, max=0.41, sum=0.41 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.275, mean=0.275, max=0.275, sum=0.275 (1)",
-            "tab": "Efficiency",
-            "score": 0.27509861532783886
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1038.833, mean=1038.833, max=1038.833, sum=1038.833 (1)",
-            "tab": "General information",
-            "score": 1038.8330019880716
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.038,
-        "details": {
-          "description": "min=0.0, mean=0.038, max=0.113, sum=0.189 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.427, mean=0.47, max=0.534, sum=2.35 (5)",
-            "tab": "Efficiency",
-            "score": 0.47001117224047206
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=136.93, mean=181.692, max=241.656, sum=908.462 (5)",
-            "tab": "General information",
-            "score": 181.69235022556967
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json b/data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json
deleted file mode 100644
index 6d945026f..000000000
--- a/data/helm_lite/microsoft/phi-3-medium-4k-instruct/4ff688da-61a0-43ce-9c2d-e1c197887683.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/microsoft_phi-3-medium-4k-instruct/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3 14B",
-    "id": "microsoft/phi-3-medium-4k-instruct",
-    "developer": "microsoft",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.509,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.12111111111111111
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.724,
-        "details": {
-          "description": "min=0.724, mean=0.724, max=0.724, sum=0.724 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=29.509, mean=29.509, max=29.509, sum=29.509 (1)",
-            "tab": "Efficiency",
-            "score": 29.5092350200868
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.392, mean=4.392, max=4.392, sum=4.392 (1)",
-            "tab": "General information",
-            "score": 4.391549295774648
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3685.707, mean=3685.707, max=3685.707, sum=3685.707 (1)",
-            "tab": "General information",
-            "score": 3685.707042253521
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=7.245, mean=7.245, max=7.245, sum=7.245 (1)",
-            "tab": "General information",
-            "score": 7.245070422535211
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.278,
-        "details": {
-          "description": "min=0.278, mean=0.278, max=0.278, sum=0.278 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=44.238, mean=44.238, max=44.238, sum=44.238 (1)",
-            "tab": "Efficiency",
-            "score": 44.23756227874756
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=49.743, mean=49.743, max=49.743, sum=49.743 (1)",
-            "tab": "Efficiency",
-            "score": 49.743374599456786
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.83, mean=4.83, max=4.83, sum=4.83 (1)",
-            "tab": "General information",
-            "score": 4.83
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
-            "tab": "General information",
-            "score": 0.026
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2307.695, mean=2307.695, max=2307.695, sum=2307.695 (1)",
-            "tab": "General information",
-            "score": 2307.695
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=8.371, mean=8.371, max=8.371, sum=8.371 (1)",
-            "tab": "General information",
-            "score": 8.371
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=156.383, mean=156.383, max=156.383, sum=156.383 (1)",
-            "tab": "General information",
-            "score": 156.383
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=10.079, mean=10.079, max=10.079, sum=10.079 (1)",
-            "tab": "General information",
-            "score": 10.079
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.916,
-        "details": {
-          "description": "min=0.916, mean=0.916, max=0.916, sum=0.916 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.385, mean=0.385, max=0.385, sum=0.385 (1)",
-            "tab": "Efficiency",
-            "score": 0.3850016188621521
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=291.574, mean=291.574, max=291.574, sum=291.574 (1)",
-            "tab": "General information",
-            "score": 291.574
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.675,
-        "details": {
-          "description": "min=0.48, mean=0.675, max=0.94, sum=3.375 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.381, mean=0.504, max=0.722, sum=2.52 (5)",
-            "tab": "Efficiency",
-            "score": 0.5039482383811682
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=406.65, mean=531.547, max=693.675, sum=2657.735 (5)",
-            "tab": "General information",
-            "score": 531.5470877192982
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.611,
-        "details": {
-          "description": "min=0.462, mean=0.611, max=0.7, sum=4.277 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=67.969, mean=71.561, max=74.993, sum=500.925 (7)",
-            "tab": "Efficiency",
-            "score": 71.56076915436368
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)",
-            "tab": "General information",
-            "score": 1438.6362030100095
-          },
-          "MATH - # output tokens": {
-            "description": "min=357.548, mean=372.128, max=392.767, sum=2604.893 (7)",
-            "tab": "General information",
-            "score": 372.1276343562145
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.878,
-        "details": {
-          "description": "min=0.878, mean=0.878, max=0.878, sum=0.878 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=74.933, mean=74.933, max=74.933, sum=74.933 (1)",
-            "tab": "Efficiency",
-            "score": 74.93269198083877
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)",
-            "tab": "General information",
-            "score": 1207.746
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=400, mean=400, max=400, sum=400 (1)",
-            "tab": "General information",
-            "score": 400.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.593,
-        "details": {
-          "description": "min=0.365, mean=0.593, max=0.811, sum=2.966 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=5.972, mean=7.879, max=14.755, sum=39.397 (5)",
-            "tab": "Efficiency",
-            "score": 7.879368148866983
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=1.884, mean=4.177, max=5, sum=20.884 (5)",
-            "tab": "General information",
-            "score": 4.176734693877551
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0.001, max=0.004, sum=0.004 (5)",
-            "tab": "General information",
-            "score": 0.0008163265306122449
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=229.137, mean=1033.818, max=3646.718, sum=5169.092 (5)",
-            "tab": "General information",
-            "score": 1033.8183708736613
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.356, max=1.979, sum=6.782 (5)",
-            "tab": "General information",
-            "score": 1.3564703389458466
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.696,
-        "details": {
-          "description": "min=0.696, mean=0.696, max=0.696, sum=0.696 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=1.792, mean=1.792, max=1.792, sum=1.792 (1)",
-            "tab": "Efficiency",
-            "score": 1.7916561092581473
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1243.901, mean=1243.901, max=1243.901, sum=1243.901 (1)",
-            "tab": "General information",
-            "score": 1243.9005964214712
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.17,
-        "details": {
-          "description": "min=0.086, mean=0.17, max=0.218, sum=0.85 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=19.742, mean=19.987, max=20.079, sum=99.934 (5)",
-            "tab": "Efficiency",
-            "score": 19.98681167411759
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=135.523, mean=150.288, max=172.972, sum=751.438 (5)",
-            "tab": "General information",
-            "score": 150.28751290334915
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=98.254, mean=99.651, max=100, sum=498.254 (5)",
-            "tab": "General information",
-            "score": 99.65089463220676
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json b/data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json
deleted file mode 100644
index c7b88764b..000000000
--- a/data/helm_lite/microsoft/phi-3-small-8k-instruct/181003ea-7587-4c93-8b89-c5c76958313d.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/microsoft_phi-3-small-8k-instruct/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3 7B",
-    "id": "microsoft/phi-3-small-8k-instruct",
-    "developer": "microsoft",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.473,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.18641975308641975
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.754,
-        "details": {
-          "description": "min=0.754, mean=0.754, max=0.754, sum=0.754 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=30.408, mean=30.408, max=30.408, sum=30.408 (1)",
-            "tab": "Efficiency",
-            "score": 30.40753108749927
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3485.67, mean=3485.67, max=3485.67, sum=3485.67 (1)",
-            "tab": "General information",
-            "score": 3485.6704225352114
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=33.71, mean=33.71, max=33.71, sum=33.71 (1)",
-            "tab": "General information",
-            "score": 33.709859154929575
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.324,
-        "details": {
-          "description": "min=0.324, mean=0.324, max=0.324, sum=0.324 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=68.232, mean=68.232, max=68.232, sum=68.232 (1)",
-            "tab": "Efficiency",
-            "score": 68.2322377743721
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=63.003, mean=63.003, max=63.003, sum=63.003 (1)",
-            "tab": "Efficiency",
-            "score": 63.00250503087044
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.965, mean=4.965, max=4.965, sum=4.965 (1)",
-            "tab": "General information",
-            "score": 4.965
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)",
-            "tab": "General information",
-            "score": 0.007
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1675.981, mean=1675.981, max=1675.981, sum=1675.981 (1)",
-            "tab": "General information",
-            "score": 1675.981
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=16.786, mean=16.786, max=16.786, sum=16.786 (1)",
-            "tab": "General information",
-            "score": 16.786
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=129.127, mean=129.127, max=129.127, sum=129.127 (1)",
-            "tab": "General information",
-            "score": 129.127
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=36.311, mean=36.311, max=36.311, sum=36.311 (1)",
-            "tab": "General information",
-            "score": 36.311
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.912,
-        "details": {
-          "description": "min=0.912, mean=0.912, max=0.912, sum=0.912 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.289, mean=0.289, max=0.289, sum=0.289 (1)",
-            "tab": "Efficiency",
-            "score": 0.28856802701950074
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=249.782, mean=249.782, max=249.782, sum=249.782 (1)",
-            "tab": "General information",
-            "score": 249.782
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.659,
-        "details": {
-          "description": "min=0.44, mean=0.659, max=0.95, sum=3.296 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.275, mean=0.406, max=0.549, sum=2.032 (5)",
-            "tab": "Efficiency",
-            "score": 0.406433069689232
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=373.44, mean=467.72, max=614.43, sum=2338.6 (5)",
-            "tab": "General information",
-            "score": 467.71996491228066
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.703,
-        "details": {
-          "description": "min=0.538, mean=0.703, max=0.933, sum=4.922 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=49.379, mean=60.681, max=73.413, sum=424.765 (7)",
-            "tab": "Efficiency",
-            "score": 60.680695580739844
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=881.363, mean=1262.911, max=2197.577, sum=8840.376 (7)",
-            "tab": "General information",
-            "score": 1262.9108741840687
-          },
-          "MATH - # output tokens": {
-            "description": "min=57.779, mean=115.236, max=283.904, sum=806.654 (7)",
-            "tab": "General information",
-            "score": 115.23627800867702
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -1.0,
-        "details": {
-          "description": "No matching runs",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "No matching runs",
-            "tab": "Efficiency",
-            "score": null
-          },
-          "GSM8K - # eval": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "GSM8K - # train": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "GSM8K - truncated": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          },
-          "GSM8K - # output tokens": {
-            "description": "No matching runs",
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.584,
-        "details": {
-          "description": "min=0.395, mean=0.584, max=0.895, sum=2.92 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=6.293, mean=8.342, max=16.012, sum=41.71 (5)",
-            "tab": "Efficiency",
-            "score": 8.34200078530511
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.798, max=5, sum=23.992 (5)",
-            "tab": "General information",
-            "score": 4.798367346938775
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=197.442, mean=1512.687, max=6294.008, sum=7563.435 (5)",
-            "tab": "General information",
-            "score": 1512.6870529886412
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.192, max=1.538, sum=5.96 (5)",
-            "tab": "General information",
-            "score": 1.192017037143267
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.672,
-        "details": {
-          "description": "min=0.672, mean=0.672, max=0.672, sum=0.672 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.89, mean=0.89, max=0.89, sum=0.89 (1)",
-            "tab": "Efficiency",
-            "score": 0.8902683931126983
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1027.414, mean=1027.414, max=1027.414, sum=1027.414 (1)",
-            "tab": "General information",
-            "score": 1027.4135188866799
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.154,
-        "details": {
-          "description": "min=0.043, mean=0.154, max=0.205, sum=0.772 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=20.252, mean=20.399, max=20.714, sum=101.996 (5)",
-            "tab": "Efficiency",
-            "score": 20.399208641134514
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=114.901, mean=138.043, max=158.185, sum=690.213 (5)",
-            "tab": "General information",
-            "score": 138.04258583116683
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=96.311, mean=96.966, max=98.575, sum=484.832 (5)",
-            "tab": "General information",
-            "score": 96.96643456568283
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json b/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json
deleted file mode 100644
index fd0f8e02b..000000000
--- a/data/helm_lite/mistralai/mistral-7b-instruct-v0.3/66688228-e59a-4caa-b3fb-c5df1efc9db4.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-7b-instruct-v0.3/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral Instruct v0.3 7B",
-    "id": "mistralai/mistral-7b-instruct-v0.3",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.196,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.6493133583021223
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.716,
-        "details": {
-          "description": "min=0.716, mean=0.716, max=0.716, sum=0.716 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.813, mean=0.813, max=0.813, sum=0.813 (1)",
-            "tab": "Efficiency",
-            "score": 0.8132137520212522
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3924.33, mean=3924.33, max=3924.33, sum=3924.33 (1)",
-            "tab": "General information",
-            "score": 3924.3295774647886
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=7.107, mean=7.107, max=7.107, sum=7.107 (1)",
-            "tab": "General information",
-            "score": 7.107042253521127
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.253,
-        "details": {
-          "description": "min=0.253, mean=0.253, max=0.253, sum=0.253 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.563, mean=0.563, max=0.563, sum=0.563 (1)",
-            "tab": "Efficiency",
-            "score": 0.5634698050022126
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.535, mean=0.535, max=0.535, sum=0.535 (1)",
-            "tab": "Efficiency",
-            "score": 0.5347676448822022
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2498.79, mean=2498.79, max=2498.79, sum=2498.79 (1)",
-            "tab": "General information",
-            "score": 2498.79
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=12.448, mean=12.448, max=12.448, sum=12.448 (1)",
-            "tab": "General information",
-            "score": 12.448
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=172.069, mean=172.069, max=172.069, sum=172.069 (1)",
-            "tab": "General information",
-            "score": 172.069
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=20.461, mean=20.461, max=20.461, sum=20.461 (1)",
-            "tab": "General information",
-            "score": 20.461
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.79, mean=0.79, max=0.79, sum=0.79 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.256, mean=0.256, max=0.256, sum=0.256 (1)",
-            "tab": "Efficiency",
-            "score": 0.25593132400512697
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=289.15, mean=289.15, max=289.15, sum=289.15 (1)",
-            "tab": "General information",
-            "score": 289.15
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.51,
-        "details": {
-          "description": "min=0.27, mean=0.51, max=0.79, sum=2.551 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.221, mean=0.372, max=0.487, sum=1.862 (5)",
-            "tab": "Efficiency",
-            "score": 0.37230395750413864
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=411.44, mean=532.091, max=696.175, sum=2660.455 (5)",
-            "tab": "General information",
-            "score": 532.0910877192983
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.289,
-        "details": {
-          "description": "min=0.115, mean=0.289, max=0.477, sum=2.02 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=2.027, mean=2.656, max=3.039, sum=18.593 (7)",
-            "tab": "Efficiency",
-            "score": 2.656151831465352
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=991.615, mean=1455.266, max=2502.962, sum=10186.865 (7)",
-            "tab": "General information",
-            "score": 1455.2664139976257
-          },
-          "MATH - # output tokens": {
-            "description": "min=123.616, mean=149.99, max=172.789, sum=1049.933 (7)",
-            "tab": "General information",
-            "score": 149.99043902740354
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.538,
-        "details": {
-          "description": "min=0.538, mean=0.538, max=0.538, sum=0.538 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=3.95, mean=3.95, max=3.95, sum=3.95 (1)",
-            "tab": "Efficiency",
-            "score": 3.949965229511261
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1187.268, mean=1187.268, max=1187.268, sum=1187.268 (1)",
-            "tab": "General information",
-            "score": 1187.268
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=196.611, mean=196.611, max=196.611, sum=196.611 (1)",
-            "tab": "General information",
-            "score": 196.611
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.331,
-        "details": {
-          "description": "min=0.063, mean=0.331, max=0.733, sum=1.655 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.316, mean=0.489, max=0.855, sum=2.444 (5)",
-            "tab": "Efficiency",
-            "score": 0.4887186054518059
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=236.453, mean=1750.748, max=7224.488, sum=8753.741 (5)",
-            "tab": "General information",
-            "score": 1750.7482458432962
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=2, mean=9.174, max=15.242, sum=45.871 (5)",
-            "tab": "General information",
-            "score": 9.17419274343898
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.517,
-        "details": {
-          "description": "min=0.517, mean=0.517, max=0.517, sum=0.517 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.418, mean=0.418, max=0.418, sum=0.418 (1)",
-            "tab": "Efficiency",
-            "score": 0.4182186216767692
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1202.093, mean=1202.093, max=1202.093, sum=1202.093 (1)",
-            "tab": "General information",
-            "score": 1202.0934393638172
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.142,
-        "details": {
-          "description": "min=0.047, mean=0.142, max=0.184, sum=0.712 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.582, mean=0.775, max=0.872, sum=3.875 (5)",
-            "tab": "Efficiency",
-            "score": 0.7750062139801958
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=148.306, mean=162.433, max=181.018, sum=812.166 (5)",
-            "tab": "General information",
-            "score": 162.43317355482492
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=28.3, mean=30.51, max=31.912, sum=152.552 (5)",
-            "tab": "General information",
-            "score": 30.510483732222053
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json b/data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json
deleted file mode 100644
index 8f4801f23..000000000
--- a/data/helm_lite/mistralai/mistral-7b-v0.1/2d7d8bac-714e-49a8-a1a7-d88d759fe60a.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-7b-v0.1/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral v0.1 7B",
-    "id": "mistralai/mistral-7b-v0.1",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.292,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.8075780274656679
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.716,
-        "details": {
-          "description": "min=0.716, mean=0.716, max=0.716, sum=0.716 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.705, mean=0.705, max=0.705, sum=0.705 (1)",
-            "tab": "Efficiency",
-            "score": 0.7051956902087574
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.575, mean=4.575, max=4.575, sum=4.575 (1)",
-            "tab": "General information",
-            "score": 4.574647887323944
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3627.715, mean=3627.715, max=3627.715, sum=3627.715 (1)",
-            "tab": "General information",
-            "score": 3627.7154929577464
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.367,
-        "details": {
-          "description": "min=0.367, mean=0.367, max=0.367, sum=0.367 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.494, mean=0.494, max=0.494, sum=0.494 (1)",
-            "tab": "Efficiency",
-            "score": 0.49417281556129455
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.462, mean=0.462, max=0.462, sum=0.462 (1)",
-            "tab": "Efficiency",
-            "score": 0.46181689071655274
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.832, mean=4.832, max=4.832, sum=4.832 (1)",
-            "tab": "General information",
-            "score": 4.832
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
-            "tab": "General information",
-            "score": 0.026
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2268.728, mean=2268.728, max=2268.728, sum=2268.728 (1)",
-            "tab": "General information",
-            "score": 2268.728
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.988, mean=0.988, max=0.988, sum=0.988 (1)",
-            "tab": "General information",
-            "score": 0.988
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=142.069, mean=142.069, max=142.069, sum=142.069 (1)",
-            "tab": "General information",
-            "score": 142.069
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.776,
-        "details": {
-          "description": "min=0.776, mean=0.776, max=0.776, sum=0.776 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.325, mean=0.325, max=0.325, sum=0.325 (1)",
-            "tab": "Efficiency",
-            "score": 0.32474704647064206
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=280.15, mean=280.15, max=280.15, sum=280.15 (1)",
-            "tab": "General information",
-            "score": 280.15
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.584,
-        "details": {
-          "description": "min=0.31, mean=0.584, max=0.85, sum=2.918 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.272, mean=0.291, max=0.304, sum=1.457 (5)",
-            "tab": "Efficiency",
-            "score": 0.2914179778851961
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=402.44, mean=523.091, max=687.175, sum=2615.455 (5)",
-            "tab": "General information",
-            "score": 523.0910877192983
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297,
-        "details": {
-          "description": "min=0.067, mean=0.297, max=0.43, sum=2.082 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=0.992, mean=1.159, max=1.576, sum=8.114 (7)",
-            "tab": "Efficiency",
-            "score": 1.159214100149656
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=991.615, mean=1455.266, max=2502.962, sum=10186.865 (7)",
-            "tab": "General information",
-            "score": 1455.2664139976257
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.377,
-        "details": {
-          "description": "min=0.377, mean=0.377, max=0.377, sum=0.377 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.632, mean=1.632, max=1.632, sum=1.632 (1)",
-            "tab": "Efficiency",
-            "score": 1.6323128745555877
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1187.268, mean=1187.268, max=1187.268, sum=1187.268 (1)",
-            "tab": "General information",
-            "score": 1187.268
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.58,
-        "details": {
-          "description": "min=0.433, mean=0.58, max=0.789, sum=2.901 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.287, mean=0.353, max=0.577, sum=1.765 (5)",
-            "tab": "Efficiency",
-            "score": 0.35307050709631943
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=1.969, mean=4.194, max=5, sum=20.969 (5)",
-            "tab": "General information",
-            "score": 4.1938775510204085
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=219.453, mean=998.503, max=3534.259, sum=4992.513 (5)",
-            "tab": "General information",
-            "score": 998.5025315575822
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0.992, mean=0.998, max=1, sum=4.992 (5)",
-            "tab": "General information",
-            "score": 0.9983673469387755
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.525,
-        "details": {
-          "description": "min=0.525, mean=0.525, max=0.525, sum=0.525 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.348, mean=0.348, max=0.348, sum=0.348 (1)",
-            "tab": "Efficiency",
-            "score": 0.3478535307093596
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1193.093, mean=1193.093, max=1193.093, sum=1193.093 (1)",
-            "tab": "General information",
-            "score": 1193.0934393638172
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.16,
-        "details": {
-          "description": "min=0.056, mean=0.16, max=0.201, sum=0.802 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.52, mean=0.561, max=0.701, sum=2.803 (5)",
-            "tab": "Efficiency",
-            "score": 0.5605853292576617
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=130.306, mean=144.433, max=163.018, sum=722.166 (5)",
-            "tab": "General information",
-            "score": 144.43317355482492
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json b/data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json
deleted file mode 100644
index d8d60cc37..000000000
--- a/data/helm_lite/mistralai/mistral-large-2402/077fe37f-b3a4-483a-93a5-034c6445fe98.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-large-2402/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral Large 2402",
-    "id": "mistralai/mistral-large-2402",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.23681647940074904
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.454,
-        "details": {
-          "description": "min=0.454, mean=0.454, max=0.454, sum=0.454 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.675, mean=1.675, max=1.675, sum=1.675 (1)",
-            "tab": "Efficiency",
-            "score": 1.6750120075655655
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3955.33, mean=3955.33, max=3955.33, sum=3955.33 (1)",
-            "tab": "General information",
-            "score": 3955.3295774647886
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=22.614, mean=22.614, max=22.614, sum=22.614 (1)",
-            "tab": "General information",
-            "score": 22.614084507042254
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.311,
-        "details": {
-          "description": "min=0.311, mean=0.311, max=0.311, sum=0.311 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.666, mean=1.666, max=1.666, sum=1.666 (1)",
-            "tab": "Efficiency",
-            "score": 1.665770656108856
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=2.122, mean=2.122, max=2.122, sum=2.122 (1)",
-            "tab": "Efficiency",
-            "score": 2.1218616259098053
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2537.79, mean=2537.79, max=2537.79, sum=2537.79 (1)",
-            "tab": "General information",
-            "score": 2537.79
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=27.337, mean=27.337, max=27.337, sum=27.337 (1)",
-            "tab": "General information",
-            "score": 27.337
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=211.069, mean=211.069, max=211.069, sum=211.069 (1)",
-            "tab": "General information",
-            "score": 211.069
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=27.91, mean=27.91, max=27.91, sum=27.91 (1)",
-            "tab": "General information",
-            "score": 27.91
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.894,
-        "details": {
-          "description": "min=0.894, mean=0.894, max=0.894, sum=0.894 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.569, mean=0.569, max=0.569, sum=0.569 (1)",
-            "tab": "Efficiency",
-            "score": 0.5687967395782471
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=280.15, mean=280.15, max=280.15, sum=280.15 (1)",
-            "tab": "General information",
-            "score": 280.15
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.638,
-        "details": {
-          "description": "min=0.38, mean=0.638, max=0.92, sum=3.19 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=1.226, mean=1.451, max=1.633, sum=7.257 (5)",
-            "tab": "Efficiency",
-            "score": 1.4514196366845515
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=402.44, mean=523.091, max=687.175, sum=2615.455 (5)",
-            "tab": "General information",
-            "score": 523.0910877192983
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.75,
-        "details": {
-          "description": "min=0.632, mean=0.75, max=0.904, sum=5.253 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=3.885, mean=5.128, max=5.812, sum=35.896 (7)",
-            "tab": "Efficiency",
-            "score": 5.128044104863146
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=1061.615, mean=1525.266, max=2572.962, sum=10676.865 (7)",
-            "tab": "General information",
-            "score": 1525.2664139976257
-          },
-          "MATH - # output tokens": {
-            "description": "min=101.444, mean=128.216, max=154.897, sum=897.515 (7)",
-            "tab": "General information",
-            "score": 128.21647245723133
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.694,
-        "details": {
-          "description": "min=0.694, mean=0.694, max=0.694, sum=0.694 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=7.095, mean=7.095, max=7.095, sum=7.095 (1)",
-            "tab": "Efficiency",
-            "score": 7.095049407720566
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1255.268, mean=1255.268, max=1255.268, sum=1255.268 (1)",
-            "tab": "General information",
-            "score": 1255.268
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=129.185, mean=129.185, max=129.185, sum=129.185 (1)",
-            "tab": "General information",
-            "score": 129.185
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.479,
-        "details": {
-          "description": "min=0.1, mean=0.479, max=0.821, sum=2.394 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.985, mean=1.692, max=2.787, sum=8.462 (5)",
-            "tab": "Efficiency",
-            "score": 1.6924799473534797
-          },
-          "LegalBench - # eval": {
-            "description": "min=50, mean=312.4, max=1000, sum=1562 (5)",
-            "tab": "General information",
-            "score": 312.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=219.46, mean=1783.252, max=7251, sum=8916.261 (5)",
-            "tab": "General information",
-            "score": 1783.2521685070988
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1.005, mean=8.217, max=25.86, sum=41.087 (5)",
-            "tab": "General information",
-            "score": 8.217420478990393
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.499,
-        "details": {
-          "description": "min=0.499, mean=0.499, max=0.499, sum=0.499 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.595, mean=0.595, max=0.595, sum=0.595 (1)",
-            "tab": "Efficiency",
-            "score": 0.5950325303238856
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1193.093, mean=1193.093, max=1193.093, sum=1193.093 (1)",
-            "tab": "General information",
-            "score": 1193.0934393638172
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.182,
-        "details": {
-          "description": "min=0.098, mean=0.182, max=0.224, sum=0.909 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.69, mean=1.969, max=2.702, sum=9.846 (5)",
-            "tab": "Efficiency",
-            "score": 1.969239294333439
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=198.306, mean=212.433, max=231.018, sum=1062.166 (5)",
-            "tab": "General information",
-            "score": 212.43317355482492
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=27.272, mean=29.042, max=29.871, sum=145.211 (5)",
-            "tab": "General information",
-            "score": 29.04227089386756
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json b/data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json
deleted file mode 100644
index d75c9932b..000000000
--- a/data/helm_lite/mistralai/mistral-large-2407/4fbb173c-b900-4e11-87bd-1ac6a489d014.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-large-2407/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral Large 2 2407",
-    "id": "mistralai/mistral-large-2407",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.744,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.4191385767790262
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.779,
-        "details": {
-          "description": "min=0.779, mean=0.779, max=0.779, sum=0.779 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.728, mean=0.728, max=0.728, sum=0.728 (1)",
-            "tab": "Efficiency",
-            "score": 0.7276979574015443
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3914.33, mean=3914.33, max=3914.33, sum=3914.33 (1)",
-            "tab": "General information",
-            "score": 3914.3295774647886
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=6.2, mean=6.2, max=6.2, sum=6.2 (1)",
-            "tab": "General information",
-            "score": 6.2
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.453,
-        "details": {
-          "description": "min=0.453, mean=0.453, max=0.453, sum=0.453 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.757, mean=0.757, max=0.757, sum=0.757 (1)",
-            "tab": "Efficiency",
-            "score": 0.7573216142654419
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.527, mean=0.527, max=0.527, sum=0.527 (1)",
-            "tab": "Efficiency",
-            "score": 0.5273597676753998
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2488.79, mean=2488.79, max=2488.79, sum=2488.79 (1)",
-            "tab": "General information",
-            "score": 2488.79
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=7.994, mean=7.994, max=7.994, sum=7.994 (1)",
-            "tab": "General information",
-            "score": 7.994
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=162.069, mean=162.069, max=162.069, sum=162.069 (1)",
-            "tab": "General information",
-            "score": 162.069
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=6.496, mean=6.496, max=6.496, sum=6.496 (1)",
-            "tab": "General information",
-            "score": 6.496
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.932,
-        "details": {
-          "description": "min=0.932, mean=0.932, max=0.932, sum=0.932 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.891, mean=0.891, max=0.891, sum=0.891 (1)",
-            "tab": "Efficiency",
-            "score": 0.8910596170425416
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=290.15, mean=290.15, max=290.15, sum=290.15 (1)",
-            "tab": "General information",
-            "score": 290.15
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.725,
-        "details": {
-          "description": "min=0.52, mean=0.725, max=0.9, sum=3.623 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.684, mean=0.789, max=0.933, sum=3.943 (5)",
-            "tab": "Efficiency",
-            "score": 0.7886472435834114
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=412.44, mean=533.091, max=697.175, sum=2665.455 (5)",
-            "tab": "General information",
-            "score": 533.0910877192983
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.677,
-        "details": {
-          "description": "min=0.342, mean=0.677, max=0.881, sum=4.737 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=4.359, mean=5.441, max=6.464, sum=38.087 (7)",
-            "tab": "Efficiency",
-            "score": 5.441067432619708
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=992.615, mean=1456.266, max=2503.962, sum=10193.865 (7)",
-            "tab": "General information",
-            "score": 1456.2664139976257
-          },
-          "MATH - # output tokens": {
-            "description": "min=129.395, mean=180.319, max=220.298, sum=1262.231 (7)",
-            "tab": "General information",
-            "score": 180.3187090913529
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.912,
-        "details": {
-          "description": "min=0.912, mean=0.912, max=0.912, sum=0.912 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=5.431, mean=5.431, max=5.431, sum=5.431 (1)",
-            "tab": "Efficiency",
-            "score": 5.431343378543854
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1187.268, mean=1187.268, max=1187.268, sum=1187.268 (1)",
-            "tab": "General information",
-            "score": 1187.268
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=205.748, mean=205.748, max=205.748, sum=205.748 (1)",
-            "tab": "General information",
-            "score": 205.748
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.646,
-        "details": {
-          "description": "min=0.229, mean=0.646, max=1, sum=3.23 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.485, mean=0.797, max=0.986, sum=3.987 (5)",
-            "tab": "Efficiency",
-            "score": 0.7974768901406878
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=227.453, mean=1741.348, max=7215.488, sum=8706.741 (5)",
-            "tab": "General information",
-            "score": 1741.3482458432961
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=3.484, max=8.469, sum=17.42 (5)",
-            "tab": "General information",
-            "score": 3.484006654237774
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.775,
-        "details": {
-          "description": "min=0.775, mean=0.775, max=0.775, sum=0.775 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.446, mean=0.446, max=0.446, sum=0.446 (1)",
-            "tab": "Efficiency",
-            "score": 0.4456319799480097
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1203.093, mean=1203.093, max=1203.093, sum=1203.093 (1)",
-            "tab": "General information",
-            "score": 1203.0934393638172
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.192,
-        "details": {
-          "description": "min=0.14, mean=0.192, max=0.231, sum=0.962 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.075, mean=1.269, max=1.402, sum=6.343 (5)",
-            "tab": "Efficiency",
-            "score": 1.2686868536542282
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=139.306, mean=153.433, max=172.018, sum=767.166 (5)",
-            "tab": "General information",
-            "score": 153.43317355482492
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=29.153, mean=30.306, max=33.358, sum=151.531 (5)",
-            "tab": "General information",
-            "score": 30.30625095580364
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json b/data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json
deleted file mode 100644
index 6bb7115e2..000000000
--- a/data/helm_lite/mistralai/mistral-medium-2312/e56e8834-27d7-44e7-b5bb-907a4d7b6a58.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-medium-2312/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral Medium 2312",
-    "id": "mistralai/mistral-medium-2312",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.268,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.06677902621722846
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.449,
-        "details": {
-          "description": "min=0.449, mean=0.449, max=0.449, sum=0.449 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=3.898, mean=3.898, max=3.898, sum=3.898 (1)",
-            "tab": "Efficiency",
-            "score": 3.898151301666045
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3955.33, mean=3955.33, max=3955.33, sum=3955.33 (1)",
-            "tab": "General information",
-            "score": 3955.3295774647886
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=24.885, mean=24.885, max=24.885, sum=24.885 (1)",
-            "tab": "General information",
-            "score": 24.88450704225352
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.29,
-        "details": {
-          "description": "min=0.29, mean=0.29, max=0.29, sum=0.29 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=5.342, mean=5.342, max=5.342, sum=5.342 (1)",
-            "tab": "Efficiency",
-            "score": 5.342489146232605
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=6.588, mean=6.588, max=6.588, sum=6.588 (1)",
-            "tab": "Efficiency",
-            "score": 6.588117929935455
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2537.79, mean=2537.79, max=2537.79, sum=2537.79 (1)",
-            "tab": "General information",
-            "score": 2537.79
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=32.377, mean=32.377, max=32.377, sum=32.377 (1)",
-            "tab": "General information",
-            "score": 32.377
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=211.069, mean=211.069, max=211.069, sum=211.069 (1)",
-            "tab": "General information",
-            "score": 211.069
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=34.263, mean=34.263, max=34.263, sum=34.263 (1)",
-            "tab": "General information",
-            "score": 34.263
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=0.83 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=2.12, mean=2.12, max=2.12, sum=2.12 (1)",
-            "tab": "Efficiency",
-            "score": 2.1195812821388245
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=280.15, mean=280.15, max=280.15, sum=280.15 (1)",
-            "tab": "General information",
-            "score": 280.15
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0.968, mean=0.968, max=0.968, sum=0.968 (1)",
-            "tab": "General information",
-            "score": 0.968
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.618,
-        "details": {
-          "description": "min=0.32, mean=0.618, max=0.91, sum=3.089 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=1.507, mean=2.775, max=3.62, sum=13.874 (5)",
-            "tab": "Efficiency",
-            "score": 2.774717758923246
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=402.44, mean=523.091, max=687.175, sum=2615.455 (5)",
-            "tab": "General information",
-            "score": 523.0910877192983
-          },
-          "MMLU - # output tokens": {
-            "description": "min=0.93, mean=0.97, max=0.991, sum=4.851 (5)",
-            "tab": "General information",
-            "score": 0.9702456140350877
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.565,
-        "details": {
-          "description": "min=0.4, mean=0.565, max=0.756, sum=3.958 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=6.1, mean=7.086, max=10.207, sum=49.602 (7)",
-            "tab": "Efficiency",
-            "score": 7.0860357509079535
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=1061.615, mean=1525.266, max=2572.962, sum=10676.865 (7)",
-            "tab": "General information",
-            "score": 1525.2664139976257
-          },
-          "MATH - # output tokens": {
-            "description": "min=80, mean=113.328, max=132.25, sum=793.295 (7)",
-            "tab": "General information",
-            "score": 113.3278270462481
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.706,
-        "details": {
-          "description": "min=0.706, mean=0.706, max=0.706, sum=0.706 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=9.719, mean=9.719, max=9.719, sum=9.719 (1)",
-            "tab": "Efficiency",
-            "score": 9.718977437496186
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1255.268, mean=1255.268, max=1255.268, sum=1255.268 (1)",
-            "tab": "General information",
-            "score": 1255.268
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=137.554, mean=137.554, max=137.554, sum=137.554 (1)",
-            "tab": "General information",
-            "score": 137.554
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.452,
-        "details": {
-          "description": "min=0.066, mean=0.452, max=0.692, sum=2.258 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=2.695, mean=3.248, max=3.795, sum=16.242 (5)",
-            "tab": "Efficiency",
-            "score": 3.248400288401771
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=287.453, mean=1801.748, max=7275.488, sum=9008.741 (5)",
-            "tab": "General information",
-            "score": 1801.7482458432964
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1.008, mean=8.476, max=25.305, sum=42.382 (5)",
-            "tab": "General information",
-            "score": 8.47642872361909
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.61,
-        "details": {
-          "description": "min=0.61, mean=0.61, max=0.61, sum=0.61 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=2.813, mean=2.813, max=2.813, sum=2.813 (1)",
-            "tab": "Efficiency",
-            "score": 2.813041030531138
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1193.093, mean=1193.093, max=1193.093, sum=1193.093 (1)",
-            "tab": "General information",
-            "score": 1193.0934393638172
-          },
-          "MedQA - # output tokens": {
-            "description": "min=0.95, mean=0.95, max=0.95, sum=0.95 (1)",
-            "tab": "General information",
-            "score": 0.9502982107355865
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.169,
-        "details": {
-          "description": "min=0.07, mean=0.169, max=0.22, sum=0.844 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=3.982, mean=4.948, max=6.067, sum=24.741 (5)",
-            "tab": "Efficiency",
-            "score": 4.9482336292575715
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=198.306, mean=212.433, max=231.018, sum=1062.166 (5)",
-            "tab": "General information",
-            "score": 212.43317355482492
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=26.33, mean=27.816, max=30.692, sum=139.08 (5)",
-            "tab": "General information",
-            "score": 27.81599632971402
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json b/data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json
deleted file mode 100644
index 1f2cb2632..000000000
--- a/data/helm_lite/mistralai/mistral-small-2402/0925f9b7-08f8-485f-84bc-a153a54aa417.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/mistralai_mistral-small-2402/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral Small 2402",
-    "id": "mistralai/mistral-small-2402",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.288,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.39283395755305867
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.519,
-        "details": {
-          "description": "min=0.519, mean=0.519, max=0.519, sum=0.519 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.948, mean=0.948, max=0.948, sum=0.948 (1)",
-            "tab": "Efficiency",
-            "score": 0.947719474577568
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3955.33, mean=3955.33, max=3955.33, sum=3955.33 (1)",
-            "tab": "General information",
-            "score": 3955.3295774647886
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=21.775, mean=21.775, max=21.775, sum=21.775 (1)",
-            "tab": "General information",
-            "score": 21.774647887323944
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.304,
-        "details": {
-          "description": "min=0.304, mean=0.304, max=0.304, sum=0.304 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.384, mean=1.384, max=1.384, sum=1.384 (1)",
-            "tab": "Efficiency",
-            "score": 1.384453837633133
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=1.442, mean=1.442, max=1.442, sum=1.442 (1)",
-            "tab": "Efficiency",
-            "score": 1.4422871778011321
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2537.79, mean=2537.79, max=2537.79, sum=2537.79 (1)",
-            "tab": "General information",
-            "score": 2537.79
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=21.017, mean=21.017, max=21.017, sum=21.017 (1)",
-            "tab": "General information",
-            "score": 21.017
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=211.069, mean=211.069, max=211.069, sum=211.069 (1)",
-            "tab": "General information",
-            "score": 211.069
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=27.473, mean=27.473, max=27.473, sum=27.473 (1)",
-            "tab": "General information",
-            "score": 27.473
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.862,
-        "details": {
-          "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.53, mean=0.53, max=0.53, sum=0.53 (1)",
-            "tab": "Efficiency",
-            "score": 0.5299914984703064
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=280.15, mean=280.15, max=280.15, sum=280.15 (1)",
-            "tab": "General information",
-            "score": 280.15
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.593,
-        "details": {
-          "description": "min=0.26, mean=0.593, max=0.89, sum=2.964 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=1.022, mean=1.262, max=1.477, sum=6.308 (5)",
-            "tab": "Efficiency",
-            "score": 1.2616501861371492
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=402.44, mean=523.091, max=687.175, sum=2615.455 (5)",
-            "tab": "General information",
-            "score": 523.0910877192983
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.621,
-        "details": {
-          "description": "min=0.367, mean=0.621, max=0.859, sum=4.344 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.895, mean=2.217, max=2.662, sum=15.518 (7)",
-            "tab": "Efficiency",
-            "score": 2.216904607788028
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=1061.615, mean=1525.266, max=2572.962, sum=10676.865 (7)",
-            "tab": "General information",
-            "score": 1525.2664139976257
-          },
-          "MATH - # output tokens": {
-            "description": "min=104.221, mean=125.526, max=154.904, sum=878.68 (7)",
-            "tab": "General information",
-            "score": 125.52572529016837
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.734,
-        "details": {
-          "description": "min=0.734, mean=0.734, max=0.734, sum=0.734 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=2.972, mean=2.972, max=2.972, sum=2.972 (1)",
-            "tab": "Efficiency",
-            "score": 2.9720949590206147
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1255.268, mean=1255.268, max=1255.268, sum=1255.268 (1)",
-            "tab": "General information",
-            "score": 1255.268
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=148.06, mean=148.06, max=148.06, sum=148.06 (1)",
-            "tab": "General information",
-            "score": 148.06
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.389,
-        "details": {
-          "description": "min=0, mean=0.389, max=0.789, sum=1.947 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.609, mean=0.874, max=1.067, sum=4.369 (5)",
-            "tab": "Efficiency",
-            "score": 0.8738773620338431
-          },
-          "LegalBench - # eval": {
-            "description": "min=50, mean=312.4, max=1000, sum=1562 (5)",
-            "tab": "General information",
-            "score": 312.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=219.46, mean=1783.252, max=7251, sum=8916.261 (5)",
-            "tab": "General information",
-            "score": 1783.2521685070988
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1.716, mean=12.778, max=30, sum=63.891 (5)",
-            "tab": "General information",
-            "score": 12.778290319804961
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.616,
-        "details": {
-          "description": "min=0.616, mean=0.616, max=0.616, sum=0.616 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.499, mean=0.499, max=0.499, sum=0.499 (1)",
-            "tab": "Efficiency",
-            "score": 0.4987720272413068
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1193.093, mean=1193.093, max=1193.093, sum=1193.093 (1)",
-            "tab": "General information",
-            "score": 1193.0934393638172
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.169,
-        "details": {
-          "description": "min=0.076, mean=0.169, max=0.215, sum=0.843 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.945, mean=1.189, max=1.429, sum=5.943 (5)",
-            "tab": "Efficiency",
-            "score": 1.1885517670659458
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=198.306, mean=212.433, max=231.018, sum=1062.166 (5)",
-            "tab": "General information",
-            "score": 212.43317355482492
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=26.479, mean=28.3, max=29.024, sum=141.498 (5)",
-            "tab": "General information",
-            "score": 28.29957084416578
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json b/data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json
deleted file mode 100644
index e6bfd0332..000000000
--- a/data/helm_lite/mistralai/mixtral-8x22b/08082277-8305-4007-97cd-88202fc0115c.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/mistralai_mixtral-8x22b/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mixtral 8x22B",
-    "id": "mistralai/mixtral-8x22b",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.705,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.5349563046192259
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.779,
-        "details": {
-          "description": "min=0.779, mean=0.779, max=0.779, sum=0.779 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.478, mean=1.478, max=1.478, sum=1.478 (1)",
-            "tab": "Efficiency",
-            "score": 1.477503587158633
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3886.33, mean=3886.33, max=3886.33, sum=3886.33 (1)",
-            "tab": "General information",
-            "score": 3886.3295774647886
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.478,
-        "details": {
-          "description": "min=0.478, mean=0.478, max=0.478, sum=0.478 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.004, mean=1.004, max=1.004, sum=1.004 (1)",
-            "tab": "Efficiency",
-            "score": 1.003950766324997
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.442, mean=0.442, max=0.442, sum=0.442 (1)",
-            "tab": "Efficiency",
-            "score": 0.44196626234054565
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2468.79, mean=2468.79, max=2468.79, sum=2468.79 (1)",
-            "tab": "General information",
-            "score": 2468.79
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=142.069, mean=142.069, max=142.069, sum=142.069 (1)",
-            "tab": "General information",
-            "score": 142.069
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.882,
-        "details": {
-          "description": "min=0.882, mean=0.882, max=0.882, sum=0.882 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.338, mean=0.338, max=0.338, sum=0.338 (1)",
-            "tab": "Efficiency",
-            "score": 0.33846320056915286
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=280.15, mean=280.15, max=280.15, sum=280.15 (1)",
-            "tab": "General information",
-            "score": 280.15
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.701,
-        "details": {
-          "description": "min=0.48, mean=0.701, max=0.95, sum=3.507 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.313, mean=0.344, max=0.359, sum=1.722 (5)",
-            "tab": "Efficiency",
-            "score": 0.344487278235586
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=402.44, mean=523.091, max=687.175, sum=2615.455 (5)",
-            "tab": "General information",
-            "score": 523.0910877192983
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.656,
-        "details": {
-          "description": "min=0.5, mean=0.656, max=0.822, sum=4.589 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=2.009, mean=2.509, max=3.121, sum=17.565 (7)",
-            "tab": "Efficiency",
-            "score": 2.5093491334109825
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=991.615, mean=1455.266, max=2502.962, sum=10186.865 (7)",
-            "tab": "General information",
-            "score": 1455.2664139976257
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=0.8 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=3.539, mean=3.539, max=3.539, sum=3.539 (1)",
-            "tab": "Efficiency",
-            "score": 3.5390553929805755
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1187.268, mean=1187.268, max=1187.268, sum=1187.268 (1)",
-            "tab": "General information",
-            "score": 1187.268
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.708,
-        "details": {
-          "description": "min=0.441, mean=0.708, max=0.968, sum=3.539 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.372, mean=0.821, max=1.973, sum=4.107 (5)",
-            "tab": "Efficiency",
-            "score": 0.8213642223004287
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=219.453, mean=1733.148, max=7207.488, sum=8665.741 (5)",
-            "tab": "General information",
-            "score": 1733.148245843296
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.704,
-        "details": {
-          "description": "min=0.704, mean=0.704, max=0.704, sum=0.704 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.463, mean=0.463, max=0.463, sum=0.463 (1)",
-            "tab": "Efficiency",
-            "score": 0.46328771849038825
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1193.093, mean=1193.093, max=1193.093, sum=1193.093 (1)",
-            "tab": "General information",
-            "score": 1193.0934393638172
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.209,
-        "details": {
-          "description": "min=0.133, mean=0.209, max=0.243, sum=1.045 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.928, mean=0.963, max=0.982, sum=4.813 (5)",
-            "tab": "Efficiency",
-            "score": 0.9626315307056144
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=130.306, mean=144.433, max=163.018, sum=722.166 (5)",
-            "tab": "General information",
-            "score": 144.43317355482492
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json b/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json
deleted file mode 100644
index 7bf0323b1..000000000
--- a/data/helm_lite/mistralai/mixtral-8x7b-32kseqlen/fe554cbd-2480-40bd-b2f5-464cad700c14.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/mistralai_mixtral-8x7b-32kseqlen/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mixtral 8x7B 32K seqlen",
-    "id": "mistralai/mixtral-8x7b-32kseqlen",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.51,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.6727715355805244
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.767,
-        "details": {
-          "description": "min=0.767, mean=0.767, max=0.767, sum=0.767 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.65, mean=0.65, max=0.65, sum=0.65 (1)",
-            "tab": "Efficiency",
-            "score": 0.649569604766201
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.575, mean=4.575, max=4.575, sum=4.575 (1)",
-            "tab": "General information",
-            "score": 4.574647887323944
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3627.715, mean=3627.715, max=3627.715, sum=3627.715 (1)",
-            "tab": "General information",
-            "score": 3627.7154929577464
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.427,
-        "details": {
-          "description": "min=0.427, mean=0.427, max=0.427, sum=0.427 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)",
-            "tab": "Efficiency",
-            "score": 0.507013471364975
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.513, mean=0.513, max=0.513, sum=0.513 (1)",
-            "tab": "Efficiency",
-            "score": 0.5133386459350586
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.832, mean=4.832, max=4.832, sum=4.832 (1)",
-            "tab": "General information",
-            "score": 4.832
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.026, mean=0.026, max=0.026, sum=0.026 (1)",
-            "tab": "General information",
-            "score": 0.026
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2268.728, mean=2268.728, max=2268.728, sum=2268.728 (1)",
-            "tab": "General information",
-            "score": 2268.728
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.991, mean=0.991, max=0.991, sum=0.991 (1)",
-            "tab": "General information",
-            "score": 0.991
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=142.069, mean=142.069, max=142.069, sum=142.069 (1)",
-            "tab": "General information",
-            "score": 142.069
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=0.999, mean=0.999, max=0.999, sum=0.999 (1)",
-            "tab": "General information",
-            "score": 0.999
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.868,
-        "details": {
-          "description": "min=0.868, mean=0.868, max=0.868, sum=0.868 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.354, mean=0.354, max=0.354, sum=0.354 (1)",
-            "tab": "Efficiency",
-            "score": 0.3542211503982544
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=280.15, mean=280.15, max=280.15, sum=280.15 (1)",
-            "tab": "General information",
-            "score": 280.15
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.649,
-        "details": {
-          "description": "min=0.38, mean=0.649, max=0.93, sum=3.245 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.355, mean=0.36, max=0.366, sum=1.802 (5)",
-            "tab": "Efficiency",
-            "score": 0.3604579553102192
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=402.44, mean=523.091, max=687.175, sum=2615.455 (5)",
-            "tab": "General information",
-            "score": 523.0910877192983
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.494,
-        "details": {
-          "description": "min=0.289, mean=0.494, max=0.696, sum=3.459 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.128, mean=1.528, max=2.033, sum=10.695 (7)",
-            "tab": "Efficiency",
-            "score": 1.527861329055259
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=991.615, mean=1455.266, max=2502.962, sum=10186.865 (7)",
-            "tab": "General information",
-            "score": 1455.2664139976257
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.622,
-        "details": {
-          "description": "min=0.622, mean=0.622, max=0.622, sum=0.622 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=3.273, mean=3.273, max=3.273, sum=3.273 (1)",
-            "tab": "Efficiency",
-            "score": 3.2728567245006563
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1187.268, mean=1187.268, max=1187.268, sum=1187.268 (1)",
-            "tab": "General information",
-            "score": 1187.268
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.63,
-        "details": {
-          "description": "min=0.428, mean=0.63, max=0.853, sum=3.15 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.369, mean=0.41, max=0.512, sum=2.05 (5)",
-            "tab": "Efficiency",
-            "score": 0.40995627823211056
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=1.969, mean=4.194, max=5, sum=20.969 (5)",
-            "tab": "General information",
-            "score": 4.1938775510204085
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=219.453, mean=998.503, max=3534.259, sum=4992.513 (5)",
-            "tab": "General information",
-            "score": 998.5025315575822
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0.998, mean=1.0, max=1, sum=4.998 (5)",
-            "tab": "General information",
-            "score": 0.9995918367346939
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.652,
-        "details": {
-          "description": "min=0.652, mean=0.652, max=0.652, sum=0.652 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.353, mean=0.353, max=0.353, sum=0.353 (1)",
-            "tab": "Efficiency",
-            "score": 0.35297762423338996
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1193.093, mean=1193.093, max=1193.093, sum=1193.093 (1)",
-            "tab": "General information",
-            "score": 1193.0934393638172
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.19,
-        "details": {
-          "description": "min=0.099, mean=0.19, max=0.23, sum=0.949 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.115, mean=1.202, max=1.294, sum=6.011 (5)",
-            "tab": "Efficiency",
-            "score": 1.2021687407719377
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=130.306, mean=144.433, max=163.018, sum=722.166 (5)",
-            "tab": "General information",
-            "score": 144.43317355482492
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=0.994, mean=0.999, max=1, sum=4.994 (5)",
-            "tab": "General information",
-            "score": 0.998798076923077
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json b/data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json
deleted file mode 100644
index 7fee5cb57..000000000
--- a/data/helm_lite/mistralai/open-mistral-nemo-2407/9d048af8-b1cb-49cb-b8ab-ab0948deacd7.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/mistralai_open-mistral-nemo-2407/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral NeMo 2402",
-    "id": "mistralai/open-mistral-nemo-2407",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.333,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.5309862671660425
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.731,
-        "details": {
-          "description": "min=0.731, mean=0.731, max=0.731, sum=0.731 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.711, mean=0.711, max=0.711, sum=0.711 (1)",
-            "tab": "Efficiency",
-            "score": 0.7111437549053783
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3527.392, mean=3527.392, max=3527.392, sum=3527.392 (1)",
-            "tab": "General information",
-            "score": 3527.3915492957744
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=6.901, mean=6.901, max=6.901, sum=6.901 (1)",
-            "tab": "General information",
-            "score": 6.901408450704225
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.265,
-        "details": {
-          "description": "min=0.265, mean=0.265, max=0.265, sum=0.265 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.852, mean=0.852, max=0.852, sum=0.852 (1)",
-            "tab": "Efficiency",
-            "score": 0.851971923828125
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.877, mean=0.877, max=0.877, sum=0.877 (1)",
-            "tab": "Efficiency",
-            "score": 0.8765462198257447
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2032.962, mean=2032.962, max=2032.962, sum=2032.962 (1)",
-            "tab": "General information",
-            "score": 2032.962
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=5.927, mean=5.927, max=5.927, sum=5.927 (1)",
-            "tab": "General information",
-            "score": 5.927
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=137.405, mean=137.405, max=137.405, sum=137.405 (1)",
-            "tab": "General information",
-            "score": 137.405
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=3.595, mean=3.595, max=3.595, sum=3.595 (1)",
-            "tab": "General information",
-            "score": 3.595
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.822,
-        "details": {
-          "description": "min=0.822, mean=0.822, max=0.822, sum=0.822 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.799, mean=0.799, max=0.799, sum=0.799 (1)",
-            "tab": "Efficiency",
-            "score": 0.7987758111953736
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=248.246, mean=248.246, max=248.246, sum=248.246 (1)",
-            "tab": "General information",
-            "score": 248.246
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.604,
-        "details": {
-          "description": "min=0.29, mean=0.604, max=0.89, sum=3.021 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.635, mean=0.782, max=1.011, sum=3.908 (5)",
-            "tab": "Efficiency",
-            "score": 0.7815720957371226
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=377.89, mean=479.924, max=631.851, sum=2399.621 (5)",
-            "tab": "General information",
-            "score": 479.9241754385965
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.668,
-        "details": {
-          "description": "min=0.558, mean=0.668, max=0.852, sum=4.679 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=0.866, mean=1.013, max=1.281, sum=7.093 (7)",
-            "tab": "Efficiency",
-            "score": 1.0132869822173503
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=915.846, mean=1317.28, max=2238.885, sum=9220.959 (7)",
-            "tab": "General information",
-            "score": 1317.2798769434019
-          },
-          "MATH - # output tokens": {
-            "description": "min=97.456, mean=111.745, max=141.433, sum=782.217 (7)",
-            "tab": "General information",
-            "score": 111.74533800213115
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.782,
-        "details": {
-          "description": "min=0.782, mean=0.782, max=0.782, sum=0.782 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.425, mean=1.425, max=1.425, sum=1.425 (1)",
-            "tab": "Efficiency",
-            "score": 1.4254731934070588
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1134.356, mean=1134.356, max=1134.356, sum=1134.356 (1)",
-            "tab": "General information",
-            "score": 1134.356
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=187.859, mean=187.859, max=187.859, sum=187.859 (1)",
-            "tab": "General information",
-            "score": 187.859
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.415,
-        "details": {
-          "description": "min=0.232, mean=0.415, max=0.758, sum=2.076 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.715, mean=0.78, max=0.868, sum=3.898 (5)",
-            "tab": "Efficiency",
-            "score": 0.7795765090728288
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=200.716, mean=1561.36, max=6486.116, sum=7806.8 (5)",
-            "tab": "General information",
-            "score": 1561.3600575619662
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=4.94, mean=8.473, max=15.796, sum=42.365 (5)",
-            "tab": "General information",
-            "score": 8.473099835809844
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.59,
-        "details": {
-          "description": "min=0.59, mean=0.59, max=0.59, sum=0.59 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.749, mean=0.749, max=0.749, sum=0.749 (1)",
-            "tab": "Efficiency",
-            "score": 0.7488490715178533
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1022.543, mean=1022.543, max=1022.543, sum=1022.543 (1)",
-            "tab": "General information",
-            "score": 1022.5427435387674
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.177,
-        "details": {
-          "description": "min=0.111, mean=0.177, max=0.211, sum=0.887 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.752, mean=0.782, max=0.819, sum=3.911 (5)",
-            "tab": "Efficiency",
-            "score": 0.7821908106898373
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=81.661, mean=110.163, max=135.306, sum=550.814 (5)",
-            "tab": "General information",
-            "score": 110.16282784064842
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=24.622, mean=26.542, max=27.26, sum=132.709 (5)",
-            "tab": "General information",
-            "score": 26.541759538920324
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json b/data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json
deleted file mode 100644
index 878d33981..000000000
--- a/data/helm_lite/openai/gpt-3.5-turbo-0613/d9654997-1d3e-41c3-9f16-05a36dde9b02.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/openai_gpt-3.5-turbo-0613/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-3.5 Turbo 0613",
-    "id": "openai/gpt-3.5-turbo-0613",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.956641697877653
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.655,
-        "details": {
-          "description": "min=0.655, mean=0.655, max=0.655, sum=0.655 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.381, mean=0.381, max=0.381, sum=0.381 (1)",
-            "tab": "Efficiency",
-            "score": 0.3810261323418416
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.946, mean=4.946, max=4.946, sum=4.946 (1)",
-            "tab": "General information",
-            "score": 4.946478873239436
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3493.662, mean=3493.662, max=3493.662, sum=3493.662 (1)",
-            "tab": "General information",
-            "score": 3493.6619718309857
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=9.91, mean=9.91, max=9.91, sum=9.91 (1)",
-            "tab": "General information",
-            "score": 9.909859154929578
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.335,
-        "details": {
-          "description": "min=0.335, mean=0.335, max=0.335, sum=0.335 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.305, mean=0.305, max=0.305, sum=0.305 (1)",
-            "tab": "Efficiency",
-            "score": 0.30532183837890625
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.221, mean=0.221, max=0.221, sum=0.221 (1)",
-            "tab": "Efficiency",
-            "score": 0.22069251775741577
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.884, mean=4.884, max=4.884, sum=4.884 (1)",
-            "tab": "General information",
-            "score": 4.884
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.019, mean=0.019, max=0.019, sum=0.019 (1)",
-            "tab": "General information",
-            "score": 0.019
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1649.552, mean=1649.552, max=1649.552, sum=1649.552 (1)",
-            "tab": "General information",
-            "score": 1649.552
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=9.389, mean=9.389, max=9.389, sum=9.389 (1)",
-            "tab": "General information",
-            "score": 9.389
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=173.127, mean=173.127, max=173.127, sum=173.127 (1)",
-            "tab": "General information",
-            "score": 173.127
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=5.576, mean=5.576, max=5.576, sum=5.576 (1)",
-            "tab": "General information",
-            "score": 5.576
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.838,
-        "details": {
-          "description": "min=0.838, mean=0.838, max=0.838, sum=0.838 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.172, mean=0.172, max=0.172, sum=0.172 (1)",
-            "tab": "Efficiency",
-            "score": 0.17227248001098633
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=242.782, mean=242.782, max=242.782, sum=242.782 (1)",
-            "tab": "General information",
-            "score": 242.782
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.614,
-        "details": {
-          "description": "min=0.38, mean=0.614, max=0.88, sum=3.07 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.171, mean=0.175, max=0.177, sum=0.875 (5)",
-            "tab": "Efficiency",
-            "score": 0.1750619323630082
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)",
-            "tab": "General information",
-            "score": 460.71996491228066
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.667,
-        "details": {
-          "description": "min=0.533, mean=0.667, max=0.826, sum=4.667 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=0.741, mean=0.813, max=0.963, sum=5.69 (7)",
-            "tab": "Efficiency",
-            "score": 0.8128212395123947
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=942.363, mean=1323.911, max=2258.577, sum=9267.376 (7)",
-            "tab": "General information",
-            "score": 1323.910874184069
-          },
-          "MATH - # output tokens": {
-            "description": "min=53.5, mean=60.844, max=77.4, sum=425.908 (7)",
-            "tab": "General information",
-            "score": 60.844003793024605
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.501,
-        "details": {
-          "description": "min=0.501, mean=0.501, max=0.501, sum=0.501 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=0.898, mean=0.898, max=0.898, sum=0.898 (1)",
-            "tab": "Efficiency",
-            "score": 0.8983073465824127
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1020.035, mean=1020.035, max=1020.035, sum=1020.035 (1)",
-            "tab": "General information",
-            "score": 1020.035
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=77.29, mean=77.29, max=77.29, sum=77.29 (1)",
-            "tab": "General information",
-            "score": 77.29
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.528,
-        "details": {
-          "description": "min=0.302, mean=0.528, max=0.747, sum=2.642 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.178, mean=0.202, max=0.277, sum=1.011 (5)",
-            "tab": "Efficiency",
-            "score": 0.20213919553681423
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=2.09, mean=4.218, max=5, sum=21.09 (5)",
-            "tab": "General information",
-            "score": 4.21795918367347
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=253.442, mean=949.517, max=3254.159, sum=4747.586 (5)",
-            "tab": "General information",
-            "score": 949.5172570702738
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.387, max=2.032, sum=6.934 (5)",
-            "tab": "General information",
-            "score": 1.3868394951957552
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.622,
-        "details": {
-          "description": "min=0.622, mean=0.622, max=0.622, sum=0.622 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.194, mean=0.194, max=0.194, sum=0.194 (1)",
-            "tab": "Efficiency",
-            "score": 0.19374941736755977
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1020.414, mean=1020.414, max=1020.414, sum=1020.414 (1)",
-            "tab": "General information",
-            "score": 1020.4135188866799
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.187,
-        "details": {
-          "description": "min=0.1, mean=0.187, max=0.23, sum=0.937 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.367, mean=0.394, max=0.409, sum=1.968 (5)",
-            "tab": "Efficiency",
-            "score": 0.39351808213963385
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=169.901, mean=193.043, max=213.185, sum=965.213 (5)",
-            "tab": "General information",
-            "score": 193.04258583116683
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=21.983, mean=25.038, max=26.352, sum=125.192 (5)",
-            "tab": "General information",
-            "score": 25.038384118366725
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json b/data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json
deleted file mode 100644
index 7ff111f74..000000000
--- a/data/helm_lite/openai/gpt-4-0613/73d6b1fe-3f58-4640-b24b-e12b9ea1aca3.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/openai_gpt-4-0613/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-4 0613",
-    "id": "openai/gpt-4-0613",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.867,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.5158801498127341
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.768,
-        "details": {
-          "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.976, mean=0.976, max=0.976, sum=0.976 (1)",
-            "tab": "Efficiency",
-            "score": 0.9758186582108619
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3522.67, mean=3522.67, max=3522.67, sum=3522.67 (1)",
-            "tab": "General information",
-            "score": 3522.6704225352114
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=8.515, mean=8.515, max=8.515, sum=8.515 (1)",
-            "tab": "General information",
-            "score": 8.51549295774648
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.457,
-        "details": {
-          "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.908, mean=0.908, max=0.908, sum=0.908 (1)",
-            "tab": "Efficiency",
-            "score": 0.9083020164966583
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.512, mean=0.512, max=0.512, sum=0.512 (1)",
-            "tab": "Efficiency",
-            "score": 0.5116857671737671
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.964, mean=4.964, max=4.964, sum=4.964 (1)",
-            "tab": "General information",
-            "score": 4.964
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)",
-            "tab": "General information",
-            "score": 0.007
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1717.847, mean=1717.847, max=1717.847, sum=1717.847 (1)",
-            "tab": "General information",
-            "score": 1717.847
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=8.055, mean=8.055, max=8.055, sum=8.055 (1)",
-            "tab": "General information",
-            "score": 8.055
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=173.127, mean=173.127, max=173.127, sum=173.127 (1)",
-            "tab": "General information",
-            "score": 173.127
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=3.832, mean=3.832, max=3.832, sum=3.832 (1)",
-            "tab": "General information",
-            "score": 3.832
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.96,
-        "details": {
-          "description": "min=0.96, mean=0.96, max=0.96, sum=0.96 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.401, mean=0.401, max=0.401, sum=0.401 (1)",
-            "tab": "Efficiency",
-            "score": 0.40061268854141235
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=242.782, mean=242.782, max=242.782, sum=242.782 (1)",
-            "tab": "General information",
-            "score": 242.782
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.735,
-        "details": {
-          "description": "min=0.55, mean=0.735, max=0.95, sum=3.674 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.364, mean=0.391, max=0.434, sum=1.954 (5)",
-            "tab": "Efficiency",
-            "score": 0.39080846048656265
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)",
-            "tab": "General information",
-            "score": 460.71996491228066
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.802,
-        "details": {
-          "description": "min=0.673, mean=0.802, max=0.948, sum=5.617 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=2.95, mean=3.472, max=4.247, sum=24.303 (7)",
-            "tab": "Efficiency",
-            "score": 3.4718795228507955
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=942.363, mean=1323.911, max=2258.577, sum=9267.376 (7)",
-            "tab": "General information",
-            "score": 1323.910874184069
-          },
-          "MATH - # output tokens": {
-            "description": "min=59.674, mean=73.257, max=81.1, sum=512.799 (7)",
-            "tab": "General information",
-            "score": 73.25695858608955
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.932,
-        "details": {
-          "description": "min=0.932, mean=0.932, max=0.932, sum=0.932 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=4.948, mean=4.948, max=4.948, sum=4.948 (1)",
-            "tab": "Efficiency",
-            "score": 4.947624314308166
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1020.035, mean=1020.035, max=1020.035, sum=1020.035 (1)",
-            "tab": "General information",
-            "score": 1020.035
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=111.209, mean=111.209, max=111.209, sum=111.209 (1)",
-            "tab": "General information",
-            "score": 111.209
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.713,
-        "details": {
-          "description": "min=0.452, mean=0.713, max=0.905, sum=3.564 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.46, mean=0.558, max=0.886, sum=2.791 (5)",
-            "tab": "Efficiency",
-            "score": 0.5582764348578453
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.798, max=5, sum=23.992 (5)",
-            "tab": "General information",
-            "score": 4.798367346938775
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=253.442, mean=1568.687, max=6350.008, sum=7843.435 (5)",
-            "tab": "General information",
-            "score": 1568.6870529886412
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.34, max=2.063, sum=6.698 (5)",
-            "tab": "General information",
-            "score": 1.3396070557866055
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.815,
-        "details": {
-          "description": "min=0.815, mean=0.815, max=0.815, sum=0.815 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.414, mean=0.414, max=0.414, sum=0.414 (1)",
-            "tab": "Efficiency",
-            "score": 0.4136932588239787
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1020.414, mean=1020.414, max=1020.414, sum=1020.414 (1)",
-            "tab": "General information",
-            "score": 1020.4135188866799
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.211,
-        "details": {
-          "description": "min=0.149, mean=0.211, max=0.256, sum=1.053 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.448, mean=1.58, max=1.724, sum=7.899 (5)",
-            "tab": "Efficiency",
-            "score": 1.5797039644192494
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=169.901, mean=193.043, max=213.185, sum=965.213 (5)",
-            "tab": "General information",
-            "score": 193.04258583116683
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=23.767, mean=25.424, max=26.121, sum=127.122 (5)",
-            "tab": "General information",
-            "score": 25.424382072946933
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json b/data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json
deleted file mode 100644
index 060ab8fb5..000000000
--- a/data/helm_lite/openai/gpt-4-1106-preview/4d01d929-b5e2-42dc-89ee-20560f560db5.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/openai_gpt-4-1106-preview/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-4 Turbo 1106 preview",
-    "id": "openai/gpt-4-1106-preview",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.698,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.3935580524344569
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.727,
-        "details": {
-          "description": "min=0.727, mean=0.727, max=0.727, sum=0.727 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.068, mean=1.068, max=1.068, sum=1.068 (1)",
-            "tab": "Efficiency",
-            "score": 1.068114177945634
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3522.67, mean=3522.67, max=3522.67, sum=3522.67 (1)",
-            "tab": "General information",
-            "score": 3522.6704225352114
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=9.885, mean=9.885, max=9.885, sum=9.885 (1)",
-            "tab": "General information",
-            "score": 9.88450704225352
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.435,
-        "details": {
-          "description": "min=0.435, mean=0.435, max=0.435, sum=0.435 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.867, mean=0.867, max=0.867, sum=0.867 (1)",
-            "tab": "Efficiency",
-            "score": 0.8667134034633637
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=1.131, mean=1.131, max=1.131, sum=1.131 (1)",
-            "tab": "Efficiency",
-            "score": 1.1312835423946381
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1762.593, mean=1762.593, max=1762.593, sum=1762.593 (1)",
-            "tab": "General information",
-            "score": 1762.593
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=8.753, mean=8.753, max=8.753, sum=8.753 (1)",
-            "tab": "General information",
-            "score": 8.753
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=173.127, mean=173.127, max=173.127, sum=173.127 (1)",
-            "tab": "General information",
-            "score": 173.127
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=14.157, mean=14.157, max=14.157, sum=14.157 (1)",
-            "tab": "General information",
-            "score": 14.157
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.95,
-        "details": {
-          "description": "min=0.95, mean=0.95, max=0.95, sum=0.95 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.512, mean=0.512, max=0.512, sum=0.512 (1)",
-            "tab": "Efficiency",
-            "score": 0.5122070140838623
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=242.782, mean=242.782, max=242.782, sum=242.782 (1)",
-            "tab": "General information",
-            "score": 242.782
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.699,
-        "details": {
-          "description": "min=0.47, mean=0.699, max=0.96, sum=3.495 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.397, mean=0.447, max=0.515, sum=2.236 (5)",
-            "tab": "Efficiency",
-            "score": 0.4471675806380155
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=366.44, mean=460.72, max=607.43, sum=2303.6 (5)",
-            "tab": "General information",
-            "score": 460.71996491228066
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.857,
-        "details": {
-          "description": "min=0.711, mean=0.857, max=0.97, sum=5.998 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=10.989, mean=12.704, max=15.09, sum=88.928 (7)",
-            "tab": "Efficiency",
-            "score": 12.704059314714486
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=942.363, mean=1323.911, max=2258.577, sum=9267.376 (7)",
-            "tab": "General information",
-            "score": 1323.910874184069
-          },
-          "MATH - # output tokens": {
-            "description": "min=122.465, mean=161.876, max=186.673, sum=1133.133 (7)",
-            "tab": "General information",
-            "score": 161.87607288445722
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.668,
-        "details": {
-          "description": "min=0.668, mean=0.668, max=0.668, sum=0.668 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=5.738, mean=5.738, max=5.738, sum=5.738 (1)",
-            "tab": "Efficiency",
-            "score": 5.738402992963791
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1020.035, mean=1020.035, max=1020.035, sum=1020.035 (1)",
-            "tab": "General information",
-            "score": 1020.035
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=98.073, mean=98.073, max=98.073, sum=98.073 (1)",
-            "tab": "General information",
-            "score": 98.073
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.626,
-        "details": {
-          "description": "min=0.368, mean=0.626, max=0.989, sum=3.13 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.445, mean=0.603, max=0.98, sum=3.017 (5)",
-            "tab": "Efficiency",
-            "score": 0.6033123332286346
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=253.442, mean=1570.163, max=6357.388, sum=7850.815 (5)",
-            "tab": "General information",
-            "score": 1570.162971355988
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.458, max=2.695, sum=7.291 (5)",
-            "tab": "General information",
-            "score": 1.458208948802524
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.817,
-        "details": {
-          "description": "min=0.817, mean=0.817, max=0.817, sum=0.817 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.392, mean=0.392, max=0.392, sum=0.392 (1)",
-            "tab": "Efficiency",
-            "score": 0.3924491192190121
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1020.414, mean=1020.414, max=1020.414, sum=1020.414 (1)",
-            "tab": "General information",
-            "score": 1020.4135188866799
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.205,
-        "details": {
-          "description": "min=0.156, mean=0.205, max=0.241, sum=1.023 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.797, mean=2.1, max=2.349, sum=10.502 (5)",
-            "tab": "Efficiency",
-            "score": 2.1004491326059744
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=169.901, mean=193.043, max=213.185, sum=965.213 (5)",
-            "tab": "General information",
-            "score": 193.04258583116683
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=26.229, mean=26.996, max=28.59, sum=134.98 (5)",
-            "tab": "General information",
-            "score": 26.995945480960394
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json b/data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json
deleted file mode 100644
index dae83b652..000000000
--- a/data/helm_lite/openai/gpt-4-turbo-2024-04-09/76c78ade-2ad6-4c85-93c9-65c4b6b249b7.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/openai_gpt-4-turbo-2024-04-09/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-4 Turbo 2024-04-09",
-    "id": "openai/gpt-4-turbo-2024-04-09",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.864,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.4568414481897628
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.761,
-        "details": {
-          "description": "min=0.761, mean=0.761, max=0.761, sum=0.761 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.804, mean=0.804, max=0.804, sum=0.804 (1)",
-            "tab": "Efficiency",
-            "score": 0.8043310716118611
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3495.67, mean=3495.67, max=3495.67, sum=3495.67 (1)",
-            "tab": "General information",
-            "score": 3495.6704225352114
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=6.037, mean=6.037, max=6.037, sum=6.037 (1)",
-            "tab": "General information",
-            "score": 6.0366197183098596
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.482,
-        "details": {
-          "description": "min=0.482, mean=0.482, max=0.482, sum=0.482 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.712, mean=0.712, max=0.712, sum=0.712 (1)",
-            "tab": "Efficiency",
-            "score": 0.7120162718296051
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.605, mean=0.605, max=0.605, sum=0.605 (1)",
-            "tab": "Efficiency",
-            "score": 0.6052222681045533
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1728.593, mean=1728.593, max=1728.593, sum=1728.593 (1)",
-            "tab": "General information",
-            "score": 1728.593
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=5.902, mean=5.902, max=5.902, sum=5.902 (1)",
-            "tab": "General information",
-            "score": 5.902
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=139.127, mean=139.127, max=139.127, sum=139.127 (1)",
-            "tab": "General information",
-            "score": 139.127
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=5.263, mean=5.263, max=5.263, sum=5.263 (1)",
-            "tab": "General information",
-            "score": 5.263
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.97,
-        "details": {
-          "description": "min=0.97, mean=0.97, max=0.97, sum=0.97 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.438, mean=0.438, max=0.438, sum=0.438 (1)",
-            "tab": "Efficiency",
-            "score": 0.4376141686439514
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=249.782, mean=249.782, max=249.782, sum=249.782 (1)",
-            "tab": "General information",
-            "score": 249.782
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.711,
-        "details": {
-          "description": "min=0.53, mean=0.711, max=0.96, sum=3.555 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.53, mean=0.55, max=0.572, sum=2.749 (5)",
-            "tab": "Efficiency",
-            "score": 0.5498773384847139
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=373.44, mean=467.72, max=614.43, sum=2338.6 (5)",
-            "tab": "General information",
-            "score": 467.71996491228066
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.833,
-        "details": {
-          "description": "min=0.684, mean=0.833, max=0.97, sum=5.83 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=4.92, mean=6.678, max=8.338, sum=46.748 (7)",
-            "tab": "Efficiency",
-            "score": 6.678270916932833
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=881.363, mean=1262.911, max=2197.577, sum=8840.376 (7)",
-            "tab": "General information",
-            "score": 1262.9108741840687
-          },
-          "MATH - # output tokens": {
-            "description": "min=135.163, mean=189.561, max=219.316, sum=1326.926 (7)",
-            "tab": "General information",
-            "score": 189.56082409362702
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.824,
-        "details": {
-          "description": "min=0.824, mean=0.824, max=0.824, sum=0.824 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=6.915, mean=6.915, max=6.915, sum=6.915 (1)",
-            "tab": "Efficiency",
-            "score": 6.91472976398468
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=959.035, mean=959.035, max=959.035, sum=959.035 (1)",
-            "tab": "General information",
-            "score": 959.035
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=141.712, mean=141.712, max=141.712, sum=141.712 (1)",
-            "tab": "General information",
-            "score": 141.712
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.727,
-        "details": {
-          "description": "min=0.417, mean=0.727, max=0.947, sum=3.637 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.514, mean=0.608, max=0.803, sum=3.041 (5)",
-            "tab": "Efficiency",
-            "score": 0.6081070231398068
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=207.442, mean=1524.163, max=6311.388, sum=7620.815 (5)",
-            "tab": "General information",
-            "score": 1524.162971355988
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.325, max=2.032, sum=6.626 (5)",
-            "tab": "General information",
-            "score": 1.3251168793919403
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.783,
-        "details": {
-          "description": "min=0.783, mean=0.783, max=0.783, sum=0.783 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.455, mean=0.455, max=0.455, sum=0.455 (1)",
-            "tab": "Efficiency",
-            "score": 0.4549296101329341
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1027.414, mean=1027.414, max=1027.414, sum=1027.414 (1)",
-            "tab": "General information",
-            "score": 1027.4135188866799
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.218,
-        "details": {
-          "description": "min=0.169, mean=0.218, max=0.264, sum=1.088 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.131, mean=1.185, max=1.222, sum=5.925 (5)",
-            "tab": "Efficiency",
-            "score": 1.1850423664020953
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=124.901, mean=148.043, max=168.185, sum=740.213 (5)",
-            "tab": "General information",
-            "score": 148.04258583116683
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=23.744, mean=25.264, max=25.938, sum=126.322 (5)",
-            "tab": "General information",
-            "score": 25.26444840571953
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json b/data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json
deleted file mode 100644
index c23053f17..000000000
--- a/data/helm_lite/openai/gpt-4o-2024-05-13/69ea0ef0-c136-4cff-9607-6ae12e0692c3.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/openai_gpt-4o-2024-05-13/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-4o 2024-05-13",
-    "id": "openai/gpt-4o-2024-05-13",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.938,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.6270536828963795
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.804,
-        "details": {
-          "description": "min=0.804, mean=0.804, max=0.804, sum=0.804 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.556, mean=0.556, max=0.556, sum=0.556 (1)",
-            "tab": "Efficiency",
-            "score": 0.5561933571184186
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3461.668, mean=3461.668, max=3461.668, sum=3461.668 (1)",
-            "tab": "General information",
-            "score": 3461.667605633803
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=4.62, mean=4.62, max=4.62, sum=4.62 (1)",
-            "tab": "General information",
-            "score": 4.619718309859155
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.501,
-        "details": {
-          "description": "min=0.501, mean=0.501, max=0.501, sum=0.501 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.507, mean=0.507, max=0.507, sum=0.507 (1)",
-            "tab": "Efficiency",
-            "score": 0.5071200861930847
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.461, mean=0.461, max=0.461, sum=0.461 (1)",
-            "tab": "Efficiency",
-            "score": 0.46105142664909365
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1724.02, mean=1724.02, max=1724.02, sum=1724.02 (1)",
-            "tab": "General information",
-            "score": 1724.02
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=5.41, mean=5.41, max=5.41, sum=5.41 (1)",
-            "tab": "General information",
-            "score": 5.41
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=139.953, mean=139.953, max=139.953, sum=139.953 (1)",
-            "tab": "General information",
-            "score": 139.953
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.245, mean=4.245, max=4.245, sum=4.245 (1)",
-            "tab": "General information",
-            "score": 4.245
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.966,
-        "details": {
-          "description": "min=0.966, mean=0.966, max=0.966, sum=0.966 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.402, mean=0.402, max=0.402, sum=0.402 (1)",
-            "tab": "Efficiency",
-            "score": 0.4019911346435547
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=245.486, mean=245.486, max=245.486, sum=245.486 (1)",
-            "tab": "General information",
-            "score": 245.486
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.748,
-        "details": {
-          "description": "min=0.61, mean=0.748, max=0.95, sum=3.742 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.353, mean=0.39, max=0.416, sum=1.952 (5)",
-            "tab": "Efficiency",
-            "score": 0.3904274333485386
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=373.42, mean=466.992, max=613.228, sum=2334.958 (5)",
-            "tab": "General information",
-            "score": 466.9916140350877
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.829,
-        "details": {
-          "description": "min=0.632, mean=0.829, max=0.977, sum=5.802 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=3.334, mean=4.358, max=4.85, sum=30.503 (7)",
-            "tab": "Efficiency",
-            "score": 4.357550465458739
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=888.43, mean=1273.32, max=2222.25, sum=8913.243 (7)",
-            "tab": "General information",
-            "score": 1273.320452019534
-          },
-          "MATH - # output tokens": {
-            "description": "min=187.942, mean=245.482, max=284.788, sum=1718.377 (7)",
-            "tab": "General information",
-            "score": 245.4823665454633
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.905,
-        "details": {
-          "description": "min=0.905, mean=0.905, max=0.905, sum=0.905 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=4.227, mean=4.227, max=4.227, sum=4.227 (1)",
-            "tab": "Efficiency",
-            "score": 4.227096201658249
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=952.617, mean=952.617, max=952.617, sum=952.617 (1)",
-            "tab": "General information",
-            "score": 952.617
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=213.475, mean=213.475, max=213.475, sum=213.475 (1)",
-            "tab": "General information",
-            "score": 213.475
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.733,
-        "details": {
-          "description": "min=0.441, mean=0.733, max=0.989, sum=3.666 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.36, mean=0.431, max=0.568, sum=2.154 (5)",
-            "tab": "Efficiency",
-            "score": 0.4307274274560104
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=208.179, mean=1512.795, max=6254.98, sum=7563.977 (5)",
-            "tab": "General information",
-            "score": 1512.7954037538377
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.249, max=2.021, sum=6.244 (5)",
-            "tab": "General information",
-            "score": 1.2488971748171518
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.857,
-        "details": {
-          "description": "min=0.857, mean=0.857, max=0.857, sum=0.857 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.407, mean=0.407, max=0.407, sum=0.407 (1)",
-            "tab": "Efficiency",
-            "score": 0.4072816490416024
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1009.05, mean=1009.05, max=1009.05, sum=1009.05 (1)",
-            "tab": "General information",
-            "score": 1009.0497017892644
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.231,
-        "details": {
-          "description": "min=0.176, mean=0.231, max=0.281, sum=1.154 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.775, mean=0.842, max=0.967, sum=4.212 (5)",
-            "tab": "Efficiency",
-            "score": 0.8424805298775759
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=79.529, mean=115.006, max=138.497, sum=575.028 (5)",
-            "tab": "General information",
-            "score": 115.00557042361216
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=23.62, mean=25.287, max=26.018, sum=126.434 (5)",
-            "tab": "General information",
-            "score": 25.286879683437835
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json b/data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json
deleted file mode 100644
index f8d7c3614..000000000
--- a/data/helm_lite/openai/gpt-4o-2024-08-06/bbe708f3-fb78-49e9-876d-cae57f1231cc.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/openai_gpt-4o-2024-08-06/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-4o 2024-08-06",
-    "id": "openai/gpt-4o-2024-08-06",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.928,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.6728589263420724
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.795,
-        "details": {
-          "description": "min=0.795, mean=0.795, max=0.795, sum=0.795 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.562, mean=0.562, max=0.562, sum=0.562 (1)",
-            "tab": "Efficiency",
-            "score": 0.5615828097706109
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3451.668, mean=3451.668, max=3451.668, sum=3451.668 (1)",
-            "tab": "General information",
-            "score": 3451.667605633803
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.076, mean=5.076, max=5.076, sum=5.076 (1)",
-            "tab": "General information",
-            "score": 5.076056338028169
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.496,
-        "details": {
-          "description": "min=0.496, mean=0.496, max=0.496, sum=0.496 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.616, mean=0.616, max=0.616, sum=0.616 (1)",
-            "tab": "Efficiency",
-            "score": 0.6156781461238862
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.418, mean=0.418, max=0.418, sum=0.418 (1)",
-            "tab": "Efficiency",
-            "score": 0.4182390425205231
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1714.02, mean=1714.02, max=1714.02, sum=1714.02 (1)",
-            "tab": "General information",
-            "score": 1714.02
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=6.504, mean=6.504, max=6.504, sum=6.504 (1)",
-            "tab": "General information",
-            "score": 6.504
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=129.953, mean=129.953, max=129.953, sum=129.953 (1)",
-            "tab": "General information",
-            "score": 129.953
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=5.032, mean=5.032, max=5.032, sum=5.032 (1)",
-            "tab": "General information",
-            "score": 5.032
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.968,
-        "details": {
-          "description": "min=0.968, mean=0.968, max=0.968, sum=0.968 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.401, mean=0.401, max=0.401, sum=0.401 (1)",
-            "tab": "Efficiency",
-            "score": 0.40116420984268186
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=245.486, mean=245.486, max=245.486, sum=245.486 (1)",
-            "tab": "General information",
-            "score": 245.486
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.738,
-        "details": {
-          "description": "min=0.58, mean=0.738, max=0.95, sum=3.691 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.335, mean=0.441, max=0.512, sum=2.204 (5)",
-            "tab": "Efficiency",
-            "score": 0.4407063991228739
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=373.42, mean=466.992, max=613.228, sum=2334.958 (5)",
-            "tab": "General information",
-            "score": 466.9916140350877
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.853,
-        "details": {
-          "description": "min=0.731, mean=0.853, max=0.956, sum=5.968 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=3.205, mean=4.321, max=6.062, sum=30.245 (7)",
-            "tab": "Efficiency",
-            "score": 4.320655013573451
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=888.43, mean=1273.32, max=2222.25, sum=8913.243 (7)",
-            "tab": "General information",
-            "score": 1273.320452019534
-          },
-          "MATH - # output tokens": {
-            "description": "min=157.721, mean=210.124, max=243.135, sum=1470.869 (7)",
-            "tab": "General information",
-            "score": 210.1241379885811
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.909,
-        "details": {
-          "description": "min=0.909, mean=0.909, max=0.909, sum=0.909 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=2.937, mean=2.937, max=2.937, sum=2.937 (1)",
-            "tab": "Efficiency",
-            "score": 2.9373713800907133
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=952.617, mean=952.617, max=952.617, sum=952.617 (1)",
-            "tab": "General information",
-            "score": 952.617
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=167.729, mean=167.729, max=167.729, sum=167.729 (1)",
-            "tab": "General information",
-            "score": 167.729
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.721,
-        "details": {
-          "description": "min=0.422, mean=0.721, max=0.979, sum=3.605 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.312, mean=0.38, max=0.526, sum=1.901 (5)",
-            "tab": "Efficiency",
-            "score": 0.38022537218958125
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=198.179, mean=1502.795, max=6244.98, sum=7513.977 (5)",
-            "tab": "General information",
-            "score": 1502.7954037538377
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.298, max=2.021, sum=6.49 (5)",
-            "tab": "General information",
-            "score": 1.298021970457479
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.863,
-        "details": {
-          "description": "min=0.863, mean=0.863, max=0.863, sum=0.863 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.307, mean=0.307, max=0.307, sum=0.307 (1)",
-            "tab": "Efficiency",
-            "score": 0.30731069923158194
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1009.05, mean=1009.05, max=1009.05, sum=1009.05 (1)",
-            "tab": "General information",
-            "score": 1009.0497017892644
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.225,
-        "details": {
-          "description": "min=0.18, mean=0.225, max=0.267, sum=1.125 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.725, mean=0.768, max=0.804, sum=3.841 (5)",
-            "tab": "Efficiency",
-            "score": 0.7681678841877538
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=69.529, mean=105.006, max=128.497, sum=525.028 (5)",
-            "tab": "General information",
-            "score": 105.00557042361216
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=23.809, mean=25.367, max=25.988, sum=126.835 (5)",
-            "tab": "General information",
-            "score": 25.366906254779018
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json b/data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json
deleted file mode 100644
index 3869cb246..000000000
--- a/data/helm_lite/openai/gpt-4o-mini-2024-07-18/ab7b7951-0792-4538-8a7a-6baee8602cbb.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/openai_gpt-4o-mini-2024-07-18/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-4o mini 2024-07-18",
-    "id": "openai/gpt-4o-mini-2024-07-18",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.701,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.7796004993757802
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.768,
-        "details": {
-          "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.473, mean=0.473, max=0.473, sum=0.473 (1)",
-            "tab": "Efficiency",
-            "score": 0.47311924612018424
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3451.668, mean=3451.668, max=3451.668, sum=3451.668 (1)",
-            "tab": "General information",
-            "score": 3451.667605633803
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=4.482, mean=4.482, max=4.482, sum=4.482 (1)",
-            "tab": "General information",
-            "score": 4.48169014084507
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.386,
-        "details": {
-          "description": "min=0.386, mean=0.386, max=0.386, sum=0.386 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.406, mean=0.406, max=0.406, sum=0.406 (1)",
-            "tab": "Efficiency",
-            "score": 0.40617332768440245
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.374, mean=0.374, max=0.374, sum=0.374 (1)",
-            "tab": "Efficiency",
-            "score": 0.3740478873252869
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1714.02, mean=1714.02, max=1714.02, sum=1714.02 (1)",
-            "tab": "General information",
-            "score": 1714.02
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=5.175, mean=5.175, max=5.175, sum=5.175 (1)",
-            "tab": "General information",
-            "score": 5.175
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=129.953, mean=129.953, max=129.953, sum=129.953 (1)",
-            "tab": "General information",
-            "score": 129.953
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.847, mean=4.847, max=4.847, sum=4.847 (1)",
-            "tab": "General information",
-            "score": 4.847
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=0.92 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.331, mean=0.331, max=0.331, sum=0.331 (1)",
-            "tab": "Efficiency",
-            "score": 0.3309546322822571
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=245.486, mean=245.486, max=245.486, sum=245.486 (1)",
-            "tab": "General information",
-            "score": 245.486
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.668,
-        "details": {
-          "description": "min=0.42, mean=0.668, max=0.91, sum=3.339 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.292, mean=0.299, max=0.309, sum=1.497 (5)",
-            "tab": "Efficiency",
-            "score": 0.2993013315033494
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=373.42, mean=466.992, max=613.228, sum=2334.958 (5)",
-            "tab": "General information",
-            "score": 466.9916140350877
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.802,
-        "details": {
-          "description": "min=0.605, mean=0.802, max=0.97, sum=5.611 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=2.312, mean=3.175, max=3.696, sum=22.228 (7)",
-            "tab": "Efficiency",
-            "score": 3.175392215033706
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=888.43, mean=1273.32, max=2222.25, sum=8913.243 (7)",
-            "tab": "General information",
-            "score": 1273.320452019534
-          },
-          "MATH - # output tokens": {
-            "description": "min=167.884, mean=238.235, max=276.058, sum=1667.647 (7)",
-            "tab": "General information",
-            "score": 238.23525019565412
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.843,
-        "details": {
-          "description": "min=0.843, mean=0.843, max=0.843, sum=0.843 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=2.519, mean=2.519, max=2.519, sum=2.519 (1)",
-            "tab": "Efficiency",
-            "score": 2.5191967821121217
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=952.617, mean=952.617, max=952.617, sum=952.617 (1)",
-            "tab": "General information",
-            "score": 952.617
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=215.465, mean=215.465, max=215.465, sum=215.465 (1)",
-            "tab": "General information",
-            "score": 215.465
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.653,
-        "details": {
-          "description": "min=0.414, mean=0.653, max=0.937, sum=3.263 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.337, mean=0.382, max=0.503, sum=1.91 (5)",
-            "tab": "Efficiency",
-            "score": 0.38199841220513264
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=198.179, mean=1502.795, max=6244.98, sum=7513.977 (5)",
-            "tab": "General information",
-            "score": 1502.7954037538377
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.293, max=2.253, sum=6.465 (5)",
-            "tab": "General information",
-            "score": 1.2930331277785745
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.748,
-        "details": {
-          "description": "min=0.748, mean=0.748, max=0.748, sum=0.748 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.332 (1)",
-            "tab": "Efficiency",
-            "score": 0.3318999989132284
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1009.05, mean=1009.05, max=1009.05, sum=1009.05 (1)",
-            "tab": "General information",
-            "score": 1009.0497017892644
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.206,
-        "details": {
-          "description": "min=0.153, mean=0.206, max=0.254, sum=1.032 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.557, mean=0.583, max=0.598, sum=2.917 (5)",
-            "tab": "Efficiency",
-            "score": 0.5833699647787834
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=69.529, mean=105.006, max=128.497, sum=525.028 (5)",
-            "tab": "General information",
-            "score": 105.00557042361216
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=23.748, mean=25.504, max=26.235, sum=127.522 (5)",
-            "tab": "General information",
-            "score": 25.504310196513227
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json b/data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json
deleted file mode 100644
index f3294dd85..000000000
--- a/data/helm_lite/openai/text-davinci-002/fc94c95d-9678-4f23-b82f-190a08ece307.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/openai_text-davinci-002/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-3.5 text-davinci-002",
-    "id": "openai/text-davinci-002",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.336,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.6860299625468165
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.719,
-        "details": {
-          "description": "min=0.719, mean=0.719, max=0.719, sum=0.719 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.226, mean=1.226, max=1.226, sum=1.226 (1)",
-            "tab": "Efficiency",
-            "score": 1.2258358747186795
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.955, mean=4.955, max=4.955, sum=4.955 (1)",
-            "tab": "General information",
-            "score": 4.954929577464789
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3479.563, mean=3479.563, max=3479.563, sum=3479.563 (1)",
-            "tab": "General information",
-            "score": 3479.56338028169
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=8.448, mean=8.448, max=8.448, sum=8.448 (1)",
-            "tab": "General information",
-            "score": 8.447887323943663
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.394,
-        "details": {
-          "description": "min=0.394, mean=0.394, max=0.394, sum=0.394 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.886, mean=0.886, max=0.886, sum=0.886 (1)",
-            "tab": "Efficiency",
-            "score": 0.8863302536010742
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.683, mean=0.683, max=0.683, sum=0.683 (1)",
-            "tab": "Efficiency",
-            "score": 0.6834516413211823
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.885, mean=4.885, max=4.885, sum=4.885 (1)",
-            "tab": "General information",
-            "score": 4.885
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.02, mean=0.02, max=0.02, sum=0.02 (1)",
-            "tab": "General information",
-            "score": 0.02
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1617.729, mean=1617.729, max=1617.729, sum=1617.729 (1)",
-            "tab": "General information",
-            "score": 1617.729
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=6.632, mean=6.632, max=6.632, sum=6.632 (1)",
-            "tab": "General information",
-            "score": 6.632
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=116.254, mean=116.254, max=116.254, sum=116.254 (1)",
-            "tab": "General information",
-            "score": 116.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=4.116, mean=4.116, max=4.116, sum=4.116 (1)",
-            "tab": "General information",
-            "score": 4.116
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "description": "min=0.796, mean=0.796, max=0.796, sum=0.796 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.174, mean=0.174, max=0.174, sum=0.174 (1)",
-            "tab": "Efficiency",
-            "score": 0.1743956871032715
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=254.21, mean=254.21, max=254.21, sum=254.21 (1)",
-            "tab": "General information",
-            "score": 254.21
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.568,
-        "details": {
-          "description": "min=0.26, mean=0.568, max=0.84, sum=2.841 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.175, mean=0.177, max=0.181, sum=0.887 (5)",
-            "tab": "Efficiency",
-            "score": 0.17730724048614502
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=2361.37 (5)",
-            "tab": "General information",
-            "score": 472.2740350877192
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.428,
-        "details": {
-          "description": "min=0.288, mean=0.428, max=0.548, sum=2.997 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=3.257, mean=5.188, max=9.459, sum=36.316 (7)",
-            "tab": "Efficiency",
-            "score": 5.188020693120383
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=906.556, mean=1375.735, max=2449.942, sum=9630.147 (7)",
-            "tab": "General information",
-            "score": 1375.7353092779654
-          },
-          "MATH - # output tokens": {
-            "description": "min=76.721, mean=136.822, max=259.175, sum=957.754 (7)",
-            "tab": "General information",
-            "score": 136.82193804427587
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.479,
-        "details": {
-          "description": "min=0.479, mean=0.479, max=0.479, sum=0.479 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=3.762, mean=3.762, max=3.762, sum=3.762 (1)",
-            "tab": "Efficiency",
-            "score": 3.762208682537079
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=938.869, mean=938.869, max=938.869, sum=938.869 (1)",
-            "tab": "General information",
-            "score": 938.869
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=90.543, mean=90.543, max=90.543, sum=90.543 (1)",
-            "tab": "General information",
-            "score": 90.543
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.58,
-        "details": {
-          "description": "min=0.326, mean=0.58, max=0.916, sum=2.901 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.167, mean=0.223, max=0.403, sum=1.115 (5)",
-            "tab": "Efficiency",
-            "score": 0.2229105462585103
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=2.053, mean=4.211, max=5, sum=21.053 (5)",
-            "tab": "General information",
-            "score": 4.210612244897959
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=205.632, mean=907.387, max=3225.32, sum=4536.936 (5)",
-            "tab": "General information",
-            "score": 907.3872120499769
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0.996, mean=1.099, max=1.238, sum=5.496 (5)",
-            "tab": "General information",
-            "score": 1.0991972687655298
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.525,
-        "details": {
-          "description": "min=0.525, mean=0.525, max=0.525, sum=0.525 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.206, mean=0.206, max=0.206, sum=0.206 (1)",
-            "tab": "Efficiency",
-            "score": 0.20554606720183052
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1038.861, mean=1038.861, max=1038.861, sum=1038.861 (1)",
-            "tab": "General information",
-            "score": 1038.8608349900596
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.174,
-        "details": {
-          "description": "min=0.077, mean=0.174, max=0.212, sum=0.872 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.446, mean=0.467, max=0.478, sum=2.336 (5)",
-            "tab": "Efficiency",
-            "score": 0.4672719452194591
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=136.93, mean=181.694, max=241.662, sum=908.469 (5)",
-            "tab": "General information",
-            "score": 181.69386660804403
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=23.557, mean=24.862, max=25.636, sum=124.309 (5)",
-            "tab": "General information",
-            "score": 24.86174013610644
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json b/data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json
deleted file mode 100644
index 93f27df2b..000000000
--- a/data/helm_lite/openai/text-davinci-003/3f92e2fc-9831-4c2c-b94e-af33d457fa82.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/openai_text-davinci-003/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-3.5 text-davinci-003",
-    "id": "openai/text-davinci-003",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.439,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.5880524344569289
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.731,
-        "details": {
-          "description": "min=0.731, mean=0.731, max=0.731, sum=0.731 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.813, mean=1.813, max=1.813, sum=1.813 (1)",
-            "tab": "Efficiency",
-            "score": 1.812959625351597
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.955, mean=4.955, max=4.955, sum=4.955 (1)",
-            "tab": "General information",
-            "score": 4.954929577464789
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3479.563, mean=3479.563, max=3479.563, sum=3479.563 (1)",
-            "tab": "General information",
-            "score": 3479.56338028169
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=9.732, mean=9.732, max=9.732, sum=9.732 (1)",
-            "tab": "General information",
-            "score": 9.732394366197184
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.413,
-        "details": {
-          "description": "min=0.413, mean=0.413, max=0.413, sum=0.413 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.187, mean=1.187, max=1.187, sum=1.187 (1)",
-            "tab": "Efficiency",
-            "score": 1.1872664585113526
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.996, mean=0.996, max=0.996, sum=0.996 (1)",
-            "tab": "Efficiency",
-            "score": 0.9963206455707551
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.885, mean=4.885, max=4.885, sum=4.885 (1)",
-            "tab": "General information",
-            "score": 4.885
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.02, mean=0.02, max=0.02, sum=0.02 (1)",
-            "tab": "General information",
-            "score": 0.02
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1617.729, mean=1617.729, max=1617.729, sum=1617.729 (1)",
-            "tab": "General information",
-            "score": 1617.729
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=6.8, mean=6.8, max=6.8, sum=6.8 (1)",
-            "tab": "General information",
-            "score": 6.8
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=116.254, mean=116.254, max=116.254, sum=116.254 (1)",
-            "tab": "General information",
-            "score": 116.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=7.074, mean=7.074, max=7.074, sum=7.074 (1)",
-            "tab": "General information",
-            "score": 7.074
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.828,
-        "details": {
-          "description": "min=0.828, mean=0.828, max=0.828, sum=0.828 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.204, mean=0.204, max=0.204, sum=0.204 (1)",
-            "tab": "Efficiency",
-            "score": 0.20436767482757567
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=254.21, mean=254.21, max=254.21, sum=254.21 (1)",
-            "tab": "General information",
-            "score": 254.21
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.555,
-        "details": {
-          "description": "min=0.3, mean=0.555, max=0.83, sum=2.774 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.199, mean=0.2, max=0.203, sum=1.0 (5)",
-            "tab": "Efficiency",
-            "score": 0.2000334782098469
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=2361.37 (5)",
-            "tab": "General information",
-            "score": 472.2740350877192
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.449,
-        "details": {
-          "description": "min=0.3, mean=0.449, max=0.548, sum=3.146 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=3.871, mean=4.334, max=5.181, sum=30.338 (7)",
-            "tab": "Efficiency",
-            "score": 4.333955165715466
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=906.556, mean=1375.735, max=2449.942, sum=9630.147 (7)",
-            "tab": "General information",
-            "score": 1375.7353092779654
-          },
-          "MATH - # output tokens": {
-            "description": "min=61.333, mean=74.938, max=97.115, sum=524.566 (7)",
-            "tab": "General information",
-            "score": 74.93793702104595
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.615,
-        "details": {
-          "description": "min=0.615, mean=0.615, max=0.615, sum=0.615 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=5.199, mean=5.199, max=5.199, sum=5.199 (1)",
-            "tab": "Efficiency",
-            "score": 5.199419307470322
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=938.869, mean=938.869, max=938.869, sum=938.869 (1)",
-            "tab": "General information",
-            "score": 938.869
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=93.717, mean=93.717, max=93.717, sum=93.717 (1)",
-            "tab": "General information",
-            "score": 93.717
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.622,
-        "details": {
-          "description": "min=0.324, mean=0.622, max=0.947, sum=3.11 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.189, mean=0.259, max=0.474, sum=1.297 (5)",
-            "tab": "Efficiency",
-            "score": 0.2594051892596125
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=2.053, mean=4.211, max=5, sum=21.053 (5)",
-            "tab": "General information",
-            "score": 4.210612244897959
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=205.632, mean=907.387, max=3225.32, sum=4536.936 (5)",
-            "tab": "General information",
-            "score": 907.3872120499769
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.168, max=1.443, sum=5.838 (5)",
-            "tab": "General information",
-            "score": 1.1675708408818857
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.531,
-        "details": {
-          "description": "min=0.531, mean=0.531, max=0.531, sum=0.531 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.228, mean=0.228, max=0.228, sum=0.228 (1)",
-            "tab": "Efficiency",
-            "score": 0.22811962975185388
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1038.861, mean=1038.861, max=1038.861, sum=1038.861 (1)",
-            "tab": "General information",
-            "score": 1038.8608349900596
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.191,
-        "details": {
-          "description": "min=0.094, mean=0.191, max=0.227, sum=0.956 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.756, mean=0.8, max=0.822, sum=4.0 (5)",
-            "tab": "Efficiency",
-            "score": 0.800053899013968
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=136.93, mean=181.694, max=241.662, sum=908.469 (5)",
-            "tab": "General information",
-            "score": 181.69386660804403
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=23.563, mean=25.117, max=25.652, sum=125.587 (5)",
-            "tab": "General information",
-            "score": 25.117336366416882
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json b/data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json
deleted file mode 100644
index 800f57826..000000000
--- a/data/helm_lite/qwen/qwen1.5-110b-chat/3e3c79f0-5fb8-4a3f-8c9b-53f742ec2f43.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-110b-chat/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5 Chat 110B",
-    "id": "qwen/qwen1.5-110b-chat",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.55,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.6592634207240948
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.721,
-        "details": {
-          "description": "min=0.721, mean=0.721, max=0.721, sum=0.721 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.984, mean=0.984, max=0.984, sum=0.984 (1)",
-            "tab": "Efficiency",
-            "score": 0.9843533623386437
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3502.913, mean=3502.913, max=3502.913, sum=3502.913 (1)",
-            "tab": "General information",
-            "score": 3502.912676056338
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=10.29, mean=10.29, max=10.29, sum=10.29 (1)",
-            "tab": "General information",
-            "score": 10.290140845070422
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35,
-        "details": {
-          "description": "min=0.35, mean=0.35, max=0.35, sum=0.35 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.647, mean=0.647, max=0.647, sum=0.647 (1)",
-            "tab": "Efficiency",
-            "score": 0.6468759918212891
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.465, mean=0.465, max=0.465, sum=0.465 (1)",
-            "tab": "Efficiency",
-            "score": 0.46513359355926515
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2017.955, mean=2017.955, max=2017.955, sum=2017.955 (1)",
-            "tab": "General information",
-            "score": 2017.955
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=8.509, mean=8.509, max=8.509, sum=8.509 (1)",
-            "tab": "General information",
-            "score": 8.509
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=146.262, mean=146.262, max=146.262, sum=146.262 (1)",
-            "tab": "General information",
-            "score": 146.262
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=8.99, mean=8.99, max=8.99, sum=8.99 (1)",
-            "tab": "General information",
-            "score": 8.99
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.922,
-        "details": {
-          "description": "min=0.922, mean=0.922, max=0.922, sum=0.922 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.244, mean=0.244, max=0.244, sum=0.244 (1)",
-            "tab": "Efficiency",
-            "score": 0.24445231294631958
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=249.846, mean=249.846, max=249.846, sum=249.846 (1)",
-            "tab": "General information",
-            "score": 249.846
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.704,
-        "details": {
-          "description": "min=0.57, mean=0.704, max=0.87, sum=3.52 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.229, mean=0.248, max=0.277, sum=1.241 (5)",
-            "tab": "Efficiency",
-            "score": 0.2482092388136345
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=378.19, mean=477.836, max=627.939, sum=2389.179 (5)",
-            "tab": "General information",
-            "score": 477.8357192982456
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.568,
-        "details": {
-          "description": "min=0.211, mean=0.568, max=0.769, sum=3.974 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=2.984, mean=3.989, max=5.0, sum=27.92 (7)",
-            "tab": "Efficiency",
-            "score": 3.9885726889236994
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)",
-            "tab": "General information",
-            "score": 1323.836848955025
-          },
-          "MATH - # output tokens": {
-            "description": "min=104.174, mean=156.855, max=202.368, sum=1097.984 (7)",
-            "tab": "General information",
-            "score": 156.85484968134907
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.815,
-        "details": {
-          "description": "min=0.815, mean=0.815, max=0.815, sum=0.815 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=4.537, mean=4.537, max=4.537, sum=4.537 (1)",
-            "tab": "Efficiency",
-            "score": 4.537143226146698
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)",
-            "tab": "General information",
-            "score": 1130.403
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=175.784, mean=175.784, max=175.784, sum=175.784 (1)",
-            "tab": "General information",
-            "score": 175.784
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.624,
-        "details": {
-          "description": "min=0.387, mean=0.624, max=0.958, sum=3.121 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.271, mean=0.499, max=1.328, sum=2.493 (5)",
-            "tab": "Efficiency",
-            "score": 0.4986402694478536
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=207.453, mean=1557.088, max=6445.714, sum=7785.442 (5)",
-            "tab": "General information",
-            "score": 1557.0883229968654
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=2, mean=2.314, max=2.958, sum=11.571 (5)",
-            "tab": "General information",
-            "score": 2.3142312634447153
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.64,
-        "details": {
-          "description": "min=0.64, mean=0.64, max=0.64, sum=0.64 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.288, mean=0.288, max=0.288, sum=0.288 (1)",
-            "tab": "Efficiency",
-            "score": 0.2881786700034473
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1052.485, mean=1052.485, max=1052.485, sum=1052.485 (1)",
-            "tab": "General information",
-            "score": 1052.4850894632207
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.192,
-        "details": {
-          "description": "min=0.133, mean=0.192, max=0.232, sum=0.962 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.839, mean=0.882, max=0.896, sum=4.411 (5)",
-            "tab": "Efficiency",
-            "score": 0.882270189100544
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=124.855, mean=142.657, max=158.373, sum=713.283 (5)",
-            "tab": "General information",
-            "score": 142.65662658663405
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=25.499, mean=26.949, max=27.529, sum=134.744 (5)",
-            "tab": "General information",
-            "score": 26.94872734745374
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json b/data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json
deleted file mode 100644
index c8749e5f5..000000000
--- a/data/helm_lite/qwen/qwen1.5-14b/6b2891bd-2444-4286-8ccf-c91181856d29.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-14b/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5 14B",
-    "id": "qwen/qwen1.5-14b",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.425,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.6941198501872659
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.711,
-        "details": {
-          "description": "min=0.711, mean=0.711, max=0.711, sum=0.711 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.987, mean=0.987, max=0.987, sum=0.987 (1)",
-            "tab": "Efficiency",
-            "score": 0.986717187183004
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3468.913, mean=3468.913, max=3468.913, sum=3468.913 (1)",
-            "tab": "General information",
-            "score": 3468.912676056338
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3,
-        "details": {
-          "description": "min=0.3, mean=0.3, max=0.3, sum=0.3 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.679, mean=0.679, max=0.679, sum=0.679 (1)",
-            "tab": "Efficiency",
-            "score": 0.6790921592712402
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.373, mean=0.373, max=0.373, sum=0.373 (1)",
-            "tab": "Efficiency",
-            "score": 0.3734231026172638
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1990.955, mean=1990.955, max=1990.955, sum=1990.955 (1)",
-            "tab": "General information",
-            "score": 1990.955
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=119.262, mean=119.262, max=119.262, sum=119.262 (1)",
-            "tab": "General information",
-            "score": 119.262
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.862,
-        "details": {
-          "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.285, mean=0.285, max=0.285, sum=0.285 (1)",
-            "tab": "Efficiency",
-            "score": 0.2849515151977539
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=242.846, mean=242.846, max=242.846, sum=242.846 (1)",
-            "tab": "General information",
-            "score": 242.846
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.626,
-        "details": {
-          "description": "min=0.4, mean=0.626, max=0.87, sum=3.131 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.285, mean=0.31, max=0.335, sum=1.549 (5)",
-            "tab": "Efficiency",
-            "score": 0.30986739750075765
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.19, mean=470.836, max=620.939, sum=2354.179 (5)",
-            "tab": "General information",
-            "score": 470.8357192982456
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.686,
-        "details": {
-          "description": "min=0.6, mean=0.686, max=0.8, sum=4.8 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=4.789, mean=4.932, max=5.055, sum=34.522 (7)",
-            "tab": "Efficiency",
-            "score": 4.931704092498438
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)",
-            "tab": "General information",
-            "score": 1323.836848955025
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.693,
-        "details": {
-          "description": "min=0.693, mean=0.693, max=0.693, sum=0.693 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.966, mean=1.966, max=1.966, sum=1.966 (1)",
-            "tab": "Efficiency",
-            "score": 1.965628466129303
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)",
-            "tab": "General information",
-            "score": 1130.403
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.593,
-        "details": {
-          "description": "min=0.358, mean=0.593, max=0.853, sum=2.966 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.332, mean=0.544, max=1.352, sum=2.722 (5)",
-            "tab": "Efficiency",
-            "score": 0.5443530451858324
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=192.453, mean=1542.088, max=6430.714, sum=7710.442 (5)",
-            "tab": "General information",
-            "score": 1542.0883229968654
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.515,
-        "details": {
-          "description": "min=0.515, mean=0.515, max=0.515, sum=0.515 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.326, mean=0.326, max=0.326, sum=0.326 (1)",
-            "tab": "Efficiency",
-            "score": 0.3256318408025662
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1045.485, mean=1045.485, max=1045.485, sum=1045.485 (1)",
-            "tab": "General information",
-            "score": 1045.4850894632207
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.178,
-        "details": {
-          "description": "min=0.101, mean=0.178, max=0.23, sum=0.89 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.59, mean=0.606, max=0.617, sum=3.032 (5)",
-            "tab": "Efficiency",
-            "score": 0.606455911532908
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=108.855, mean=126.657, max=142.373, sum=633.283 (5)",
-            "tab": "General information",
-            "score": 126.65662658663405
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json b/data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json
deleted file mode 100644
index 699c1515b..000000000
--- a/data/helm_lite/qwen/qwen1.5-32b/bd924bd3-e13c-48e0-b339-8c15c5072038.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-32b/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5 32B",
-    "id": "qwen/qwen1.5-32b",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.546,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.47831460674157306
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.589,
-        "details": {
-          "description": "min=0.589, mean=0.589, max=0.589, sum=0.589 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.848, mean=1.848, max=1.848, sum=1.848 (1)",
-            "tab": "Efficiency",
-            "score": 1.847580643774758
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3468.913, mean=3468.913, max=3468.913, sum=3468.913 (1)",
-            "tab": "General information",
-            "score": 3468.912676056338
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.353,
-        "details": {
-          "description": "min=0.353, mean=0.353, max=0.353, sum=0.353 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.139, mean=1.139, max=1.139, sum=1.139 (1)",
-            "tab": "Efficiency",
-            "score": 1.1394575798511506
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)",
-            "tab": "Efficiency",
-            "score": 0.457463458776474
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1990.955, mean=1990.955, max=1990.955, sum=1990.955 (1)",
-            "tab": "General information",
-            "score": 1990.955
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=119.262, mean=119.262, max=119.262, sum=119.262 (1)",
-            "tab": "General information",
-            "score": 119.262
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.932,
-        "details": {
-          "description": "min=0.932, mean=0.932, max=0.932, sum=0.932 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.352, mean=0.352, max=0.352, sum=0.352 (1)",
-            "tab": "Efficiency",
-            "score": 0.3515647969245911
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=242.846, mean=242.846, max=242.846, sum=242.846 (1)",
-            "tab": "General information",
-            "score": 242.846
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.628,
-        "details": {
-          "description": "min=0.4, mean=0.628, max=0.91, sum=3.141 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.337, mean=0.345, max=0.367, sum=1.724 (5)",
-            "tab": "Efficiency",
-            "score": 0.34482146733267266
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.19, mean=470.836, max=620.939, sum=2354.179 (5)",
-            "tab": "General information",
-            "score": 470.8357192982456
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.733,
-        "details": {
-          "description": "min=0.5, mean=0.733, max=0.859, sum=5.132 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=8.668, mean=9.437, max=10.496, sum=66.058 (7)",
-            "tab": "Efficiency",
-            "score": 9.436887120006455
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)",
-            "tab": "General information",
-            "score": 1323.836848955025
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.773,
-        "details": {
-          "description": "min=0.773, mean=0.773, max=0.773, sum=0.773 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=3.406, mean=3.406, max=3.406, sum=3.406 (1)",
-            "tab": "Efficiency",
-            "score": 3.405816124200821
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)",
-            "tab": "General information",
-            "score": 1130.403
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.636,
-        "details": {
-          "description": "min=0.417, mean=0.636, max=0.926, sum=3.179 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.371, mean=0.789, max=2.33, sum=3.947 (5)",
-            "tab": "Efficiency",
-            "score": 0.7894946821991368
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=192.453, mean=1542.088, max=6430.714, sum=7710.442 (5)",
-            "tab": "General information",
-            "score": 1542.0883229968654
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.656,
-        "details": {
-          "description": "min=0.656, mean=0.656, max=0.656, sum=0.656 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.452, mean=0.452, max=0.452, sum=0.452 (1)",
-            "tab": "Efficiency",
-            "score": 0.4515474046437925
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1045.485, mean=1045.485, max=1045.485, sum=1045.485 (1)",
-            "tab": "General information",
-            "score": 1045.4850894632207
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.193,
-        "details": {
-          "description": "min=0.129, mean=0.193, max=0.242, sum=0.967 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.902, mean=0.92, max=0.952, sum=4.6 (5)",
-            "tab": "Efficiency",
-            "score": 0.9200148107330449
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=108.855, mean=126.657, max=142.373, sum=633.283 (5)",
-            "tab": "General information",
-            "score": 126.65662658663405
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json b/data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json
deleted file mode 100644
index 8b347b68d..000000000
--- a/data/helm_lite/qwen/qwen1.5-72b/b8a6f32a-9904-43bb-9add-89404093a9db.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-72b/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5 72B",
-    "id": "qwen/qwen1.5-72b",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.608,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.3881398252184769
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.601,
-        "details": {
-          "description": "min=0.601, mean=0.601, max=0.601, sum=0.601 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=2.437, mean=2.437, max=2.437, sum=2.437 (1)",
-            "tab": "Efficiency",
-            "score": 2.4371175302586083
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.994, mean=4.994, max=4.994, sum=4.994 (1)",
-            "tab": "General information",
-            "score": 4.994366197183099
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3465.859, mean=3465.859, max=3465.859, sum=3465.859 (1)",
-            "tab": "General information",
-            "score": 3465.8591549295775
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.417,
-        "details": {
-          "description": "min=0.417, mean=0.417, max=0.417, sum=0.417 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.421, mean=1.421, max=1.421, sum=1.421 (1)",
-            "tab": "Efficiency",
-            "score": 1.4208379020690918
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.577, mean=0.577, max=0.577, sum=0.577 (1)",
-            "tab": "Efficiency",
-            "score": 0.5770996954441071
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.863, mean=4.863, max=4.863, sum=4.863 (1)",
-            "tab": "General information",
-            "score": 4.863
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.022, mean=0.022, max=0.022, sum=0.022 (1)",
-            "tab": "General information",
-            "score": 0.022
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1846.221, mean=1846.221, max=1846.221, sum=1846.221 (1)",
-            "tab": "General information",
-            "score": 1846.221
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=119.262, mean=119.262, max=119.262, sum=119.262 (1)",
-            "tab": "General information",
-            "score": 119.262
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "details": {
-          "description": "min=0.93, mean=0.93, max=0.93, sum=0.93 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.338, mean=0.338, max=0.338, sum=0.338 (1)",
-            "tab": "Efficiency",
-            "score": 0.3381467695236206
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=242.846, mean=242.846, max=242.846, sum=242.846 (1)",
-            "tab": "General information",
-            "score": 242.846
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.647,
-        "details": {
-          "description": "min=0.44, mean=0.647, max=0.94, sum=3.234 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.338, mean=0.364, max=0.396, sum=1.819 (5)",
-            "tab": "Efficiency",
-            "score": 0.3638015921659637
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.19, mean=470.836, max=620.939, sum=2354.179 (5)",
-            "tab": "General information",
-            "score": 470.8357192982456
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.683,
-        "details": {
-          "description": "min=0.6, mean=0.683, max=0.763, sum=4.784 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=10.776, mean=11.813, max=12.91, sum=82.688 (7)",
-            "tab": "Efficiency",
-            "score": 11.812623854443027
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)",
-            "tab": "General information",
-            "score": 1323.836848955025
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.799,
-        "details": {
-          "description": "min=0.799, mean=0.799, max=0.799, sum=0.799 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=4.587, mean=4.587, max=4.587, sum=4.587 (1)",
-            "tab": "Efficiency",
-            "score": 4.5866835827827455
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)",
-            "tab": "General information",
-            "score": 1130.403
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.694,
-        "details": {
-          "description": "min=0.425, mean=0.694, max=0.958, sum=3.469 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.426, mean=0.878, max=1.58, sum=4.392 (5)",
-            "tab": "Efficiency",
-            "score": 0.8783966223148776
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=2.253, mean=4.251, max=5, sum=21.253 (5)",
-            "tab": "General information",
-            "score": 4.25061224489796
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=192.453, mean=940.377, max=3422.157, sum=4701.884 (5)",
-            "tab": "General information",
-            "score": 940.3768944254368
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.67,
-        "details": {
-          "description": "min=0.67, mean=0.67, max=0.67, sum=0.67 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.543, mean=0.543, max=0.543, sum=0.543 (1)",
-            "tab": "Efficiency",
-            "score": 0.5430597031329782
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1045.485, mean=1045.485, max=1045.485, sum=1045.485 (1)",
-            "tab": "General information",
-            "score": 1045.4850894632207
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.201,
-        "details": {
-          "description": "min=0.14, mean=0.201, max=0.255, sum=1.006 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.148, mean=1.187, max=1.205, sum=5.933 (5)",
-            "tab": "Efficiency",
-            "score": 1.1866255830765444
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=108.855, mean=126.657, max=142.373, sum=633.283 (5)",
-            "tab": "General information",
-            "score": 126.65662658663405
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json b/data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json
deleted file mode 100644
index b1bc89d92..000000000
--- a/data/helm_lite/qwen/qwen1.5-7b/c49e4b98-49c5-485b-8f16-0eeed2d9cd82.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/qwen_qwen1.5-7b/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5 7B",
-    "id": "qwen/qwen1.5-7b",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.275,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.8087765293383271
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.448,
-        "details": {
-          "description": "min=0.448, mean=0.448, max=0.448, sum=0.448 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.855, mean=0.855, max=0.855, sum=0.855 (1)",
-            "tab": "Efficiency",
-            "score": 0.8547548650016248
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3468.913, mean=3468.913, max=3468.913, sum=3468.913 (1)",
-            "tab": "General information",
-            "score": 3468.912676056338
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.27,
-        "details": {
-          "description": "min=0.27, mean=0.27, max=0.27, sum=0.27 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.479, mean=0.479, max=0.479, sum=0.479 (1)",
-            "tab": "Efficiency",
-            "score": 0.4786673946380615
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.354, mean=0.354, max=0.354, sum=0.354 (1)",
-            "tab": "Efficiency",
-            "score": 0.354404949426651
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1990.955, mean=1990.955, max=1990.955, sum=1990.955 (1)",
-            "tab": "General information",
-            "score": 1990.955
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=119.262, mean=119.262, max=119.262, sum=119.262 (1)",
-            "tab": "General information",
-            "score": 119.262
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.806,
-        "details": {
-          "description": "min=0.806, mean=0.806, max=0.806, sum=0.806 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.281, mean=0.281, max=0.281, sum=0.281 (1)",
-            "tab": "Efficiency",
-            "score": 0.2806105532646179
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=242.846, mean=242.846, max=242.846, sum=242.846 (1)",
-            "tab": "General information",
-            "score": 242.846
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.569,
-        "details": {
-          "description": "min=0.39, mean=0.569, max=0.84, sum=2.847 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.281, mean=0.289, max=0.298, sum=1.447 (5)",
-            "tab": "Efficiency",
-            "score": 0.28946571837810053
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.19, mean=470.836, max=620.939, sum=2354.179 (5)",
-            "tab": "General information",
-            "score": 470.8357192982456
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.561,
-        "details": {
-          "description": "min=0.462, mean=0.561, max=0.726, sum=3.928 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=2.593, mean=2.933, max=3.209, sum=20.53 (7)",
-            "tab": "Efficiency",
-            "score": 2.9328109453469335
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)",
-            "tab": "General information",
-            "score": 1323.836848955025
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6,
-        "details": {
-          "description": "min=0.6, mean=0.6, max=0.6, sum=0.6 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.381, mean=1.381, max=1.381, sum=1.381 (1)",
-            "tab": "Efficiency",
-            "score": 1.380831289768219
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)",
-            "tab": "General information",
-            "score": 1130.403
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.523,
-        "details": {
-          "description": "min=0.253, mean=0.523, max=0.716, sum=2.614 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.298, mean=0.44, max=0.946, sum=2.2 (5)",
-            "tab": "Efficiency",
-            "score": 0.4400657887452306
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=192.453, mean=1542.088, max=6430.714, sum=7710.442 (5)",
-            "tab": "General information",
-            "score": 1542.0883229968654
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.479,
-        "details": {
-          "description": "min=0.479, mean=0.479, max=0.479, sum=0.479 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.298, mean=0.298, max=0.298, sum=0.298 (1)",
-            "tab": "Efficiency",
-            "score": 0.2983713296962306
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1045.485, mean=1045.485, max=1045.485, sum=1045.485 (1)",
-            "tab": "General information",
-            "score": 1045.4850894632207
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.153,
-        "details": {
-          "description": "min=0.082, mean=0.153, max=0.19, sum=0.767 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.461, mean=0.484, max=0.517, sum=2.421 (5)",
-            "tab": "Efficiency",
-            "score": 0.4841760334465496
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=108.855, mean=126.657, max=142.373, sum=633.283 (5)",
-            "tab": "General information",
-            "score": 126.65662658663405
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json b/data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json
deleted file mode 100644
index 58edcde03..000000000
--- a/data/helm_lite/qwen/qwen2-72b-instruct/9c1fc50a-437d-458b-926c-33cabdcc4aeb.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/qwen_qwen2-72b-instruct/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2 Instruct 72B",
-    "id": "qwen/qwen2-72b-instruct",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.592421972534332
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.727,
-        "details": {
-          "description": "min=0.727, mean=0.727, max=0.727, sum=0.727 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.19, mean=1.19, max=1.19, sum=1.19 (1)",
-            "tab": "Efficiency",
-            "score": 1.1896146727279877
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3502.913, mean=3502.913, max=3502.913, sum=3502.913 (1)",
-            "tab": "General information",
-            "score": 3502.912676056338
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=11.642, mean=11.642, max=11.642, sum=11.642 (1)",
-            "tab": "General information",
-            "score": 11.64225352112676
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39,
-        "details": {
-          "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.868, mean=0.868, max=0.868, sum=0.868 (1)",
-            "tab": "Efficiency",
-            "score": 0.8683992192745209
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.356, mean=0.356, max=0.356, sum=0.356 (1)",
-            "tab": "Efficiency",
-            "score": 0.35628414297103883
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2017.955, mean=2017.955, max=2017.955, sum=2017.955 (1)",
-            "tab": "General information",
-            "score": 2017.955
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=9.044, mean=9.044, max=9.044, sum=9.044 (1)",
-            "tab": "General information",
-            "score": 9.044
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=146.262, mean=146.262, max=146.262, sum=146.262 (1)",
-            "tab": "General information",
-            "score": 146.262
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=6.433, mean=6.433, max=6.433, sum=6.433 (1)",
-            "tab": "General information",
-            "score": 6.433
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.954,
-        "details": {
-          "description": "min=0.954, mean=0.954, max=0.954, sum=0.954 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.218, mean=0.218, max=0.218, sum=0.218 (1)",
-            "tab": "Efficiency",
-            "score": 0.21781798839569091
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=249.846, mean=249.846, max=249.846, sum=249.846 (1)",
-            "tab": "General information",
-            "score": 249.846
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.769,
-        "details": {
-          "description": "min=0.65, mean=0.769, max=0.94, sum=3.847 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.195, mean=0.277, max=0.395, sum=1.385 (5)",
-            "tab": "Efficiency",
-            "score": 0.2769099538284435
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=378.19, mean=477.836, max=627.939, sum=2389.179 (5)",
-            "tab": "General information",
-            "score": 477.8357192982456
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.605, mean=0.79, max=0.93, sum=5.533 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=3.599, mean=4.461, max=5.828, sum=31.228 (7)",
-            "tab": "Efficiency",
-            "score": 4.461141077844028
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)",
-            "tab": "General information",
-            "score": 1323.836848955025
-          },
-          "MATH - # output tokens": {
-            "description": "min=145.36, mean=173.894, max=202.346, sum=1217.257 (7)",
-            "tab": "General information",
-            "score": 173.89384019579856
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=0.92 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=6.592, mean=6.592, max=6.592, sum=6.592 (1)",
-            "tab": "Efficiency",
-            "score": 6.592170278310776
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)",
-            "tab": "General information",
-            "score": 1130.403
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=166.4, mean=166.4, max=166.4, sum=166.4 (1)",
-            "tab": "General information",
-            "score": 166.4
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.712,
-        "details": {
-          "description": "min=0.411, mean=0.712, max=0.947, sum=3.559 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.233, mean=0.521, max=1.575, sum=2.605 (5)",
-            "tab": "Efficiency",
-            "score": 0.5210018908984072
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=207.453, mean=1557.088, max=6445.714, sum=7785.442 (5)",
-            "tab": "General information",
-            "score": 1557.0883229968654
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=2, mean=2.299, max=3.042, sum=11.494 (5)",
-            "tab": "General information",
-            "score": 2.2988842678904344
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.746,
-        "details": {
-          "description": "min=0.746, mean=0.746, max=0.746, sum=0.746 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.535, mean=0.535, max=0.535, sum=0.535 (1)",
-            "tab": "Efficiency",
-            "score": 0.5349795590812122
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1052.485, mean=1052.485, max=1052.485, sum=1052.485 (1)",
-            "tab": "General information",
-            "score": 1052.4850894632207
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.207,
-        "details": {
-          "description": "min=0.156, mean=0.207, max=0.255, sum=1.033 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.802, mean=0.827, max=0.86, sum=4.135 (5)",
-            "tab": "Efficiency",
-            "score": 0.8269615642193179
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=124.855, mean=142.657, max=158.373, sum=713.283 (5)",
-            "tab": "General information",
-            "score": 142.65662658663405
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=25.368, mean=27.029, max=27.714, sum=135.143 (5)",
-            "tab": "General information",
-            "score": 27.028530260743235
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json b/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json
deleted file mode 100644
index 3e08a0cdf..000000000
--- a/data/helm_lite/qwen/qwen2.5-72b-instruct-turbo/5e0e911a-79b0-46fe-88eb-f9ae8cbdd642.json
+++ /dev/null
@@ -1,644 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/qwen_qwen2.5-72b-instruct-turbo/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5 Instruct Turbo 72B",
-    "id": "qwen/qwen2.5-72b-instruct-turbo",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.745,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.5851310861423221
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.745,
-        "details": {
-          "description": "min=0.745, mean=0.745, max=0.745, sum=0.745 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.853, mean=0.853, max=0.853, sum=0.853 (1)",
-            "tab": "Efficiency",
-            "score": 0.8528219290182624
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3492.913, mean=3492.913, max=3492.913, sum=3492.913 (1)",
-            "tab": "General information",
-            "score": 3492.912676056338
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=8.718, mean=8.718, max=8.718, sum=8.718 (1)",
-            "tab": "General information",
-            "score": 8.71830985915493
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.359,
-        "details": {
-          "description": "min=0.359, mean=0.359, max=0.359, sum=0.359 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.974, mean=0.974, max=0.974, sum=0.974 (1)",
-            "tab": "Efficiency",
-            "score": 0.9738211624622345
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.506, mean=0.506, max=0.506, sum=0.506 (1)",
-            "tab": "Efficiency",
-            "score": 0.5063141629695892
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2007.955, mean=2007.955, max=2007.955, sum=2007.955 (1)",
-            "tab": "General information",
-            "score": 2007.955
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=17.681, mean=17.681, max=17.681, sum=17.681 (1)",
-            "tab": "General information",
-            "score": 17.681
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=136.262, mean=136.262, max=136.262, sum=136.262 (1)",
-            "tab": "General information",
-            "score": 136.262
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=15.132, mean=15.132, max=15.132, sum=15.132 (1)",
-            "tab": "General information",
-            "score": 15.132
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.962,
-        "details": {
-          "description": "min=0.962, mean=0.962, max=0.962, sum=0.962 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.372, mean=0.372, max=0.372, sum=0.372 (1)",
-            "tab": "Efficiency",
-            "score": 0.3723496675491333
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=249.846, mean=249.846, max=249.846, sum=249.846 (1)",
-            "tab": "General information",
-            "score": 249.846
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.62, mean=0.77, max=0.96, sum=3.848 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.438, mean=0.585, max=0.815, sum=2.924 (5)",
-            "tab": "Efficiency",
-            "score": 0.5848997679509614
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=378.19, mean=477.836, max=627.939, sum=2389.179 (5)",
-            "tab": "General information",
-            "score": 477.8357192982456
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.884,
-        "details": {
-          "description": "min=0.763, mean=0.884, max=0.97, sum=6.187 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=3.874, mean=6.367, max=11.192, sum=44.569 (7)",
-            "tab": "Efficiency",
-            "score": 6.366941373965945
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)",
-            "tab": "General information",
-            "score": 1323.836848955025
-          },
-          "MATH - # output tokens": {
-            "description": "min=147.558, mean=186.764, max=230.288, sum=1307.351 (7)",
-            "tab": "General information",
-            "score": 186.76438709076407
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=0.9 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=2.558, mean=2.558, max=2.558, sum=2.558 (1)",
-            "tab": "Efficiency",
-            "score": 2.5583292784690856
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)",
-            "tab": "General information",
-            "score": 1130.403
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=198.303, mean=198.303, max=198.303, sum=198.303 (1)",
-            "tab": "General information",
-            "score": 198.303
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.74,
-        "details": {
-          "description": "min=0.46, mean=0.74, max=0.979, sum=3.7 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.306, mean=0.445, max=0.944, sum=2.224 (5)",
-            "tab": "Efficiency",
-            "score": 0.44489043568091446
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=216.453, mean=1558.888, max=6440.714, sum=7794.442 (5)",
-            "tab": "General information",
-            "score": 1558.8883229968653
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=2, mean=2.453, max=3.021, sum=12.263 (5)",
-            "tab": "General information",
-            "score": 2.452587326627195
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ],
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.753,
-        "details": {
-          "description": "min=0.753, mean=0.753, max=0.753, sum=0.753 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.332 (1)",
-            "tab": "Efficiency",
-            "score": 0.33223102912751157
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1052.485, mean=1052.485, max=1052.485, sum=1052.485 (1)",
-            "tab": "General information",
-            "score": 1052.4850894632207
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.207,
-        "details": {
-          "description": "min=0.153, mean=0.207, max=0.257, sum=1.033 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.635, mean=0.67, max=0.752, sum=3.351 (5)",
-            "tab": "Efficiency",
-            "score": 0.6702916101891663
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=114.855, mean=132.657, max=148.373, sum=663.283 (5)",
-            "tab": "General information",
-            "score": 132.65662658663405
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=25.517, mean=27.126, max=27.755, sum=135.631 (5)",
-            "tab": "General information",
-            "score": 27.126178505887747
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json b/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json
deleted file mode 100644
index 3f844c281..000000000
--- a/data/helm_lite/qwen/qwen2.5-7b-instruct-turbo/10e1abfa-83de-4960-8d4c-c5099894cb80.json
+++ /dev/null
@@ -1,644 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/qwen_qwen2.5-7b-instruct-turbo/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5 Instruct Turbo 7B",
-    "id": "qwen/qwen2.5-7b-instruct-turbo",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.488,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.8808988764044944
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.742,
-        "details": {
-          "description": "min=0.742, mean=0.742, max=0.742, sum=0.742 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.516, mean=0.516, max=0.516, sum=0.516 (1)",
-            "tab": "Efficiency",
-            "score": 0.5156192410160119
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3492.913, mean=3492.913, max=3492.913, sum=3492.913 (1)",
-            "tab": "General information",
-            "score": 3492.912676056338
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.549, mean=5.549, max=5.549, sum=5.549 (1)",
-            "tab": "General information",
-            "score": 5.549295774647887
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.205,
-        "details": {
-          "description": "min=0.205, mean=0.205, max=0.205, sum=0.205 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.301, mean=0.301, max=0.301, sum=0.301 (1)",
-            "tab": "Efficiency",
-            "score": 0.30121764993667605
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.217, mean=0.217, max=0.217, sum=0.217 (1)",
-            "tab": "Efficiency",
-            "score": 0.21686342740058898
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2007.955, mean=2007.955, max=2007.955, sum=2007.955 (1)",
-            "tab": "General information",
-            "score": 2007.955
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=8.698, mean=8.698, max=8.698, sum=8.698 (1)",
-            "tab": "General information",
-            "score": 8.698
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=136.262, mean=136.262, max=136.262, sum=136.262 (1)",
-            "tab": "General information",
-            "score": 136.262
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=7.041, mean=7.041, max=7.041, sum=7.041 (1)",
-            "tab": "General information",
-            "score": 7.041
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.862,
-        "details": {
-          "description": "min=0.862, mean=0.862, max=0.862, sum=0.862 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.186, mean=0.186, max=0.186, sum=0.186 (1)",
-            "tab": "Efficiency",
-            "score": 0.1863201789855957
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=249.846, mean=249.846, max=249.846, sum=249.846 (1)",
-            "tab": "General information",
-            "score": 249.846
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.658,
-        "details": {
-          "description": "min=0.49, mean=0.658, max=0.86, sum=3.29 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.285, mean=0.35, max=0.431, sum=1.751 (5)",
-            "tab": "Efficiency",
-            "score": 0.35013260537699653
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=378.19, mean=477.836, max=627.939, sum=2389.179 (5)",
-            "tab": "General information",
-            "score": 477.8357192982456
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.835,
-        "details": {
-          "description": "min=0.684, mean=0.835, max=0.963, sum=5.846 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.449, mean=1.825, max=2.345, sum=12.778 (7)",
-            "tab": "Efficiency",
-            "score": 1.8253796190803115
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=937.926, mean=1323.837, max=2246.673, sum=9266.858 (7)",
-            "tab": "General information",
-            "score": 1323.836848955025
-          },
-          "MATH - # output tokens": {
-            "description": "min=156.674, mean=196.898, max=240.288, sum=1378.285 (7)",
-            "tab": "General information",
-            "score": 196.8978610559394
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=0.83 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=1.7, mean=1.7, max=1.7, sum=1.7 (1)",
-            "tab": "Efficiency",
-            "score": 1.7000067098140716
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1130.403, mean=1130.403, max=1130.403, sum=1130.403 (1)",
-            "tab": "General information",
-            "score": 1130.403
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=194.776, mean=194.776, max=194.776, sum=194.776 (1)",
-            "tab": "General information",
-            "score": 194.776
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.632,
-        "details": {
-          "description": "min=0.414, mean=0.632, max=0.916, sum=3.161 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.183, mean=0.261, max=0.489, sum=1.305 (5)",
-            "tab": "Efficiency",
-            "score": 0.2609495958632719
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=216.453, mean=1558.888, max=6440.714, sum=7794.442 (5)",
-            "tab": "General information",
-            "score": 1558.8883229968653
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=2, mean=2.402, max=3.084, sum=12.008 (5)",
-            "tab": "General information",
-            "score": 2.4015832496773273
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ],
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6,
-        "details": {
-          "description": "min=0.6, mean=0.6, max=0.6, sum=0.6 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.201, mean=0.201, max=0.201, sum=0.201 (1)",
-            "tab": "Efficiency",
-            "score": 0.20058301760709546
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1052.485, mean=1052.485, max=1052.485, sum=1052.485 (1)",
-            "tab": "General information",
-            "score": 1052.4850894632207
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.155,
-        "details": {
-          "description": "min=0.085, mean=0.155, max=0.204, sum=0.777 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.346, mean=0.376, max=0.414, sum=1.88 (5)",
-            "tab": "Efficiency",
-            "score": 0.3759268445955365
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=114.855, mean=132.657, max=148.373, sum=663.283 (5)",
-            "tab": "General information",
-            "score": 132.65662658663405
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=26.946, mean=27.742, max=28.649, sum=138.709 (5)",
-            "tab": "General information",
-            "score": 27.74173612173115
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json b/data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json
deleted file mode 100644
index 09f377d89..000000000
--- a/data/helm_lite/snowflake/snowflake-arctic-instruct/40aa244f-a5dd-4e02-9ca5-6edaf755b79f.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/snowflake_snowflake-arctic-instruct/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Arctic Instruct",
-    "id": "snowflake/snowflake-arctic-instruct",
-    "developer": "snowflake",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.338,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.7606242197253433
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.654,
-        "details": {
-          "description": "min=0.654, mean=0.654, max=0.654, sum=0.654 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=0.624, mean=0.624, max=0.624, sum=0.624 (1)",
-            "tab": "Efficiency",
-            "score": 0.6239793220036466
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=4.262, mean=4.262, max=4.262, sum=4.262 (1)",
-            "tab": "General information",
-            "score": 4.261971830985916
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3603.217, mean=3603.217, max=3603.217, sum=3603.217 (1)",
-            "tab": "General information",
-            "score": 3603.2169014084507
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=11.907, mean=11.907, max=11.907, sum=11.907 (1)",
-            "tab": "General information",
-            "score": 11.907042253521126
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39,
-        "details": {
-          "description": "min=0.39, mean=0.39, max=0.39, sum=0.39 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.636, mean=0.636, max=0.636, sum=0.636 (1)",
-            "tab": "Efficiency",
-            "score": 0.6355201268196106
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.469, mean=0.469, max=0.469, sum=0.469 (1)",
-            "tab": "Efficiency",
-            "score": 0.4687326259613037
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.825, mean=4.825, max=4.825, sum=4.825 (1)",
-            "tab": "General information",
-            "score": 4.825
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.028, mean=0.028, max=0.028, sum=0.028 (1)",
-            "tab": "General information",
-            "score": 0.028
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2311.514, mean=2311.514, max=2311.514, sum=2311.514 (1)",
-            "tab": "General information",
-            "score": 2311.514
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=18.701, mean=18.701, max=18.701, sum=18.701 (1)",
-            "tab": "General information",
-            "score": 18.701
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=166.383, mean=166.383, max=166.383, sum=166.383 (1)",
-            "tab": "General information",
-            "score": 166.383
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=14.473, mean=14.473, max=14.473, sum=14.473 (1)",
-            "tab": "General information",
-            "score": 14.473
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.828,
-        "details": {
-          "description": "min=0.828, mean=0.828, max=0.828, sum=0.828 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.284, mean=0.284, max=0.284, sum=0.284 (1)",
-            "tab": "Efficiency",
-            "score": 0.2840936713218689
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=291.574, mean=291.574, max=291.574, sum=291.574 (1)",
-            "tab": "General information",
-            "score": 291.574
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.575,
-        "details": {
-          "description": "min=0.31, mean=0.575, max=0.88, sum=2.876 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.293, mean=0.303, max=0.317, sum=1.516 (5)",
-            "tab": "Efficiency",
-            "score": 0.30325288054817606
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=406.65, mean=531.547, max=693.675, sum=2657.735 (5)",
-            "tab": "General information",
-            "score": 531.5470877192982
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.519,
-        "details": {
-          "description": "min=0.316, mean=0.519, max=0.785, sum=3.636 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.482, mean=1.724, max=1.995, sum=12.068 (7)",
-            "tab": "Efficiency",
-            "score": 1.723981539653867
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)",
-            "tab": "General information",
-            "score": 1438.6362030100095
-          },
-          "MATH - # output tokens": {
-            "description": "min=82.872, mean=98.802, max=122.233, sum=691.615 (7)",
-            "tab": "General information",
-            "score": 98.80208187931566
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.768,
-        "details": {
-          "description": "min=0.768, mean=0.768, max=0.768, sum=0.768 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=2.961, mean=2.961, max=2.961, sum=2.961 (1)",
-            "tab": "Efficiency",
-            "score": 2.9610197002887726
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)",
-            "tab": "General information",
-            "score": 1207.746
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=189.305, mean=189.305, max=189.305, sum=189.305 (1)",
-            "tab": "General information",
-            "score": 189.305
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.588,
-        "details": {
-          "description": "min=0.351, mean=0.588, max=0.874, sum=2.94 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.292, mean=0.346, max=0.462, sum=1.729 (5)",
-            "tab": "Efficiency",
-            "score": 0.34576316386866485
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=1.81, mean=4.162, max=5, sum=20.81 (5)",
-            "tab": "General information",
-            "score": 4.162040816326531
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0.002, max=0.008, sum=0.008 (5)",
-            "tab": "General information",
-            "score": 0.0016326530612244899
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=239.137, mean=1024.722, max=3561.237, sum=5123.61 (5)",
-            "tab": "General information",
-            "score": 1024.7220443430492
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=2, mean=2.438, max=3.421, sum=12.188 (5)",
-            "tab": "General information",
-            "score": 2.4375592890361366
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.581,
-        "details": {
-          "description": "min=0.581, mean=0.581, max=0.581, sum=0.581 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.313, mean=0.313, max=0.313, sum=0.313 (1)",
-            "tab": "Efficiency",
-            "score": 0.31300480038697864
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1243.901, mean=1243.901, max=1243.901, sum=1243.901 (1)",
-            "tab": "General information",
-            "score": 1243.9005964214712
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.172,
-        "details": {
-          "description": "min=0.09, mean=0.172, max=0.217, sum=0.86 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.65, mean=0.681, max=0.702, sum=3.405 (5)",
-            "tab": "Efficiency",
-            "score": 0.681007040066764
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=145.523, mean=160.288, max=182.972, sum=801.438 (5)",
-            "tab": "General information",
-            "score": 160.28751290334915
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=28.596, mean=30.59, max=31.485, sum=152.951 (5)",
-            "tab": "General information",
-            "score": 30.59012702630372
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json b/data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json
deleted file mode 100644
index 2bf240f96..000000000
--- a/data/helm_lite/tiiuae/falcon-40b/2abf3bb8-a78f-4a59-807e-52da4e6426fd.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/tiiuae_falcon-40b/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon 40B",
-    "id": "tiiuae/falcon-40b",
-    "developer": "tiiuae",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.217,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.086729088639201
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.671,
-        "details": {
-          "description": "min=0.671, mean=0.671, max=0.671, sum=0.671 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=4.985, mean=4.985, max=4.985, sum=4.985 (1)",
-            "tab": "Efficiency",
-            "score": 4.985411514362819
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=2.023, mean=2.023, max=2.023, sum=2.023 (1)",
-            "tab": "General information",
-            "score": 2.0225352112676056
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1692.33, mean=1692.33, max=1692.33, sum=1692.33 (1)",
-            "tab": "General information",
-            "score": 1692.3295774647888
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.392,
-        "details": {
-          "description": "min=0.392, mean=0.392, max=0.392, sum=0.392 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=3.184, mean=3.184, max=3.184, sum=3.184 (1)",
-            "tab": "Efficiency",
-            "score": 3.184468511581421
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=2.849, mean=2.849, max=2.849, sum=2.849 (1)",
-            "tab": "Efficiency",
-            "score": 2.848947753429413
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.598, mean=4.598, max=4.598, sum=4.598 (1)",
-            "tab": "General information",
-            "score": 4.598
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)",
-            "tab": "General information",
-            "score": 0.039
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1586.717, mean=1586.717, max=1586.717, sum=1586.717 (1)",
-            "tab": "General information",
-            "score": 1586.717
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.991, mean=0.991, max=0.991, sum=0.991 (1)",
-            "tab": "General information",
-            "score": 0.991
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=124.246, mean=124.246, max=124.246, sum=124.246 (1)",
-            "tab": "General information",
-            "score": 124.246
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.662,
-        "details": {
-          "description": "min=0.662, mean=0.662, max=0.662, sum=0.662 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=1.268, mean=1.268, max=1.268, sum=1.268 (1)",
-            "tab": "Efficiency",
-            "score": 1.268236391544342
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=251.174, mean=251.174, max=251.174, sum=251.174 (1)",
-            "tab": "General information",
-            "score": 251.174
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.507,
-        "details": {
-          "description": "min=0.31, mean=0.507, max=0.79, sum=2.535 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=1.176, mean=1.431, max=1.805, sum=7.154 (5)",
-            "tab": "Efficiency",
-            "score": 1.4308063889804639
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=389.6, mean=500.12, max=664.281, sum=2500.601 (5)",
-            "tab": "General information",
-            "score": 500.12014035087725
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.128,
-        "details": {
-          "description": "min=0.019, mean=0.128, max=0.228, sum=0.893 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=7.555, mean=11.414, max=18.723, sum=79.896 (7)",
-            "tab": "Efficiency",
-            "score": 11.413689562224084
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=2.385, mean=6.818, max=8, sum=47.727 (7)",
-            "tab": "General information",
-            "score": 6.818102949681896
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=965.096, mean=1150.049, max=1495.447, sum=8050.346 (7)",
-            "tab": "General information",
-            "score": 1150.0493709178531
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.267,
-        "details": {
-          "description": "min=0.267, mean=0.267, max=0.267, sum=0.267 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=12.967, mean=12.967, max=12.967, sum=12.967 (1)",
-            "tab": "Efficiency",
-            "score": 12.967224577903748
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1056.967, mean=1056.967, max=1056.967, sum=1056.967 (1)",
-            "tab": "General information",
-            "score": 1056.967
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.442,
-        "details": {
-          "description": "min=0.204, mean=0.442, max=0.737, sum=2.209 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=1.333, mean=1.731, max=3.174, sum=8.654 (5)",
-            "tab": "Efficiency",
-            "score": 1.730808089747147
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=0.265, mean=3.853, max=5, sum=19.265 (5)",
-            "tab": "General information",
-            "score": 3.853061224489796
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0.003, max=0.016, sum=0.016 (5)",
-            "tab": "General information",
-            "score": 0.0032653061224489797
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=211.284, mean=566.694, max=1486.482, sum=2833.468 (5)",
-            "tab": "General information",
-            "score": 566.6935553560819
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0.876, mean=0.975, max=1, sum=4.876 (5)",
-            "tab": "General information",
-            "score": 0.9751020408163266
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.419,
-        "details": {
-          "description": "min=0.419, mean=0.419, max=0.419, sum=0.419 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=2.203, mean=2.203, max=2.203, sum=2.203 (1)",
-            "tab": "Efficiency",
-            "score": 2.202825612149703
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1048.624, mean=1048.624, max=1048.624, sum=1048.624 (1)",
-            "tab": "General information",
-            "score": 1048.624254473161
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.162,
-        "details": {
-          "description": "min=0.017, mean=0.162, max=0.208, sum=0.809 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=2.468, mean=3.098, max=4.642, sum=15.491 (5)",
-            "tab": "Efficiency",
-            "score": 3.0981059579736714
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=115.642, mean=162.454, max=224.817, sum=812.272 (5)",
-            "tab": "General information",
-            "score": 162.45444400902278
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json b/data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json
deleted file mode 100644
index 9a704269c..000000000
--- a/data/helm_lite/tiiuae/falcon-7b/ae28615a-b7fa-4782-89e1-4b8e4804dc62.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/tiiuae_falcon-7b/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon 7B",
-    "id": "tiiuae/falcon-7b",
-    "developer": "tiiuae",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.064,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.36905118601747816
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.621,
-        "details": {
-          "description": "min=0.621, mean=0.621, max=0.621, sum=0.621 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.141, mean=1.141, max=1.141, sum=1.141 (1)",
-            "tab": "Efficiency",
-            "score": 1.1411562691272144
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=2.023, mean=2.023, max=2.023, sum=2.023 (1)",
-            "tab": "General information",
-            "score": 2.0225352112676056
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=1692.33, mean=1692.33, max=1692.33, sum=1692.33 (1)",
-            "tab": "General information",
-            "score": 1692.3295774647888
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.285,
-        "details": {
-          "description": "min=0.285, mean=0.285, max=0.285, sum=0.285 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.009, mean=1.009, max=1.009, sum=1.009 (1)",
-            "tab": "Efficiency",
-            "score": 1.0090243232250213
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.876, mean=0.876, max=0.876, sum=0.876 (1)",
-            "tab": "Efficiency",
-            "score": 0.8758702797889709
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.598, mean=4.598, max=4.598, sum=4.598 (1)",
-            "tab": "General information",
-            "score": 4.598
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.039, mean=0.039, max=0.039, sum=0.039 (1)",
-            "tab": "General information",
-            "score": 0.039
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1586.717, mean=1586.717, max=1586.717, sum=1586.717 (1)",
-            "tab": "General information",
-            "score": 1586.717
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=0.99, mean=0.99, max=0.99, sum=0.99 (1)",
-            "tab": "General information",
-            "score": 0.99
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=124.246, mean=124.246, max=124.246, sum=124.246 (1)",
-            "tab": "General information",
-            "score": 124.246
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.26,
-        "details": {
-          "description": "min=0.26, mean=0.26, max=0.26, sum=0.26 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.412, mean=0.412, max=0.412, sum=0.412 (1)",
-            "tab": "Efficiency",
-            "score": 0.4118037748336792
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=251.174, mean=251.174, max=251.174, sum=251.174 (1)",
-            "tab": "General information",
-            "score": 251.174
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.288,
-        "details": {
-          "description": "min=0.17, mean=0.288, max=0.39, sum=1.441 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.434, mean=0.475, max=0.497, sum=2.373 (5)",
-            "tab": "Efficiency",
-            "score": 0.47453500427279555
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=389.6, mean=500.12, max=664.281, sum=2500.601 (5)",
-            "tab": "General information",
-            "score": 500.12014035087725
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.044,
-        "details": {
-          "description": "min=0, mean=0.044, max=0.105, sum=0.307 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=5.445, mean=6.987, max=10.873, sum=48.91 (7)",
-            "tab": "Efficiency",
-            "score": 6.987098801445013
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=2.385, mean=6.818, max=8, sum=47.727 (7)",
-            "tab": "General information",
-            "score": 6.818102949681896
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=965.096, mean=1150.049, max=1495.447, sum=8050.346 (7)",
-            "tab": "General information",
-            "score": 1150.0493709178531
-          },
-          "MATH - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=7 (7)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.055,
-        "details": {
-          "description": "min=0.055, mean=0.055, max=0.055, sum=0.055 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=6.94, mean=6.94, max=6.94, sum=6.94 (1)",
-            "tab": "Efficiency",
-            "score": 6.940216990470886
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1056.967, mean=1056.967, max=1056.967, sum=1056.967 (1)",
-            "tab": "General information",
-            "score": 1056.967
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.346,
-        "details": {
-          "description": "min=0.12, mean=0.346, max=0.558, sum=1.731 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.453, mean=0.628, max=1.041, sum=3.139 (5)",
-            "tab": "Efficiency",
-            "score": 0.6278266410596228
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=0.265, mean=3.853, max=5, sum=19.265 (5)",
-            "tab": "General information",
-            "score": 3.853061224489796
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0.003, max=0.016, sum=0.016 (5)",
-            "tab": "General information",
-            "score": 0.0032653061224489797
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=211.284, mean=566.694, max=1486.482, sum=2833.468 (5)",
-            "tab": "General information",
-            "score": 566.6935553560819
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=0.982, mean=0.996, max=1, sum=4.982 (5)",
-            "tab": "General information",
-            "score": 0.9963265306122449
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.254,
-        "details": {
-          "description": "min=0.254, mean=0.254, max=0.254, sum=0.254 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.735, mean=0.735, max=0.735, sum=0.735 (1)",
-            "tab": "Efficiency",
-            "score": 0.7352914724861889
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1048.624, mean=1048.624, max=1048.624, sum=1048.624 (1)",
-            "tab": "General information",
-            "score": 1048.624254473161
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.094,
-        "details": {
-          "description": "min=0.0, mean=0.094, max=0.186, sum=0.471 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.05, mean=1.604, max=3.055, sum=8.019 (5)",
-            "tab": "Efficiency",
-            "score": 1.6038075838932468
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=115.642, mean=162.454, max=224.817, sum=812.272 (5)",
-            "tab": "General information",
-            "score": 162.45444400902278
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=0.999, mean=1.0, max=1, sum=4.999 (5)",
-            "tab": "General information",
-            "score": 0.9997596153846153
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json b/data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json
deleted file mode 100644
index 1f111d01c..000000000
--- a/data/helm_lite/upstage/solar-pro-241126/52bb6ab9-e80b-4bf0-a375-7706f16d311d.json
+++ /dev/null
@@ -1,643 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/upstage_solar-pro-241126/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Solar Pro",
-    "id": "upstage/solar-pro-241126",
-    "developer": "upstage",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.602,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.4817852684144819
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.753,
-        "details": {
-          "description": "min=0.753, mean=0.753, max=0.753, sum=0.753 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=2.29, mean=2.29, max=2.29, sum=2.29 (1)",
-            "tab": "Efficiency",
-            "score": 2.2897866705773584
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=4063.606, mean=4063.606, max=4063.606, sum=4063.606 (1)",
-            "tab": "General information",
-            "score": 4063.605633802817
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=5.972, mean=5.972, max=5.972, sum=5.972 (1)",
-            "tab": "General information",
-            "score": 5.971830985915493
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297,
-        "details": {
-          "description": "min=0.297, mean=0.297, max=0.297, sum=0.297 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.102, mean=1.102, max=1.102, sum=1.102 (1)",
-            "tab": "Efficiency",
-            "score": 1.1022112455368043
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.588, mean=0.588, max=0.588, sum=0.588 (1)",
-            "tab": "Efficiency",
-            "score": 0.5883909621238709
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=2513.406, mean=2513.406, max=2513.406, sum=2513.406 (1)",
-            "tab": "General information",
-            "score": 2513.406
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=7.252, mean=7.252, max=7.252, sum=7.252 (1)",
-            "tab": "General information",
-            "score": 7.252
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=156.383, mean=156.383, max=156.383, sum=156.383 (1)",
-            "tab": "General information",
-            "score": 156.383
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=9.034, mean=9.034, max=9.034, sum=9.034 (1)",
-            "tab": "General information",
-            "score": 9.034
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.922,
-        "details": {
-          "description": "min=0.922, mean=0.922, max=0.922, sum=0.922 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.431, mean=0.431, max=0.431, sum=0.431 (1)",
-            "tab": "Efficiency",
-            "score": 0.43103125095367434
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=291.574, mean=291.574, max=291.574, sum=291.574 (1)",
-            "tab": "General information",
-            "score": 291.574
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.679,
-        "details": {
-          "description": "min=0.46, mean=0.679, max=0.97, sum=3.395 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.429, mean=0.529, max=0.765, sum=2.644 (5)",
-            "tab": "Efficiency",
-            "score": 0.5287977041361624
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=406.65, mean=531.547, max=693.675, sum=2657.735 (5)",
-            "tab": "General information",
-            "score": 531.5470877192982
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.567,
-        "details": {
-          "description": "min=0.421, mean=0.567, max=0.741, sum=3.968 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.926, mean=2.29, max=2.87, sum=16.027 (7)",
-            "tab": "Efficiency",
-            "score": 2.289581796117552
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=971.652, mean=1438.636, max=2490.962, sum=10070.453 (7)",
-            "tab": "General information",
-            "score": 1438.6362030100095
-          },
-          "MATH - # output tokens": {
-            "description": "min=94.269, mean=124.053, max=183.018, sum=868.373 (7)",
-            "tab": "General information",
-            "score": 124.05328023895956
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=0.871 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=2.666, mean=2.666, max=2.666, sum=2.666 (1)",
-            "tab": "Efficiency",
-            "score": 2.6663423478603363
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=1207.746, mean=1207.746, max=1207.746, sum=1207.746 (1)",
-            "tab": "General information",
-            "score": 1207.746
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=143.978, mean=143.978, max=143.978, sum=143.978 (1)",
-            "tab": "General information",
-            "score": 143.978
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.67,
-        "details": {
-          "description": "min=0.384, mean=0.67, max=0.905, sum=3.348 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.438, mean=0.654, max=1.454, sum=3.271 (5)",
-            "tab": "Efficiency",
-            "score": 0.6542452756040519
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.8, max=5, sum=24 (5)",
-            "tab": "General information",
-            "score": 4.8
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=229.137, mean=1839.512, max=7675.188, sum=9197.561 (5)",
-            "tab": "General information",
-            "score": 1839.5122484246817
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.395, max=2.011, sum=6.977 (5)",
-            "tab": "General information",
-            "score": 1.3953837372723363
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.698,
-        "details": {
-          "description": "min=0.698, mean=0.698, max=0.698, sum=0.698 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.596, mean=0.596, max=0.596, sum=0.596 (1)",
-            "tab": "Efficiency",
-            "score": 0.5956100185159187
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1243.901, mean=1243.901, max=1243.901, sum=1243.901 (1)",
-            "tab": "General information",
-            "score": 1243.9005964214712
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.169,
-        "details": {
-          "description": "min=0.085, mean=0.169, max=0.229, sum=0.844 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.839, mean=0.871, max=0.895, sum=4.357 (5)",
-            "tab": "Efficiency",
-            "score": 0.8713457104322841
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=135.523, mean=150.288, max=172.972, sum=751.438 (5)",
-            "tab": "General information",
-            "score": 150.28751290334915
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=27.539, mean=30.28, max=31.635, sum=151.4 (5)",
-            "tab": "General information",
-            "score": 30.280004587857473
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json b/data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json
deleted file mode 100644
index 8026be475..000000000
--- a/data/helm_lite/writer/palmyra-x-004/fcf9d3dd-8b31-4ab9-98c4-d2712eebf867.json
+++ /dev/null
@@ -1,649 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/writer_palmyra-x-004/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Palmyra-X-004",
-    "id": "writer/palmyra-x-004",
-    "developer": "writer",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.808,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.4045318352059925
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.773,
-        "details": {
-          "description": "min=0.773, mean=0.773, max=0.773, sum=0.773 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.634, mean=1.634, max=1.634, sum=1.634 (1)",
-            "tab": "Efficiency",
-            "score": 1.634409177135414
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3484.268, mean=3484.268, max=3484.268, sum=3484.268 (1)",
-            "tab": "General information",
-            "score": 3484.2676056338028
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=6.338, mean=6.338, max=6.338, sum=6.338 (1)",
-            "tab": "General information",
-            "score": 6.338028169014085
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.457,
-        "details": {
-          "description": "min=0.457, mean=0.457, max=0.457, sum=0.457 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=1.221, mean=1.221, max=1.221, sum=1.221 (1)",
-            "tab": "Efficiency",
-            "score": 1.22119681596756
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=1.213, mean=1.213, max=1.213, sum=1.213 (1)",
-            "tab": "Efficiency",
-            "score": 1.2129934797286988
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.965, mean=4.965, max=4.965, sum=4.965 (1)",
-            "tab": "General information",
-            "score": 4.965
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.007, mean=0.007, max=0.007, sum=0.007 (1)",
-            "tab": "General information",
-            "score": 0.007
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1675.231, mean=1675.231, max=1675.231, sum=1675.231 (1)",
-            "tab": "General information",
-            "score": 1675.231
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=10.295, mean=10.295, max=10.295, sum=10.295 (1)",
-            "tab": "General information",
-            "score": 10.295
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=129.12, mean=129.12, max=129.12, sum=129.12 (1)",
-            "tab": "General information",
-            "score": 129.12
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=12.549, mean=12.549, max=12.549, sum=12.549 (1)",
-            "tab": "General information",
-            "score": 12.549
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook",
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.926,
-        "details": {
-          "description": "min=0.926, mean=0.926, max=0.926, sum=0.926 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.271, mean=0.271, max=0.271, sum=0.271 (1)",
-            "tab": "Efficiency",
-            "score": 0.2705215420722961
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=249.776, mean=249.776, max=249.776, sum=249.776 (1)",
-            "tab": "General information",
-            "score": 249.776
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=0.992, mean=0.992, max=0.992, sum=0.992 (1)",
-            "tab": "General information",
-            "score": 0.992
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.739,
-        "details": {
-          "description": "min=0.52, mean=0.739, max=0.92, sum=3.694 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.309, mean=0.396, max=0.722, sum=1.982 (5)",
-            "tab": "Efficiency",
-            "score": 0.39635124337045774
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=373.43, mean=467.686, max=614.421, sum=2338.431 (5)",
-            "tab": "General information",
-            "score": 467.6862105263158
-          },
-          "MMLU - # output tokens": {
-            "description": "min=0.97, mean=0.99, max=1, sum=4.951 (5)",
-            "tab": "General information",
-            "score": 0.9902456140350877
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.767,
-        "details": {
-          "description": "min=0.553, mean=0.767, max=0.948, sum=5.371 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=5.13, mean=14.827, max=45.729, sum=103.786 (7)",
-            "tab": "Efficiency",
-            "score": 14.82662017363065
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=881.363, mean=1262.909, max=2197.577, sum=8840.364 (7)",
-            "tab": "General information",
-            "score": 1262.9092130545007
-          },
-          "MATH - # output tokens": {
-            "description": "min=174.547, mean=209.333, max=238.692, sum=1465.33 (7)",
-            "tab": "General information",
-            "score": 209.3327932233685
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True",
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.905,
-        "details": {
-          "description": "min=0.905, mean=0.905, max=0.905, sum=0.905 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=11.45, mean=11.45, max=11.45, sum=11.45 (1)",
-            "tab": "Efficiency",
-            "score": 11.449529441833496
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=959.032, mean=959.032, max=959.032, sum=959.032 (1)",
-            "tab": "General information",
-            "score": 959.032
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=174.327, mean=174.327, max=174.327, sum=174.327 (1)",
-            "tab": "General information",
-            "score": 174.327
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.73,
-        "details": {
-          "description": "min=0.433, mean=0.73, max=0.989, sum=3.648 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.478, mean=0.504, max=0.522, sum=2.519 (5)",
-            "tab": "Efficiency",
-            "score": 0.5037181089898329
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=4, mean=4.798, max=5, sum=23.992 (5)",
-            "tab": "General information",
-            "score": 4.798367346938775
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=216.442, mean=1524.207, max=6297.633, sum=7621.033 (5)",
-            "tab": "General information",
-            "score": 1524.206501356544
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.416, max=2.021, sum=7.082 (5)",
-            "tab": "General information",
-            "score": 1.4163162483866343
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ],
-          "stop": "none"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.775,
-        "details": {
-          "description": "min=0.775, mean=0.775, max=0.775, sum=0.775 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.399, mean=0.399, max=0.399, sum=0.399 (1)",
-            "tab": "Efficiency",
-            "score": 0.39942375139498093
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1025.274, mean=1025.274, max=1025.274, sum=1025.274 (1)",
-            "tab": "General information",
-            "score": 1025.2743538767395
-          },
-          "MedQA - # output tokens": {
-            "description": "min=0.992, mean=0.992, max=0.992, sum=0.992 (1)",
-            "tab": "General information",
-            "score": 0.9920477137176938
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.203,
-        "details": {
-          "description": "min=0.144, mean=0.203, max=0.249, sum=1.016 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.801, mean=2.046, max=2.515, sum=10.228 (5)",
-            "tab": "Efficiency",
-            "score": 2.045695114985284
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=96.139, mean=115.712, max=136.117, sum=578.559 (5)",
-            "tab": "General information",
-            "score": 115.71178123566294
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=26.191, mean=29.362, max=37.718, sum=146.808 (5)",
-            "tab": "General information",
-            "score": 29.36160106667686
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ],
-          "stop": "none"
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json b/data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json
deleted file mode 100644
index 5e5faf9fb..000000000
--- a/data/helm_lite/writer/palmyra-x-v2/1158720a-9a0e-492e-a677-9b0936f4cde5.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/writer_palmyra-x-v2/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Palmyra X V2 33B",
-    "id": "writer/palmyra-x-v2",
-    "developer": "writer",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.589,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.5062546816479401
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.752,
-        "details": {
-          "description": "min=0.752, mean=0.752, max=0.752, sum=0.752 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=1.202, mean=1.202, max=1.202, sum=1.202 (1)",
-            "tab": "Efficiency",
-            "score": 1.2016644296511798
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3504.577, mean=3504.577, max=3504.577, sum=3504.577 (1)",
-            "tab": "General information",
-            "score": 3504.5774647887324
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=8.208, mean=8.208, max=8.208, sum=8.208 (1)",
-            "tab": "General information",
-            "score": 8.208450704225353
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.428,
-        "details": {
-          "description": "min=0.428, mean=0.428, max=0.428, sum=0.428 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=0.969, mean=0.969, max=0.969, sum=0.969 (1)",
-            "tab": "Efficiency",
-            "score": 0.9688332653045655
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=0.62, mean=0.62, max=0.62, sum=0.62 (1)",
-            "tab": "Efficiency",
-            "score": 0.6202523970603943
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.926, mean=4.926, max=4.926, sum=4.926 (1)",
-            "tab": "General information",
-            "score": 4.926
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.013, mean=0.013, max=0.013, sum=0.013 (1)",
-            "tab": "General information",
-            "score": 0.013
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1662.782, mean=1662.782, max=1662.782, sum=1662.782 (1)",
-            "tab": "General information",
-            "score": 1662.782
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=7.809, mean=7.809, max=7.809, sum=7.809 (1)",
-            "tab": "General information",
-            "score": 7.809
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=116.254, mean=116.254, max=116.254, sum=116.254 (1)",
-            "tab": "General information",
-            "score": 116.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=7.067, mean=7.067, max=7.067, sum=7.067 (1)",
-            "tab": "General information",
-            "score": 7.067
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.878,
-        "details": {
-          "description": "min=0.878, mean=0.878, max=0.878, sum=0.878 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.42 (1)",
-            "tab": "Efficiency",
-            "score": 0.4200127201080322
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=254.21, mean=254.21, max=254.21, sum=254.21 (1)",
-            "tab": "General information",
-            "score": 254.21
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.621,
-        "details": {
-          "description": "min=0.37, mean=0.621, max=0.91, sum=3.106 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.462, mean=0.532, max=0.577, sum=2.661 (5)",
-            "tab": "Efficiency",
-            "score": 0.5321985618859008
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=2361.37 (5)",
-            "tab": "General information",
-            "score": 472.2740350877192
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.58,
-        "details": {
-          "description": "min=0.395, mean=0.58, max=0.8, sum=4.059 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=1.722, mean=2.088, max=2.676, sum=14.619 (7)",
-            "tab": "Efficiency",
-            "score": 2.0883775065675723
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=906.556, mean=1375.735, max=2449.942, sum=9630.147 (7)",
-            "tab": "General information",
-            "score": 1375.7353092779654
-          },
-          "MATH - # output tokens": {
-            "description": "min=64, mean=87.032, max=107.385, sum=609.221 (7)",
-            "tab": "General information",
-            "score": 87.03154467364993
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.735,
-        "details": {
-          "description": "min=0.735, mean=0.735, max=0.735, sum=0.735 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=2.543, mean=2.543, max=2.543, sum=2.543 (1)",
-            "tab": "Efficiency",
-            "score": 2.543274956703186
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=938.869, mean=938.869, max=938.869, sum=938.869 (1)",
-            "tab": "General information",
-            "score": 938.869
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=89.718, mean=89.718, max=89.718, sum=89.718 (1)",
-            "tab": "General information",
-            "score": 89.718
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.644,
-        "details": {
-          "description": "min=0.33, mean=0.644, max=0.989, sum=3.221 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.425, mean=0.731, max=1.784, sum=3.657 (5)",
-            "tab": "Efficiency",
-            "score": 0.7313747247589137
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=3.984, mean=4.597, max=5, sum=22.984 (5)",
-            "tab": "General information",
-            "score": 4.596734693877551
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=205.632, mean=1355.759, max=5467.178, sum=6778.793 (5)",
-            "tab": "General information",
-            "score": 1355.7586406214054
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=2.077, max=5.406, sum=10.386 (5)",
-            "tab": "General information",
-            "score": 2.0771673311343752
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.598,
-        "details": {
-          "description": "min=0.598, mean=0.598, max=0.598, sum=0.598 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.605, mean=0.605, max=0.605, sum=0.605 (1)",
-            "tab": "Efficiency",
-            "score": 0.6051040529967776
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1038.861, mean=1038.861, max=1038.861, sum=1038.861 (1)",
-            "tab": "General information",
-            "score": 1038.8608349900596
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.239,
-        "details": {
-          "description": "min=0.2, mean=0.239, max=0.27, sum=1.194 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=0.83, mean=0.905, max=0.948, sum=4.524 (5)",
-            "tab": "Efficiency",
-            "score": 0.904815991352295
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=136.93, mean=181.694, max=241.662, sum=908.469 (5)",
-            "tab": "General information",
-            "score": 181.69386660804403
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=23.829, mean=25.142, max=25.958, sum=125.709 (5)",
-            "tab": "General information",
-            "score": 25.14180111637865
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json b/data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json
deleted file mode 100644
index c8073d254..000000000
--- a/data/helm_lite/writer/palmyra-x-v3/254ded81-4051-420d-b402-2e7b80a23848.json
+++ /dev/null
@@ -1,641 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_lite/writer_palmyra-x-v3/1770834614.1822479",
-  "retrieved_timestamp": "1770834614.1822479",
-  "source_metadata": {
-    "source_name": "helm_lite",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Palmyra X V3 72B",
-    "id": "writer/palmyra-x-v3",
-    "developer": "writer",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_lite",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.679,
-        "details": {
-          "tab": "Accuracy",
-          "Mean win rate - Efficiency": {
-            "description": null,
-            "tab": "Efficiency",
-            "score": 0.25696629213483146
-          },
-          "Mean win rate - General information": {
-            "description": null,
-            "tab": "General information",
-            "score": null
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NarrativeQA",
-      "source_data": {
-        "dataset_name": "NarrativeQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NarrativeQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.706,
-        "details": {
-          "description": "min=0.706, mean=0.706, max=0.706, sum=0.706 (1)",
-          "tab": "Accuracy",
-          "NarrativeQA - Observed inference time (s)": {
-            "description": "min=2.849, mean=2.849, max=2.849, sum=2.849 (1)",
-            "tab": "Efficiency",
-            "score": 2.848917615245765
-          },
-          "NarrativeQA - # eval": {
-            "description": "min=355, mean=355, max=355, sum=355 (1)",
-            "tab": "General information",
-            "score": 355.0
-          },
-          "NarrativeQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NarrativeQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NarrativeQA - # prompt tokens": {
-            "description": "min=3504.577, mean=3504.577, max=3504.577, sum=3504.577 (1)",
-            "tab": "General information",
-            "score": 3504.5774647887324
-          },
-          "NarrativeQA - # output tokens": {
-            "description": "min=11.149, mean=11.149, max=11.149, sum=11.149 (1)",
-            "tab": "General information",
-            "score": 11.149295774647888
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "NaturalQuestions (closed-book)",
-      "source_data": {
-        "dataset_name": "NaturalQuestions (closed-book)",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "F1 on NaturalQuestions (closed-book)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.407,
-        "details": {
-          "description": "min=0.407, mean=0.407, max=0.407, sum=0.407 (1)",
-          "tab": "Accuracy",
-          "NaturalQuestions (open-book) - Observed inference time (s)": {
-            "description": "min=2.319, mean=2.319, max=2.319, sum=2.319 (1)",
-            "tab": "Efficiency",
-            "score": 2.31904000210762
-          },
-          "NaturalQuestions (closed-book) - Observed inference time (s)": {
-            "description": "min=2.373, mean=2.373, max=2.373, sum=2.373 (1)",
-            "tab": "Efficiency",
-            "score": 2.3729000978469847
-          },
-          "NaturalQuestions (open-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (open-book) - # train": {
-            "description": "min=4.885, mean=4.885, max=4.885, sum=4.885 (1)",
-            "tab": "General information",
-            "score": 4.885
-          },
-          "NaturalQuestions (open-book) - truncated": {
-            "description": "min=0.02, mean=0.02, max=0.02, sum=0.02 (1)",
-            "tab": "General information",
-            "score": 0.02
-          },
-          "NaturalQuestions (open-book) - # prompt tokens": {
-            "description": "min=1617.709, mean=1617.709, max=1617.709, sum=1617.709 (1)",
-            "tab": "General information",
-            "score": 1617.709
-          },
-          "NaturalQuestions (open-book) - # output tokens": {
-            "description": "min=12.864, mean=12.864, max=12.864, sum=12.864 (1)",
-            "tab": "General information",
-            "score": 12.864
-          },
-          "NaturalQuestions (closed-book) - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "NaturalQuestions (closed-book) - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "NaturalQuestions (closed-book) - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "NaturalQuestions (closed-book) - # prompt tokens": {
-            "description": "min=116.254, mean=116.254, max=116.254, sum=116.254 (1)",
-            "tab": "General information",
-            "score": 116.254
-          },
-          "NaturalQuestions (closed-book) - # output tokens": {
-            "description": "min=19.113, mean=19.113, max=19.113, sum=19.113 (1)",
-            "tab": "General information",
-            "score": 19.113
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "mode": "closedbook"
-        }
-      }
-    },
-    {
-      "evaluation_name": "OpenbookQA",
-      "source_data": {
-        "dataset_name": "OpenbookQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on OpenbookQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.938,
-        "details": {
-          "description": "min=0.938, mean=0.938, max=0.938, sum=0.938 (1)",
-          "tab": "Accuracy",
-          "OpenbookQA - Observed inference time (s)": {
-            "description": "min=0.607, mean=0.607, max=0.607, sum=0.607 (1)",
-            "tab": "Efficiency",
-            "score": 0.6074039902687073
-          },
-          "OpenbookQA - # eval": {
-            "description": "min=500, mean=500, max=500, sum=500 (1)",
-            "tab": "General information",
-            "score": 500.0
-          },
-          "OpenbookQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "OpenbookQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "OpenbookQA - # prompt tokens": {
-            "description": "min=254.21, mean=254.21, max=254.21, sum=254.21 (1)",
-            "tab": "General information",
-            "score": 254.21
-          },
-          "OpenbookQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "dataset": "openbookqa",
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MMLU",
-      "source_data": {
-        "dataset_name": "MMLU",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.702,
-        "details": {
-          "description": "min=0.53, mean=0.702, max=0.96, sum=3.509 (5)",
-          "tab": "Accuracy",
-          "MMLU - Observed inference time (s)": {
-            "description": "min=0.604, mean=0.657, max=0.783, sum=3.283 (5)",
-            "tab": "Efficiency",
-            "score": 0.656667516515966
-          },
-          "MMLU - # eval": {
-            "description": "min=100, mean=102.8, max=114, sum=514 (5)",
-            "tab": "General information",
-            "score": 102.8
-          },
-          "MMLU - # train": {
-            "description": "min=5, mean=5, max=5, sum=25 (5)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU - # prompt tokens": {
-            "description": "min=371.38, mean=472.274, max=624.07, sum=2361.37 (5)",
-            "tab": "General information",
-            "score": 472.2740350877192
-          },
-          "MMLU - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "college_chemistry",
-            "computer_security",
-            "econometrics",
-            "us_foreign_policy"
-          ],
-          "method": "multiple_choice_joint"
-        }
-      }
-    },
-    {
-      "evaluation_name": "MATH",
-      "source_data": {
-        "dataset_name": "MATH",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Equivalent (CoT) on MATH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.723,
-        "details": {
-          "description": "min=0.579, mean=0.723, max=0.896, sum=5.06 (7)",
-          "tab": "Accuracy",
-          "MATH - Observed inference time (s)": {
-            "description": "min=3.23, mean=4.259, max=6.331, sum=29.811 (7)",
-            "tab": "Efficiency",
-            "score": 4.258683228698372
-          },
-          "MATH - # eval": {
-            "description": "min=30, mean=62.429, max=135, sum=437 (7)",
-            "tab": "General information",
-            "score": 62.42857142857143
-          },
-          "MATH - # train": {
-            "description": "min=8, mean=8, max=8, sum=56 (7)",
-            "tab": "General information",
-            "score": 8.0
-          },
-          "MATH - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (7)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MATH - # prompt tokens": {
-            "description": "min=906.556, mean=1375.735, max=2449.942, sum=9630.147 (7)",
-            "tab": "General information",
-            "score": 1375.7353092779654
-          },
-          "MATH - # output tokens": {
-            "description": "min=60.012, mean=83.135, max=128.942, sum=581.943 (7)",
-            "tab": "General information",
-            "score": 83.13468064416656
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "algebra",
-            "counting_and_probability",
-            "geometry",
-            "intermediate_algebra",
-            "number_theory",
-            "prealgebra",
-            "precalculus"
-          ],
-          "level": "1",
-          "use_official_examples": "False",
-          "use_chain_of_thought": "True"
-        }
-      }
-    },
-    {
-      "evaluation_name": "GSM8K",
-      "source_data": {
-        "dataset_name": "GSM8K",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on GSM8K",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.831,
-        "details": {
-          "description": "min=0.831, mean=0.831, max=0.831, sum=0.831 (1)",
-          "tab": "Accuracy",
-          "GSM8K - Observed inference time (s)": {
-            "description": "min=5.07, mean=5.07, max=5.07, sum=5.07 (1)",
-            "tab": "Efficiency",
-            "score": 5.069576686620712
-          },
-          "GSM8K - # eval": {
-            "description": "min=1000, mean=1000, max=1000, sum=1000 (1)",
-            "tab": "General information",
-            "score": 1000.0
-          },
-          "GSM8K - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "GSM8K - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "GSM8K - # prompt tokens": {
-            "description": "min=938.869, mean=938.869, max=938.869, sum=938.869 (1)",
-            "tab": "General information",
-            "score": 938.869
-          },
-          "GSM8K - # output tokens": {
-            "description": "min=89.919, mean=89.919, max=89.919, sum=89.919 (1)",
-            "tab": "General information",
-            "score": 89.919
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "LegalBench",
-      "source_data": {
-        "dataset_name": "LegalBench",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on LegalBench",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.709,
-        "details": {
-          "description": "min=0.439, mean=0.709, max=0.926, sum=3.544 (5)",
-          "tab": "Accuracy",
-          "LegalBench - Observed inference time (s)": {
-            "description": "min=0.668, mean=1.16, max=3.0, sum=5.798 (5)",
-            "tab": "Efficiency",
-            "score": 1.1595191393847304
-          },
-          "LegalBench - # eval": {
-            "description": "min=95, mean=409.4, max=1000, sum=2047 (5)",
-            "tab": "General information",
-            "score": 409.4
-          },
-          "LegalBench - # train": {
-            "description": "min=3.984, mean=4.597, max=5, sum=22.984 (5)",
-            "tab": "General information",
-            "score": 4.596734693877551
-          },
-          "LegalBench - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "LegalBench - # prompt tokens": {
-            "description": "min=205.632, mean=1355.759, max=5467.178, sum=6778.793 (5)",
-            "tab": "General information",
-            "score": 1355.7586406214054
-          },
-          "LegalBench - # output tokens": {
-            "description": "min=1, mean=1.078, max=1.2, sum=5.388 (5)",
-            "tab": "General information",
-            "score": 1.0776021798365123
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subset": [
-            "abercrombie",
-            "corporate_lobbying",
-            "function_of_decision_section",
-            "international_citizenship_questions",
-            "proa"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "MedQA",
-      "source_data": {
-        "dataset_name": "MedQA",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MedQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.684,
-        "details": {
-          "description": "min=0.684, mean=0.684, max=0.684, sum=0.684 (1)",
-          "tab": "Accuracy",
-          "MedQA - Observed inference time (s)": {
-            "description": "min=0.927, mean=0.927, max=0.927, sum=0.927 (1)",
-            "tab": "Efficiency",
-            "score": 0.9268994279220611
-          },
-          "MedQA - # eval": {
-            "description": "min=503, mean=503, max=503, sum=503 (1)",
-            "tab": "General information",
-            "score": 503.0
-          },
-          "MedQA - # train": {
-            "description": "min=5, mean=5, max=5, sum=5 (1)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MedQA - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (1)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MedQA - # prompt tokens": {
-            "description": "min=1038.861, mean=1038.861, max=1038.861, sum=1038.861 (1)",
-            "tab": "General information",
-            "score": 1038.8608349900596
-          },
-          "MedQA - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=1 (1)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    },
-    {
-      "evaluation_name": "WMT 2014",
-      "source_data": {
-        "dataset_name": "WMT 2014",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/lite/benchmark_output/releases/v1.13.0/groups/core_scenarios.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "BLEU-4 on WMT 2014",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.262,
-        "details": {
-          "description": "min=0.235, mean=0.262, max=0.284, sum=1.309 (5)",
-          "tab": "Accuracy",
-          "WMT 2014 - Observed inference time (s)": {
-            "description": "min=1.32, mean=1.406, max=1.477, sum=7.032 (5)",
-            "tab": "Efficiency",
-            "score": 1.4063038200537652
-          },
-          "WMT 2014 - # eval": {
-            "description": "min=503, mean=568.8, max=832, sum=2844 (5)",
-            "tab": "General information",
-            "score": 568.8
-          },
-          "WMT 2014 - # train": {
-            "description": "min=1, mean=1, max=1, sum=5 (5)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "WMT 2014 - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (5)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "WMT 2014 - # prompt tokens": {
-            "description": "min=136.93, mean=181.694, max=241.662, sum=908.469 (5)",
-            "tab": "General information",
-            "score": 181.69386660804403
-          },
-          "WMT 2014 - # output tokens": {
-            "description": "min=23.356, mean=24.983, max=25.829, sum=124.915 (5)",
-            "tab": "General information",
-            "score": 24.983090877810064
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "language_pair": [
-            "cs-en",
-            "de-en",
-            "fr-en",
-            "hi-en",
-            "ru-en"
-          ]
-        }
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json b/data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json
deleted file mode 100644
index a5d4de71f..000000000
--- a/data/helm_mmlu/01-ai/yi-34b/ce5acf2d-e5c6-42b8-ac8e-622a755300b8.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/01-ai_yi-34b/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi 34B",
-    "id": "01-ai/yi-34b",
-    "developer": "01-ai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.762,
-        "details": {
-          "description": "min=0.4, mean=0.762, max=0.974, sum=86.905 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.407, mean=0.823, max=2.683, sum=93.841 (114)",
-            "tab": "Efficiency",
-            "score": 0.8231679963633336
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=289.971, mean=661.842, max=2957.412, sum=75449.942 (114)",
-            "tab": "General information",
-            "score": 661.8416008681387
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4,
-        "details": {
-          "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.658, mean=0.658, max=0.658, sum=1.315 (2)",
-            "tab": "Efficiency",
-            "score": 0.6577284264564515
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=383.67, mean=383.67, max=383.67, sum=767.34 (2)",
-            "tab": "General information",
-            "score": 383.67
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.748,
-        "details": {
-          "description": "min=0.748, mean=0.748, max=0.748, sum=1.496 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.601, mean=0.601, max=0.601, sum=1.202 (2)",
-            "tab": "Efficiency",
-            "score": 0.6009190011907507
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=375.77, mean=375.77, max=375.77, sum=751.541 (2)",
-            "tab": "General information",
-            "score": 375.77037037037036
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.531, mean=0.531, max=0.531, sum=1.061 (2)",
-            "tab": "Efficiency",
-            "score": 0.5305842399597168
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.502, mean=0.502, max=0.502, sum=1.004 (2)",
-            "tab": "Efficiency",
-            "score": 0.5021488202942742
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.708, mean=0.708, max=0.708, sum=1.415 (2)",
-            "tab": "Efficiency",
-            "score": 0.7075318503379822
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.569, mean=0.569, max=0.569, sum=1.138 (2)",
-            "tab": "Efficiency",
-            "score": 0.5689087891578675
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.575, mean=0.575, max=0.575, sum=1.15 (2)",
-            "tab": "Efficiency",
-            "score": 0.5747669638925894
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.604, mean=0.604, max=0.604, sum=1.207 (2)",
-            "tab": "Efficiency",
-            "score": 0.603668584543116
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=597.54, mean=597.54, max=597.54, sum=1195.08 (2)",
-            "tab": "General information",
-            "score": 597.54
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=514.819, mean=514.819, max=514.819, sum=1029.639 (2)",
-            "tab": "General information",
-            "score": 514.8194444444445
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=883.06, mean=883.06, max=883.06, sum=1766.12 (2)",
-            "tab": "General information",
-            "score": 883.06
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=635.3, mean=635.3, max=635.3, sum=1270.6 (2)",
-            "tab": "General information",
-            "score": 635.3
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=549.688, mean=549.688, max=549.688, sum=1099.376 (2)",
-            "tab": "General information",
-            "score": 549.6878612716763
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=512.912, mean=512.912, max=512.912, sum=1025.824 (2)",
-            "tab": "General information",
-            "score": 512.9117647058823
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.472, mean=0.472, max=0.472, sum=0.943 (2)",
-            "tab": "Efficiency",
-            "score": 0.47160084009170533
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=405.74, mean=405.74, max=405.74, sum=811.48 (2)",
-            "tab": "General information",
-            "score": 405.74
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.588,
-        "details": {
-          "description": "min=0.588, mean=0.588, max=0.588, sum=1.175 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.61, mean=0.61, max=0.61, sum=1.219 (2)",
-            "tab": "Efficiency",
-            "score": 0.6095903463530958
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=667.789, mean=667.789, max=667.789, sum=1335.579 (2)",
-            "tab": "General information",
-            "score": 667.7894736842105
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53,
-        "details": {
-          "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.537, mean=0.537, max=0.537, sum=1.074 (2)",
-            "tab": "Efficiency",
-            "score": 0.5369880175590516
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=462.32, mean=462.32, max=462.32, sum=924.64 (2)",
-            "tab": "General information",
-            "score": 462.32
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.898,
-        "details": {
-          "description": "min=0.898, mean=0.898, max=0.898, sum=1.796 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.668, mean=0.668, max=0.668, sum=1.336 (2)",
-            "tab": "Efficiency",
-            "score": 0.668224381075965
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=431.898, mean=431.898, max=431.898, sum=863.796 (2)",
-            "tab": "General information",
-            "score": 431.89814814814815
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.424, mean=0.424, max=0.424, sum=0.848 (2)",
-            "tab": "Efficiency",
-            "score": 0.42395149779856395
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=356.723, mean=356.723, max=356.723, sum=713.447 (2)",
-            "tab": "General information",
-            "score": 356.7234726688103
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.835,
-        "details": {
-          "description": "min=0.835, mean=0.835, max=0.835, sum=1.67 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=2.222, mean=2.222, max=2.222, sum=4.444 (2)",
-            "tab": "Efficiency",
-            "score": 2.222188143169179
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)",
-            "tab": "Efficiency",
-            "score": 0.6598629156748453
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=1.839, mean=1.839, max=1.839, sum=3.678 (2)",
-            "tab": "Efficiency",
-            "score": 1.839003596032303
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=2.178, mean=2.178, max=2.178, sum=4.356 (2)",
-            "tab": "Efficiency",
-            "score": 2.1780028343200684
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1202.533, mean=1202.533, max=1202.533, sum=2405.066 (2)",
-            "tab": "General information",
-            "score": 1202.5330882352941
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=771.16, mean=771.16, max=771.16, sum=1542.319 (2)",
-            "tab": "General information",
-            "score": 771.1595744680851
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1759.098, mean=1759.098, max=1759.098, sum=3518.197 (2)",
-            "tab": "General information",
-            "score": 1759.0984354628422
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=608.201, mean=608.201, max=608.201, sum=1216.402 (2)",
-            "tab": "General information",
-            "score": 608.2009803921569
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.607, mean=0.607, max=0.607, sum=1.214 (2)",
-            "tab": "Efficiency",
-            "score": 0.6068471717834473
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=458.53, mean=458.53, max=458.53, sum=917.06 (2)",
-            "tab": "General information",
-            "score": 458.53
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.803 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.559, mean=0.559, max=0.559, sum=1.117 (2)",
-            "tab": "Efficiency",
-            "score": 0.5586237562330145
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=626.895, mean=626.895, max=626.895, sum=1253.789 (2)",
-            "tab": "General information",
-            "score": 626.8947368421053
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.75,
-        "details": {
-          "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.566, mean=0.566, max=0.566, sum=1.133 (2)",
-            "tab": "Efficiency",
-            "score": 0.5663742089271545
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=616.97, mean=616.97, max=616.97, sum=1233.94 (2)",
-            "tab": "General information",
-            "score": 616.97
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.487, mean=0.487, max=0.487, sum=0.975 (2)",
-            "tab": "Efficiency",
-            "score": 0.4874912774787759
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=446.966, mean=446.966, max=446.966, sum=893.932 (2)",
-            "tab": "General information",
-            "score": 446.96603773584906
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.439, mean=0.439, max=0.439, sum=0.878 (2)",
-            "tab": "Efficiency",
-            "score": 0.4390637499220828
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=311.94, mean=311.94, max=311.94, sum=623.881 (2)",
-            "tab": "General information",
-            "score": 311.9404255319149
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.779,
-        "details": {
-          "description": "min=0.779, mean=0.779, max=0.779, sum=1.559 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.531, mean=0.531, max=0.531, sum=1.063 (2)",
-            "tab": "Efficiency",
-            "score": 0.531287300175634
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=491.993, mean=491.993, max=491.993, sum=983.986 (2)",
-            "tab": "General information",
-            "score": 491.99310344827586
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.656,
-        "details": {
-          "description": "min=0.656, mean=0.656, max=0.656, sum=1.312 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.561, mean=0.561, max=0.561, sum=1.123 (2)",
-            "tab": "Efficiency",
-            "score": 0.5613514084033865
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=601.344, mean=601.344, max=601.344, sum=1202.688 (2)",
-            "tab": "General information",
-            "score": 601.3439153439153
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.548,
-        "details": {
-          "description": "min=0.548, mean=0.548, max=0.548, sum=1.095 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.626, mean=0.626, max=0.626, sum=1.253 (2)",
-            "tab": "Efficiency",
-            "score": 0.6264226947511945
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=675.579, mean=675.579, max=675.579, sum=1351.159 (2)",
-            "tab": "General information",
-            "score": 675.5793650793651
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.907,
-        "details": {
-          "description": "min=0.907, mean=0.907, max=0.907, sum=1.814 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.59, mean=0.59, max=0.59, sum=1.179 (2)",
-            "tab": "Efficiency",
-            "score": 0.5895279146009876
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.562, mean=0.562, max=0.562, sum=1.124 (2)",
-            "tab": "Efficiency",
-            "score": 0.5618457112993512
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.851, mean=0.851, max=0.851, sum=1.702 (2)",
-            "tab": "Efficiency",
-            "score": 0.8510373497009277
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=2.359, mean=2.359, max=2.359, sum=4.717 (2)",
-            "tab": "Efficiency",
-            "score": 2.358732930096713
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=1.215, mean=1.215, max=1.215, sum=2.43 (2)",
-            "tab": "Efficiency",
-            "score": 1.21489392266129
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.677, mean=0.677, max=0.677, sum=1.354 (2)",
-            "tab": "Efficiency",
-            "score": 0.6768323757487875
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.57, mean=0.57, max=0.57, sum=1.14 (2)",
-            "tab": "Efficiency",
-            "score": 0.5697616595488328
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.541, mean=0.541, max=0.541, sum=1.082 (2)",
-            "tab": "Efficiency",
-            "score": 0.5409333193743671
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.657, mean=0.657, max=0.657, sum=1.314 (2)",
-            "tab": "Efficiency",
-            "score": 0.6570467107436236
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.738, mean=0.738, max=0.738, sum=1.476 (2)",
-            "tab": "Efficiency",
-            "score": 0.7378138311651369
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.524, mean=0.524, max=0.524, sum=1.049 (2)",
-            "tab": "Efficiency",
-            "score": 0.5244918534515101
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)",
-            "tab": "Efficiency",
-            "score": 0.7453252838717567
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=1.821, mean=1.821, max=1.821, sum=3.642 (2)",
-            "tab": "Efficiency",
-            "score": 1.8211165923698276
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=1.27, mean=1.27, max=1.27, sum=2.541 (2)",
-            "tab": "Efficiency",
-            "score": 1.2703520537428714
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=546.394, mean=546.394, max=546.394, sum=1092.787 (2)",
-            "tab": "General information",
-            "score": 546.3935483870968
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=537.015, mean=537.015, max=537.015, sum=1074.03 (2)",
-            "tab": "General information",
-            "score": 537.0147783251232
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=962.1, mean=962.1, max=962.1, sum=1924.2 (2)",
-            "tab": "General information",
-            "score": 962.1
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2957.412, mean=2957.412, max=2957.412, sum=5914.824 (2)",
-            "tab": "General information",
-            "score": 2957.4121212121213
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=404.035, mean=404.035, max=404.035, sum=808.071 (2)",
-            "tab": "General information",
-            "score": 404.0353535353535
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=484.725, mean=484.725, max=484.725, sum=969.451 (2)",
-            "tab": "General information",
-            "score": 484.7253886010363
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=398.892, mean=398.892, max=398.892, sum=797.785 (2)",
-            "tab": "General information",
-            "score": 398.89230769230767
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=575.622, mean=575.622, max=575.622, sum=1151.244 (2)",
-            "tab": "General information",
-            "score": 575.6222222222223
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=420.739, mean=420.739, max=420.739, sum=841.479 (2)",
-            "tab": "General information",
-            "score": 420.73949579831935
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=599.411, mean=599.411, max=599.411, sum=1198.821 (2)",
-            "tab": "General information",
-            "score": 599.4105960264901
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=526.826, mean=526.826, max=526.826, sum=1053.651 (2)",
-            "tab": "General information",
-            "score": 526.8256880733945
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=869.778, mean=869.778, max=869.778, sum=1739.556 (2)",
-            "tab": "General information",
-            "score": 869.7777777777778
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2369.132, mean=2369.132, max=2369.132, sum=4738.265 (2)",
-            "tab": "General information",
-            "score": 2369.1323529411766
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1541.371, mean=1541.371, max=1541.371, sum=3082.743 (2)",
-            "tab": "General information",
-            "score": 1541.3713080168777
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.768, mean=0.768, max=0.768, sum=1.535 (2)",
-            "tab": "Efficiency",
-            "score": 0.76751750146327
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)",
-            "tab": "Efficiency",
-            "score": 0.4077764613027791
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=332.013, mean=332.013, max=332.013, sum=664.027 (2)",
-            "tab": "General information",
-            "score": 332.0134529147982
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=367.855, mean=367.855, max=367.855, sum=735.71 (2)",
-            "tab": "General information",
-            "score": 367.85496183206106
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.909,
-        "details": {
-          "description": "min=0.909, mean=0.909, max=0.909, sum=1.818 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.588, mean=0.588, max=0.588, sum=1.175 (2)",
-            "tab": "Efficiency",
-            "score": 0.5876634554429487
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=663.289, mean=663.289, max=663.289, sum=1326.579 (2)",
-            "tab": "General information",
-            "score": 663.2892561983471
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.883,
-        "details": {
-          "description": "min=0.883, mean=0.883, max=0.883, sum=1.767 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.622, mean=0.622, max=0.622, sum=1.245 (2)",
-            "tab": "Efficiency",
-            "score": 0.6223941814680041
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=466.595, mean=466.595, max=466.595, sum=933.19 (2)",
-            "tab": "General information",
-            "score": 466.5950920245399
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.58,
-        "details": {
-          "description": "min=0.58, mean=0.58, max=0.58, sum=1.161 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.638, mean=0.638, max=0.638, sum=1.277 (2)",
-            "tab": "Efficiency",
-            "score": 0.6384105682373047
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=720.161, mean=720.161, max=720.161, sum=1440.321 (2)",
-            "tab": "General information",
-            "score": 720.1607142857143
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.893,
-        "details": {
-          "description": "min=0.893, mean=0.893, max=0.893, sum=1.786 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.841 (2)",
-            "tab": "Efficiency",
-            "score": 0.4204523748564489
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=300.544, mean=300.544, max=300.544, sum=601.087 (2)",
-            "tab": "General information",
-            "score": 300.54368932038835
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.936,
-        "details": {
-          "description": "min=0.936, mean=0.936, max=0.936, sum=1.872 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.463, mean=0.463, max=0.463, sum=0.926 (2)",
-            "tab": "Efficiency",
-            "score": 0.463064443351876
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=442.825, mean=442.825, max=442.825, sum=885.65 (2)",
-            "tab": "General information",
-            "score": 442.8247863247863
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.428, mean=0.428, max=0.428, sum=0.857 (2)",
-            "tab": "Efficiency",
-            "score": 0.42836678981781007
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=362, mean=362, max=362, sum=724 (2)",
-            "tab": "General information",
-            "score": 362.0
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.902,
-        "details": {
-          "description": "min=0.902, mean=0.902, max=0.902, sum=1.803 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.537, mean=0.537, max=0.537, sum=1.075 (2)",
-            "tab": "Efficiency",
-            "score": 0.5372742845333095
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=331.441, mean=331.441, max=331.441, sum=662.881 (2)",
-            "tab": "General information",
-            "score": 331.4406130268199
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.606,
-        "details": {
-          "description": "min=0.606, mean=0.606, max=0.606, sum=1.211 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.671, mean=0.671, max=0.671, sum=1.341 (2)",
-            "tab": "Efficiency",
-            "score": 0.6705957754498961
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.764, mean=0.764, max=0.764, sum=1.528 (2)",
-            "tab": "Efficiency",
-            "score": 0.7642385613318928
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=507.913, mean=507.913, max=507.913, sum=1015.827 (2)",
-            "tab": "General information",
-            "score": 507.91329479768785
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=703.334, mean=703.334, max=703.334, sum=1406.668 (2)",
-            "tab": "General information",
-            "score": 703.3340782122905
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.869,
-        "details": {
-          "description": "min=0.869, mean=0.869, max=0.869, sum=1.739 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=1.038, mean=1.038, max=1.038, sum=2.077 (2)",
-            "tab": "Efficiency",
-            "score": 1.0384757246067322
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=643.317, mean=643.317, max=643.317, sum=1286.634 (2)",
-            "tab": "General information",
-            "score": 643.3169934640523
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.877,
-        "details": {
-          "description": "min=0.877, mean=0.877, max=0.877, sum=1.753 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.561, mean=0.561, max=0.561, sum=1.121 (2)",
-            "tab": "Efficiency",
-            "score": 0.560588002204895
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=565.096, mean=565.096, max=565.096, sum=1130.191 (2)",
-            "tab": "General information",
-            "score": 565.0956790123457
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.745,
-        "details": {
-          "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=2.107, mean=2.107, max=2.107, sum=4.213 (2)",
-            "tab": "Efficiency",
-            "score": 2.1067019375887783
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=432.436, mean=432.436, max=432.436, sum=864.873 (2)",
-            "tab": "General information",
-            "score": 432.43636363636364
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.833,
-        "details": {
-          "description": "min=0.833, mean=0.833, max=0.833, sum=1.665 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=2.683, mean=2.683, max=2.683, sum=5.366 (2)",
-            "tab": "Efficiency",
-            "score": 2.682755525744691
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1227.196, mean=1227.196, max=1227.196, sum=2454.392 (2)",
-            "tab": "General information",
-            "score": 1227.1959183673468
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=1.401, mean=1.401, max=1.401, sum=2.803 (2)",
-            "tab": "Efficiency",
-            "score": 1.4013089469416224
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=463.99, mean=463.99, max=463.99, sum=927.98 (2)",
-            "tab": "General information",
-            "score": 463.99004975124376
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.572,
-        "details": {
-          "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.563, mean=0.563, max=0.563, sum=1.127 (2)",
-            "tab": "Efficiency",
-            "score": 0.5633984617440098
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=363.102, mean=363.102, max=363.102, sum=726.205 (2)",
-            "tab": "General information",
-            "score": 363.1024096385542
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.877,
-        "details": {
-          "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.407, mean=0.407, max=0.407, sum=0.814 (2)",
-            "tab": "Efficiency",
-            "score": 0.4067504726655302
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=289.971, mean=289.971, max=289.971, sum=579.942 (2)",
-            "tab": "General information",
-            "score": 289.97076023391816
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.315,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json b/data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json
deleted file mode 100644
index 1f0a7e20f..000000000
--- a/data/helm_mmlu/01-ai/yi-6b/7f2975a3-1cd5-4102-bb0c-f0f329db9d2d.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/01-ai_yi-6b/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi 6B",
-    "id": "01-ai/yi-6b",
-    "developer": "01-ai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.64,
-        "details": {
-          "description": "min=0.3, mean=0.64, max=0.907, sum=72.967 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.315, mean=0.388, max=0.912, sum=44.195 (114)",
-            "tab": "Efficiency",
-            "score": 0.3876731134304364
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=289.971, mean=661.842, max=2957.412, sum=75449.942 (114)",
-            "tab": "General information",
-            "score": 661.8416008681387
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3,
-        "details": {
-          "description": "min=0.3, mean=0.3, max=0.3, sum=0.6 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)",
-            "tab": "Efficiency",
-            "score": 0.34289863109588625
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=383.67, mean=383.67, max=383.67, sum=767.34 (2)",
-            "tab": "General information",
-            "score": 383.67
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6,
-        "details": {
-          "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.334, mean=0.334, max=0.334, sum=0.668 (2)",
-            "tab": "Efficiency",
-            "score": 0.3338937794720685
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=375.77, mean=375.77, max=375.77, sum=751.541 (2)",
-            "tab": "General information",
-            "score": 375.77037037037036
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.422,
-        "details": {
-          "description": "min=0.422, mean=0.422, max=0.422, sum=0.843 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)",
-            "tab": "Efficiency",
-            "score": 0.3400930452346802
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.331, mean=0.331, max=0.331, sum=0.661 (2)",
-            "tab": "Efficiency",
-            "score": 0.3306954221593009
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.397, mean=0.397, max=0.397, sum=0.793 (2)",
-            "tab": "Efficiency",
-            "score": 0.39658718585968017
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.372, mean=0.372, max=0.372, sum=0.744 (2)",
-            "tab": "Efficiency",
-            "score": 0.3718992257118225
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.36, mean=0.36, max=0.36, sum=0.721 (2)",
-            "tab": "Efficiency",
-            "score": 0.360349433270493
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.363, mean=0.363, max=0.363, sum=0.726 (2)",
-            "tab": "Efficiency",
-            "score": 0.36309780092800364
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=597.54, mean=597.54, max=597.54, sum=1195.08 (2)",
-            "tab": "General information",
-            "score": 597.54
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=514.819, mean=514.819, max=514.819, sum=1029.639 (2)",
-            "tab": "General information",
-            "score": 514.8194444444445
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=883.06, mean=883.06, max=883.06, sum=1766.12 (2)",
-            "tab": "General information",
-            "score": 883.06
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=635.3, mean=635.3, max=635.3, sum=1270.6 (2)",
-            "tab": "General information",
-            "score": 635.3
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=549.688, mean=549.688, max=549.688, sum=1099.376 (2)",
-            "tab": "General information",
-            "score": 549.6878612716763
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=512.912, mean=512.912, max=512.912, sum=1025.824 (2)",
-            "tab": "General information",
-            "score": 512.9117647058823
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.73,
-        "details": {
-          "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.673 (2)",
-            "tab": "Efficiency",
-            "score": 0.3364018177986145
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=405.74, mean=405.74, max=405.74, sum=811.48 (2)",
-            "tab": "General information",
-            "score": 405.74
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.351,
-        "details": {
-          "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.398, mean=0.398, max=0.398, sum=0.796 (2)",
-            "tab": "Efficiency",
-            "score": 0.397992962285092
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=667.789, mean=667.789, max=667.789, sum=1335.579 (2)",
-            "tab": "General information",
-            "score": 667.7894736842105
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.43,
-        "details": {
-          "description": "min=0.43, mean=0.43, max=0.43, sum=0.86 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.655 (2)",
-            "tab": "Efficiency",
-            "score": 0.3273779916763306
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=462.32, mean=462.32, max=462.32, sum=924.64 (2)",
-            "tab": "General information",
-            "score": 462.32
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "description": "min=0.796, mean=0.796, max=0.796, sum=1.593 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.361, mean=0.361, max=0.361, sum=0.721 (2)",
-            "tab": "Efficiency",
-            "score": 0.3607365202020716
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=431.898, mean=431.898, max=431.898, sum=863.796 (2)",
-            "tab": "General information",
-            "score": 431.89814814814815
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.678,
-        "details": {
-          "description": "min=0.678, mean=0.678, max=0.678, sum=1.357 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.347, mean=0.347, max=0.347, sum=0.693 (2)",
-            "tab": "Efficiency",
-            "score": 0.34667477807048047
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=356.723, mean=356.723, max=356.723, sum=713.447 (2)",
-            "tab": "General information",
-            "score": 356.7234726688103
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.668,
-        "details": {
-          "description": "min=0.668, mean=0.668, max=0.668, sum=1.337 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.769, mean=0.769, max=0.769, sum=1.538 (2)",
-            "tab": "Efficiency",
-            "score": 0.7688747907386106
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.37, mean=0.37, max=0.37, sum=0.74 (2)",
-            "tab": "Efficiency",
-            "score": 0.37016247857546974
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.566, mean=0.566, max=0.566, sum=1.131 (2)",
-            "tab": "Efficiency",
-            "score": 0.5655125939084467
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.335, mean=0.335, max=0.335, sum=0.67 (2)",
-            "tab": "Efficiency",
-            "score": 0.33476316071803275
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1202.533, mean=1202.533, max=1202.533, sum=2405.066 (2)",
-            "tab": "General information",
-            "score": 1202.5330882352941
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=771.16, mean=771.16, max=771.16, sum=1542.319 (2)",
-            "tab": "General information",
-            "score": 771.1595744680851
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1759.098, mean=1759.098, max=1759.098, sum=3518.197 (2)",
-            "tab": "General information",
-            "score": 1759.0984354628422
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=608.201, mean=608.201, max=608.201, sum=1216.402 (2)",
-            "tab": "General information",
-            "score": 608.2009803921569
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)",
-            "tab": "Efficiency",
-            "score": 0.38381587505340575
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=458.53, mean=458.53, max=458.53, sum=917.06 (2)",
-            "tab": "General information",
-            "score": 458.53
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.684,
-        "details": {
-          "description": "min=0.684, mean=0.684, max=0.684, sum=1.368 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)",
-            "tab": "Efficiency",
-            "score": 0.3511188610603935
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=626.895, mean=626.895, max=626.895, sum=1253.789 (2)",
-            "tab": "General information",
-            "score": 626.8947368421053
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.67,
-        "details": {
-          "description": "min=0.67, mean=0.67, max=0.67, sum=1.34 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.335, mean=0.335, max=0.335, sum=0.671 (2)",
-            "tab": "Efficiency",
-            "score": 0.33533199548721315
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=616.97, mean=616.97, max=616.97, sum=1233.94 (2)",
-            "tab": "General information",
-            "score": 616.97
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.66,
-        "details": {
-          "description": "min=0.66, mean=0.66, max=0.66, sum=1.321 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.347, mean=0.347, max=0.347, sum=0.694 (2)",
-            "tab": "Efficiency",
-            "score": 0.34722964808625995
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=446.966, mean=446.966, max=446.966, sum=893.932 (2)",
-            "tab": "General information",
-            "score": 446.96603773584906
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.621,
-        "details": {
-          "description": "min=0.621, mean=0.621, max=0.621, sum=1.243 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.665 (2)",
-            "tab": "Efficiency",
-            "score": 0.3323540139705577
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=311.94, mean=311.94, max=311.94, sum=623.881 (2)",
-            "tab": "General information",
-            "score": 311.9404255319149
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.662,
-        "details": {
-          "description": "min=0.662, mean=0.662, max=0.662, sum=1.324 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.33, mean=0.33, max=0.33, sum=0.661 (2)",
-            "tab": "Efficiency",
-            "score": 0.33032174274839204
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=491.993, mean=491.993, max=491.993, sum=983.986 (2)",
-            "tab": "General information",
-            "score": 491.99310344827586
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.452,
-        "details": {
-          "description": "min=0.452, mean=0.452, max=0.452, sum=0.905 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)",
-            "tab": "Efficiency",
-            "score": 0.34218634310222806
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=601.344, mean=601.344, max=601.344, sum=1202.688 (2)",
-            "tab": "General information",
-            "score": 601.3439153439153
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.452,
-        "details": {
-          "description": "min=0.452, mean=0.452, max=0.452, sum=0.905 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.356, mean=0.356, max=0.356, sum=0.713 (2)",
-            "tab": "Efficiency",
-            "score": 0.3562947171075003
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=675.579, mean=675.579, max=675.579, sum=1351.159 (2)",
-            "tab": "General information",
-            "score": 675.5793650793651
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.785,
-        "details": {
-          "description": "min=0.785, mean=0.785, max=0.785, sum=1.57 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.685 (2)",
-            "tab": "Efficiency",
-            "score": 0.3425526588193832
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.334, mean=0.334, max=0.334, sum=0.667 (2)",
-            "tab": "Efficiency",
-            "score": 0.3337097426353417
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.411, mean=0.411, max=0.411, sum=0.822 (2)",
-            "tab": "Efficiency",
-            "score": 0.4111129188537598
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.912, mean=0.912, max=0.912, sum=1.824 (2)",
-            "tab": "Efficiency",
-            "score": 0.9120050358049797
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.781, mean=0.781, max=0.781, sum=1.563 (2)",
-            "tab": "Efficiency",
-            "score": 0.7814190243229722
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.344, mean=0.344, max=0.344, sum=0.688 (2)",
-            "tab": "Efficiency",
-            "score": 0.3440394698029355
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.672 (2)",
-            "tab": "Efficiency",
-            "score": 0.3361299728735899
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.365, mean=0.365, max=0.365, sum=0.73 (2)",
-            "tab": "Efficiency",
-            "score": 0.36511756932293926
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.335, mean=0.335, max=0.335, sum=0.67 (2)",
-            "tab": "Efficiency",
-            "score": 0.3350923071388437
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.363, mean=0.363, max=0.363, sum=0.727 (2)",
-            "tab": "Efficiency",
-            "score": 0.3634012266500107
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.339, mean=0.339, max=0.339, sum=0.678 (2)",
-            "tab": "Efficiency",
-            "score": 0.3389187379714546
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.384, mean=0.384, max=0.384, sum=0.767 (2)",
-            "tab": "Efficiency",
-            "score": 0.38363339724364104
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.661, mean=0.661, max=0.661, sum=1.322 (2)",
-            "tab": "Efficiency",
-            "score": 0.6610236086097419
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.502, mean=0.502, max=0.502, sum=1.004 (2)",
-            "tab": "Efficiency",
-            "score": 0.5019015682397513
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=546.394, mean=546.394, max=546.394, sum=1092.787 (2)",
-            "tab": "General information",
-            "score": 546.3935483870968
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=537.015, mean=537.015, max=537.015, sum=1074.03 (2)",
-            "tab": "General information",
-            "score": 537.0147783251232
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=962.1, mean=962.1, max=962.1, sum=1924.2 (2)",
-            "tab": "General information",
-            "score": 962.1
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2957.412, mean=2957.412, max=2957.412, sum=5914.824 (2)",
-            "tab": "General information",
-            "score": 2957.4121212121213
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=404.035, mean=404.035, max=404.035, sum=808.071 (2)",
-            "tab": "General information",
-            "score": 404.0353535353535
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=484.725, mean=484.725, max=484.725, sum=969.451 (2)",
-            "tab": "General information",
-            "score": 484.7253886010363
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=398.892, mean=398.892, max=398.892, sum=797.785 (2)",
-            "tab": "General information",
-            "score": 398.89230769230767
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=575.622, mean=575.622, max=575.622, sum=1151.244 (2)",
-            "tab": "General information",
-            "score": 575.6222222222223
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=420.739, mean=420.739, max=420.739, sum=841.479 (2)",
-            "tab": "General information",
-            "score": 420.73949579831935
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=599.411, mean=599.411, max=599.411, sum=1198.821 (2)",
-            "tab": "General information",
-            "score": 599.4105960264901
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=526.826, mean=526.826, max=526.826, sum=1053.651 (2)",
-            "tab": "General information",
-            "score": 526.8256880733945
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=869.778, mean=869.778, max=869.778, sum=1739.556 (2)",
-            "tab": "General information",
-            "score": 869.7777777777778
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2369.132, mean=2369.132, max=2369.132, sum=4738.265 (2)",
-            "tab": "General information",
-            "score": 2369.1323529411766
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1541.371, mean=1541.371, max=1541.371, sum=3082.743 (2)",
-            "tab": "General information",
-            "score": 1541.3713080168777
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.763,
-        "details": {
-          "description": "min=0.763, mean=0.763, max=0.763, sum=1.527 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.346, mean=0.346, max=0.346, sum=0.691 (2)",
-            "tab": "Efficiency",
-            "score": 0.3457356803620343
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.322, mean=0.322, max=0.322, sum=0.645 (2)",
-            "tab": "Efficiency",
-            "score": 0.3222540717088539
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=332.013, mean=332.013, max=332.013, sum=664.027 (2)",
-            "tab": "General information",
-            "score": 332.0134529147982
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=367.855, mean=367.855, max=367.855, sum=735.71 (2)",
-            "tab": "General information",
-            "score": 367.85496183206106
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.769,
-        "details": {
-          "description": "min=0.769, mean=0.769, max=0.769, sum=1.537 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.356, mean=0.356, max=0.356, sum=0.711 (2)",
-            "tab": "Efficiency",
-            "score": 0.35565017274588595
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=663.289, mean=663.289, max=663.289, sum=1326.579 (2)",
-            "tab": "General information",
-            "score": 663.2892561983471
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.779,
-        "details": {
-          "description": "min=0.779, mean=0.779, max=0.779, sum=1.558 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.352, mean=0.352, max=0.352, sum=0.703 (2)",
-            "tab": "Efficiency",
-            "score": 0.3515900117487995
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=466.595, mean=466.595, max=466.595, sum=933.19 (2)",
-            "tab": "General information",
-            "score": 466.5950920245399
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.411,
-        "details": {
-          "description": "min=0.411, mean=0.411, max=0.411, sum=0.821 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.355, mean=0.355, max=0.355, sum=0.71 (2)",
-            "tab": "Efficiency",
-            "score": 0.35482590326241087
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=720.161, mean=720.161, max=720.161, sum=1440.321 (2)",
-            "tab": "General information",
-            "score": 720.1607142857143
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.806,
-        "details": {
-          "description": "min=0.806, mean=0.806, max=0.806, sum=1.612 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)",
-            "tab": "Efficiency",
-            "score": 0.33675998622931325
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=300.544, mean=300.544, max=300.544, sum=601.087 (2)",
-            "tab": "General information",
-            "score": 300.54368932038835
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.893,
-        "details": {
-          "description": "min=0.893, mean=0.893, max=0.893, sum=1.786 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.328, mean=0.328, max=0.328, sum=0.656 (2)",
-            "tab": "Efficiency",
-            "score": 0.3279143999784421
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=442.825, mean=442.825, max=442.825, sum=885.65 (2)",
-            "tab": "General information",
-            "score": 442.8247863247863
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.372, mean=0.372, max=0.372, sum=0.744 (2)",
-            "tab": "Efficiency",
-            "score": 0.3717941379547119
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=362, mean=362, max=362, sum=724 (2)",
-            "tab": "General information",
-            "score": 362.0
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "description": "min=0.796, mean=0.796, max=0.796, sum=1.591 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.317, mean=0.317, max=0.317, sum=0.634 (2)",
-            "tab": "Efficiency",
-            "score": 0.31703713509619313
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=331.441, mean=331.441, max=331.441, sum=662.881 (2)",
-            "tab": "General information",
-            "score": 331.4406130268199
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.335,
-        "details": {
-          "description": "min=0.335, mean=0.335, max=0.335, sum=0.67 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.643 (2)",
-            "tab": "Efficiency",
-            "score": 0.3214432848671268
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)",
-            "tab": "Efficiency",
-            "score": 0.3421009585844072
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=507.913, mean=507.913, max=507.913, sum=1015.827 (2)",
-            "tab": "General information",
-            "score": 507.91329479768785
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=703.334, mean=703.334, max=703.334, sum=1406.668 (2)",
-            "tab": "General information",
-            "score": 703.3340782122905
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.739,
-        "details": {
-          "description": "min=0.739, mean=0.739, max=0.739, sum=1.477 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.354, mean=0.354, max=0.354, sum=0.708 (2)",
-            "tab": "Efficiency",
-            "score": 0.35382014474058465
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=643.317, mean=643.317, max=643.317, sum=1286.634 (2)",
-            "tab": "General information",
-            "score": 643.3169934640523
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.713,
-        "details": {
-          "description": "min=0.713, mean=0.713, max=0.713, sum=1.426 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.358, mean=0.358, max=0.358, sum=0.715 (2)",
-            "tab": "Efficiency",
-            "score": 0.3577412587625009
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=565.096, mean=565.096, max=565.096, sum=1130.191 (2)",
-            "tab": "General information",
-            "score": 565.0956790123457
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.718,
-        "details": {
-          "description": "min=0.718, mean=0.718, max=0.718, sum=1.436 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.352, mean=0.352, max=0.352, sum=0.704 (2)",
-            "tab": "Efficiency",
-            "score": 0.35222616412422875
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=432.436, mean=432.436, max=432.436, sum=864.873 (2)",
-            "tab": "General information",
-            "score": 432.43636363636364
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.735,
-        "details": {
-          "description": "min=0.735, mean=0.735, max=0.735, sum=1.469 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.439, mean=0.439, max=0.439, sum=0.877 (2)",
-            "tab": "Efficiency",
-            "score": 0.4387260553788166
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1227.196, mean=1227.196, max=1227.196, sum=2454.392 (2)",
-            "tab": "General information",
-            "score": 1227.1959183673468
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.831,
-        "details": {
-          "description": "min=0.831, mean=0.831, max=0.831, sum=1.662 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)",
-            "tab": "Efficiency",
-            "score": 0.31509182820865766
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=463.99, mean=463.99, max=463.99, sum=927.98 (2)",
-            "tab": "General information",
-            "score": 463.99004975124376
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.452,
-        "details": {
-          "description": "min=0.452, mean=0.452, max=0.452, sum=0.904 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.352, mean=0.352, max=0.352, sum=0.705 (2)",
-            "tab": "Efficiency",
-            "score": 0.3524869034089238
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=363.102, mean=363.102, max=363.102, sum=726.205 (2)",
-            "tab": "General information",
-            "score": 363.1024096385542
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.836,
-        "details": {
-          "description": "min=0.836, mean=0.836, max=0.836, sum=1.673 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.687 (2)",
-            "tab": "Efficiency",
-            "score": 0.34344731576261467
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=289.971, mean=289.971, max=289.971, sum=579.942 (2)",
-            "tab": "General information",
-            "score": 289.97076023391816
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.651,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json b/data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json
deleted file mode 100644
index 4838cda1c..000000000
--- a/data/helm_mmlu/01-ai/yi-large-preview/5b5508aa-2956-4a38-84e2-c50b9ce08dc9.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/01-ai_yi-large-preview/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi Large Preview",
-    "id": "01-ai/yi-large-preview",
-    "developer": "01-ai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.793,
-        "details": {
-          "description": "min=0.36, mean=0.793, max=0.969, sum=90.428 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.621, mean=0.764, max=1.689, sum=87.08 (114)",
-            "tab": "Efficiency",
-            "score": 0.7638553584278898
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=302.971, mean=674.842, max=2970.412, sum=76931.942 (114)",
-            "tab": "General information",
-            "score": 674.8416008681387
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6,
-        "details": {
-          "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.718, mean=0.718, max=0.718, sum=1.436 (2)",
-            "tab": "Efficiency",
-            "score": 0.718058660030365
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=396.67, mean=396.67, max=396.67, sum=793.34 (2)",
-            "tab": "General information",
-            "score": 396.67
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.659 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.672, mean=0.672, max=0.672, sum=1.343 (2)",
-            "tab": "Efficiency",
-            "score": 0.6716545846727159
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=388.77, mean=388.77, max=388.77, sum=777.541 (2)",
-            "tab": "General information",
-            "score": 388.77037037037036
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.569,
-        "details": {
-          "description": "min=0.569, mean=0.569, max=0.569, sum=1.137 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.722, mean=0.722, max=0.722, sum=1.443 (2)",
-            "tab": "Efficiency",
-            "score": 0.721672637462616
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.72, mean=0.72, max=0.72, sum=1.439 (2)",
-            "tab": "Efficiency",
-            "score": 0.7195867978864245
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.828, mean=0.828, max=0.828, sum=1.657 (2)",
-            "tab": "Efficiency",
-            "score": 0.8283914875984192
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.734, mean=0.734, max=0.734, sum=1.468 (2)",
-            "tab": "Efficiency",
-            "score": 0.734215636253357
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.704, mean=0.704, max=0.704, sum=1.407 (2)",
-            "tab": "Efficiency",
-            "score": 0.7037480470073016
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.742, mean=0.742, max=0.742, sum=1.484 (2)",
-            "tab": "Efficiency",
-            "score": 0.7418750898510802
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=610.54, mean=610.54, max=610.54, sum=1221.08 (2)",
-            "tab": "General information",
-            "score": 610.54
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=527.819, mean=527.819, max=527.819, sum=1055.639 (2)",
-            "tab": "General information",
-            "score": 527.8194444444445
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=896.06, mean=896.06, max=896.06, sum=1792.12 (2)",
-            "tab": "General information",
-            "score": 896.06
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=648.3, mean=648.3, max=648.3, sum=1296.6 (2)",
-            "tab": "General information",
-            "score": 648.3
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=562.688, mean=562.688, max=562.688, sum=1125.376 (2)",
-            "tab": "General information",
-            "score": 562.6878612716763
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=525.912, mean=525.912, max=525.912, sum=1051.824 (2)",
-            "tab": "General information",
-            "score": 525.9117647058823
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.86,
-        "details": {
-          "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.679, mean=0.679, max=0.679, sum=1.358 (2)",
-            "tab": "Efficiency",
-            "score": 0.6791670727729797
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=418.74, mean=418.74, max=418.74, sum=837.48 (2)",
-            "tab": "General information",
-            "score": 418.74
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.728,
-        "details": {
-          "description": "min=0.728, mean=0.728, max=0.728, sum=1.456 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.752, mean=0.752, max=0.752, sum=1.504 (2)",
-            "tab": "Efficiency",
-            "score": 0.7519724473618624
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=680.789, mean=680.789, max=680.789, sum=1361.579 (2)",
-            "tab": "General information",
-            "score": 680.7894736842105
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52,
-        "details": {
-          "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.7, mean=0.7, max=0.7, sum=1.401 (2)",
-            "tab": "Efficiency",
-            "score": 0.7004458856582642
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=475.32, mean=475.32, max=475.32, sum=950.64 (2)",
-            "tab": "General information",
-            "score": 475.32
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.852,
-        "details": {
-          "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.709, mean=0.709, max=0.709, sum=1.417 (2)",
-            "tab": "Efficiency",
-            "score": 0.7087078028255038
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=444.898, mean=444.898, max=444.898, sum=889.796 (2)",
-            "tab": "General information",
-            "score": 444.89814814814815
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.842,
-        "details": {
-          "description": "min=0.842, mean=0.842, max=0.842, sum=1.685 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.665, mean=0.665, max=0.665, sum=1.33 (2)",
-            "tab": "Efficiency",
-            "score": 0.6652177269435772
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=369.723, mean=369.723, max=369.723, sum=739.447 (2)",
-            "tab": "General information",
-            "score": 369.7234726688103
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.853,
-        "details": {
-          "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.906, mean=0.906, max=0.906, sum=1.813 (2)",
-            "tab": "Efficiency",
-            "score": 0.9064707010984421
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.774, mean=0.774, max=0.774, sum=1.549 (2)",
-            "tab": "Efficiency",
-            "score": 0.7743352516323116
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=1.112, mean=1.112, max=1.112, sum=2.224 (2)",
-            "tab": "Efficiency",
-            "score": 1.1117667775732287
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.729, mean=0.729, max=0.729, sum=1.458 (2)",
-            "tab": "Efficiency",
-            "score": 0.7289925248794307
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1215.533, mean=1215.533, max=1215.533, sum=2431.066 (2)",
-            "tab": "General information",
-            "score": 1215.5330882352941
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=784.16, mean=784.16, max=784.16, sum=1568.319 (2)",
-            "tab": "General information",
-            "score": 784.1595744680851
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1772.098, mean=1772.098, max=1772.098, sum=3544.197 (2)",
-            "tab": "General information",
-            "score": 1772.0984354628422
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=621.201, mean=621.201, max=621.201, sum=1242.402 (2)",
-            "tab": "General information",
-            "score": 621.2009803921569
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)",
-            "tab": "Efficiency",
-            "score": 0.6958462524414063
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=471.53, mean=471.53, max=471.53, sum=943.06 (2)",
-            "tab": "General information",
-            "score": 471.53
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.914,
-        "details": {
-          "description": "min=0.914, mean=0.914, max=0.914, sum=1.829 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.76, mean=0.76, max=0.76, sum=1.521 (2)",
-            "tab": "Efficiency",
-            "score": 0.7604575784582841
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=639.895, mean=639.895, max=639.895, sum=1279.789 (2)",
-            "tab": "General information",
-            "score": 639.8947368421053
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.731, mean=0.731, max=0.731, sum=1.463 (2)",
-            "tab": "Efficiency",
-            "score": 0.7314971995353698
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=629.97, mean=629.97, max=629.97, sum=1259.94 (2)",
-            "tab": "General information",
-            "score": 629.97
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.857,
-        "details": {
-          "description": "min=0.857, mean=0.857, max=0.857, sum=1.713 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.688, mean=0.688, max=0.688, sum=1.376 (2)",
-            "tab": "Efficiency",
-            "score": 0.6877818728392979
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=459.966, mean=459.966, max=459.966, sum=919.932 (2)",
-            "tab": "General information",
-            "score": 459.96603773584906
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.864,
-        "details": {
-          "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.659, mean=0.659, max=0.659, sum=1.319 (2)",
-            "tab": "Efficiency",
-            "score": 0.6594150309867047
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=324.94, mean=324.94, max=324.94, sum=649.881 (2)",
-            "tab": "General information",
-            "score": 324.9404255319149
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.779,
-        "details": {
-          "description": "min=0.779, mean=0.779, max=0.779, sum=1.559 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.697, mean=0.697, max=0.697, sum=1.394 (2)",
-            "tab": "Efficiency",
-            "score": 0.6971425631950642
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=504.993, mean=504.993, max=504.993, sum=1009.986 (2)",
-            "tab": "General information",
-            "score": 504.99310344827586
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.685,
-        "details": {
-          "description": "min=0.685, mean=0.685, max=0.685, sum=1.37 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.715, mean=0.715, max=0.715, sum=1.43 (2)",
-            "tab": "Efficiency",
-            "score": 0.7149287146866006
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=614.344, mean=614.344, max=614.344, sum=1228.688 (2)",
-            "tab": "General information",
-            "score": 614.3439153439153
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.603,
-        "details": {
-          "description": "min=0.603, mean=0.603, max=0.603, sum=1.206 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.761, mean=0.761, max=0.761, sum=1.522 (2)",
-            "tab": "Efficiency",
-            "score": 0.7611211935679117
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=688.579, mean=688.579, max=688.579, sum=1377.159 (2)",
-            "tab": "General information",
-            "score": 688.5793650793651
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.928,
-        "details": {
-          "description": "min=0.928, mean=0.928, max=0.928, sum=1.857 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.739, mean=0.739, max=0.739, sum=1.478 (2)",
-            "tab": "Efficiency",
-            "score": 0.7389615043517082
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.727, mean=0.727, max=0.727, sum=1.454 (2)",
-            "tab": "Efficiency",
-            "score": 0.7272039317145136
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)",
-            "tab": "Efficiency",
-            "score": 0.8772388291358948
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=1.689, mean=1.689, max=1.689, sum=3.378 (2)",
-            "tab": "Efficiency",
-            "score": 1.6891969362894694
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.725, mean=0.725, max=0.725, sum=1.451 (2)",
-            "tab": "Efficiency",
-            "score": 0.7252739162156077
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.693, mean=0.693, max=0.693, sum=1.387 (2)",
-            "tab": "Efficiency",
-            "score": 0.6934328054517044
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.684, mean=0.684, max=0.684, sum=1.367 (2)",
-            "tab": "Efficiency",
-            "score": 0.6835794656704633
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.716, mean=0.716, max=0.716, sum=1.432 (2)",
-            "tab": "Efficiency",
-            "score": 0.7162466013873064
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.711, mean=0.711, max=0.711, sum=1.422 (2)",
-            "tab": "Efficiency",
-            "score": 0.7111842982909259
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.74, mean=0.74, max=0.74, sum=1.481 (2)",
-            "tab": "Efficiency",
-            "score": 0.7403108505223761
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)",
-            "tab": "Efficiency",
-            "score": 0.7000295271567248
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.833, mean=0.833, max=0.833, sum=1.666 (2)",
-            "tab": "Efficiency",
-            "score": 0.8330503514519444
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=1.349, mean=1.349, max=1.349, sum=2.698 (2)",
-            "tab": "Efficiency",
-            "score": 1.3490371108055115
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=1.047, mean=1.047, max=1.047, sum=2.093 (2)",
-            "tab": "Efficiency",
-            "score": 1.046591958919155
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=559.394, mean=559.394, max=559.394, sum=1118.787 (2)",
-            "tab": "General information",
-            "score": 559.3935483870968
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=550.015, mean=550.015, max=550.015, sum=1100.03 (2)",
-            "tab": "General information",
-            "score": 550.0147783251232
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=975.1, mean=975.1, max=975.1, sum=1950.2 (2)",
-            "tab": "General information",
-            "score": 975.1
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2970.412, mean=2970.412, max=2970.412, sum=5940.824 (2)",
-            "tab": "General information",
-            "score": 2970.4121212121213
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=417.035, mean=417.035, max=417.035, sum=834.071 (2)",
-            "tab": "General information",
-            "score": 417.0353535353535
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=497.725, mean=497.725, max=497.725, sum=995.451 (2)",
-            "tab": "General information",
-            "score": 497.7253886010363
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=411.892, mean=411.892, max=411.892, sum=823.785 (2)",
-            "tab": "General information",
-            "score": 411.89230769230767
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=588.622, mean=588.622, max=588.622, sum=1177.244 (2)",
-            "tab": "General information",
-            "score": 588.6222222222223
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=433.739, mean=433.739, max=433.739, sum=867.479 (2)",
-            "tab": "General information",
-            "score": 433.73949579831935
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=612.411, mean=612.411, max=612.411, sum=1224.821 (2)",
-            "tab": "General information",
-            "score": 612.4105960264901
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=539.826, mean=539.826, max=539.826, sum=1079.651 (2)",
-            "tab": "General information",
-            "score": 539.8256880733945
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=882.778, mean=882.778, max=882.778, sum=1765.556 (2)",
-            "tab": "General information",
-            "score": 882.7777777777778
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2382.132, mean=2382.132, max=2382.132, sum=4764.265 (2)",
-            "tab": "General information",
-            "score": 2382.1323529411766
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1554.371, mean=1554.371, max=1554.371, sum=3108.743 (2)",
-            "tab": "General information",
-            "score": 1554.3713080168777
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)",
-            "tab": "Efficiency",
-            "score": 0.6601343742935112
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.704, mean=0.704, max=0.704, sum=1.409 (2)",
-            "tab": "Efficiency",
-            "score": 0.7043184669873187
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=345.013, mean=345.013, max=345.013, sum=690.027 (2)",
-            "tab": "General information",
-            "score": 345.0134529147982
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=380.855, mean=380.855, max=380.855, sum=761.71 (2)",
-            "tab": "General information",
-            "score": 380.85496183206106
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.917,
-        "details": {
-          "description": "min=0.917, mean=0.917, max=0.917, sum=1.835 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.769, mean=0.769, max=0.769, sum=1.538 (2)",
-            "tab": "Efficiency",
-            "score": 0.7691502098209602
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=676.289, mean=676.289, max=676.289, sum=1352.579 (2)",
-            "tab": "General information",
-            "score": 676.2892561983471
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "details": {
-          "description": "min=0.865, mean=0.865, max=0.865, sum=1.73 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.684, mean=0.684, max=0.684, sum=1.367 (2)",
-            "tab": "Efficiency",
-            "score": 0.6835026492370418
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=479.595, mean=479.595, max=479.595, sum=959.19 (2)",
-            "tab": "General information",
-            "score": 479.5950920245399
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.616,
-        "details": {
-          "description": "min=0.616, mean=0.616, max=0.616, sum=1.232 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.745, mean=0.745, max=0.745, sum=1.489 (2)",
-            "tab": "Efficiency",
-            "score": 0.7447149263960975
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=733.161, mean=733.161, max=733.161, sum=1466.321 (2)",
-            "tab": "General information",
-            "score": 733.1607142857143
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.903,
-        "details": {
-          "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.621, mean=0.621, max=0.621, sum=1.243 (2)",
-            "tab": "Efficiency",
-            "score": 0.6213390433672562
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=313.544, mean=313.544, max=313.544, sum=627.087 (2)",
-            "tab": "General information",
-            "score": 313.54368932038835
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.927,
-        "details": {
-          "description": "min=0.927, mean=0.927, max=0.927, sum=1.855 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.679, mean=0.679, max=0.679, sum=1.357 (2)",
-            "tab": "Efficiency",
-            "score": 0.6785362948719252
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=455.825, mean=455.825, max=455.825, sum=911.65 (2)",
-            "tab": "General information",
-            "score": 455.8247863247863
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.689, mean=0.689, max=0.689, sum=1.379 (2)",
-            "tab": "Efficiency",
-            "score": 0.6893473124504089
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=375, mean=375, max=375, sum=750 (2)",
-            "tab": "General information",
-            "score": 375.0
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.916,
-        "details": {
-          "description": "min=0.916, mean=0.916, max=0.916, sum=1.831 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.633, mean=0.633, max=0.633, sum=1.266 (2)",
-            "tab": "Efficiency",
-            "score": 0.6329697509073815
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=344.441, mean=344.441, max=344.441, sum=688.881 (2)",
-            "tab": "General information",
-            "score": 344.4406130268199
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.831,
-        "details": {
-          "description": "min=0.831, mean=0.831, max=0.831, sum=1.663 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.703, mean=0.703, max=0.703, sum=1.406 (2)",
-            "tab": "Efficiency",
-            "score": 0.7028186107646524
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.754, mean=0.754, max=0.754, sum=1.509 (2)",
-            "tab": "Efficiency",
-            "score": 0.7543408100831442
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=520.913, mean=520.913, max=520.913, sum=1041.827 (2)",
-            "tab": "General information",
-            "score": 520.9132947976879
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=716.334, mean=716.334, max=716.334, sum=1432.668 (2)",
-            "tab": "General information",
-            "score": 716.3340782122905
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.846,
-        "details": {
-          "description": "min=0.846, mean=0.846, max=0.846, sum=1.693 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.721, mean=0.721, max=0.721, sum=1.442 (2)",
-            "tab": "Efficiency",
-            "score": 0.7212473138485079
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=656.317, mean=656.317, max=656.317, sum=1312.634 (2)",
-            "tab": "General information",
-            "score": 656.3169934640523
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.892,
-        "details": {
-          "description": "min=0.892, mean=0.892, max=0.892, sum=1.784 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.712, mean=0.712, max=0.712, sum=1.423 (2)",
-            "tab": "Efficiency",
-            "score": 0.7115242841802998
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=578.096, mean=578.096, max=578.096, sum=1156.191 (2)",
-            "tab": "General information",
-            "score": 578.0956790123457
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.827,
-        "details": {
-          "description": "min=0.827, mean=0.827, max=0.827, sum=1.655 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.708, mean=0.708, max=0.708, sum=1.417 (2)",
-            "tab": "Efficiency",
-            "score": 0.708361968127164
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=445.436, mean=445.436, max=445.436, sum=890.873 (2)",
-            "tab": "General information",
-            "score": 445.43636363636364
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.82, mean=0.82, max=0.82, sum=1.641 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
-            "tab": "Efficiency",
-            "score": 0.9198286231683225
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1240.196, mean=1240.196, max=1240.196, sum=2480.392 (2)",
-            "tab": "General information",
-            "score": 1240.1959183673468
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.881,
-        "details": {
-          "description": "min=0.881, mean=0.881, max=0.881, sum=1.761 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.71, mean=0.71, max=0.71, sum=1.421 (2)",
-            "tab": "Efficiency",
-            "score": 0.7103830344641386
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=476.99, mean=476.99, max=476.99, sum=953.98 (2)",
-            "tab": "General information",
-            "score": 476.99004975124376
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.59,
-        "details": {
-          "description": "min=0.59, mean=0.59, max=0.59, sum=1.181 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.677, mean=0.677, max=0.677, sum=1.354 (2)",
-            "tab": "Efficiency",
-            "score": 0.6768132835985666
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=376.102, mean=376.102, max=376.102, sum=752.205 (2)",
-            "tab": "General information",
-            "score": 376.1024096385542
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=1.743 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.645, mean=0.645, max=0.645, sum=1.289 (2)",
-            "tab": "Efficiency",
-            "score": 0.644616849241201
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=302.971, mean=302.971, max=302.971, sum=605.942 (2)",
-            "tab": "General information",
-            "score": 302.97076023391816
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.258,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json b/data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json
deleted file mode 100644
index 45536e1a1..000000000
--- a/data/helm_mmlu/ai21/jamba-1.5-large/0e14f2da-72a0-451a-ad35-d8ecd9e27d3f.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/ai21_jamba-1.5-large/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Jamba 1.5 Large",
-    "id": "ai21/jamba-1.5-large",
-    "developer": "ai21",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.782,
-        "details": {
-          "description": "min=0.46, mean=0.782, max=0.969, sum=89.128 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.889, mean=1.01, max=1.394, sum=115.088 (114)",
-            "tab": "Efficiency",
-            "score": 1.0095401397461812
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=293.649, mean=658.432, max=2900.673, sum=75061.271 (114)",
-            "tab": "General information",
-            "score": 658.4322049384847
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53,
-        "details": {
-          "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.933, mean=0.933, max=0.933, sum=1.865 (2)",
-            "tab": "Efficiency",
-            "score": 0.9326767182350159
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=397.58, mean=397.58, max=397.58, sum=795.16 (2)",
-            "tab": "General information",
-            "score": 397.58
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.793,
-        "details": {
-          "description": "min=0.793, mean=0.793, max=0.793, sum=1.585 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.889, mean=0.889, max=0.889, sum=1.777 (2)",
-            "tab": "Efficiency",
-            "score": 0.8885634528266059
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=376.741, mean=376.741, max=376.741, sum=753.481 (2)",
-            "tab": "General information",
-            "score": 376.74074074074076
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.51,
-        "details": {
-          "description": "min=0.51, mean=0.51, max=0.51, sum=1.02 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.971, mean=0.971, max=0.971, sum=1.942 (2)",
-            "tab": "Efficiency",
-            "score": 0.9710254788398742
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.968, mean=0.968, max=0.968, sum=1.936 (2)",
-            "tab": "Efficiency",
-            "score": 0.968123722407553
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.986, mean=0.986, max=0.986, sum=1.973 (2)",
-            "tab": "Efficiency",
-            "score": 0.9862666988372802
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)",
-            "tab": "Efficiency",
-            "score": 0.9599522399902344
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.978, mean=0.978, max=0.978, sum=1.957 (2)",
-            "tab": "Efficiency",
-            "score": 0.9782800839815525
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=1.01, mean=1.01, max=1.01, sum=2.019 (2)",
-            "tab": "Efficiency",
-            "score": 1.0095638387343462
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=598.67, mean=598.67, max=598.67, sum=1197.34 (2)",
-            "tab": "General information",
-            "score": 598.67
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=507.306, mean=507.306, max=507.306, sum=1014.611 (2)",
-            "tab": "General information",
-            "score": 507.30555555555554
-          },
-          "College Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=883.21, mean=883.21, max=883.21, sum=1766.42 (2)",
-            "tab": "General information",
-            "score": 883.21
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=643.97, mean=643.97, max=643.97, sum=1287.94 (2)",
-            "tab": "General information",
-            "score": 643.97
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=543.347, mean=543.347, max=543.347, sum=1086.694 (2)",
-            "tab": "General information",
-            "score": 543.3468208092486
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=533.402, mean=533.402, max=533.402, sum=1066.804 (2)",
-            "tab": "General information",
-            "score": 533.4019607843137
-          },
-          "College Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=1.0, mean=1.0, max=1.0, sum=2.0 (2)",
-            "tab": "Efficiency",
-            "score": 1.000160608291626
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=404.27, mean=404.27, max=404.27, sum=808.54 (2)",
-            "tab": "General information",
-            "score": 404.27
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.614,
-        "details": {
-          "description": "min=0.614, mean=0.614, max=0.614, sum=1.228 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.971, mean=0.971, max=0.971, sum=1.942 (2)",
-            "tab": "Efficiency",
-            "score": 0.9712212587657728
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=678.64, mean=678.64, max=678.64, sum=1357.281 (2)",
-            "tab": "General information",
-            "score": 678.640350877193
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.54,
-        "details": {
-          "description": "min=0.54, mean=0.54, max=0.54, sum=1.08 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.951, mean=0.951, max=0.951, sum=1.901 (2)",
-            "tab": "Efficiency",
-            "score": 0.9506172919273377
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=466.9, mean=466.9, max=466.9, sum=933.8 (2)",
-            "tab": "General information",
-            "score": 466.9
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.929, mean=0.929, max=0.929, sum=1.858 (2)",
-            "tab": "Efficiency",
-            "score": 0.9292316171858046
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=427.185, mean=427.185, max=427.185, sum=854.37 (2)",
-            "tab": "General information",
-            "score": 427.18518518518516
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.849,
-        "details": {
-          "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.924, mean=0.924, max=0.924, sum=1.848 (2)",
-            "tab": "Efficiency",
-            "score": 0.9240530403480652
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=359.441, mean=359.441, max=359.441, sum=718.881 (2)",
-            "tab": "General information",
-            "score": 359.4405144694534
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.842,
-        "details": {
-          "description": "min=0.842, mean=0.842, max=0.842, sum=1.683 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=1.025, mean=1.025, max=1.025, sum=2.05 (2)",
-            "tab": "Efficiency",
-            "score": 1.0251652388011707
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.954, mean=0.954, max=0.954, sum=1.907 (2)",
-            "tab": "Efficiency",
-            "score": 0.9537228667144234
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=1.039, mean=1.039, max=1.039, sum=2.078 (2)",
-            "tab": "Efficiency",
-            "score": 1.0390360032097767
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.959, mean=0.959, max=0.959, sum=1.918 (2)",
-            "tab": "Efficiency",
-            "score": 0.9592212933340883
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1170.393, mean=1170.393, max=1170.393, sum=2340.787 (2)",
-            "tab": "General information",
-            "score": 1170.3933823529412
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=770.316, mean=770.316, max=770.316, sum=1540.631 (2)",
-            "tab": "General information",
-            "score": 770.3156028368794
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1725.955, mean=1725.955, max=1725.955, sum=3451.91 (2)",
-            "tab": "General information",
-            "score": 1725.9550195567144
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=611.645, mean=611.645, max=611.645, sum=1223.291 (2)",
-            "tab": "General information",
-            "score": 611.6454248366013
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.991, mean=0.991, max=0.991, sum=1.982 (2)",
-            "tab": "Efficiency",
-            "score": 0.9911877512931824
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=461.53, mean=461.53, max=461.53, sum=923.06 (2)",
-            "tab": "General information",
-            "score": 461.53
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.882,
-        "details": {
-          "description": "min=0.882, mean=0.882, max=0.882, sum=1.763 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.975, mean=0.975, max=0.975, sum=1.95 (2)",
-            "tab": "Efficiency",
-            "score": 0.9748745105768505
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=632.947, mean=632.947, max=632.947, sum=1265.895 (2)",
-            "tab": "General information",
-            "score": 632.9473684210526
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.963, mean=0.963, max=0.963, sum=1.926 (2)",
-            "tab": "Efficiency",
-            "score": 0.9630230093002319
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=591.96, mean=591.96, max=591.96, sum=1183.92 (2)",
-            "tab": "General information",
-            "score": 591.96
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.849,
-        "details": {
-          "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.937, mean=0.937, max=0.937, sum=1.874 (2)",
-            "tab": "Efficiency",
-            "score": 0.9370616642933971
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=437.34, mean=437.34, max=437.34, sum=874.679 (2)",
-            "tab": "General information",
-            "score": 437.33962264150944
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.779,
-        "details": {
-          "description": "min=0.779, mean=0.779, max=0.779, sum=1.557 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.898, mean=0.898, max=0.898, sum=1.795 (2)",
-            "tab": "Efficiency",
-            "score": 0.8976521999277967
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=322.962, mean=322.962, max=322.962, sum=645.923 (2)",
-            "tab": "General information",
-            "score": 322.9617021276596
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.793,
-        "details": {
-          "description": "min=0.793, mean=0.793, max=0.793, sum=1.586 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)",
-            "tab": "Efficiency",
-            "score": 0.9001944936555007
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=494.662, mean=494.662, max=494.662, sum=989.324 (2)",
-            "tab": "General information",
-            "score": 494.6620689655172
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.656,
-        "details": {
-          "description": "min=0.656, mean=0.656, max=0.656, sum=1.312 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.976, mean=0.976, max=0.976, sum=1.951 (2)",
-            "tab": "Efficiency",
-            "score": 0.9756249517360062
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=607.042, mean=607.042, max=607.042, sum=1214.085 (2)",
-            "tab": "General information",
-            "score": 607.042328042328
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.619,
-        "details": {
-          "description": "min=0.619, mean=0.619, max=0.619, sum=1.238 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.973, mean=0.973, max=0.973, sum=1.947 (2)",
-            "tab": "Efficiency",
-            "score": 0.9733156949754745
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=656.468, mean=656.468, max=656.468, sum=1312.937 (2)",
-            "tab": "General information",
-            "score": 656.468253968254
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.911,
-        "details": {
-          "description": "min=0.911, mean=0.911, max=0.911, sum=1.823 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.953, mean=0.953, max=0.953, sum=1.906 (2)",
-            "tab": "Efficiency",
-            "score": 0.9529511121011549
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.955, mean=0.955, max=0.955, sum=1.911 (2)",
-            "tab": "Efficiency",
-            "score": 0.955410502814307
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.978, mean=0.978, max=0.978, sum=1.957 (2)",
-            "tab": "Efficiency",
-            "score": 0.9784861493110657
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=1.394, mean=1.394, max=1.394, sum=2.789 (2)",
-            "tab": "Efficiency",
-            "score": 1.394392929655133
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=1.119, mean=1.119, max=1.119, sum=2.238 (2)",
-            "tab": "Efficiency",
-            "score": 1.1188469896412858
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=1.151, mean=1.151, max=1.151, sum=2.302 (2)",
-            "tab": "Efficiency",
-            "score": 1.1508279983243794
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=1.015, mean=1.015, max=1.015, sum=2.03 (2)",
-            "tab": "Efficiency",
-            "score": 1.014756965637207
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=1.115, mean=1.115, max=1.115, sum=2.229 (2)",
-            "tab": "Efficiency",
-            "score": 1.1145719607671103
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=1.094, mean=1.094, max=1.094, sum=2.189 (2)",
-            "tab": "Efficiency",
-            "score": 1.094437322696718
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=1.117, mean=1.117, max=1.117, sum=2.235 (2)",
-            "tab": "Efficiency",
-            "score": 1.1174537361852381
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=1.026, mean=1.026, max=1.026, sum=2.051 (2)",
-            "tab": "Efficiency",
-            "score": 1.025726358606181
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=1.119, mean=1.119, max=1.119, sum=2.238 (2)",
-            "tab": "Efficiency",
-            "score": 1.1191309756702847
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=1.362, mean=1.362, max=1.362, sum=2.724 (2)",
-            "tab": "Efficiency",
-            "score": 1.3617976483176737
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=1.138, mean=1.138, max=1.138, sum=2.275 (2)",
-            "tab": "Efficiency",
-            "score": 1.1377391141175217
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=532.455, mean=532.455, max=532.455, sum=1064.91 (2)",
-            "tab": "General information",
-            "score": 532.4548387096775
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=537.089, mean=537.089, max=537.089, sum=1074.177 (2)",
-            "tab": "General information",
-            "score": 537.0886699507389
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=958.39, mean=958.39, max=958.39, sum=1916.78 (2)",
-            "tab": "General information",
-            "score": 958.39
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2900.673, mean=2900.673, max=2900.673, sum=5801.345 (2)",
-            "tab": "General information",
-            "score": 2900.672727272727
-          },
-          "High School European History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=406.146, mean=406.146, max=406.146, sum=812.293 (2)",
-            "tab": "General information",
-            "score": 406.14646464646466
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=492.788, mean=492.788, max=492.788, sum=985.575 (2)",
-            "tab": "General information",
-            "score": 492.78756476683935
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=406.1, mean=406.1, max=406.1, sum=812.2 (2)",
-            "tab": "General information",
-            "score": 406.1
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=583.248, mean=583.248, max=583.248, sum=1166.496 (2)",
-            "tab": "General information",
-            "score": 583.2481481481482
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=426.265, mean=426.265, max=426.265, sum=852.529 (2)",
-            "tab": "General information",
-            "score": 426.2647058823529
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=603.272, mean=603.272, max=603.272, sum=1206.543 (2)",
-            "tab": "General information",
-            "score": 603.2715231788079
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=525.635, mean=525.635, max=525.635, sum=1051.27 (2)",
-            "tab": "General information",
-            "score": 525.6348623853211
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=876.032, mean=876.032, max=876.032, sum=1752.065 (2)",
-            "tab": "General information",
-            "score": 876.0324074074074
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2310.931, mean=2310.931, max=2310.931, sum=4621.863 (2)",
-            "tab": "General information",
-            "score": 2310.9313725490197
-          },
-          "High School US History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1501.477, mean=1501.477, max=1501.477, sum=3002.954 (2)",
-            "tab": "General information",
-            "score": 1501.4767932489451
-          },
-          "High School World History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.832,
-        "details": {
-          "description": "min=0.832, mean=0.832, max=0.832, sum=1.664 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=1.018, mean=1.018, max=1.018, sum=2.036 (2)",
-            "tab": "Efficiency",
-            "score": 1.0177636157236827
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=1.059, mean=1.059, max=1.059, sum=2.118 (2)",
-            "tab": "Efficiency",
-            "score": 1.0589779351503794
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=333.036, mean=333.036, max=333.036, sum=666.072 (2)",
-            "tab": "General information",
-            "score": 333.0358744394619
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=362.466, mean=362.466, max=362.466, sum=724.931 (2)",
-            "tab": "General information",
-            "score": 362.46564885496184
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.884,
-        "details": {
-          "description": "min=0.884, mean=0.884, max=0.884, sum=1.769 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=1.098, mean=1.098, max=1.098, sum=2.197 (2)",
-            "tab": "Efficiency",
-            "score": 1.098483010757068
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=662.628, mean=662.628, max=662.628, sum=1325.256 (2)",
-            "tab": "General information",
-            "score": 662.6280991735537
-          },
-          "International Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.859,
-        "details": {
-          "description": "min=0.859, mean=0.859, max=0.859, sum=1.718 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=1.023, mean=1.023, max=1.023, sum=2.046 (2)",
-            "tab": "Efficiency",
-            "score": 1.0228094908357397
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=466.227, mean=466.227, max=466.227, sum=932.454 (2)",
-            "tab": "General information",
-            "score": 466.2269938650307
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.688,
-        "details": {
-          "description": "min=0.688, mean=0.688, max=0.688, sum=1.375 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=1.124, mean=1.124, max=1.124, sum=2.247 (2)",
-            "tab": "Efficiency",
-            "score": 1.123652777501515
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=719.938, mean=719.938, max=719.938, sum=1439.875 (2)",
-            "tab": "General information",
-            "score": 719.9375
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.864,
-        "details": {
-          "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=1.033, mean=1.033, max=1.033, sum=2.067 (2)",
-            "tab": "Efficiency",
-            "score": 1.0334750402320936
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=299.553, mean=299.553, max=299.553, sum=599.107 (2)",
-            "tab": "General information",
-            "score": 299.5533980582524
-          },
-          "Management - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=1.097, mean=1.097, max=1.097, sum=2.194 (2)",
-            "tab": "Efficiency",
-            "score": 1.0967916657782009
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=446.714, mean=446.714, max=446.714, sum=893.427 (2)",
-            "tab": "General information",
-            "score": 446.71367521367523
-          },
-          "Marketing - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "details": {
-          "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=1.101, mean=1.101, max=1.101, sum=2.201 (2)",
-            "tab": "Efficiency",
-            "score": 1.1006885027885438
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=361.45, mean=361.45, max=361.45, sum=722.9 (2)",
-            "tab": "General information",
-            "score": 361.45
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.931,
-        "details": {
-          "description": "min=0.931, mean=0.931, max=0.931, sum=1.862 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.906, mean=0.906, max=0.906, sum=1.813 (2)",
-            "tab": "Efficiency",
-            "score": 0.9063281955085647
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=332.257, mean=332.257, max=332.257, sum=664.513 (2)",
-            "tab": "General information",
-            "score": 332.2567049808429
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.686,
-        "details": {
-          "description": "min=0.686, mean=0.686, max=0.686, sum=1.372 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.946, mean=0.946, max=0.946, sum=1.892 (2)",
-            "tab": "Efficiency",
-            "score": 0.9461793238027937
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)",
-            "tab": "Efficiency",
-            "score": 0.9602039808667572
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=506.514, mean=506.514, max=506.514, sum=1013.029 (2)",
-            "tab": "General information",
-            "score": 506.514450867052
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=709.934, mean=709.934, max=709.934, sum=1419.868 (2)",
-            "tab": "General information",
-            "score": 709.9340782122905
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.869,
-        "details": {
-          "description": "min=0.869, mean=0.869, max=0.869, sum=1.739 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.947, mean=0.947, max=0.947, sum=1.894 (2)",
-            "tab": "Efficiency",
-            "score": 0.9469306157305349
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=619.683, mean=619.683, max=619.683, sum=1239.366 (2)",
-            "tab": "General information",
-            "score": 619.6830065359477
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.892,
-        "details": {
-          "description": "min=0.892, mean=0.892, max=0.892, sum=1.784 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.956, mean=0.956, max=0.956, sum=1.912 (2)",
-            "tab": "Efficiency",
-            "score": 0.9560920861032274
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=566.244, mean=566.244, max=566.244, sum=1132.488 (2)",
-            "tab": "General information",
-            "score": 566.2438271604939
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.755,
-        "details": {
-          "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.944, mean=0.944, max=0.944, sum=1.887 (2)",
-            "tab": "Efficiency",
-            "score": 0.9436206535859541
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=440.6, mean=440.6, max=440.6, sum=881.2 (2)",
-            "tab": "General information",
-            "score": 440.6
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.771,
-        "details": {
-          "description": "min=0.771, mean=0.771, max=0.771, sum=1.543 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.988, mean=0.988, max=0.988, sum=1.976 (2)",
-            "tab": "Efficiency",
-            "score": 0.9880037901352863
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1221.388, mean=1221.388, max=1221.388, sum=2442.776 (2)",
-            "tab": "General information",
-            "score": 1221.3877551020407
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "details": {
-          "description": "min=0.93, mean=0.93, max=0.93, sum=1.861 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.947, mean=0.947, max=0.947, sum=1.894 (2)",
-            "tab": "Efficiency",
-            "score": 0.9468028070914805
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=465.925, mean=465.925, max=465.925, sum=931.851 (2)",
-            "tab": "General information",
-            "score": 465.92537313432837
-          },
-          "Sociology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.554,
-        "details": {
-          "description": "min=0.554, mean=0.554, max=0.554, sum=1.108 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.901, mean=0.901, max=0.901, sum=1.803 (2)",
-            "tab": "Efficiency",
-            "score": 0.9013677418950092
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=358.048, mean=358.048, max=358.048, sum=716.096 (2)",
-            "tab": "General information",
-            "score": 358.04819277108436
-          },
-          "Virology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "details": {
-          "description": "min=0.865, mean=0.865, max=0.865, sum=1.731 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.899, mean=0.899, max=0.899, sum=1.799 (2)",
-            "tab": "Efficiency",
-            "score": 0.8992712400112933
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=293.649, mean=293.649, max=293.649, sum=587.298 (2)",
-            "tab": "General information",
-            "score": 293.64912280701753
-          },
-          "World Religions - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.147,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json b/data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json
deleted file mode 100644
index 727c60261..000000000
--- a/data/helm_mmlu/ai21/jamba-1.5-mini/92e0b1b9-c167-4e07-b770-2b78527eb4eb.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/ai21_jamba-1.5-mini/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Jamba 1.5 Mini",
-    "id": "ai21/jamba-1.5-mini",
-    "developer": "ai21",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.699,
-        "details": {
-          "description": "min=0.269, mean=0.699, max=0.943, sum=79.696 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.78, mean=0.859, max=1.024, sum=97.957 (114)",
-            "tab": "Efficiency",
-            "score": 0.8592709427634447
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=293.649, mean=658.432, max=2900.673, sum=75061.271 (114)",
-            "tab": "General information",
-            "score": 658.4322049384847
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.33,
-        "details": {
-          "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.783, mean=0.783, max=0.783, sum=1.566 (2)",
-            "tab": "Efficiency",
-            "score": 0.783083221912384
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=397.58, mean=397.58, max=397.58, sum=795.16 (2)",
-            "tab": "General information",
-            "score": 397.58
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.711,
-        "details": {
-          "description": "min=0.711, mean=0.711, max=0.711, sum=1.422 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.832, mean=0.832, max=0.832, sum=1.664 (2)",
-            "tab": "Efficiency",
-            "score": 0.8321040700983118
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=376.741, mean=376.741, max=376.741, sum=753.481 (2)",
-            "tab": "General information",
-            "score": 376.74074074074076
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.48,
-        "details": {
-          "description": "min=0.48, mean=0.48, max=0.48, sum=0.961 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.807, mean=0.807, max=0.807, sum=1.615 (2)",
-            "tab": "Efficiency",
-            "score": 0.8074449944496155
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.821, mean=0.821, max=0.821, sum=1.643 (2)",
-            "tab": "Efficiency",
-            "score": 0.8214208516809676
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)",
-            "tab": "Efficiency",
-            "score": 0.8334288668632507
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)",
-            "tab": "Efficiency",
-            "score": 0.8399906301498413
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.831, mean=0.831, max=0.831, sum=1.662 (2)",
-            "tab": "Efficiency",
-            "score": 0.8312392317490771
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.829, mean=0.829, max=0.829, sum=1.658 (2)",
-            "tab": "Efficiency",
-            "score": 0.8287959309185252
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=598.67, mean=598.67, max=598.67, sum=1197.34 (2)",
-            "tab": "General information",
-            "score": 598.67
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=507.306, mean=507.306, max=507.306, sum=1014.611 (2)",
-            "tab": "General information",
-            "score": 507.30555555555554
-          },
-          "College Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=883.21, mean=883.21, max=883.21, sum=1766.42 (2)",
-            "tab": "General information",
-            "score": 883.21
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=643.97, mean=643.97, max=643.97, sum=1287.94 (2)",
-            "tab": "General information",
-            "score": 643.97
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=543.347, mean=543.347, max=543.347, sum=1086.694 (2)",
-            "tab": "General information",
-            "score": 543.3468208092486
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=533.402, mean=533.402, max=533.402, sum=1066.804 (2)",
-            "tab": "General information",
-            "score": 533.4019607843137
-          },
-          "College Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.73,
-        "details": {
-          "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.801, mean=0.801, max=0.801, sum=1.602 (2)",
-            "tab": "Efficiency",
-            "score": 0.8010901069641113
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=404.27, mean=404.27, max=404.27, sum=808.54 (2)",
-            "tab": "General information",
-            "score": 404.27
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.491,
-        "details": {
-          "description": "min=0.491, mean=0.491, max=0.491, sum=0.982 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.83, mean=0.83, max=0.83, sum=1.661 (2)",
-            "tab": "Efficiency",
-            "score": 0.8303811194603903
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=678.64, mean=678.64, max=678.64, sum=1357.281 (2)",
-            "tab": "General information",
-            "score": 678.640350877193
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.43,
-        "details": {
-          "description": "min=0.43, mean=0.43, max=0.43, sum=0.86 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.847, mean=0.847, max=0.847, sum=1.694 (2)",
-            "tab": "Efficiency",
-            "score": 0.8467721128463745
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=466.9, mean=466.9, max=466.9, sum=933.8 (2)",
-            "tab": "General information",
-            "score": 466.9
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "details": {
-          "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.809, mean=0.809, max=0.809, sum=1.619 (2)",
-            "tab": "Efficiency",
-            "score": 0.8092732672338132
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=427.185, mean=427.185, max=427.185, sum=854.37 (2)",
-            "tab": "General information",
-            "score": 427.18518518518516
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.752,
-        "details": {
-          "description": "min=0.752, mean=0.752, max=0.752, sum=1.505 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.815, mean=0.815, max=0.815, sum=1.629 (2)",
-            "tab": "Efficiency",
-            "score": 0.8147224314343124
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=359.441, mean=359.441, max=359.441, sum=718.881 (2)",
-            "tab": "General information",
-            "score": 359.4405144694534
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.832, mean=0.832, max=0.832, sum=1.663 (2)",
-            "tab": "Efficiency",
-            "score": 0.8315524055677301
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.803, mean=0.803, max=0.803, sum=1.606 (2)",
-            "tab": "Efficiency",
-            "score": 0.8028552659014438
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.836, mean=0.836, max=0.836, sum=1.671 (2)",
-            "tab": "Efficiency",
-            "score": 0.8356168884031154
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.812, mean=0.812, max=0.812, sum=1.624 (2)",
-            "tab": "Efficiency",
-            "score": 0.811913901684331
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1170.393, mean=1170.393, max=1170.393, sum=2340.787 (2)",
-            "tab": "General information",
-            "score": 1170.3933823529412
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=770.316, mean=770.316, max=770.316, sum=1540.631 (2)",
-            "tab": "General information",
-            "score": 770.3156028368794
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1725.955, mean=1725.955, max=1725.955, sum=3451.91 (2)",
-            "tab": "General information",
-            "score": 1725.9550195567144
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=611.645, mean=611.645, max=611.645, sum=1223.291 (2)",
-            "tab": "General information",
-            "score": 611.6454248366013
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.827, mean=0.827, max=0.827, sum=1.654 (2)",
-            "tab": "Efficiency",
-            "score": 0.8269450402259827
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=461.53, mean=461.53, max=461.53, sum=923.06 (2)",
-            "tab": "General information",
-            "score": 461.53
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.822,
-        "details": {
-          "description": "min=0.822, mean=0.822, max=0.822, sum=1.645 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.811, mean=0.811, max=0.811, sum=1.622 (2)",
-            "tab": "Efficiency",
-            "score": 0.8109481099404787
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=632.947, mean=632.947, max=632.947, sum=1265.895 (2)",
-            "tab": "General information",
-            "score": 632.9473684210526
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.803, mean=0.803, max=0.803, sum=1.607 (2)",
-            "tab": "Efficiency",
-            "score": 0.8034474205970764
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=591.96, mean=591.96, max=591.96, sum=1183.92 (2)",
-            "tab": "General information",
-            "score": 591.96
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.74,
-        "details": {
-          "description": "min=0.74, mean=0.74, max=0.74, sum=1.479 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.821, mean=0.821, max=0.821, sum=1.641 (2)",
-            "tab": "Efficiency",
-            "score": 0.8206060139638073
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=437.34, mean=437.34, max=437.34, sum=874.679 (2)",
-            "tab": "General information",
-            "score": 437.33962264150944
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.677,
-        "details": {
-          "description": "min=0.677, mean=0.677, max=0.677, sum=1.353 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.788, mean=0.788, max=0.788, sum=1.577 (2)",
-            "tab": "Efficiency",
-            "score": 0.7882616854728537
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=322.962, mean=322.962, max=322.962, sum=645.923 (2)",
-            "tab": "General information",
-            "score": 322.9617021276596
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.683,
-        "details": {
-          "description": "min=0.683, mean=0.683, max=0.683, sum=1.366 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-            "tab": "Efficiency",
-            "score": 0.800032663345337
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=494.662, mean=494.662, max=494.662, sum=989.324 (2)",
-            "tab": "General information",
-            "score": 494.6620689655172
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.553,
-        "details": {
-          "description": "min=0.553, mean=0.553, max=0.553, sum=1.106 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.81, mean=0.81, max=0.81, sum=1.619 (2)",
-            "tab": "Efficiency",
-            "score": 0.8097125253980122
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=607.042, mean=607.042, max=607.042, sum=1214.085 (2)",
-            "tab": "General information",
-            "score": 607.042328042328
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.452,
-        "details": {
-          "description": "min=0.452, mean=0.452, max=0.452, sum=0.905 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.821, mean=0.821, max=0.821, sum=1.641 (2)",
-            "tab": "Efficiency",
-            "score": 0.8205922804181538
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=656.468, mean=656.468, max=656.468, sum=1312.937 (2)",
-            "tab": "General information",
-            "score": 656.468253968254
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.802, mean=0.802, max=0.802, sum=1.604 (2)",
-            "tab": "Efficiency",
-            "score": 0.8022162606639247
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.786, mean=0.786, max=0.786, sum=1.572 (2)",
-            "tab": "Efficiency",
-            "score": 0.7860349763203137
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-            "tab": "Efficiency",
-            "score": 0.7999507975578308
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.891, mean=0.891, max=0.891, sum=1.782 (2)",
-            "tab": "Efficiency",
-            "score": 0.8912014065366802
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.943, mean=0.943, max=0.943, sum=1.887 (2)",
-            "tab": "Efficiency",
-            "score": 0.9434030766438957
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.989, mean=0.989, max=0.989, sum=1.977 (2)",
-            "tab": "Efficiency",
-            "score": 0.9887206962071552
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.921, mean=0.921, max=0.921, sum=1.842 (2)",
-            "tab": "Efficiency",
-            "score": 0.9210334313221467
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.977, mean=0.977, max=0.977, sum=1.953 (2)",
-            "tab": "Efficiency",
-            "score": 0.976661871097706
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.914, mean=0.914, max=0.914, sum=1.828 (2)",
-            "tab": "Efficiency",
-            "score": 0.9139112444484935
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.933, mean=0.933, max=0.933, sum=1.866 (2)",
-            "tab": "Efficiency",
-            "score": 0.9328556392366523
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.915, mean=0.915, max=0.915, sum=1.83 (2)",
-            "tab": "Efficiency",
-            "score": 0.9148573503581756
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.956, mean=0.956, max=0.956, sum=1.912 (2)",
-            "tab": "Efficiency",
-            "score": 0.95619613704858
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.98, mean=0.98, max=0.98, sum=1.959 (2)",
-            "tab": "Efficiency",
-            "score": 0.9797390874694375
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.996, mean=0.996, max=0.996, sum=1.991 (2)",
-            "tab": "Efficiency",
-            "score": 0.9955862363179525
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=532.455, mean=532.455, max=532.455, sum=1064.91 (2)",
-            "tab": "General information",
-            "score": 532.4548387096775
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=537.089, mean=537.089, max=537.089, sum=1074.177 (2)",
-            "tab": "General information",
-            "score": 537.0886699507389
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=958.39, mean=958.39, max=958.39, sum=1916.78 (2)",
-            "tab": "General information",
-            "score": 958.39
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2900.673, mean=2900.673, max=2900.673, sum=5801.345 (2)",
-            "tab": "General information",
-            "score": 2900.672727272727
-          },
-          "High School European History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=406.146, mean=406.146, max=406.146, sum=812.293 (2)",
-            "tab": "General information",
-            "score": 406.14646464646466
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=492.788, mean=492.788, max=492.788, sum=985.575 (2)",
-            "tab": "General information",
-            "score": 492.78756476683935
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=406.1, mean=406.1, max=406.1, sum=812.2 (2)",
-            "tab": "General information",
-            "score": 406.1
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=583.248, mean=583.248, max=583.248, sum=1166.496 (2)",
-            "tab": "General information",
-            "score": 583.2481481481482
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=426.265, mean=426.265, max=426.265, sum=852.529 (2)",
-            "tab": "General information",
-            "score": 426.2647058823529
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=603.272, mean=603.272, max=603.272, sum=1206.543 (2)",
-            "tab": "General information",
-            "score": 603.2715231788079
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=525.635, mean=525.635, max=525.635, sum=1051.27 (2)",
-            "tab": "General information",
-            "score": 525.6348623853211
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=876.032, mean=876.032, max=876.032, sum=1752.065 (2)",
-            "tab": "General information",
-            "score": 876.0324074074074
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2310.931, mean=2310.931, max=2310.931, sum=4621.863 (2)",
-            "tab": "General information",
-            "score": 2310.9313725490197
-          },
-          "High School US History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1501.477, mean=1501.477, max=1501.477, sum=3002.954 (2)",
-            "tab": "General information",
-            "score": 1501.4767932489451
-          },
-          "High School World History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.809,
-        "details": {
-          "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)",
-            "tab": "Efficiency",
-            "score": 0.889766787199696
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.928, mean=0.928, max=0.928, sum=1.856 (2)",
-            "tab": "Efficiency",
-            "score": 0.9282377730799085
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=333.036, mean=333.036, max=333.036, sum=666.072 (2)",
-            "tab": "General information",
-            "score": 333.0358744394619
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=362.466, mean=362.466, max=362.466, sum=724.931 (2)",
-            "tab": "General information",
-            "score": 362.46564885496184
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.893,
-        "details": {
-          "description": "min=0.893, mean=0.893, max=0.893, sum=1.785 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.938, mean=0.938, max=0.938, sum=1.875 (2)",
-            "tab": "Efficiency",
-            "score": 0.9376649265446939
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=662.628, mean=662.628, max=662.628, sum=1325.256 (2)",
-            "tab": "General information",
-            "score": 662.6280991735537
-          },
-          "International Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)",
-            "tab": "Efficiency",
-            "score": 0.9101676209572634
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=466.227, mean=466.227, max=466.227, sum=932.454 (2)",
-            "tab": "General information",
-            "score": 466.2269938650307
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.509,
-        "details": {
-          "description": "min=0.509, mean=0.509, max=0.509, sum=1.018 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.936, mean=0.936, max=0.936, sum=1.873 (2)",
-            "tab": "Efficiency",
-            "score": 0.9363672009536198
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=719.938, mean=719.938, max=719.938, sum=1439.875 (2)",
-            "tab": "General information",
-            "score": 719.9375
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.825,
-        "details": {
-          "description": "min=0.825, mean=0.825, max=0.825, sum=1.65 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=1.024, mean=1.024, max=1.024, sum=2.049 (2)",
-            "tab": "Efficiency",
-            "score": 1.0244285111288423
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=299.553, mean=299.553, max=299.553, sum=599.107 (2)",
-            "tab": "General information",
-            "score": 299.5533980582524
-          },
-          "Management - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.915,
-        "details": {
-          "description": "min=0.915, mean=0.915, max=0.915, sum=1.829 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.967, mean=0.967, max=0.967, sum=1.934 (2)",
-            "tab": "Efficiency",
-            "score": 0.9670558464832795
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=446.714, mean=446.714, max=446.714, sum=893.427 (2)",
-            "tab": "General information",
-            "score": 446.71367521367523
-          },
-          "Marketing - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69,
-        "details": {
-          "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=1.001, mean=1.001, max=1.001, sum=2.002 (2)",
-            "tab": "Efficiency",
-            "score": 1.0011137557029723
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=361.45, mean=361.45, max=361.45, sum=722.9 (2)",
-            "tab": "General information",
-            "score": 361.45
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.902,
-        "details": {
-          "description": "min=0.902, mean=0.902, max=0.902, sum=1.803 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.907, mean=0.907, max=0.907, sum=1.813 (2)",
-            "tab": "Efficiency",
-            "score": 0.9065530522420793
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=332.257, mean=332.257, max=332.257, sum=664.513 (2)",
-            "tab": "General information",
-            "score": 332.2567049808429
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.269,
-        "details": {
-          "description": "min=0.269, mean=0.269, max=0.269, sum=0.539 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.799, mean=0.799, max=0.799, sum=1.599 (2)",
-            "tab": "Efficiency",
-            "score": 0.7992533741658823
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.79, mean=0.79, max=0.79, sum=1.581 (2)",
-            "tab": "Efficiency",
-            "score": 0.7903663371528327
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=506.514, mean=506.514, max=506.514, sum=1013.029 (2)",
-            "tab": "General information",
-            "score": 506.514450867052
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=709.934, mean=709.934, max=709.934, sum=1419.868 (2)",
-            "tab": "General information",
-            "score": 709.9340782122905
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.801,
-        "details": {
-          "description": "min=0.801, mean=0.801, max=0.801, sum=1.601 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.799, mean=0.799, max=0.799, sum=1.599 (2)",
-            "tab": "Efficiency",
-            "score": 0.7992852076985477
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=619.683, mean=619.683, max=619.683, sum=1239.366 (2)",
-            "tab": "General information",
-            "score": 619.6830065359477
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.824,
-        "details": {
-          "description": "min=0.824, mean=0.824, max=0.824, sum=1.648 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.804, mean=0.804, max=0.804, sum=1.607 (2)",
-            "tab": "Efficiency",
-            "score": 0.8036901479885902
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=566.244, mean=566.244, max=566.244, sum=1132.488 (2)",
-            "tab": "General information",
-            "score": 566.2438271604939
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.727,
-        "details": {
-          "description": "min=0.727, mean=0.727, max=0.727, sum=1.455 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.819, mean=0.819, max=0.819, sum=1.638 (2)",
-            "tab": "Efficiency",
-            "score": 0.8189079783179544
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=440.6, mean=440.6, max=440.6, sum=881.2 (2)",
-            "tab": "General information",
-            "score": 440.6
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.755,
-        "details": {
-          "description": "min=0.755, mean=0.755, max=0.755, sum=1.51 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.828, mean=0.828, max=0.828, sum=1.655 (2)",
-            "tab": "Efficiency",
-            "score": 0.8276801226090412
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1221.388, mean=1221.388, max=1221.388, sum=2442.776 (2)",
-            "tab": "General information",
-            "score": 1221.3877551020407
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.876,
-        "details": {
-          "description": "min=0.876, mean=0.876, max=0.876, sum=1.751 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.792, mean=0.792, max=0.792, sum=1.583 (2)",
-            "tab": "Efficiency",
-            "score": 0.7917492271062747
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=465.925, mean=465.925, max=465.925, sum=931.851 (2)",
-            "tab": "General information",
-            "score": 465.92537313432837
-          },
-          "Sociology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.578,
-        "details": {
-          "description": "min=0.578, mean=0.578, max=0.578, sum=1.157 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.78, mean=0.78, max=0.78, sum=1.559 (2)",
-            "tab": "Efficiency",
-            "score": 0.7796976523227003
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=358.048, mean=358.048, max=358.048, sum=716.096 (2)",
-            "tab": "General information",
-            "score": 358.04819277108436
-          },
-          "Virology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.842,
-        "details": {
-          "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)",
-            "tab": "Efficiency",
-            "score": 0.8218589679539552
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=293.649, mean=293.649, max=293.649, sum=587.298 (2)",
-            "tab": "General information",
-            "score": 293.64912280701753
-          },
-          "World Religions - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.206,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json b/data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json
deleted file mode 100644
index 3a25316d9..000000000
--- a/data/helm_mmlu/ai21/jamba-instruct/3da06ad4-0770-45f5-a6a2-9ef9500cef05.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/ai21_jamba-instruct/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Jamba Instruct",
-    "id": "ai21/jamba-instruct",
-    "developer": "ai21",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.659,
-        "details": {
-          "description": "min=0.341, mean=0.659, max=0.91, sum=75.114 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.233, mean=0.277, max=0.519, sum=31.585 (114)",
-            "tab": "Efficiency",
-            "score": 0.2770578114829593
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=223.731, mean=490.686, max=2081.679, sum=55938.26 (114)",
-            "tab": "General information",
-            "score": 490.6864895752317
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.36,
-        "details": {
-          "description": "min=0.36, mean=0.36, max=0.36, sum=0.72 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.275, mean=0.275, max=0.275, sum=0.55 (2)",
-            "tab": "Efficiency",
-            "score": 0.27479029655456544
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=373.44, mean=373.44, max=373.44, sum=746.88 (2)",
-            "tab": "General information",
-            "score": 373.44
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.615,
-        "details": {
-          "description": "min=0.615, mean=0.615, max=0.615, sum=1.23 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.236, mean=0.236, max=0.236, sum=0.473 (2)",
-            "tab": "Efficiency",
-            "score": 0.2363892325648555
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=270.2, mean=270.2, max=270.2, sum=540.4 (2)",
-            "tab": "General information",
-            "score": 270.2
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.422,
-        "details": {
-          "description": "min=0.422, mean=0.422, max=0.422, sum=0.843 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.275, mean=0.275, max=0.275, sum=0.55 (2)",
-            "tab": "Efficiency",
-            "score": 0.2747657370567322
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.26, mean=0.26, max=0.26, sum=0.519 (2)",
-            "tab": "Efficiency",
-            "score": 0.2595776534742779
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)",
-            "tab": "Efficiency",
-            "score": 0.2938127589225769
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.269, mean=0.269, max=0.269, sum=0.538 (2)",
-            "tab": "Efficiency",
-            "score": 0.26912292957305906
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.309, mean=0.309, max=0.309, sum=0.618 (2)",
-            "tab": "Efficiency",
-            "score": 0.30890216579327007
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.374, mean=0.374, max=0.374, sum=0.749 (2)",
-            "tab": "Efficiency",
-            "score": 0.374276315464693
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=549.4, mean=549.4, max=549.4, sum=1098.8 (2)",
-            "tab": "General information",
-            "score": 549.4
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=363.431, mean=363.431, max=363.431, sum=726.861 (2)",
-            "tab": "General information",
-            "score": 363.43055555555554
-          },
-          "College Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=720.67, mean=720.67, max=720.67, sum=1441.34 (2)",
-            "tab": "General information",
-            "score": 720.67
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=535.22, mean=535.22, max=535.22, sum=1070.44 (2)",
-            "tab": "General information",
-            "score": 535.22
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=397.855, mean=397.855, max=397.855, sum=795.711 (2)",
-            "tab": "General information",
-            "score": 397.8554913294798
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=392.598, mean=392.598, max=392.598, sum=785.196 (2)",
-            "tab": "General information",
-            "score": 392.5980392156863
-          },
-          "College Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.253, mean=0.253, max=0.253, sum=0.506 (2)",
-            "tab": "Efficiency",
-            "score": 0.2529018998146057
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=378.54, mean=378.54, max=378.54, sum=757.08 (2)",
-            "tab": "General information",
-            "score": 378.54
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.439,
-        "details": {
-          "description": "min=0.439, mean=0.439, max=0.439, sum=0.877 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.254, mean=0.254, max=0.254, sum=0.507 (2)",
-            "tab": "Efficiency",
-            "score": 0.25371592086658146
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=614.43, mean=614.43, max=614.43, sum=1228.86 (2)",
-            "tab": "General information",
-            "score": 614.4298245614035
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4,
-        "details": {
-          "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.257, mean=0.257, max=0.257, sum=0.514 (2)",
-            "tab": "Efficiency",
-            "score": 0.25686686754226684
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=329.71, mean=329.71, max=329.71, sum=659.42 (2)",
-            "tab": "General information",
-            "score": 329.71
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "description": "min=0.796, mean=0.796, max=0.796, sum=1.593 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.26, mean=0.26, max=0.26, sum=0.521 (2)",
-            "tab": "Efficiency",
-            "score": 0.260397990544637
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=312.287, mean=312.287, max=312.287, sum=624.574 (2)",
-            "tab": "General information",
-            "score": 312.287037037037
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.749,
-        "details": {
-          "description": "min=0.749, mean=0.749, max=0.749, sum=1.498 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.252, mean=0.252, max=0.252, sum=0.504 (2)",
-            "tab": "Efficiency",
-            "score": 0.25189057270430293
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=267.441, mean=267.441, max=267.441, sum=534.881 (2)",
-            "tab": "General information",
-            "score": 267.4405144694534
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.716,
-        "details": {
-          "description": "min=0.716, mean=0.716, max=0.716, sum=1.431 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.308, mean=0.308, max=0.308, sum=0.616 (2)",
-            "tab": "Efficiency",
-            "score": 0.30818068542901206
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.266, mean=0.266, max=0.266, sum=0.532 (2)",
-            "tab": "Efficiency",
-            "score": 0.26598995881723175
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.365, mean=0.365, max=0.365, sum=0.73 (2)",
-            "tab": "Efficiency",
-            "score": 0.36489380229716195
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.255, mean=0.255, max=0.255, sum=0.511 (2)",
-            "tab": "Efficiency",
-            "score": 0.25544750768374774
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=813.651, mean=813.651, max=813.651, sum=1627.301 (2)",
-            "tab": "General information",
-            "score": 813.6507352941177
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=555.461, mean=555.461, max=555.461, sum=1110.922 (2)",
-            "tab": "General information",
-            "score": 555.4609929078014
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1151.508, mean=1151.508, max=1151.508, sum=2303.016 (2)",
-            "tab": "General information",
-            "score": 1151.5078226857888
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=422.158, mean=422.158, max=422.158, sum=844.317 (2)",
-            "tab": "General information",
-            "score": 422.15849673202615
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.271, mean=0.271, max=0.271, sum=0.542 (2)",
-            "tab": "Efficiency",
-            "score": 0.27118161678314207
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)",
-            "tab": "General information",
-            "score": 422.79
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.73,
-        "details": {
-          "description": "min=0.73, mean=0.73, max=0.73, sum=1.461 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.276, mean=0.276, max=0.276, sum=0.553 (2)",
-            "tab": "Efficiency",
-            "score": 0.27634719171022115
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=440.612, mean=440.612, max=440.612, sum=881.224 (2)",
-            "tab": "General information",
-            "score": 440.6118421052632
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6,
-        "details": {
-          "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.267, mean=0.267, max=0.267, sum=0.533 (2)",
-            "tab": "Efficiency",
-            "score": 0.2665403389930725
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=521.13, mean=521.13, max=521.13, sum=1042.26 (2)",
-            "tab": "General information",
-            "score": 521.13
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.702,
-        "details": {
-          "description": "min=0.702, mean=0.702, max=0.702, sum=1.404 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.259, mean=0.259, max=0.259, sum=0.517 (2)",
-            "tab": "Efficiency",
-            "score": 0.25872870661177727
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=317.268, mean=317.268, max=317.268, sum=634.536 (2)",
-            "tab": "General information",
-            "score": 317.2679245283019
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.677,
-        "details": {
-          "description": "min=0.677, mean=0.677, max=0.677, sum=1.353 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.306, mean=0.306, max=0.306, sum=0.613 (2)",
-            "tab": "Efficiency",
-            "score": 0.30636518965376186
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=241.511, mean=241.511, max=241.511, sum=483.021 (2)",
-            "tab": "General information",
-            "score": 241.51063829787233
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.621,
-        "details": {
-          "description": "min=0.621, mean=0.621, max=0.621, sum=1.241 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.412, mean=0.412, max=0.412, sum=0.825 (2)",
-            "tab": "Efficiency",
-            "score": 0.41247522255470015
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=382.393, mean=382.393, max=382.393, sum=764.786 (2)",
-            "tab": "General information",
-            "score": 382.39310344827584
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.497,
-        "details": {
-          "description": "min=0.497, mean=0.497, max=0.497, sum=0.995 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.259, mean=0.259, max=0.259, sum=0.517 (2)",
-            "tab": "Efficiency",
-            "score": 0.2586819948973479
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=467.987, mean=467.987, max=467.987, sum=935.974 (2)",
-            "tab": "General information",
-            "score": 467.9867724867725
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.444,
-        "details": {
-          "description": "min=0.444, mean=0.444, max=0.444, sum=0.889 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.263, mean=0.263, max=0.263, sum=0.526 (2)",
-            "tab": "Efficiency",
-            "score": 0.2629187542294699
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=559.865, mean=559.865, max=559.865, sum=1119.73 (2)",
-            "tab": "General information",
-            "score": 559.8650793650794
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.797,
-        "details": {
-          "description": "min=0.797, mean=0.797, max=0.797, sum=1.595 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.256, mean=0.256, max=0.256, sum=0.513 (2)",
-            "tab": "Efficiency",
-            "score": 0.25630061088069794
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.259, mean=0.259, max=0.259, sum=0.519 (2)",
-            "tab": "Efficiency",
-            "score": 0.2594739521665526
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)",
-            "tab": "Efficiency",
-            "score": 0.29399110078811647
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.519, mean=0.519, max=0.519, sum=1.039 (2)",
-            "tab": "Efficiency",
-            "score": 0.5194540543989702
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.25, mean=0.25, max=0.25, sum=0.5 (2)",
-            "tab": "Efficiency",
-            "score": 0.24992815051415954
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.242, mean=0.242, max=0.242, sum=0.484 (2)",
-            "tab": "Efficiency",
-            "score": 0.242088835474123
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.24, mean=0.24, max=0.24, sum=0.481 (2)",
-            "tab": "Efficiency",
-            "score": 0.240464658003587
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.252, mean=0.252, max=0.252, sum=0.503 (2)",
-            "tab": "Efficiency",
-            "score": 0.25154934459262424
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.25, mean=0.25, max=0.25, sum=0.501 (2)",
-            "tab": "Efficiency",
-            "score": 0.25046268931957855
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.256, mean=0.256, max=0.256, sum=0.511 (2)",
-            "tab": "Efficiency",
-            "score": 0.25560809444907484
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.251, mean=0.251, max=0.251, sum=0.501 (2)",
-            "tab": "Efficiency",
-            "score": 0.250657169971991
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.282, mean=0.282, max=0.282, sum=0.564 (2)",
-            "tab": "Efficiency",
-            "score": 0.2818450938772272
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.45, mean=0.45, max=0.45, sum=0.9 (2)",
-            "tab": "Efficiency",
-            "score": 0.44991188072690774
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.347, mean=0.347, max=0.347, sum=0.693 (2)",
-            "tab": "Efficiency",
-            "score": 0.3466388042466047
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=380.871, mean=380.871, max=380.871, sum=761.742 (2)",
-            "tab": "General information",
-            "score": 380.8709677419355
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=401.734, mean=401.734, max=401.734, sum=803.468 (2)",
-            "tab": "General information",
-            "score": 401.73399014778323
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=793.8, mean=793.8, max=793.8, sum=1587.6 (2)",
-            "tab": "General information",
-            "score": 793.8
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2081.679, mean=2081.679, max=2081.679, sum=4163.358 (2)",
-            "tab": "General information",
-            "score": 2081.6787878787877
-          },
-          "High School European History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=299.717, mean=299.717, max=299.717, sum=599.434 (2)",
-            "tab": "General information",
-            "score": 299.7171717171717
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=333.601, mean=333.601, max=333.601, sum=667.202 (2)",
-            "tab": "General information",
-            "score": 333.60103626943004
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=286.562, mean=286.562, max=286.562, sum=573.123 (2)",
-            "tab": "General information",
-            "score": 286.5615384615385
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=421.889, mean=421.889, max=421.889, sum=843.778 (2)",
-            "tab": "General information",
-            "score": 421.8888888888889
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=301.231, mean=301.231, max=301.231, sum=602.462 (2)",
-            "tab": "General information",
-            "score": 301.2310924369748
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=453.51, mean=453.51, max=453.51, sum=907.02 (2)",
-            "tab": "General information",
-            "score": 453.50993377483445
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=355.059, mean=355.059, max=355.059, sum=710.117 (2)",
-            "tab": "General information",
-            "score": 355.0587155963303
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=648.037, mean=648.037, max=648.037, sum=1296.074 (2)",
-            "tab": "General information",
-            "score": 648.0370370370371
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=1628.495, mean=1628.495, max=1628.495, sum=3256.99 (2)",
-            "tab": "General information",
-            "score": 1628.4950980392157
-          },
-          "High School US History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1025.097, mean=1025.097, max=1025.097, sum=2050.194 (2)",
-            "tab": "General information",
-            "score": 1025.097046413502
-          },
-          "High School World History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.794,
-        "details": {
-          "description": "min=0.794, mean=0.794, max=0.794, sum=1.588 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.233, mean=0.233, max=0.233, sum=0.466 (2)",
-            "tab": "Efficiency",
-            "score": 0.2328128023532474
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.251, mean=0.251, max=0.251, sum=0.501 (2)",
-            "tab": "Efficiency",
-            "score": 0.2506928462108583
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=250.915, mean=250.915, max=250.915, sum=501.83 (2)",
-            "tab": "General information",
-            "score": 250.91479820627802
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=263.183, mean=263.183, max=263.183, sum=526.366 (2)",
-            "tab": "General information",
-            "score": 263.1832061068702
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.835,
-        "details": {
-          "description": "min=0.835, mean=0.835, max=0.835, sum=1.669 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.271, mean=0.271, max=0.271, sum=0.542 (2)",
-            "tab": "Efficiency",
-            "score": 0.27110107082965945
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=477.843, mean=477.843, max=477.843, sum=955.686 (2)",
-            "tab": "General information",
-            "score": 477.8429752066116
-          },
-          "International Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.706,
-        "details": {
-          "description": "min=0.706, mean=0.706, max=0.706, sum=1.411 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.25, mean=0.25, max=0.25, sum=0.499 (2)",
-            "tab": "Efficiency",
-            "score": 0.24970631804202964
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=337.718, mean=337.718, max=337.718, sum=675.436 (2)",
-            "tab": "General information",
-            "score": 337.7177914110429
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.536,
-        "details": {
-          "description": "min=0.536, mean=0.536, max=0.536, sum=1.071 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.267, mean=0.267, max=0.267, sum=0.533 (2)",
-            "tab": "Efficiency",
-            "score": 0.2665597881589617
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=559.277, mean=559.277, max=559.277, sum=1118.554 (2)",
-            "tab": "General information",
-            "score": 559.2767857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.786,
-        "details": {
-          "description": "min=0.786, mean=0.786, max=0.786, sum=1.573 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.241, mean=0.241, max=0.241, sum=0.481 (2)",
-            "tab": "Efficiency",
-            "score": 0.24073980386974742
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=225.262, mean=225.262, max=225.262, sum=450.524 (2)",
-            "tab": "General information",
-            "score": 225.2621359223301
-          },
-          "Management - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.885,
-        "details": {
-          "description": "min=0.885, mean=0.885, max=0.885, sum=1.769 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.258, mean=0.258, max=0.258, sum=0.517 (2)",
-            "tab": "Efficiency",
-            "score": 0.25835410753885907
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=351.573, mean=351.573, max=351.573, sum=703.145 (2)",
-            "tab": "General information",
-            "score": 351.5726495726496
-          },
-          "Marketing - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.67,
-        "details": {
-          "description": "min=0.67, mean=0.67, max=0.67, sum=1.34 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.251, mean=0.251, max=0.251, sum=0.502 (2)",
-            "tab": "Efficiency",
-            "score": 0.2510761094093323
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=274.75, mean=274.75, max=274.75, sum=549.5 (2)",
-            "tab": "General information",
-            "score": 274.75
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "details": {
-          "description": "min=0.865, mean=0.865, max=0.865, sum=1.729 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.233, mean=0.233, max=0.233, sum=0.466 (2)",
-            "tab": "Efficiency",
-            "score": 0.23304342005596915
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=254.525, mean=254.525, max=254.525, sum=509.05 (2)",
-            "tab": "General information",
-            "score": 254.5249042145594
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.465,
-        "details": {
-          "description": "min=0.465, mean=0.465, max=0.465, sum=0.93 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.256, mean=0.256, max=0.256, sum=0.512 (2)",
-            "tab": "Efficiency",
-            "score": 0.2561916905331474
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.262, mean=0.262, max=0.262, sum=0.525 (2)",
-            "tab": "Efficiency",
-            "score": 0.2624055065922231
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=357.165, mean=357.165, max=357.165, sum=714.329 (2)",
-            "tab": "General information",
-            "score": 357.16473988439304
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=546.793, mean=546.793, max=546.793, sum=1093.587 (2)",
-            "tab": "General information",
-            "score": 546.7932960893854
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.745,
-        "details": {
-          "description": "min=0.745, mean=0.745, max=0.745, sum=1.49 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.248, mean=0.248, max=0.248, sum=0.496 (2)",
-            "tab": "Efficiency",
-            "score": 0.2479639964945176
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=454.758, mean=454.758, max=454.758, sum=909.516 (2)",
-            "tab": "General information",
-            "score": 454.75816993464053
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "description": "min=0.796, mean=0.796, max=0.796, sum=1.593 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.254, mean=0.254, max=0.254, sum=0.508 (2)",
-            "tab": "Efficiency",
-            "score": 0.2538878917694092
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=410.315, mean=410.315, max=410.315, sum=820.63 (2)",
-            "tab": "General information",
-            "score": 410.31481481481484
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.682,
-        "details": {
-          "description": "min=0.682, mean=0.682, max=0.682, sum=1.364 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.252, mean=0.252, max=0.252, sum=0.505 (2)",
-            "tab": "Efficiency",
-            "score": 0.25225248553536156
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=316.591, mean=316.591, max=316.591, sum=633.182 (2)",
-            "tab": "General information",
-            "score": 316.59090909090907
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.743,
-        "details": {
-          "description": "min=0.743, mean=0.743, max=0.743, sum=1.486 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.31, mean=0.31, max=0.31, sum=0.62 (2)",
-            "tab": "Efficiency",
-            "score": 0.30983400539476047
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=856.637, mean=856.637, max=856.637, sum=1713.273 (2)",
-            "tab": "General information",
-            "score": 856.6367346938775
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.891,
-        "details": {
-          "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.258, mean=0.258, max=0.258, sum=0.515 (2)",
-            "tab": "Efficiency",
-            "score": 0.25752189384764107
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=327.801, mean=327.801, max=327.801, sum=655.602 (2)",
-            "tab": "General information",
-            "score": 327.80099502487565
-          },
-          "Sociology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53,
-        "details": {
-          "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.238, mean=0.238, max=0.238, sum=0.477 (2)",
-            "tab": "Efficiency",
-            "score": 0.23830672200903835
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=267.458, mean=267.458, max=267.458, sum=534.916 (2)",
-            "tab": "General information",
-            "score": 267.4578313253012
-          },
-          "Virology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.813,
-        "details": {
-          "description": "min=0.813, mean=0.813, max=0.813, sum=1.626 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.236, mean=0.236, max=0.236, sum=0.473 (2)",
-            "tab": "Efficiency",
-            "score": 0.23630904593662908
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=223.731, mean=223.731, max=223.731, sum=447.462 (2)",
-            "tab": "General information",
-            "score": 223.73099415204678
-          },
-          "World Religions - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.887,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json b/data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json
deleted file mode 100644
index 8bf036c64..000000000
--- a/data/helm_mmlu/allenai/olmo-1.7-7b/c1c79360-60bd-4f5d-a746-e0411b94f69b.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/allenai_olmo-1.7-7b/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OLMo 1.7 7B",
-    "id": "allenai/olmo-1.7-7b",
-    "developer": "allenai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.538,
-        "details": {
-          "description": "min=0.307, mean=0.538, max=0.769, sum=61.295 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.518, mean=1.024, max=2.978, sum=116.777 (114)",
-            "tab": "Efficiency",
-            "score": 1.024362741022275
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=2.909, mean=4.946, max=5, sum=563.813 (114)",
-            "tab": "General information",
-            "score": 4.945727778020373
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=285.766, mean=597.916, max=1816.758, sum=68162.415 (114)",
-            "tab": "General information",
-            "score": 597.9159199418197
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.33,
-        "details": {
-          "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.664, mean=0.664, max=0.664, sum=1.328 (2)",
-            "tab": "Efficiency",
-            "score": 0.664234619140625
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=358.76, mean=358.76, max=358.76, sum=717.52 (2)",
-            "tab": "General information",
-            "score": 358.76
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.496,
-        "details": {
-          "description": "min=0.496, mean=0.496, max=0.496, sum=0.993 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.619, mean=0.619, max=0.619, sum=1.237 (2)",
-            "tab": "Efficiency",
-            "score": 0.618622675648442
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=352.03, mean=352.03, max=352.03, sum=704.059 (2)",
-            "tab": "General information",
-            "score": 352.02962962962965
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.333,
-        "details": {
-          "description": "min=0.333, mean=0.333, max=0.333, sum=0.667 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.954, mean=0.954, max=0.954, sum=1.908 (2)",
-            "tab": "Efficiency",
-            "score": 0.9539380264282227
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.791, mean=0.791, max=0.791, sum=1.582 (2)",
-            "tab": "Efficiency",
-            "score": 0.7911433676878611
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=1.44, mean=1.44, max=1.44, sum=2.88 (2)",
-            "tab": "Efficiency",
-            "score": 1.4402443194389343
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=1.005, mean=1.005, max=1.005, sum=2.01 (2)",
-            "tab": "Efficiency",
-            "score": 1.0049437880516052
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.933, mean=0.933, max=0.933, sum=1.866 (2)",
-            "tab": "Efficiency",
-            "score": 0.9331957646188019
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.854, mean=0.854, max=0.854, sum=1.707 (2)",
-            "tab": "Efficiency",
-            "score": 0.8537454745348763
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=535.85, mean=535.85, max=535.85, sum=1071.7 (2)",
-            "tab": "General information",
-            "score": 535.85
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=470.319, mean=470.319, max=470.319, sum=940.639 (2)",
-            "tab": "General information",
-            "score": 470.31944444444446
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=842.89, mean=842.89, max=842.89, sum=1685.78 (2)",
-            "tab": "General information",
-            "score": 842.89
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=592.82, mean=592.82, max=592.82, sum=1185.64 (2)",
-            "tab": "General information",
-            "score": 592.82
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=519.376, mean=519.376, max=519.376, sum=1038.751 (2)",
-            "tab": "General information",
-            "score": 519.3757225433526
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=476.657, mean=476.657, max=476.657, sum=953.314 (2)",
-            "tab": "General information",
-            "score": 476.65686274509807
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.65,
-        "details": {
-          "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.671, mean=0.671, max=0.671, sum=1.343 (2)",
-            "tab": "Efficiency",
-            "score": 0.6713726472854614
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=388.19, mean=388.19, max=388.19, sum=776.38 (2)",
-            "tab": "General information",
-            "score": 388.19
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.404,
-        "details": {
-          "description": "min=0.404, mean=0.404, max=0.404, sum=0.807 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=1.05, mean=1.05, max=1.05, sum=2.099 (2)",
-            "tab": "Efficiency",
-            "score": 1.0495816971126355
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=612.798, mean=612.798, max=612.798, sum=1225.596 (2)",
-            "tab": "General information",
-            "score": 612.7982456140351
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.34,
-        "details": {
-          "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.739, mean=0.739, max=0.739, sum=1.477 (2)",
-            "tab": "Efficiency",
-            "score": 0.7387202930450439
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=400.58, mean=400.58, max=400.58, sum=801.16 (2)",
-            "tab": "General information",
-            "score": 400.58
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.565,
-        "details": {
-          "description": "min=0.565, mean=0.565, max=0.565, sum=1.13 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.755, mean=0.755, max=0.755, sum=1.51 (2)",
-            "tab": "Efficiency",
-            "score": 0.7549951495947661
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=420.861, mean=420.861, max=420.861, sum=841.722 (2)",
-            "tab": "General information",
-            "score": 420.8611111111111
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.592,
-        "details": {
-          "description": "min=0.592, mean=0.592, max=0.592, sum=1.183 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.622, mean=0.622, max=0.622, sum=1.244 (2)",
-            "tab": "Efficiency",
-            "score": 0.6219598725677686
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=345.277, mean=345.277, max=345.277, sum=690.553 (2)",
-            "tab": "General information",
-            "score": 345.2765273311897
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.526,
-        "details": {
-          "description": "min=0.526, mean=0.526, max=0.526, sum=1.052 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=1.88, mean=1.88, max=1.88, sum=3.759 (2)",
-            "tab": "Efficiency",
-            "score": 1.8796235156409882
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=1.156, mean=1.156, max=1.156, sum=2.312 (2)",
-            "tab": "Efficiency",
-            "score": 1.1558757741400536
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=2.735, mean=2.735, max=2.735, sum=5.47 (2)",
-            "tab": "Efficiency",
-            "score": 2.734811251757198
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=1.006, mean=1.006, max=1.006, sum=2.012 (2)",
-            "tab": "Efficiency",
-            "score": 1.0057547404096017
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1080.882, mean=1080.882, max=1080.882, sum=2161.765 (2)",
-            "tab": "General information",
-            "score": 1080.8823529411766
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=660.922, mean=660.922, max=660.922, sum=1321.844 (2)",
-            "tab": "General information",
-            "score": 660.9219858156029
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=4.997, mean=4.997, max=4.997, sum=9.995 (2)",
-            "tab": "General information",
-            "score": 4.9973924380704045
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1654.433, mean=1654.433, max=1654.433, sum=3308.866 (2)",
-            "tab": "General information",
-            "score": 1654.4328552803129
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=590.873, mean=590.873, max=590.873, sum=1181.745 (2)",
-            "tab": "General information",
-            "score": 590.8725490196078
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.802, mean=0.802, max=0.802, sum=1.604 (2)",
-            "tab": "Efficiency",
-            "score": 0.8018933439254761
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=444.08, mean=444.08, max=444.08, sum=888.16 (2)",
-            "tab": "General information",
-            "score": 444.08
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.526,
-        "details": {
-          "description": "min=0.526, mean=0.526, max=0.526, sum=1.053 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=1.012, mean=1.012, max=1.012, sum=2.023 (2)",
-            "tab": "Efficiency",
-            "score": 1.0116610966230695
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=598.487, mean=598.487, max=598.487, sum=1196.974 (2)",
-            "tab": "General information",
-            "score": 598.4868421052631
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.59,
-        "details": {
-          "description": "min=0.59, mean=0.59, max=0.59, sum=1.18 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.964, mean=0.964, max=0.964, sum=1.929 (2)",
-            "tab": "Efficiency",
-            "score": 0.9642905473709107
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=585.05, mean=585.05, max=585.05, sum=1170.1 (2)",
-            "tab": "General information",
-            "score": 585.05
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.57,
-        "details": {
-          "description": "min=0.57, mean=0.57, max=0.57, sum=1.14 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.57, mean=0.57, max=0.57, sum=1.139 (2)",
-            "tab": "Efficiency",
-            "score": 0.5697462513761701
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=401.917, mean=401.917, max=401.917, sum=803.834 (2)",
-            "tab": "General information",
-            "score": 401.9169811320755
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.434,
-        "details": {
-          "description": "min=0.434, mean=0.434, max=0.434, sum=0.868 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.524, mean=0.524, max=0.524, sum=1.049 (2)",
-            "tab": "Efficiency",
-            "score": 0.5244635977643601
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=311.311, mean=311.311, max=311.311, sum=622.621 (2)",
-            "tab": "General information",
-            "score": 311.31063829787234
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.517,
-        "details": {
-          "description": "min=0.517, mean=0.517, max=0.517, sum=1.034 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.764, mean=0.764, max=0.764, sum=1.528 (2)",
-            "tab": "Efficiency",
-            "score": 0.7642407762593236
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=424.848, mean=424.848, max=424.848, sum=849.697 (2)",
-            "tab": "General information",
-            "score": 424.848275862069
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307,
-        "details": {
-          "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.909, mean=0.909, max=0.909, sum=1.817 (2)",
-            "tab": "Efficiency",
-            "score": 0.9087190634359128
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=505.071, mean=505.071, max=505.071, sum=1010.143 (2)",
-            "tab": "General information",
-            "score": 505.07142857142856
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.325,
-        "details": {
-          "description": "min=0.325, mean=0.325, max=0.325, sum=0.651 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=1.12, mean=1.12, max=1.12, sum=2.24 (2)",
-            "tab": "Efficiency",
-            "score": 1.1198924439293998
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=653.595, mean=653.595, max=653.595, sum=1307.19 (2)",
-            "tab": "General information",
-            "score": 653.5952380952381
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.713,
-        "details": {
-          "description": "min=0.713, mean=0.713, max=0.713, sum=1.426 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.926, mean=0.926, max=0.926, sum=1.852 (2)",
-            "tab": "Efficiency",
-            "score": 0.9262428129872968
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.711, mean=0.711, max=0.711, sum=1.421 (2)",
-            "tab": "Efficiency",
-            "score": 0.710636249316737
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=1.389, mean=1.389, max=1.389, sum=2.779 (2)",
-            "tab": "Efficiency",
-            "score": 1.3893755102157592
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=2.978, mean=2.978, max=2.978, sum=5.957 (2)",
-            "tab": "Efficiency",
-            "score": 2.9784073266116056
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.736, mean=0.736, max=0.736, sum=1.471 (2)",
-            "tab": "Efficiency",
-            "score": 0.7356561253769229
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.878, mean=0.878, max=0.878, sum=1.755 (2)",
-            "tab": "Efficiency",
-            "score": 0.8775828440572314
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.689, mean=0.689, max=0.689, sum=1.378 (2)",
-            "tab": "Efficiency",
-            "score": 0.6891599153861021
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.917, mean=0.917, max=0.917, sum=1.834 (2)",
-            "tab": "Efficiency",
-            "score": 0.9171109632209495
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.748, mean=0.748, max=0.748, sum=1.496 (2)",
-            "tab": "Efficiency",
-            "score": 0.7482213062398574
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.983, mean=0.983, max=0.983, sum=1.965 (2)",
-            "tab": "Efficiency",
-            "score": 0.9825576125391272
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
-            "tab": "Efficiency",
-            "score": 0.9199631371629348
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=1.151, mean=1.151, max=1.151, sum=2.303 (2)",
-            "tab": "Efficiency",
-            "score": 1.1514487498336368
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=2.908, mean=2.908, max=2.908, sum=5.816 (2)",
-            "tab": "Efficiency",
-            "score": 2.9081676029691508
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=2.459, mean=2.459, max=2.459, sum=4.919 (2)",
-            "tab": "Efficiency",
-            "score": 2.4593187173207602
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=513.932, mean=513.932, max=513.932, sum=1027.865 (2)",
-            "tab": "General information",
-            "score": 513.9322580645161
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=479.842, mean=479.842, max=479.842, sum=959.685 (2)",
-            "tab": "General information",
-            "score": 479.8423645320197
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=889.39, mean=889.39, max=889.39, sum=1778.78 (2)",
-            "tab": "General information",
-            "score": 889.39
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=2.909, mean=2.909, max=2.909, sum=5.818 (2)",
-            "tab": "General information",
-            "score": 2.909090909090909
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=1816.758, mean=1816.758, max=1816.758, sum=3633.515 (2)",
-            "tab": "General information",
-            "score": 1816.7575757575758
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=400.091, mean=400.091, max=400.091, sum=800.182 (2)",
-            "tab": "General information",
-            "score": 400.09090909090907
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=482.762, mean=482.762, max=482.762, sum=965.523 (2)",
-            "tab": "General information",
-            "score": 482.7616580310881
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=392.351, mean=392.351, max=392.351, sum=784.703 (2)",
-            "tab": "General information",
-            "score": 392.35128205128206
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=506.689, mean=506.689, max=506.689, sum=1013.378 (2)",
-            "tab": "General information",
-            "score": 506.68888888888887
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=411.235, mean=411.235, max=411.235, sum=822.471 (2)",
-            "tab": "General information",
-            "score": 411.2352941176471
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=548.728, mean=548.728, max=548.728, sum=1097.457 (2)",
-            "tab": "General information",
-            "score": 548.7284768211921
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=514.793, mean=514.793, max=514.793, sum=1029.585 (2)",
-            "tab": "General information",
-            "score": 514.7926605504587
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=796.606, mean=796.606, max=796.606, sum=1593.213 (2)",
-            "tab": "General information",
-            "score": 796.6064814814815
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=4, mean=4, max=4, sum=8 (2)",
-            "tab": "General information",
-            "score": 4.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=1788.387, mean=1788.387, max=1788.387, sum=3576.775 (2)",
-            "tab": "General information",
-            "score": 1788.387254901961
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1461.443, mean=1461.443, max=1461.443, sum=2922.886 (2)",
-            "tab": "General information",
-            "score": 1461.4430379746836
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.595,
-        "details": {
-          "description": "min=0.595, mean=0.595, max=0.595, sum=1.191 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.568, mean=0.568, max=0.568, sum=1.135 (2)",
-            "tab": "Efficiency",
-            "score": 0.5676639603926996
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.627, mean=0.627, max=0.627, sum=1.254 (2)",
-            "tab": "Efficiency",
-            "score": 0.6270790318496354
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=323.691, mean=323.691, max=323.691, sum=647.381 (2)",
-            "tab": "General information",
-            "score": 323.69058295964123
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=355.351, mean=355.351, max=355.351, sum=710.702 (2)",
-            "tab": "General information",
-            "score": 355.35114503816794
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.612,
-        "details": {
-          "description": "min=0.612, mean=0.612, max=0.612, sum=1.223 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=1.125, mean=1.125, max=1.125, sum=2.25 (2)",
-            "tab": "Efficiency",
-            "score": 1.1249816102429855
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=650.372, mean=650.372, max=650.372, sum=1300.744 (2)",
-            "tab": "General information",
-            "score": 650.3719008264463
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.607,
-        "details": {
-          "description": "min=0.607, mean=0.607, max=0.607, sum=1.215 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.824, mean=0.824, max=0.824, sum=1.648 (2)",
-            "tab": "Efficiency",
-            "score": 0.8238252847472582
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=458.828, mean=458.828, max=458.828, sum=917.656 (2)",
-            "tab": "General information",
-            "score": 458.8282208588957
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375,
-        "details": {
-          "description": "min=0.375, mean=0.375, max=0.375, sum=0.75 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=1.161, mean=1.161, max=1.161, sum=2.321 (2)",
-            "tab": "Efficiency",
-            "score": 1.160504766872951
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=661.214, mean=661.214, max=661.214, sum=1322.429 (2)",
-            "tab": "General information",
-            "score": 661.2142857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.689,
-        "details": {
-          "description": "min=0.689, mean=0.689, max=0.689, sum=1.379 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.518, mean=0.518, max=0.518, sum=1.035 (2)",
-            "tab": "Efficiency",
-            "score": 0.5176426901400668
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=298.049, mean=298.049, max=298.049, sum=596.097 (2)",
-            "tab": "General information",
-            "score": 298.0485436893204
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.769,
-        "details": {
-          "description": "min=0.769, mean=0.769, max=0.769, sum=1.538 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.749, mean=0.749, max=0.749, sum=1.499 (2)",
-            "tab": "Efficiency",
-            "score": 0.7494234182895758
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=440.103, mean=440.103, max=440.103, sum=880.205 (2)",
-            "tab": "General information",
-            "score": 440.1025641025641
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.56,
-        "details": {
-          "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.56, mean=0.56, max=0.56, sum=1.121 (2)",
-            "tab": "Efficiency",
-            "score": 0.5603377485275268
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=340.48, mean=340.48, max=340.48, sum=680.96 (2)",
-            "tab": "General information",
-            "score": 340.48
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.734,
-        "details": {
-          "description": "min=0.734, mean=0.734, max=0.734, sum=1.469 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.533, mean=0.533, max=0.533, sum=1.066 (2)",
-            "tab": "Efficiency",
-            "score": 0.533118042452582
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=320.443, mean=320.443, max=320.443, sum=640.886 (2)",
-            "tab": "General information",
-            "score": 320.4431673052363
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.335,
-        "details": {
-          "description": "min=0.335, mean=0.335, max=0.335, sum=0.67 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.845, mean=0.845, max=0.845, sum=1.69 (2)",
-            "tab": "Efficiency",
-            "score": 0.8448189255819155
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=1.193, mean=1.193, max=1.193, sum=2.387 (2)",
-            "tab": "Efficiency",
-            "score": 1.1933270441087265
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=502.243, mean=502.243, max=502.243, sum=1004.486 (2)",
-            "tab": "General information",
-            "score": 502.242774566474
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=667.861, mean=667.861, max=667.861, sum=1335.723 (2)",
-            "tab": "General information",
-            "score": 667.8614525139665
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.608,
-        "details": {
-          "description": "min=0.608, mean=0.608, max=0.608, sum=1.216 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.99, mean=0.99, max=0.99, sum=1.979 (2)",
-            "tab": "Efficiency",
-            "score": 0.9895777281592874
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=579.127, mean=579.127, max=579.127, sum=1158.255 (2)",
-            "tab": "General information",
-            "score": 579.1274509803922
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.593,
-        "details": {
-          "description": "min=0.593, mean=0.593, max=0.593, sum=1.185 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.966, mean=0.966, max=0.966, sum=1.932 (2)",
-            "tab": "Efficiency",
-            "score": 0.9661886655254128
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=535.151, mean=535.151, max=535.151, sum=1070.302 (2)",
-            "tab": "General information",
-            "score": 535.1512345679013
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6,
-        "details": {
-          "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.763, mean=0.763, max=0.763, sum=1.526 (2)",
-            "tab": "Efficiency",
-            "score": 0.7631508913907138
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=422.982, mean=422.982, max=422.982, sum=845.964 (2)",
-            "tab": "General information",
-            "score": 422.9818181818182
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.522,
-        "details": {
-          "description": "min=0.522, mean=0.522, max=0.522, sum=1.045 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=2.064, mean=2.064, max=2.064, sum=4.128 (2)",
-            "tab": "Efficiency",
-            "score": 2.0640801809271987
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1207.057, mean=1207.057, max=1207.057, sum=2414.114 (2)",
-            "tab": "General information",
-            "score": 1207.057142857143
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.751,
-        "details": {
-          "description": "min=0.751, mean=0.751, max=0.751, sum=1.502 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.821, mean=0.821, max=0.821, sum=1.642 (2)",
-            "tab": "Efficiency",
-            "score": 0.8210354812109648
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=452.02, mean=452.02, max=452.02, sum=904.04 (2)",
-            "tab": "General information",
-            "score": 452.0199004975124
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.452,
-        "details": {
-          "description": "min=0.452, mean=0.452, max=0.452, sum=0.904 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.62, mean=0.62, max=0.62, sum=1.241 (2)",
-            "tab": "Efficiency",
-            "score": 0.6204164372869285
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=349.584, mean=349.584, max=349.584, sum=699.169 (2)",
-            "tab": "General information",
-            "score": 349.5843373493976
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.731,
-        "details": {
-          "description": "min=0.731, mean=0.731, max=0.731, sum=1.462 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)",
-            "tab": "Efficiency",
-            "score": 0.5299853595376712
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=285.766, mean=285.766, max=285.766, sum=571.532 (2)",
-            "tab": "General information",
-            "score": 285.766081871345
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.196,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json b/data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json
deleted file mode 100644
index 2b8d4cdfb..000000000
--- a/data/helm_mmlu/allenai/olmo-7b/bb904716-048c-4b41-9f64-4d17c485afe3.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/allenai_olmo-7b/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OLMo 7B",
-    "id": "allenai/olmo-7b",
-    "developer": "allenai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.295,
-        "details": {
-          "description": "min=0.22, mean=0.295, max=0.454, sum=33.59 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.258, mean=0.386, max=0.824, sum=44.021 (114)",
-            "tab": "Efficiency",
-            "score": 0.38615337806031275
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=2.903, mean=4.946, max=5, sum=563.801 (114)",
-            "tab": "General information",
-            "score": 4.9456214515982575
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=285.766, mean=597.867, max=1813.97, sum=68156.839 (114)",
-            "tab": "General information",
-            "score": 597.8670097876463
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.26,
-        "details": {
-          "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.309, mean=0.309, max=0.309, sum=0.619 (2)",
-            "tab": "Efficiency",
-            "score": 0.309316143989563
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=358.76, mean=358.76, max=358.76, sum=717.52 (2)",
-            "tab": "General information",
-            "score": 358.76
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.222,
-        "details": {
-          "description": "min=0.222, mean=0.222, max=0.222, sum=0.444 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.536, mean=0.536, max=0.536, sum=1.072 (2)",
-            "tab": "Efficiency",
-            "score": 0.5358577339737504
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=352.03, mean=352.03, max=352.03, sum=704.059 (2)",
-            "tab": "General information",
-            "score": 352.02962962962965
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.294,
-        "details": {
-          "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.346, mean=0.346, max=0.346, sum=0.691 (2)",
-            "tab": "Efficiency",
-            "score": 0.34570912599563597
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.309, mean=0.309, max=0.309, sum=0.619 (2)",
-            "tab": "Efficiency",
-            "score": 0.30927823815080857
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.423, mean=0.423, max=0.423, sum=0.847 (2)",
-            "tab": "Efficiency",
-            "score": 0.42337616443634035
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.344, mean=0.344, max=0.344, sum=0.687 (2)",
-            "tab": "Efficiency",
-            "score": 0.34355913400650023
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.324, mean=0.324, max=0.324, sum=0.647 (2)",
-            "tab": "Efficiency",
-            "score": 0.32374938237184736
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)",
-            "tab": "Efficiency",
-            "score": 0.3302010788637049
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=535.85, mean=535.85, max=535.85, sum=1071.7 (2)",
-            "tab": "General information",
-            "score": 535.85
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=470.319, mean=470.319, max=470.319, sum=940.639 (2)",
-            "tab": "General information",
-            "score": 470.31944444444446
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=842.89, mean=842.89, max=842.89, sum=1685.78 (2)",
-            "tab": "General information",
-            "score": 842.89
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=592.82, mean=592.82, max=592.82, sum=1185.64 (2)",
-            "tab": "General information",
-            "score": 592.82
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=519.376, mean=519.376, max=519.376, sum=1038.751 (2)",
-            "tab": "General information",
-            "score": 519.3757225433526
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=476.657, mean=476.657, max=476.657, sum=953.314 (2)",
-            "tab": "General information",
-            "score": 476.65686274509807
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3,
-        "details": {
-          "description": "min=0.3, mean=0.3, max=0.3, sum=0.6 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.317, mean=0.317, max=0.317, sum=0.634 (2)",
-            "tab": "Efficiency",
-            "score": 0.31721718072891236
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=388.19, mean=388.19, max=388.19, sum=776.38 (2)",
-            "tab": "General information",
-            "score": 388.19
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.325,
-        "details": {
-          "description": "min=0.325, mean=0.325, max=0.325, sum=0.649 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.345, mean=0.345, max=0.345, sum=0.69 (2)",
-            "tab": "Efficiency",
-            "score": 0.34500646591186523
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=612.798, mean=612.798, max=612.798, sum=1225.596 (2)",
-            "tab": "General information",
-            "score": 612.7982456140351
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.32,
-        "details": {
-          "description": "min=0.32, mean=0.32, max=0.32, sum=0.64 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.316, mean=0.316, max=0.316, sum=0.633 (2)",
-            "tab": "Efficiency",
-            "score": 0.3163221001625061
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=400.58, mean=400.58, max=400.58, sum=801.16 (2)",
-            "tab": "General information",
-            "score": 400.58
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25,
-        "details": {
-          "description": "min=0.25, mean=0.25, max=0.25, sum=0.5 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.306, mean=0.306, max=0.306, sum=0.613 (2)",
-            "tab": "Efficiency",
-            "score": 0.3064618044429355
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=420.861, mean=420.861, max=420.861, sum=841.722 (2)",
-            "tab": "General information",
-            "score": 420.8611111111111
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.325,
-        "details": {
-          "description": "min=0.325, mean=0.325, max=0.325, sum=0.65 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.396, mean=0.396, max=0.396, sum=0.792 (2)",
-            "tab": "Efficiency",
-            "score": 0.39610295280382946
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=345.277, mean=345.277, max=345.277, sum=690.553 (2)",
-            "tab": "General information",
-            "score": 345.2765273311897
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.232,
-        "details": {
-          "description": "min=0.232, mean=0.232, max=0.232, sum=0.464 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.0 (2)",
-            "tab": "Efficiency",
-            "score": 0.4999704089234857
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.346, mean=0.346, max=0.346, sum=0.692 (2)",
-            "tab": "Efficiency",
-            "score": 0.3458050379516385
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.768, mean=0.768, max=0.768, sum=1.537 (2)",
-            "tab": "Efficiency",
-            "score": 0.7683826767325868
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.433, mean=0.433, max=0.433, sum=0.865 (2)",
-            "tab": "Efficiency",
-            "score": 0.43272479998519997
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1080.882, mean=1080.882, max=1080.882, sum=2161.765 (2)",
-            "tab": "General information",
-            "score": 1080.8823529411766
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=660.922, mean=660.922, max=660.922, sum=1321.844 (2)",
-            "tab": "General information",
-            "score": 660.9219858156029
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=4.997, mean=4.997, max=4.997, sum=9.995 (2)",
-            "tab": "General information",
-            "score": 4.9973924380704045
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1654.433, mean=1654.433, max=1654.433, sum=3308.866 (2)",
-            "tab": "General information",
-            "score": 1654.4328552803129
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=590.873, mean=590.873, max=590.873, sum=1181.745 (2)",
-            "tab": "General information",
-            "score": 590.8725490196078
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.26,
-        "details": {
-          "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.312, mean=0.312, max=0.312, sum=0.624 (2)",
-            "tab": "Efficiency",
-            "score": 0.31185237407684324
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=444.08, mean=444.08, max=444.08, sum=888.16 (2)",
-            "tab": "General information",
-            "score": 444.08
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.342,
-        "details": {
-          "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)",
-            "tab": "Efficiency",
-            "score": 0.3300002766282935
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=598.487, mean=598.487, max=598.487, sum=1196.974 (2)",
-            "tab": "General information",
-            "score": 598.4868421052631
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.24,
-        "details": {
-          "description": "min=0.24, mean=0.24, max=0.24, sum=0.48 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.356, mean=0.356, max=0.356, sum=0.713 (2)",
-            "tab": "Efficiency",
-            "score": 0.3563597345352173
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=585.05, mean=585.05, max=585.05, sum=1170.1 (2)",
-            "tab": "General information",
-            "score": 585.05
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.26,
-        "details": {
-          "description": "min=0.26, mean=0.26, max=0.26, sum=0.521 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.282, mean=0.282, max=0.282, sum=0.564 (2)",
-            "tab": "Efficiency",
-            "score": 0.2817675842429107
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=401.917, mean=401.917, max=401.917, sum=803.834 (2)",
-            "tab": "General information",
-            "score": 401.9169811320755
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.319,
-        "details": {
-          "description": "min=0.319, mean=0.319, max=0.319, sum=0.638 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.601 (2)",
-            "tab": "Efficiency",
-            "score": 0.3004691002216745
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=311.311, mean=311.311, max=311.311, sum=622.621 (2)",
-            "tab": "General information",
-            "score": 311.31063829787234
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.29,
-        "details": {
-          "description": "min=0.29, mean=0.29, max=0.29, sum=0.579 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.271, mean=0.271, max=0.271, sum=0.542 (2)",
-            "tab": "Efficiency",
-            "score": 0.27095125954726645
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=424.848, mean=424.848, max=424.848, sum=849.697 (2)",
-            "tab": "General information",
-            "score": 424.848275862069
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.254,
-        "details": {
-          "description": "min=0.254, mean=0.254, max=0.254, sum=0.508 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.31, mean=0.31, max=0.31, sum=0.62 (2)",
-            "tab": "Efficiency",
-            "score": 0.3099196644687148
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=505.071, mean=505.071, max=505.071, sum=1010.143 (2)",
-            "tab": "General information",
-            "score": 505.07142857142856
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.278,
-        "details": {
-          "description": "min=0.278, mean=0.278, max=0.278, sum=0.556 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.582, mean=0.582, max=0.582, sum=1.165 (2)",
-            "tab": "Efficiency",
-            "score": 0.5824837514332363
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=653.595, mean=653.595, max=653.595, sum=1307.19 (2)",
-            "tab": "General information",
-            "score": 653.5952380952381
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.253,
-        "details": {
-          "description": "min=0.253, mean=0.253, max=0.253, sum=0.506 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)",
-            "tab": "Efficiency",
-            "score": 0.28990614798761183
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.298, mean=0.298, max=0.298, sum=0.596 (2)",
-            "tab": "Efficiency",
-            "score": 0.29780743039887525
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.427, mean=0.427, max=0.427, sum=0.854 (2)",
-            "tab": "Efficiency",
-            "score": 0.4271339774131775
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.824, mean=0.824, max=0.824, sum=1.648 (2)",
-            "tab": "Efficiency",
-            "score": 0.8240610585068211
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.301, mean=0.301, max=0.301, sum=0.603 (2)",
-            "tab": "Efficiency",
-            "score": 0.30138304980114256
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.653 (2)",
-            "tab": "Efficiency",
-            "score": 0.32666249472860226
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)",
-            "tab": "Efficiency",
-            "score": 0.30416087615184295
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.666 (2)",
-            "tab": "Efficiency",
-            "score": 0.3329446854414763
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.277, mean=0.277, max=0.277, sum=0.555 (2)",
-            "tab": "Efficiency",
-            "score": 0.27732292243412565
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)",
-            "tab": "Efficiency",
-            "score": 0.3369376612025381
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.295, mean=0.295, max=0.295, sum=0.589 (2)",
-            "tab": "Efficiency",
-            "score": 0.294664117830609
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.409, mean=0.409, max=0.409, sum=0.817 (2)",
-            "tab": "Efficiency",
-            "score": 0.40864299955191435
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.816, mean=0.816, max=0.816, sum=1.632 (2)",
-            "tab": "Efficiency",
-            "score": 0.8157591445773256
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.672, mean=0.672, max=0.672, sum=1.343 (2)",
-            "tab": "Efficiency",
-            "score": 0.6715093554323736
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=513.932, mean=513.932, max=513.932, sum=1027.865 (2)",
-            "tab": "General information",
-            "score": 513.9322580645161
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=479.842, mean=479.842, max=479.842, sum=959.685 (2)",
-            "tab": "General information",
-            "score": 479.8423645320197
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=889.39, mean=889.39, max=889.39, sum=1778.78 (2)",
-            "tab": "General information",
-            "score": 889.39
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=2.903, mean=2.903, max=2.903, sum=5.806 (2)",
-            "tab": "General information",
-            "score": 2.903030303030303
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=1813.97, mean=1813.97, max=1813.97, sum=3627.939 (2)",
-            "tab": "General information",
-            "score": 1813.969696969697
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=400.091, mean=400.091, max=400.091, sum=800.182 (2)",
-            "tab": "General information",
-            "score": 400.09090909090907
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=482.762, mean=482.762, max=482.762, sum=965.523 (2)",
-            "tab": "General information",
-            "score": 482.7616580310881
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=392.351, mean=392.351, max=392.351, sum=784.703 (2)",
-            "tab": "General information",
-            "score": 392.35128205128206
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=506.689, mean=506.689, max=506.689, sum=1013.378 (2)",
-            "tab": "General information",
-            "score": 506.68888888888887
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=411.235, mean=411.235, max=411.235, sum=822.471 (2)",
-            "tab": "General information",
-            "score": 411.2352941176471
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=548.728, mean=548.728, max=548.728, sum=1097.457 (2)",
-            "tab": "General information",
-            "score": 548.7284768211921
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=514.793, mean=514.793, max=514.793, sum=1029.585 (2)",
-            "tab": "General information",
-            "score": 514.7926605504587
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=796.606, mean=796.606, max=796.606, sum=1593.213 (2)",
-            "tab": "General information",
-            "score": 796.6064814814815
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=4, mean=4, max=4, sum=8 (2)",
-            "tab": "General information",
-            "score": 4.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=1788.387, mean=1788.387, max=1788.387, sum=3576.775 (2)",
-            "tab": "General information",
-            "score": 1788.387254901961
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1461.443, mean=1461.443, max=1461.443, sum=2922.886 (2)",
-            "tab": "General information",
-            "score": 1461.4430379746836
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.267,
-        "details": {
-          "description": "min=0.267, mean=0.267, max=0.267, sum=0.534 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)",
-            "tab": "Efficiency",
-            "score": 0.2699183316508751
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.552, mean=0.552, max=0.552, sum=1.104 (2)",
-            "tab": "Efficiency",
-            "score": 0.5521998168857953
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=323.691, mean=323.691, max=323.691, sum=647.381 (2)",
-            "tab": "General information",
-            "score": 323.69058295964123
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=355.351, mean=355.351, max=355.351, sum=710.702 (2)",
-            "tab": "General information",
-            "score": 355.35114503816794
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.306,
-        "details": {
-          "description": "min=0.306, mean=0.306, max=0.306, sum=0.612 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.326, mean=0.326, max=0.326, sum=0.652 (2)",
-            "tab": "Efficiency",
-            "score": 0.3259233679653199
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=650.372, mean=650.372, max=650.372, sum=1300.744 (2)",
-            "tab": "General information",
-            "score": 650.3719008264463
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.264,
-        "details": {
-          "description": "min=0.264, mean=0.264, max=0.264, sum=0.528 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.665 (2)",
-            "tab": "Efficiency",
-            "score": 0.3324835944029451
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=458.828, mean=458.828, max=458.828, sum=917.656 (2)",
-            "tab": "General information",
-            "score": 458.8282208588957
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.286,
-        "details": {
-          "description": "min=0.286, mean=0.286, max=0.286, sum=0.571 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.352, mean=0.352, max=0.352, sum=0.704 (2)",
-            "tab": "Efficiency",
-            "score": 0.3520317098924092
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=661.214, mean=661.214, max=661.214, sum=1322.429 (2)",
-            "tab": "General information",
-            "score": 661.2142857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.272,
-        "details": {
-          "description": "min=0.272, mean=0.272, max=0.272, sum=0.544 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.306, mean=0.306, max=0.306, sum=0.613 (2)",
-            "tab": "Efficiency",
-            "score": 0.3064361937995096
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=298.049, mean=298.049, max=298.049, sum=596.097 (2)",
-            "tab": "General information",
-            "score": 298.0485436893204
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.269,
-        "details": {
-          "description": "min=0.269, mean=0.269, max=0.269, sum=0.538 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)",
-            "tab": "Efficiency",
-            "score": 0.3111040826536651
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=440.103, mean=440.103, max=440.103, sum=880.205 (2)",
-            "tab": "General information",
-            "score": 440.1025641025641
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.28,
-        "details": {
-          "description": "min=0.28, mean=0.28, max=0.28, sum=0.56 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.258, mean=0.258, max=0.258, sum=0.516 (2)",
-            "tab": "Efficiency",
-            "score": 0.2580227541923523
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=340.48, mean=340.48, max=340.48, sum=680.96 (2)",
-            "tab": "General information",
-            "score": 340.48
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.292,
-        "details": {
-          "description": "min=0.292, mean=0.292, max=0.292, sum=0.585 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)",
-            "tab": "Efficiency",
-            "score": 0.3421932640051324
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=320.443, mean=320.443, max=320.443, sum=640.886 (2)",
-            "tab": "General information",
-            "score": 320.4431673052363
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.265,
-        "details": {
-          "description": "min=0.265, mean=0.265, max=0.265, sum=0.53 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.395, mean=0.395, max=0.395, sum=0.791 (2)",
-            "tab": "Efficiency",
-            "score": 0.39545129627161635
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.36, mean=0.36, max=0.36, sum=0.72 (2)",
-            "tab": "Efficiency",
-            "score": 0.3597933335011232
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=502.243, mean=502.243, max=502.243, sum=1004.486 (2)",
-            "tab": "General information",
-            "score": 502.242774566474
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=667.861, mean=667.861, max=667.861, sum=1335.723 (2)",
-            "tab": "General information",
-            "score": 667.8614525139665
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.34,
-        "details": {
-          "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.451, mean=0.451, max=0.451, sum=0.902 (2)",
-            "tab": "Efficiency",
-            "score": 0.45079101612365324
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=579.127, mean=579.127, max=579.127, sum=1158.255 (2)",
-            "tab": "General information",
-            "score": 579.1274509803922
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318,
-        "details": {
-          "description": "min=0.318, mean=0.318, max=0.318, sum=0.636 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.328, mean=0.328, max=0.328, sum=0.656 (2)",
-            "tab": "Efficiency",
-            "score": 0.32820526979587694
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=535.151, mean=535.151, max=535.151, sum=1070.302 (2)",
-            "tab": "General information",
-            "score": 535.1512345679013
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.345,
-        "details": {
-          "description": "min=0.345, mean=0.345, max=0.345, sum=0.691 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.285, mean=0.285, max=0.285, sum=0.571 (2)",
-            "tab": "Efficiency",
-            "score": 0.28533268625086006
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=422.982, mean=422.982, max=422.982, sum=845.964 (2)",
-            "tab": "General information",
-            "score": 422.9818181818182
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.408,
-        "details": {
-          "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.551, mean=0.551, max=0.551, sum=1.102 (2)",
-            "tab": "Efficiency",
-            "score": 0.5510748113904681
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1207.057, mean=1207.057, max=1207.057, sum=2414.114 (2)",
-            "tab": "General information",
-            "score": 1207.057142857143
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.383,
-        "details": {
-          "description": "min=0.383, mean=0.383, max=0.383, sum=0.766 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.586 (2)",
-            "tab": "Efficiency",
-            "score": 0.2929653884166509
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=452.02, mean=452.02, max=452.02, sum=904.04 (2)",
-            "tab": "General information",
-            "score": 452.0199004975124
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.416,
-        "details": {
-          "description": "min=0.416, mean=0.416, max=0.416, sum=0.831 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.492, mean=0.492, max=0.492, sum=0.983 (2)",
-            "tab": "Efficiency",
-            "score": 0.4916250992970294
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=349.584, mean=349.584, max=349.584, sum=699.169 (2)",
-            "tab": "General information",
-            "score": 349.5843373493976
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.234,
-        "details": {
-          "description": "min=0.234, mean=0.234, max=0.234, sum=0.468 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.503, mean=0.503, max=0.503, sum=1.007 (2)",
-            "tab": "Efficiency",
-            "score": 0.5034504368988394
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=285.766, mean=285.766, max=285.766, sum=571.532 (2)",
-            "tab": "General information",
-            "score": 285.766081871345
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.68,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json b/data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json
deleted file mode 100644
index 1bb99dccc..000000000
--- a/data/helm_mmlu/amazon/nova-lite-v1_0/063bd04d-e0d8-426a-a56a-062f7bc1a4e4.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/amazon_nova-lite-v1:0/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Amazon Nova Lite",
-    "id": "amazon/nova-lite-v1:0",
-    "developer": "amazon",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.509, mean=0.77, max=0.969, sum=87.802 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.113, mean=0.127, max=0.174, sum=14.526 (114)",
-            "tab": "Efficiency",
-            "score": 0.12742174922519597
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=305.386, mean=655.489, max=2872.03, sum=74725.746 (114)",
-            "tab": "General information",
-            "score": 655.4890026560713
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52,
-        "details": {
-          "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.136, mean=0.136, max=0.136, sum=0.272 (2)",
-            "tab": "Efficiency",
-            "score": 0.13592
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=399.38, mean=399.38, max=399.38, sum=798.76 (2)",
-            "tab": "General information",
-            "score": 399.38
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.719,
-        "details": {
-          "description": "min=0.719, mean=0.719, max=0.719, sum=1.437 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.124, mean=0.124, max=0.124, sum=0.248 (2)",
-            "tab": "Efficiency",
-            "score": 0.12411851851851854
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=400.081, mean=400.081, max=400.081, sum=800.163 (2)",
-            "tab": "General information",
-            "score": 400.0814814814815
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.608,
-        "details": {
-          "description": "min=0.608, mean=0.608, max=0.608, sum=1.216 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.133, mean=0.133, max=0.133, sum=0.265 (2)",
-            "tab": "Efficiency",
-            "score": 0.13258
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.126, mean=0.126, max=0.126, sum=0.252 (2)",
-            "tab": "Efficiency",
-            "score": 0.12590277777777775
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.137, mean=0.137, max=0.137, sum=0.274 (2)",
-            "tab": "Efficiency",
-            "score": 0.13685
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.134, mean=0.134, max=0.134, sum=0.268 (2)",
-            "tab": "Efficiency",
-            "score": 0.13410999999999995
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.129, mean=0.129, max=0.129, sum=0.258 (2)",
-            "tab": "Efficiency",
-            "score": 0.12883815028901727
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.129, mean=0.129, max=0.129, sum=0.258 (2)",
-            "tab": "Efficiency",
-            "score": 0.12883333333333336
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=573.4, mean=573.4, max=573.4, sum=1146.8 (2)",
-            "tab": "General information",
-            "score": 573.4
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=510.278, mean=510.278, max=510.278, sum=1020.556 (2)",
-            "tab": "General information",
-            "score": 510.27777777777777
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=880.15, mean=880.15, max=880.15, sum=1760.3 (2)",
-            "tab": "General information",
-            "score": 880.15
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=639.53, mean=639.53, max=639.53, sum=1279.06 (2)",
-            "tab": "General information",
-            "score": 639.53
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=558.301, mean=558.301, max=558.301, sum=1116.601 (2)",
-            "tab": "General information",
-            "score": 558.3005780346821
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=517.324, mean=517.324, max=517.324, sum=1034.647 (2)",
-            "tab": "General information",
-            "score": 517.3235294117648
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.124, mean=0.124, max=0.124, sum=0.247 (2)",
-            "tab": "Efficiency",
-            "score": 0.12359999999999999
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=415.4, mean=415.4, max=415.4, sum=830.8 (2)",
-            "tab": "General information",
-            "score": 415.4
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.675,
-        "details": {
-          "description": "min=0.675, mean=0.675, max=0.675, sum=1.351 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.132, mean=0.132, max=0.132, sum=0.263 (2)",
-            "tab": "Efficiency",
-            "score": 0.13153508771929825
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=652.07, mean=652.07, max=652.07, sum=1304.14 (2)",
-            "tab": "General information",
-            "score": 652.0701754385965
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.55,
-        "details": {
-          "description": "min=0.55, mean=0.55, max=0.55, sum=1.1 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.127, mean=0.127, max=0.127, sum=0.255 (2)",
-            "tab": "Efficiency",
-            "score": 0.12749
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=426.42, mean=426.42, max=426.42, sum=852.84 (2)",
-            "tab": "General information",
-            "score": 426.42
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.852,
-        "details": {
-          "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.124, mean=0.124, max=0.124, sum=0.248 (2)",
-            "tab": "Efficiency",
-            "score": 0.12411111111111109
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=446.722, mean=446.722, max=446.722, sum=893.444 (2)",
-            "tab": "General information",
-            "score": 446.72222222222223
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.817,
-        "details": {
-          "description": "min=0.817, mean=0.817, max=0.817, sum=1.633 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.121, mean=0.121, max=0.121, sum=0.242 (2)",
-            "tab": "Efficiency",
-            "score": 0.12122186495176847
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=381.704, mean=381.704, max=381.704, sum=763.408 (2)",
-            "tab": "General information",
-            "score": 381.7041800643087
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.812,
-        "details": {
-          "description": "min=0.812, mean=0.812, max=0.812, sum=1.624 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.139, mean=0.139, max=0.139, sum=0.277 (2)",
-            "tab": "Efficiency",
-            "score": 0.13866176470588237
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.126, mean=0.126, max=0.126, sum=0.253 (2)",
-            "tab": "Efficiency",
-            "score": 0.1264397163120567
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.143, mean=0.143, max=0.143, sum=0.286 (2)",
-            "tab": "Efficiency",
-            "score": 0.14286505867014285
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.124, mean=0.124, max=0.124, sum=0.248 (2)",
-            "tab": "Efficiency",
-            "score": 0.12417647058823517
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1146.287, mean=1146.287, max=1146.287, sum=2292.574 (2)",
-            "tab": "General information",
-            "score": 1146.2867647058824
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=688.72, mean=688.72, max=688.72, sum=1377.44 (2)",
-            "tab": "General information",
-            "score": 688.7198581560284
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1686.73, mean=1686.73, max=1686.73, sum=3373.46 (2)",
-            "tab": "General information",
-            "score": 1686.7301173402868
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=625.574, mean=625.574, max=625.574, sum=1251.147 (2)",
-            "tab": "General information",
-            "score": 625.5735294117648
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.128, mean=0.128, max=0.128, sum=0.256 (2)",
-            "tab": "Efficiency",
-            "score": 0.12775000000000003
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=461.12, mean=461.12, max=461.12, sum=922.24 (2)",
-            "tab": "General information",
-            "score": 461.12
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.862,
-        "details": {
-          "description": "min=0.862, mean=0.862, max=0.862, sum=1.724 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.129, mean=0.129, max=0.129, sum=0.258 (2)",
-            "tab": "Efficiency",
-            "score": 0.12905921052631578
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=628.112, mean=628.112, max=628.112, sum=1256.224 (2)",
-            "tab": "General information",
-            "score": 628.1118421052631
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.73,
-        "details": {
-          "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.126, mean=0.126, max=0.126, sum=0.252 (2)",
-            "tab": "Efficiency",
-            "score": 0.12613000000000005
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=617.46, mean=617.46, max=617.46, sum=1234.92 (2)",
-            "tab": "General information",
-            "score": 617.46
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.126, mean=0.126, max=0.126, sum=0.251 (2)",
-            "tab": "Efficiency",
-            "score": 0.1255018867924528
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=451.925, mean=451.925, max=451.925, sum=903.849 (2)",
-            "tab": "General information",
-            "score": 451.92452830188677
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "description": "min=0.796, mean=0.796, max=0.796, sum=1.591 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.115, mean=0.115, max=0.115, sum=0.23 (2)",
-            "tab": "Efficiency",
-            "score": 0.11518723404255315
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=341.723, mean=341.723, max=341.723, sum=683.447 (2)",
-            "tab": "General information",
-            "score": 341.72340425531917
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.779,
-        "details": {
-          "description": "min=0.779, mean=0.779, max=0.779, sum=1.559 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.116, mean=0.116, max=0.116, sum=0.232 (2)",
-            "tab": "Efficiency",
-            "score": 0.11609655172413792
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=458.345, mean=458.345, max=458.345, sum=916.69 (2)",
-            "tab": "General information",
-            "score": 458.3448275862069
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.757,
-        "details": {
-          "description": "min=0.757, mean=0.757, max=0.757, sum=1.513 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.126, mean=0.126, max=0.126, sum=0.253 (2)",
-            "tab": "Efficiency",
-            "score": 0.12626455026455036
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=534.09, mean=534.09, max=534.09, sum=1068.18 (2)",
-            "tab": "General information",
-            "score": 534.0899470899471
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.643,
-        "details": {
-          "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.129, mean=0.129, max=0.129, sum=0.257 (2)",
-            "tab": "Efficiency",
-            "score": 0.12850793650793654
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=669, mean=669, max=669, sum=1338 (2)",
-            "tab": "General information",
-            "score": 669.0
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.886,
-        "details": {
-          "description": "min=0.886, mean=0.886, max=0.886, sum=1.772 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.122, mean=0.122, max=0.122, sum=0.244 (2)",
-            "tab": "Efficiency",
-            "score": 0.12203870967741924
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.127, mean=0.127, max=0.127, sum=0.254 (2)",
-            "tab": "Efficiency",
-            "score": 0.1271921182266009
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.136, mean=0.136, max=0.136, sum=0.271 (2)",
-            "tab": "Efficiency",
-            "score": 0.13555999999999999
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.174, mean=0.174, max=0.174, sum=0.348 (2)",
-            "tab": "Efficiency",
-            "score": 0.1741696969696969
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.123, mean=0.123, max=0.123, sum=0.245 (2)",
-            "tab": "Efficiency",
-            "score": 0.1226313131313131
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.125, mean=0.125, max=0.125, sum=0.251 (2)",
-            "tab": "Efficiency",
-            "score": 0.12531606217616578
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.121, mean=0.121, max=0.121, sum=0.242 (2)",
-            "tab": "Efficiency",
-            "score": 0.12077948717948701
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.126, mean=0.126, max=0.126, sum=0.251 (2)",
-            "tab": "Efficiency",
-            "score": 0.1257444444444444
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.123, mean=0.123, max=0.123, sum=0.247 (2)",
-            "tab": "Efficiency",
-            "score": 0.12331512605042017
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.128, mean=0.128, max=0.128, sum=0.256 (2)",
-            "tab": "Efficiency",
-            "score": 0.1282052980132451
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.123, mean=0.123, max=0.123, sum=0.246 (2)",
-            "tab": "Efficiency",
-            "score": 0.12288256880733935
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.13, mean=0.13, max=0.13, sum=0.261 (2)",
-            "tab": "Efficiency",
-            "score": 0.13030555555555556
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.161, mean=0.161, max=0.161, sum=0.322 (2)",
-            "tab": "Efficiency",
-            "score": 0.16099019607843132
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.146, mean=0.146, max=0.146, sum=0.293 (2)",
-            "tab": "Efficiency",
-            "score": 0.14643881856540092
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=568.748, mean=568.748, max=568.748, sum=1137.497 (2)",
-            "tab": "General information",
-            "score": 568.7483870967742
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=523.65, mean=523.65, max=523.65, sum=1047.3 (2)",
-            "tab": "General information",
-            "score": 523.6502463054187
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=932.15, mean=932.15, max=932.15, sum=1864.3 (2)",
-            "tab": "General information",
-            "score": 932.15
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2872.03, mean=2872.03, max=2872.03, sum=5744.061 (2)",
-            "tab": "General information",
-            "score": 2872.030303030303
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=425.646, mean=425.646, max=425.646, sum=851.293 (2)",
-            "tab": "General information",
-            "score": 425.64646464646466
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=506.073, mean=506.073, max=506.073, sum=1012.145 (2)",
-            "tab": "General information",
-            "score": 506.07253886010363
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=419.987, mean=419.987, max=419.987, sum=839.974 (2)",
-            "tab": "General information",
-            "score": 419.9871794871795
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=554.352, mean=554.352, max=554.352, sum=1108.704 (2)",
-            "tab": "General information",
-            "score": 554.3518518518518
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=439.055, mean=439.055, max=439.055, sum=878.109 (2)",
-            "tab": "General information",
-            "score": 439.0546218487395
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=581.669, mean=581.669, max=581.669, sum=1163.338 (2)",
-            "tab": "General information",
-            "score": 581.6688741721854
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=544.842, mean=544.842, max=544.842, sum=1089.684 (2)",
-            "tab": "General information",
-            "score": 544.8422018348624
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=833, mean=833, max=833, sum=1666 (2)",
-            "tab": "General information",
-            "score": 833.0
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2270.25, mean=2270.25, max=2270.25, sum=4540.5 (2)",
-            "tab": "General information",
-            "score": 2270.25
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1466.561, mean=1466.561, max=1466.561, sum=2933.122 (2)",
-            "tab": "General information",
-            "score": 1466.5611814345991
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.113, mean=0.113, max=0.113, sum=0.227 (2)",
-            "tab": "Efficiency",
-            "score": 0.11326008968609867
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.118, mean=0.118, max=0.118, sum=0.236 (2)",
-            "tab": "Efficiency",
-            "score": 0.11813740458015273
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=352.48, mean=352.48, max=352.48, sum=704.96 (2)",
-            "tab": "General information",
-            "score": 352.47982062780267
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=385.626, mean=385.626, max=385.626, sum=771.252 (2)",
-            "tab": "General information",
-            "score": 385.62595419847327
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.843,
-        "details": {
-          "description": "min=0.843, mean=0.843, max=0.843, sum=1.686 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.129, mean=0.129, max=0.129, sum=0.258 (2)",
-            "tab": "Efficiency",
-            "score": 0.129206611570248
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=667.843, mean=667.843, max=667.843, sum=1335.686 (2)",
-            "tab": "General information",
-            "score": 667.8429752066115
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.124, mean=0.124, max=0.124, sum=0.249 (2)",
-            "tab": "Efficiency",
-            "score": 0.12445398773006137
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=482.227, mean=482.227, max=482.227, sum=964.454 (2)",
-            "tab": "General information",
-            "score": 482.2269938650307
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.509,
-        "details": {
-          "description": "min=0.509, mean=0.509, max=0.509, sum=1.018 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.135, mean=0.135, max=0.135, sum=0.27 (2)",
-            "tab": "Efficiency",
-            "score": 0.13516071428571433
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=699.598, mean=699.598, max=699.598, sum=1399.196 (2)",
-            "tab": "General information",
-            "score": 699.5982142857143
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.864,
-        "details": {
-          "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.118, mean=0.118, max=0.118, sum=0.237 (2)",
-            "tab": "Efficiency",
-            "score": 0.1183980582524272
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=320.34, mean=320.34, max=320.34, sum=640.68 (2)",
-            "tab": "General information",
-            "score": 320.3398058252427
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.889,
-        "details": {
-          "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.122, mean=0.122, max=0.122, sum=0.243 (2)",
-            "tab": "Efficiency",
-            "score": 0.12151282051282052
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=466.697, mean=466.697, max=466.697, sum=933.393 (2)",
-            "tab": "General information",
-            "score": 466.6965811965812
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.115, mean=0.115, max=0.115, sum=0.23 (2)",
-            "tab": "Efficiency",
-            "score": 0.11518
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=380.71, mean=380.71, max=380.71, sum=761.42 (2)",
-            "tab": "General information",
-            "score": 380.71
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.872,
-        "details": {
-          "description": "min=0.872, mean=0.872, max=0.872, sum=1.745 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.114, mean=0.114, max=0.114, sum=0.227 (2)",
-            "tab": "Efficiency",
-            "score": 0.11356577266922054
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=342.847, mean=342.847, max=342.847, sum=685.693 (2)",
-            "tab": "General information",
-            "score": 342.84674329501917
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.694,
-        "details": {
-          "description": "min=0.694, mean=0.694, max=0.694, sum=1.388 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.125, mean=0.125, max=0.125, sum=0.249 (2)",
-            "tab": "Efficiency",
-            "score": 0.12473699421965324
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.124, mean=0.124, max=0.124, sum=0.247 (2)",
-            "tab": "Efficiency",
-            "score": 0.12357988826815636
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=525.329, mean=525.329, max=525.329, sum=1050.659 (2)",
-            "tab": "General information",
-            "score": 525.3294797687861
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=692.482, mean=692.482, max=692.482, sum=1384.963 (2)",
-            "tab": "General information",
-            "score": 692.4815642458101
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.788,
-        "details": {
-          "description": "min=0.788, mean=0.788, max=0.788, sum=1.575 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.124, mean=0.124, max=0.124, sum=0.247 (2)",
-            "tab": "Efficiency",
-            "score": 0.12373529411764701
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=612.69, mean=612.69, max=612.69, sum=1225.379 (2)",
-            "tab": "General information",
-            "score": 612.6895424836601
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.849,
-        "details": {
-          "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.129, mean=0.129, max=0.129, sum=0.258 (2)",
-            "tab": "Efficiency",
-            "score": 0.1291882716049382
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=552.454, mean=552.454, max=552.454, sum=1104.907 (2)",
-            "tab": "General information",
-            "score": 552.4537037037037
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.682,
-        "details": {
-          "description": "min=0.682, mean=0.682, max=0.682, sum=1.364 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.12, mean=0.12, max=0.12, sum=0.241 (2)",
-            "tab": "Efficiency",
-            "score": 0.1202636363636364
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=448.609, mean=448.609, max=448.609, sum=897.218 (2)",
-            "tab": "General information",
-            "score": 448.6090909090909
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.788,
-        "details": {
-          "description": "min=0.788, mean=0.788, max=0.788, sum=1.576 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.137, mean=0.137, max=0.137, sum=0.273 (2)",
-            "tab": "Efficiency",
-            "score": 0.13666530612244904
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1224.433, mean=1224.433, max=1224.433, sum=2448.865 (2)",
-            "tab": "General information",
-            "score": 1224.4326530612245
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.896,
-        "details": {
-          "description": "min=0.896, mean=0.896, max=0.896, sum=1.791 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.121, mean=0.121, max=0.121, sum=0.241 (2)",
-            "tab": "Efficiency",
-            "score": 0.12068656716417903
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=474.512, mean=474.512, max=474.512, sum=949.025 (2)",
-            "tab": "General information",
-            "score": 474.5124378109453
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.542,
-        "details": {
-          "description": "min=0.542, mean=0.542, max=0.542, sum=1.084 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.114, mean=0.114, max=0.114, sum=0.227 (2)",
-            "tab": "Efficiency",
-            "score": 0.113578313253012
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=380.753, mean=380.753, max=380.753, sum=761.506 (2)",
-            "tab": "General information",
-            "score": 380.7530120481928
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=1.743 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.114, mean=0.114, max=0.114, sum=0.229 (2)",
-            "tab": "Efficiency",
-            "score": 0.11440935672514624
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=305.386, mean=305.386, max=305.386, sum=610.772 (2)",
-            "tab": "General information",
-            "score": 305.3859649122807
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.987,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json b/data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json
deleted file mode 100644
index ab9b8c843..000000000
--- a/data/helm_mmlu/amazon/nova-micro-v1_0/c8949c55-8987-4ed3-b74b-8b13b4381806.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/amazon_nova-micro-v1:0/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Amazon Nova Micro",
-    "id": "amazon/nova-micro-v1:0",
-    "developer": "amazon",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.708,
-        "details": {
-          "description": "min=0.42, mean=0.708, max=0.922, sum=80.671 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.102, mean=0.114, max=0.152, sum=13.049 (114)",
-            "tab": "Efficiency",
-            "score": 0.1144634124237814
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=305.386, mean=655.489, max=2872.03, sum=74725.746 (114)",
-            "tab": "General information",
-            "score": 655.4890026560713
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0.999, mean=1.0, max=1, sum=113.997 (114)",
-            "tab": "General information",
-            "score": 0.9999775940489795
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.42,
-        "details": {
-          "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.117, mean=0.117, max=0.117, sum=0.234 (2)",
-            "tab": "Efficiency",
-            "score": 0.11696000000000005
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=399.38, mean=399.38, max=399.38, sum=798.76 (2)",
-            "tab": "General information",
-            "score": 399.38
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.726,
-        "details": {
-          "description": "min=0.726, mean=0.726, max=0.726, sum=1.452 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.107, mean=0.107, max=0.107, sum=0.214 (2)",
-            "tab": "Efficiency",
-            "score": 0.10704444444444451
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=400.081, mean=400.081, max=400.081, sum=800.163 (2)",
-            "tab": "General information",
-            "score": 400.0814814814815
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.118, mean=0.118, max=0.118, sum=0.235 (2)",
-            "tab": "Efficiency",
-            "score": 0.11762000000000004
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.118, mean=0.118, max=0.118, sum=0.237 (2)",
-            "tab": "Efficiency",
-            "score": 0.11843055555555557
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.125, mean=0.125, max=0.125, sum=0.25 (2)",
-            "tab": "Efficiency",
-            "score": 0.12490000000000004
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.122, mean=0.122, max=0.122, sum=0.244 (2)",
-            "tab": "Efficiency",
-            "score": 0.12207000000000001
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.116, mean=0.116, max=0.116, sum=0.233 (2)",
-            "tab": "Efficiency",
-            "score": 0.11635838150289027
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.115, mean=0.115, max=0.115, sum=0.229 (2)",
-            "tab": "Efficiency",
-            "score": 0.11473529411764712
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=573.4, mean=573.4, max=573.4, sum=1146.8 (2)",
-            "tab": "General information",
-            "score": 573.4
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=510.278, mean=510.278, max=510.278, sum=1020.556 (2)",
-            "tab": "General information",
-            "score": 510.27777777777777
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=880.15, mean=880.15, max=880.15, sum=1760.3 (2)",
-            "tab": "General information",
-            "score": 880.15
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=639.53, mean=639.53, max=639.53, sum=1279.06 (2)",
-            "tab": "General information",
-            "score": 639.53
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=558.301, mean=558.301, max=558.301, sum=1116.601 (2)",
-            "tab": "General information",
-            "score": 558.3005780346821
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=517.324, mean=517.324, max=517.324, sum=1034.647 (2)",
-            "tab": "General information",
-            "score": 517.3235294117648
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.115, mean=0.115, max=0.115, sum=0.231 (2)",
-            "tab": "Efficiency",
-            "score": 0.11527000000000003
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=415.4, mean=415.4, max=415.4, sum=830.8 (2)",
-            "tab": "General information",
-            "score": 415.4
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.57,
-        "details": {
-          "description": "min=0.57, mean=0.57, max=0.57, sum=1.14 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.116, mean=0.116, max=0.116, sum=0.231 (2)",
-            "tab": "Efficiency",
-            "score": 0.11560526315789472
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=652.07, mean=652.07, max=652.07, sum=1304.14 (2)",
-            "tab": "General information",
-            "score": 652.0701754385965
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44,
-        "details": {
-          "description": "min=0.44, mean=0.44, max=0.44, sum=0.88 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.115, mean=0.115, max=0.115, sum=0.231 (2)",
-            "tab": "Efficiency",
-            "score": 0.11540999999999998
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=426.42, mean=426.42, max=426.42, sum=852.84 (2)",
-            "tab": "General information",
-            "score": 426.42
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.815,
-        "details": {
-          "description": "min=0.815, mean=0.815, max=0.815, sum=1.63 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.111, mean=0.111, max=0.111, sum=0.223 (2)",
-            "tab": "Efficiency",
-            "score": 0.11141666666666669
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=446.722, mean=446.722, max=446.722, sum=893.444 (2)",
-            "tab": "General information",
-            "score": 446.72222222222223
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.733,
-        "details": {
-          "description": "min=0.733, mean=0.733, max=0.733, sum=1.466 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.107, mean=0.107, max=0.107, sum=0.214 (2)",
-            "tab": "Efficiency",
-            "score": 0.10707717041800643
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=381.704, mean=381.704, max=381.704, sum=763.408 (2)",
-            "tab": "General information",
-            "score": 381.7041800643087
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.739,
-        "details": {
-          "description": "min=0.739, mean=0.739, max=0.739, sum=1.477 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.127, mean=0.127, max=0.127, sum=0.255 (2)",
-            "tab": "Efficiency",
-            "score": 0.12727573529411765
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.117, mean=0.117, max=0.117, sum=0.234 (2)",
-            "tab": "Efficiency",
-            "score": 0.11683687943262412
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.128, mean=0.128, max=0.128, sum=0.256 (2)",
-            "tab": "Efficiency",
-            "score": 0.1279393741851367
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.111, mean=0.111, max=0.111, sum=0.221 (2)",
-            "tab": "Efficiency",
-            "score": 0.11058333333333302
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1146.287, mean=1146.287, max=1146.287, sum=2292.574 (2)",
-            "tab": "General information",
-            "score": 1146.2867647058824
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=688.72, mean=688.72, max=688.72, sum=1377.44 (2)",
-            "tab": "General information",
-            "score": 688.7198581560284
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1686.73, mean=1686.73, max=1686.73, sum=3373.46 (2)",
-            "tab": "General information",
-            "score": 1686.7301173402868
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=625.574, mean=625.574, max=625.574, sum=1251.147 (2)",
-            "tab": "General information",
-            "score": 625.5735294117648
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.113, mean=0.113, max=0.113, sum=0.226 (2)",
-            "tab": "Efficiency",
-            "score": 0.11315000000000004
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=461.12, mean=461.12, max=461.12, sum=922.24 (2)",
-            "tab": "General information",
-            "score": 461.12
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.822,
-        "details": {
-          "description": "min=0.822, mean=0.822, max=0.822, sum=1.645 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.116, mean=0.116, max=0.116, sum=0.232 (2)",
-            "tab": "Efficiency",
-            "score": 0.11597368421052637
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=628.112, mean=628.112, max=628.112, sum=1256.224 (2)",
-            "tab": "General information",
-            "score": 628.1118421052631
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.71,
-        "details": {
-          "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.118, mean=0.118, max=0.118, sum=0.237 (2)",
-            "tab": "Efficiency",
-            "score": 0.11840000000000003
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=617.46, mean=617.46, max=617.46, sum=1234.92 (2)",
-            "tab": "General information",
-            "score": 617.46
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.751,
-        "details": {
-          "description": "min=0.751, mean=0.751, max=0.751, sum=1.502 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.115, mean=0.115, max=0.115, sum=0.23 (2)",
-            "tab": "Efficiency",
-            "score": 0.11494716981132078
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=451.925, mean=451.925, max=451.925, sum=903.849 (2)",
-            "tab": "General information",
-            "score": 451.92452830188677
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.706,
-        "details": {
-          "description": "min=0.706, mean=0.706, max=0.706, sum=1.413 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.105, mean=0.105, max=0.105, sum=0.21 (2)",
-            "tab": "Efficiency",
-            "score": 0.10520000000000002
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=341.723, mean=341.723, max=341.723, sum=683.447 (2)",
-            "tab": "General information",
-            "score": 341.72340425531917
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.683,
-        "details": {
-          "description": "min=0.683, mean=0.683, max=0.683, sum=1.366 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.109, mean=0.109, max=0.109, sum=0.218 (2)",
-            "tab": "Efficiency",
-            "score": 0.10906896551724135
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=458.345, mean=458.345, max=458.345, sum=916.69 (2)",
-            "tab": "General information",
-            "score": 458.3448275862069
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.55,
-        "details": {
-          "description": "min=0.55, mean=0.55, max=0.55, sum=1.101 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.116, mean=0.116, max=0.116, sum=0.232 (2)",
-            "tab": "Efficiency",
-            "score": 0.11621164021164002
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=534.09, mean=534.09, max=534.09, sum=1068.18 (2)",
-            "tab": "General information",
-            "score": 534.0899470899471
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.508,
-        "details": {
-          "description": "min=0.508, mean=0.508, max=0.508, sum=1.016 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.113, mean=0.113, max=0.113, sum=0.226 (2)",
-            "tab": "Efficiency",
-            "score": 0.112968253968254
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=669, mean=669, max=669, sum=1338 (2)",
-            "tab": "General information",
-            "score": 669.0
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.112, mean=0.112, max=0.112, sum=0.224 (2)",
-            "tab": "Efficiency",
-            "score": 0.11209354838709669
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.113, mean=0.113, max=0.113, sum=0.226 (2)",
-            "tab": "Efficiency",
-            "score": 0.11317733990147788
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.12, mean=0.12, max=0.12, sum=0.24 (2)",
-            "tab": "Efficiency",
-            "score": 0.11999000000000004
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.152, mean=0.152, max=0.152, sum=0.303 (2)",
-            "tab": "Efficiency",
-            "score": 0.1516909090909091
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.11, mean=0.11, max=0.11, sum=0.22 (2)",
-            "tab": "Efficiency",
-            "score": 0.11011616161616171
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.108, mean=0.108, max=0.108, sum=0.216 (2)",
-            "tab": "Efficiency",
-            "score": 0.10789637305699486
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.11, mean=0.11, max=0.11, sum=0.221 (2)",
-            "tab": "Efficiency",
-            "score": 0.11032307692307693
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.113, mean=0.113, max=0.113, sum=0.226 (2)",
-            "tab": "Efficiency",
-            "score": 0.11290000000000003
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.11, mean=0.11, max=0.11, sum=0.219 (2)",
-            "tab": "Efficiency",
-            "score": 0.10956302521008413
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.116, mean=0.116, max=0.116, sum=0.231 (2)",
-            "tab": "Efficiency",
-            "score": 0.11561589403973516
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.11, mean=0.11, max=0.11, sum=0.22 (2)",
-            "tab": "Efficiency",
-            "score": 0.11005137614678874
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.116, mean=0.116, max=0.116, sum=0.233 (2)",
-            "tab": "Efficiency",
-            "score": 0.11631018518518522
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.139, mean=0.139, max=0.139, sum=0.279 (2)",
-            "tab": "Efficiency",
-            "score": 0.13944117647058826
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.134, mean=0.134, max=0.134, sum=0.268 (2)",
-            "tab": "Efficiency",
-            "score": 0.13399578059071726
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=568.748, mean=568.748, max=568.748, sum=1137.497 (2)",
-            "tab": "General information",
-            "score": 568.7483870967742
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=523.65, mean=523.65, max=523.65, sum=1047.3 (2)",
-            "tab": "General information",
-            "score": 523.6502463054187
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=932.15, mean=932.15, max=932.15, sum=1864.3 (2)",
-            "tab": "General information",
-            "score": 932.15
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2872.03, mean=2872.03, max=2872.03, sum=5744.061 (2)",
-            "tab": "General information",
-            "score": 2872.030303030303
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=425.646, mean=425.646, max=425.646, sum=851.293 (2)",
-            "tab": "General information",
-            "score": 425.64646464646466
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=506.073, mean=506.073, max=506.073, sum=1012.145 (2)",
-            "tab": "General information",
-            "score": 506.07253886010363
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=419.987, mean=419.987, max=419.987, sum=839.974 (2)",
-            "tab": "General information",
-            "score": 419.9871794871795
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=554.352, mean=554.352, max=554.352, sum=1108.704 (2)",
-            "tab": "General information",
-            "score": 554.3518518518518
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=439.055, mean=439.055, max=439.055, sum=878.109 (2)",
-            "tab": "General information",
-            "score": 439.0546218487395
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=581.669, mean=581.669, max=581.669, sum=1163.338 (2)",
-            "tab": "General information",
-            "score": 581.6688741721854
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=544.842, mean=544.842, max=544.842, sum=1089.684 (2)",
-            "tab": "General information",
-            "score": 544.8422018348624
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=833, mean=833, max=833, sum=1666 (2)",
-            "tab": "General information",
-            "score": 833.0
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2270.25, mean=2270.25, max=2270.25, sum=4540.5 (2)",
-            "tab": "General information",
-            "score": 2270.25
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1466.561, mean=1466.561, max=1466.561, sum=2933.122 (2)",
-            "tab": "General information",
-            "score": 1466.5611814345991
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.824,
-        "details": {
-          "description": "min=0.824, mean=0.824, max=0.824, sum=1.649 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.104, mean=0.104, max=0.104, sum=0.208 (2)",
-            "tab": "Efficiency",
-            "score": 0.10423766816143511
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.112, mean=0.112, max=0.112, sum=0.224 (2)",
-            "tab": "Efficiency",
-            "score": 0.11212213740458017
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=352.48, mean=352.48, max=352.48, sum=704.96 (2)",
-            "tab": "General information",
-            "score": 352.47982062780267
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=385.626, mean=385.626, max=385.626, sum=771.252 (2)",
-            "tab": "General information",
-            "score": 385.62595419847327
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.843,
-        "details": {
-          "description": "min=0.843, mean=0.843, max=0.843, sum=1.686 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.111, mean=0.111, max=0.111, sum=0.221 (2)",
-            "tab": "Efficiency",
-            "score": 0.11063636363636367
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=667.843, mean=667.843, max=667.843, sum=1335.686 (2)",
-            "tab": "General information",
-            "score": 667.8429752066115
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.798,
-        "details": {
-          "description": "min=0.798, mean=0.798, max=0.798, sum=1.595 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.111, mean=0.111, max=0.111, sum=0.221 (2)",
-            "tab": "Efficiency",
-            "score": 0.11058895705521476
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=482.227, mean=482.227, max=482.227, sum=964.454 (2)",
-            "tab": "General information",
-            "score": 482.2269938650307
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.562,
-        "details": {
-          "description": "min=0.562, mean=0.562, max=0.562, sum=1.125 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.115, mean=0.115, max=0.115, sum=0.231 (2)",
-            "tab": "Efficiency",
-            "score": 0.11541964285714289
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=699.598, mean=699.598, max=699.598, sum=1399.196 (2)",
-            "tab": "General information",
-            "score": 699.5982142857143
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.816,
-        "details": {
-          "description": "min=0.816, mean=0.816, max=0.816, sum=1.631 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.102, mean=0.102, max=0.102, sum=0.205 (2)",
-            "tab": "Efficiency",
-            "score": 0.10230097087378638
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=320.34, mean=320.34, max=320.34, sum=640.68 (2)",
-            "tab": "General information",
-            "score": 320.3398058252427
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.112, mean=0.112, max=0.112, sum=0.223 (2)",
-            "tab": "Efficiency",
-            "score": 0.11152136752136761
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=466.697, mean=466.697, max=466.697, sum=933.393 (2)",
-            "tab": "General information",
-            "score": 466.6965811965812
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.106, mean=0.106, max=0.106, sum=0.212 (2)",
-            "tab": "Efficiency",
-            "score": 0.10620000000000003
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=380.71, mean=380.71, max=380.71, sum=761.42 (2)",
-            "tab": "General information",
-            "score": 380.71
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.105, mean=0.105, max=0.105, sum=0.21 (2)",
-            "tab": "Efficiency",
-            "score": 0.10505236270753474
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=342.847, mean=342.847, max=342.847, sum=685.693 (2)",
-            "tab": "General information",
-            "score": 342.84674329501917
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=0.999, mean=0.999, max=0.999, sum=1.997 (2)",
-            "tab": "General information",
-            "score": 0.9987228607918263
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.464,
-        "details": {
-          "description": "min=0.464, mean=0.464, max=0.464, sum=0.927 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.112, mean=0.112, max=0.112, sum=0.225 (2)",
-            "tab": "Efficiency",
-            "score": 0.11246242774566474
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.112, mean=0.112, max=0.112, sum=0.223 (2)",
-            "tab": "Efficiency",
-            "score": 0.11168156424580966
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=525.329, mean=525.329, max=525.329, sum=1050.659 (2)",
-            "tab": "General information",
-            "score": 525.3294797687861
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=692.482, mean=692.482, max=692.482, sum=1384.963 (2)",
-            "tab": "General information",
-            "score": 692.4815642458101
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.778,
-        "details": {
-          "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.109, mean=0.109, max=0.109, sum=0.219 (2)",
-            "tab": "Efficiency",
-            "score": 0.1093660130718955
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=612.69, mean=612.69, max=612.69, sum=1225.379 (2)",
-            "tab": "General information",
-            "score": 612.6895424836601
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.787,
-        "details": {
-          "description": "min=0.787, mean=0.787, max=0.787, sum=1.574 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.11, mean=0.11, max=0.11, sum=0.22 (2)",
-            "tab": "Efficiency",
-            "score": 0.1099814814814816
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=552.454, mean=552.454, max=552.454, sum=1104.907 (2)",
-            "tab": "General information",
-            "score": 552.4537037037037
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.673,
-        "details": {
-          "description": "min=0.673, mean=0.673, max=0.673, sum=1.345 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.108, mean=0.108, max=0.108, sum=0.215 (2)",
-            "tab": "Efficiency",
-            "score": 0.1075000000000001
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=448.609, mean=448.609, max=448.609, sum=897.218 (2)",
-            "tab": "General information",
-            "score": 448.6090909090909
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.718,
-        "details": {
-          "description": "min=0.718, mean=0.718, max=0.718, sum=1.437 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.122, mean=0.122, max=0.122, sum=0.244 (2)",
-            "tab": "Efficiency",
-            "score": 0.12202448979591832
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1224.433, mean=1224.433, max=1224.433, sum=2448.865 (2)",
-            "tab": "General information",
-            "score": 1224.4326530612245
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.846,
-        "details": {
-          "description": "min=0.846, mean=0.846, max=0.846, sum=1.692 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.11, mean=0.11, max=0.11, sum=0.221 (2)",
-            "tab": "Efficiency",
-            "score": 0.11042288557213926
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=474.512, mean=474.512, max=474.512, sum=949.025 (2)",
-            "tab": "General information",
-            "score": 474.5124378109453
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.524,
-        "details": {
-          "description": "min=0.524, mean=0.524, max=0.524, sum=1.048 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.104, mean=0.104, max=0.104, sum=0.209 (2)",
-            "tab": "Efficiency",
-            "score": 0.10432530120481927
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=380.753, mean=380.753, max=380.753, sum=761.506 (2)",
-            "tab": "General information",
-            "score": 380.7530120481928
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.825,
-        "details": {
-          "description": "min=0.825, mean=0.825, max=0.825, sum=1.649 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.104, mean=0.104, max=0.104, sum=0.208 (2)",
-            "tab": "Efficiency",
-            "score": 0.10395321637426902
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=305.386, mean=305.386, max=305.386, sum=610.772 (2)",
-            "tab": "General information",
-            "score": 305.3859649122807
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 1.0,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json b/data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json
deleted file mode 100644
index af30c4448..000000000
--- a/data/helm_mmlu/amazon/nova-pro-v1_0/ecfa0e26-edff-46e4-8954-6f07a0e6fca0.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/amazon_nova-pro-v1:0/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Amazon Nova Pro",
-    "id": "amazon/nova-pro-v1:0",
-    "developer": "amazon",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.54, mean=0.82, max=0.974, sum=93.477 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.128, mean=0.14, max=0.17, sum=15.944 (114)",
-            "tab": "Efficiency",
-            "score": 0.13986169479756677
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=305.386, mean=655.489, max=2872.03, sum=74725.746 (114)",
-            "tab": "General information",
-            "score": 655.4890026560713
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69,
-        "details": {
-          "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.152, mean=0.152, max=0.152, sum=0.305 (2)",
-            "tab": "Efficiency",
-            "score": 0.15239000000000003
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=399.38, mean=399.38, max=399.38, sum=798.76 (2)",
-            "tab": "General information",
-            "score": 399.38
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.807,
-        "details": {
-          "description": "min=0.807, mean=0.807, max=0.807, sum=1.615 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.138, mean=0.138, max=0.138, sum=0.275 (2)",
-            "tab": "Efficiency",
-            "score": 0.13757037037037034
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=400.081, mean=400.081, max=400.081, sum=800.163 (2)",
-            "tab": "General information",
-            "score": 400.0814814814815
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.647,
-        "details": {
-          "description": "min=0.647, mean=0.647, max=0.647, sum=1.294 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.148, mean=0.148, max=0.148, sum=0.296 (2)",
-            "tab": "Efficiency",
-            "score": 0.14806999999999998
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.148, mean=0.148, max=0.148, sum=0.296 (2)",
-            "tab": "Efficiency",
-            "score": 0.14820138888888884
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.152, mean=0.152, max=0.152, sum=0.305 (2)",
-            "tab": "Efficiency",
-            "score": 0.15245
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.151, mean=0.151, max=0.151, sum=0.303 (2)",
-            "tab": "Efficiency",
-            "score": 0.15141
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.143, mean=0.143, max=0.143, sum=0.287 (2)",
-            "tab": "Efficiency",
-            "score": 0.1433988439306358
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.146, mean=0.146, max=0.146, sum=0.292 (2)",
-            "tab": "Efficiency",
-            "score": 0.14623529411764705
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=573.4, mean=573.4, max=573.4, sum=1146.8 (2)",
-            "tab": "General information",
-            "score": 573.4
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=510.278, mean=510.278, max=510.278, sum=1020.556 (2)",
-            "tab": "General information",
-            "score": 510.27777777777777
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=880.15, mean=880.15, max=880.15, sum=1760.3 (2)",
-            "tab": "General information",
-            "score": 880.15
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=639.53, mean=639.53, max=639.53, sum=1279.06 (2)",
-            "tab": "General information",
-            "score": 639.53
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=558.301, mean=558.301, max=558.301, sum=1116.601 (2)",
-            "tab": "General information",
-            "score": 558.3005780346821
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=517.324, mean=517.324, max=517.324, sum=1034.647 (2)",
-            "tab": "General information",
-            "score": 517.3235294117648
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.141, mean=0.141, max=0.141, sum=0.281 (2)",
-            "tab": "Efficiency",
-            "score": 0.14067000000000005
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=415.4, mean=415.4, max=415.4, sum=830.8 (2)",
-            "tab": "General information",
-            "score": 415.4
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.702,
-        "details": {
-          "description": "min=0.702, mean=0.702, max=0.702, sum=1.404 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.142, mean=0.142, max=0.142, sum=0.285 (2)",
-            "tab": "Efficiency",
-            "score": 0.1423421052631579
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=652.07, mean=652.07, max=652.07, sum=1304.14 (2)",
-            "tab": "General information",
-            "score": 652.0701754385965
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.54,
-        "details": {
-          "description": "min=0.54, mean=0.54, max=0.54, sum=1.08 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.142, mean=0.142, max=0.142, sum=0.283 (2)",
-            "tab": "Efficiency",
-            "score": 0.14153999999999997
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=426.42, mean=426.42, max=426.42, sum=852.84 (2)",
-            "tab": "General information",
-            "score": 426.42
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.861,
-        "details": {
-          "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.141, mean=0.141, max=0.141, sum=0.282 (2)",
-            "tab": "Efficiency",
-            "score": 0.14100925925925917
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=446.722, mean=446.722, max=446.722, sum=893.444 (2)",
-            "tab": "General information",
-            "score": 446.72222222222223
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.826,
-        "details": {
-          "description": "min=0.826, mean=0.826, max=0.826, sum=1.653 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.131, mean=0.131, max=0.131, sum=0.261 (2)",
-            "tab": "Efficiency",
-            "score": 0.1307266881028939
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=381.704, mean=381.704, max=381.704, sum=763.408 (2)",
-            "tab": "General information",
-            "score": 381.7041800643087
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.864,
-        "details": {
-          "description": "min=0.864, mean=0.864, max=0.864, sum=1.729 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.145, mean=0.145, max=0.145, sum=0.291 (2)",
-            "tab": "Efficiency",
-            "score": 0.14530882352941174
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.139, mean=0.139, max=0.139, sum=0.278 (2)",
-            "tab": "Efficiency",
-            "score": 0.1388758865248228
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.146, mean=0.146, max=0.146, sum=0.292 (2)",
-            "tab": "Efficiency",
-            "score": 0.14584159061277666
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.132, mean=0.132, max=0.132, sum=0.264 (2)",
-            "tab": "Efficiency",
-            "score": 0.13185620915032703
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1146.287, mean=1146.287, max=1146.287, sum=2292.574 (2)",
-            "tab": "General information",
-            "score": 1146.2867647058824
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=688.72, mean=688.72, max=688.72, sum=1377.44 (2)",
-            "tab": "General information",
-            "score": 688.7198581560284
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1686.73, mean=1686.73, max=1686.73, sum=3373.46 (2)",
-            "tab": "General information",
-            "score": 1686.7301173402868
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=625.574, mean=625.574, max=625.574, sum=1251.147 (2)",
-            "tab": "General information",
-            "score": 625.5735294117648
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "details": {
-          "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.141, mean=0.141, max=0.141, sum=0.282 (2)",
-            "tab": "Efficiency",
-            "score": 0.14117999999999994
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=461.12, mean=461.12, max=461.12, sum=922.24 (2)",
-            "tab": "General information",
-            "score": 461.12
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.895,
-        "details": {
-          "description": "min=0.895, mean=0.895, max=0.895, sum=1.789 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.141, mean=0.141, max=0.141, sum=0.282 (2)",
-            "tab": "Efficiency",
-            "score": 0.1411447368421052
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=628.112, mean=628.112, max=628.112, sum=1256.224 (2)",
-            "tab": "General information",
-            "score": 628.1118421052631
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.144, mean=0.144, max=0.144, sum=0.288 (2)",
-            "tab": "Efficiency",
-            "score": 0.14414
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=617.46, mean=617.46, max=617.46, sum=1234.92 (2)",
-            "tab": "General information",
-            "score": 617.46
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.875,
-        "details": {
-          "description": "min=0.875, mean=0.875, max=0.875, sum=1.751 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.142, mean=0.142, max=0.142, sum=0.284 (2)",
-            "tab": "Efficiency",
-            "score": 0.14190943396226424
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=451.925, mean=451.925, max=451.925, sum=903.849 (2)",
-            "tab": "General information",
-            "score": 451.92452830188677
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.851,
-        "details": {
-          "description": "min=0.851, mean=0.851, max=0.851, sum=1.702 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.132, mean=0.132, max=0.132, sum=0.264 (2)",
-            "tab": "Efficiency",
-            "score": 0.13199148936170213
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=341.723, mean=341.723, max=341.723, sum=683.447 (2)",
-            "tab": "General information",
-            "score": 341.72340425531917
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.135, mean=0.135, max=0.135, sum=0.27 (2)",
-            "tab": "Efficiency",
-            "score": 0.1350000000000001
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=458.345, mean=458.345, max=458.345, sum=916.69 (2)",
-            "tab": "General information",
-            "score": 458.3448275862069
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.831,
-        "details": {
-          "description": "min=0.831, mean=0.831, max=0.831, sum=1.661 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.142, mean=0.142, max=0.142, sum=0.285 (2)",
-            "tab": "Efficiency",
-            "score": 0.14232010582010587
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=534.09, mean=534.09, max=534.09, sum=1068.18 (2)",
-            "tab": "General information",
-            "score": 534.0899470899471
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.714,
-        "details": {
-          "description": "min=0.714, mean=0.714, max=0.714, sum=1.429 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.145, mean=0.145, max=0.145, sum=0.29 (2)",
-            "tab": "Efficiency",
-            "score": 0.1448888888888889
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=669, mean=669, max=669, sum=1338 (2)",
-            "tab": "General information",
-            "score": 669.0
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.928,
-        "details": {
-          "description": "min=0.928, mean=0.928, max=0.928, sum=1.857 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.139, mean=0.139, max=0.139, sum=0.278 (2)",
-            "tab": "Efficiency",
-            "score": 0.13894516129032267
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.139, mean=0.139, max=0.139, sum=0.278 (2)",
-            "tab": "Efficiency",
-            "score": 0.13885221674876858
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.148, mean=0.148, max=0.148, sum=0.296 (2)",
-            "tab": "Efficiency",
-            "score": 0.1479
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.17, mean=0.17, max=0.17, sum=0.341 (2)",
-            "tab": "Efficiency",
-            "score": 0.17033939393939396
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.133, mean=0.133, max=0.133, sum=0.266 (2)",
-            "tab": "Efficiency",
-            "score": 0.13296969696969696
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.135, mean=0.135, max=0.135, sum=0.27 (2)",
-            "tab": "Efficiency",
-            "score": 0.1351139896373057
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.134, mean=0.134, max=0.134, sum=0.268 (2)",
-            "tab": "Efficiency",
-            "score": 0.1338025641025641
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.14, mean=0.14, max=0.14, sum=0.279 (2)",
-            "tab": "Efficiency",
-            "score": 0.13964074074074065
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.135, mean=0.135, max=0.135, sum=0.271 (2)",
-            "tab": "Efficiency",
-            "score": 0.1353235294117648
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.137, mean=0.137, max=0.137, sum=0.274 (2)",
-            "tab": "Efficiency",
-            "score": 0.13686754966887416
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.136, mean=0.136, max=0.136, sum=0.272 (2)",
-            "tab": "Efficiency",
-            "score": 0.13622018348623863
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.143, mean=0.143, max=0.143, sum=0.286 (2)",
-            "tab": "Efficiency",
-            "score": 0.14287499999999997
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.158, mean=0.158, max=0.158, sum=0.317 (2)",
-            "tab": "Efficiency",
-            "score": 0.15845098039215685
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.152, mean=0.152, max=0.152, sum=0.304 (2)",
-            "tab": "Efficiency",
-            "score": 0.151776371308017
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=568.748, mean=568.748, max=568.748, sum=1137.497 (2)",
-            "tab": "General information",
-            "score": 568.7483870967742
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=523.65, mean=523.65, max=523.65, sum=1047.3 (2)",
-            "tab": "General information",
-            "score": 523.6502463054187
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=932.15, mean=932.15, max=932.15, sum=1864.3 (2)",
-            "tab": "General information",
-            "score": 932.15
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2872.03, mean=2872.03, max=2872.03, sum=5744.061 (2)",
-            "tab": "General information",
-            "score": 2872.030303030303
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=425.646, mean=425.646, max=425.646, sum=851.293 (2)",
-            "tab": "General information",
-            "score": 425.64646464646466
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=506.073, mean=506.073, max=506.073, sum=1012.145 (2)",
-            "tab": "General information",
-            "score": 506.07253886010363
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=419.987, mean=419.987, max=419.987, sum=839.974 (2)",
-            "tab": "General information",
-            "score": 419.9871794871795
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=554.352, mean=554.352, max=554.352, sum=1108.704 (2)",
-            "tab": "General information",
-            "score": 554.3518518518518
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=439.055, mean=439.055, max=439.055, sum=878.109 (2)",
-            "tab": "General information",
-            "score": 439.0546218487395
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=581.669, mean=581.669, max=581.669, sum=1163.338 (2)",
-            "tab": "General information",
-            "score": 581.6688741721854
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=544.842, mean=544.842, max=544.842, sum=1089.684 (2)",
-            "tab": "General information",
-            "score": 544.8422018348624
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=833, mean=833, max=833, sum=1666 (2)",
-            "tab": "General information",
-            "score": 833.0
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2270.25, mean=2270.25, max=2270.25, sum=4540.5 (2)",
-            "tab": "General information",
-            "score": 2270.25
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1466.561, mean=1466.561, max=1466.561, sum=2933.122 (2)",
-            "tab": "General information",
-            "score": 1466.5611814345991
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.885,
-        "details": {
-          "description": "min=0.885, mean=0.885, max=0.885, sum=1.771 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.128, mean=0.128, max=0.128, sum=0.257 (2)",
-            "tab": "Efficiency",
-            "score": 0.12830044843049326
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.132, mean=0.132, max=0.132, sum=0.263 (2)",
-            "tab": "Efficiency",
-            "score": 0.13163358778625955
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=352.48, mean=352.48, max=352.48, sum=704.96 (2)",
-            "tab": "General information",
-            "score": 352.47982062780267
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=385.626, mean=385.626, max=385.626, sum=771.252 (2)",
-            "tab": "General information",
-            "score": 385.62595419847327
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.139, mean=0.139, max=0.139, sum=0.277 (2)",
-            "tab": "Efficiency",
-            "score": 0.13855371900826452
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=667.843, mean=667.843, max=667.843, sum=1335.686 (2)",
-            "tab": "General information",
-            "score": 667.8429752066115
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.136, mean=0.136, max=0.136, sum=0.272 (2)",
-            "tab": "Efficiency",
-            "score": 0.13612269938650304
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=482.227, mean=482.227, max=482.227, sum=964.454 (2)",
-            "tab": "General information",
-            "score": 482.2269938650307
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.625,
-        "details": {
-          "description": "min=0.625, mean=0.625, max=0.625, sum=1.25 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.142, mean=0.142, max=0.142, sum=0.284 (2)",
-            "tab": "Efficiency",
-            "score": 0.14183035714285702
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=699.598, mean=699.598, max=699.598, sum=1399.196 (2)",
-            "tab": "General information",
-            "score": 699.5982142857143
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.922,
-        "details": {
-          "description": "min=0.922, mean=0.922, max=0.922, sum=1.845 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.129, mean=0.129, max=0.129, sum=0.257 (2)",
-            "tab": "Efficiency",
-            "score": 0.12854368932038837
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=320.34, mean=320.34, max=320.34, sum=640.68 (2)",
-            "tab": "General information",
-            "score": 320.3398058252427
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.923,
-        "details": {
-          "description": "min=0.923, mean=0.923, max=0.923, sum=1.846 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.132, mean=0.132, max=0.132, sum=0.264 (2)",
-            "tab": "Efficiency",
-            "score": 0.13224786324786314
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=466.697, mean=466.697, max=466.697, sum=933.393 (2)",
-            "tab": "General information",
-            "score": 466.6965811965812
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.133, mean=0.133, max=0.133, sum=0.266 (2)",
-            "tab": "Efficiency",
-            "score": 0.13288
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=380.71, mean=380.71, max=380.71, sum=761.42 (2)",
-            "tab": "General information",
-            "score": 380.71
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.912,
-        "details": {
-          "description": "min=0.912, mean=0.912, max=0.912, sum=1.824 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.129, mean=0.129, max=0.129, sum=0.257 (2)",
-            "tab": "Efficiency",
-            "score": 0.12866538952745835
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=342.847, mean=342.847, max=342.847, sum=685.693 (2)",
-            "tab": "General information",
-            "score": 342.84674329501917
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.135, mean=0.135, max=0.135, sum=0.27 (2)",
-            "tab": "Efficiency",
-            "score": 0.1350173410404623
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.138, mean=0.138, max=0.138, sum=0.277 (2)",
-            "tab": "Efficiency",
-            "score": 0.13844581005586606
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=525.329, mean=525.329, max=525.329, sum=1050.659 (2)",
-            "tab": "General information",
-            "score": 525.3294797687861
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=692.482, mean=692.482, max=692.482, sum=1384.963 (2)",
-            "tab": "General information",
-            "score": 692.4815642458101
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.866,
-        "details": {
-          "description": "min=0.866, mean=0.866, max=0.866, sum=1.732 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.135, mean=0.135, max=0.135, sum=0.27 (2)",
-            "tab": "Efficiency",
-            "score": 0.13503921568627456
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=612.69, mean=612.69, max=612.69, sum=1225.379 (2)",
-            "tab": "General information",
-            "score": 612.6895424836601
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.926,
-        "details": {
-          "description": "min=0.926, mean=0.926, max=0.926, sum=1.852 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.135, mean=0.135, max=0.135, sum=0.271 (2)",
-            "tab": "Efficiency",
-            "score": 0.135388888888889
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=552.454, mean=552.454, max=552.454, sum=1104.907 (2)",
-            "tab": "General information",
-            "score": 552.4537037037037
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.132, mean=0.132, max=0.132, sum=0.265 (2)",
-            "tab": "Efficiency",
-            "score": 0.13249090909090908
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=448.609, mean=448.609, max=448.609, sum=897.218 (2)",
-            "tab": "General information",
-            "score": 448.6090909090909
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.849,
-        "details": {
-          "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.143, mean=0.143, max=0.143, sum=0.285 (2)",
-            "tab": "Efficiency",
-            "score": 0.1427142857142858
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1224.433, mean=1224.433, max=1224.433, sum=2448.865 (2)",
-            "tab": "General information",
-            "score": 1224.4326530612245
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.905,
-        "details": {
-          "description": "min=0.905, mean=0.905, max=0.905, sum=1.811 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.137, mean=0.137, max=0.137, sum=0.275 (2)",
-            "tab": "Efficiency",
-            "score": 0.13738308457711446
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=474.512, mean=474.512, max=474.512, sum=949.025 (2)",
-            "tab": "General information",
-            "score": 474.5124378109453
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.59,
-        "details": {
-          "description": "min=0.59, mean=0.59, max=0.59, sum=1.181 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.129, mean=0.129, max=0.129, sum=0.258 (2)",
-            "tab": "Efficiency",
-            "score": 0.1290301204819277
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=380.753, mean=380.753, max=380.753, sum=761.506 (2)",
-            "tab": "General information",
-            "score": 380.7530120481928
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.877,
-        "details": {
-          "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.128, mean=0.128, max=0.128, sum=0.257 (2)",
-            "tab": "Efficiency",
-            "score": 0.12828070175438594
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=305.386, mean=305.386, max=305.386, sum=610.772 (2)",
-            "tab": "General information",
-            "score": 305.3859649122807
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.975,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json b/data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json
deleted file mode 100644
index c2616d7f8..000000000
--- a/data/helm_mmlu/anthropic/claude-2.1/bc9cedd7-5cb2-44b2-abda-470322570e14.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-2.1/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 2.1",
-    "id": "anthropic/claude-2.1",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.735,
-        "details": {
-          "description": "min=0.352, mean=0.735, max=0.959, sum=83.762 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=1.934, mean=2.418, max=3.916, sum=275.693 (114)",
-            "tab": "Efficiency",
-            "score": 2.4183583522219108
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=358.018, mean=703.288, max=2952.576, sum=80174.875 (114)",
-            "tab": "General information",
-            "score": 703.2883793758955
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0.994, mean=1.0, max=1, sum=113.982 (114)",
-            "tab": "General information",
-            "score": 0.999841257531982
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4,
-        "details": {
-          "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=2.043, mean=2.043, max=2.043, sum=4.087 (2)",
-            "tab": "Efficiency",
-            "score": 2.043452892303467
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=435.26, mean=435.26, max=435.26, sum=870.52 (2)",
-            "tab": "General information",
-            "score": 435.26
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.726,
-        "details": {
-          "description": "min=0.726, mean=0.726, max=0.726, sum=1.452 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=2.071, mean=2.071, max=2.071, sum=4.142 (2)",
-            "tab": "Efficiency",
-            "score": 2.0710925843980577
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=435.8, mean=435.8, max=435.8, sum=871.6 (2)",
-            "tab": "General information",
-            "score": 435.8
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=2.579, mean=2.579, max=2.579, sum=5.158 (2)",
-            "tab": "Efficiency",
-            "score": 2.579245555400848
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=2.209, mean=2.209, max=2.209, sum=4.418 (2)",
-            "tab": "Efficiency",
-            "score": 2.2088319063186646
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=2.413, mean=2.413, max=2.413, sum=4.826 (2)",
-            "tab": "Efficiency",
-            "score": 2.4128634238243105
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=2.18, mean=2.18, max=2.18, sum=4.359 (2)",
-            "tab": "Efficiency",
-            "score": 2.179708275794983
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=2.324, mean=2.324, max=2.324, sum=4.648 (2)",
-            "tab": "Efficiency",
-            "score": 2.3239130339870564
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=2.145, mean=2.145, max=2.145, sum=4.289 (2)",
-            "tab": "Efficiency",
-            "score": 2.144603039704117
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=615.01, mean=615.01, max=615.01, sum=1230.02 (2)",
-            "tab": "General information",
-            "score": 615.01
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=555.347, mean=555.347, max=555.347, sum=1110.694 (2)",
-            "tab": "General information",
-            "score": 555.3472222222222
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=903.24, mean=903.24, max=903.24, sum=1806.48 (2)",
-            "tab": "General information",
-            "score": 903.24
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=669.19, mean=669.19, max=669.19, sum=1338.38 (2)",
-            "tab": "General information",
-            "score": 669.19
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=605.63, mean=605.63, max=605.63, sum=1211.26 (2)",
-            "tab": "General information",
-            "score": 605.6300578034682
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0.994, mean=0.994, max=0.994, sum=1.988 (2)",
-            "tab": "General information",
-            "score": 0.9942196531791907
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=554.48, mean=554.48, max=554.48, sum=1108.961 (2)",
-            "tab": "General information",
-            "score": 554.4803921568628
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=2.244, mean=2.244, max=2.244, sum=4.487 (2)",
-            "tab": "Efficiency",
-            "score": 2.2435835003852844
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=463.62, mean=463.62, max=463.62, sum=927.24 (2)",
-            "tab": "General information",
-            "score": 463.62
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.596,
-        "details": {
-          "description": "min=0.596, mean=0.596, max=0.596, sum=1.193 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=2.615, mean=2.615, max=2.615, sum=5.23 (2)",
-            "tab": "Efficiency",
-            "score": 2.6147566636403403
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=684.596, mean=684.596, max=684.596, sum=1369.193 (2)",
-            "tab": "General information",
-            "score": 684.5964912280701
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.55,
-        "details": {
-          "description": "min=0.55, mean=0.55, max=0.55, sum=1.1 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=1.934, mean=1.934, max=1.934, sum=3.869 (2)",
-            "tab": "Efficiency",
-            "score": 1.934385061264038
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=476.61, mean=476.61, max=476.61, sum=953.22 (2)",
-            "tab": "General information",
-            "score": 476.61
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=2.042, mean=2.042, max=2.042, sum=4.084 (2)",
-            "tab": "Efficiency",
-            "score": 2.041935768392351
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=496.426, mean=496.426, max=496.426, sum=992.852 (2)",
-            "tab": "General information",
-            "score": 496.4259259259259
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.794,
-        "details": {
-          "description": "min=0.794, mean=0.794, max=0.794, sum=1.588 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=2.326, mean=2.326, max=2.326, sum=4.652 (2)",
-            "tab": "Efficiency",
-            "score": 2.3260836739248787
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=424.965, mean=424.965, max=424.965, sum=849.929 (2)",
-            "tab": "General information",
-            "score": 424.9646302250804
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.797,
-        "details": {
-          "description": "min=0.797, mean=0.797, max=0.797, sum=1.595 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=2.936, mean=2.936, max=2.936, sum=5.871 (2)",
-            "tab": "Efficiency",
-            "score": 2.9355741520138348
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=2.529, mean=2.529, max=2.529, sum=5.058 (2)",
-            "tab": "Efficiency",
-            "score": 2.528953587755244
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=3.335, mean=3.335, max=3.335, sum=6.669 (2)",
-            "tab": "Efficiency",
-            "score": 3.3346744537975206
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=2.597, mean=2.597, max=2.597, sum=5.194 (2)",
-            "tab": "Efficiency",
-            "score": 2.5970658024931264
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1188.537, mean=1188.537, max=1188.537, sum=2377.074 (2)",
-            "tab": "General information",
-            "score": 1188.5367647058824
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=730.422, mean=730.422, max=730.422, sum=1460.844 (2)",
-            "tab": "General information",
-            "score": 730.4219858156029
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1766.16, mean=1766.16, max=1766.16, sum=3532.321 (2)",
-            "tab": "General information",
-            "score": 1766.16036505867
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=668.168, mean=668.168, max=668.168, sum=1336.337 (2)",
-            "tab": "General information",
-            "score": 668.1683006535948
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=2.374, mean=2.374, max=2.374, sum=4.747 (2)",
-            "tab": "Efficiency",
-            "score": 2.37366126537323
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=520.25, mean=520.25, max=520.25, sum=1040.5 (2)",
-            "tab": "General information",
-            "score": 520.25
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.855,
-        "details": {
-          "description": "min=0.855, mean=0.855, max=0.855, sum=1.711 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=2.346, mean=2.346, max=2.346, sum=4.692 (2)",
-            "tab": "Efficiency",
-            "score": 2.345861089857001
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=669.493, mean=669.493, max=669.493, sum=1338.987 (2)",
-            "tab": "General information",
-            "score": 669.4934210526316
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.73,
-        "details": {
-          "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=2.35, mean=2.35, max=2.35, sum=4.701 (2)",
-            "tab": "Efficiency",
-            "score": 2.3504813623428347
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=665.02, mean=665.02, max=665.02, sum=1330.04 (2)",
-            "tab": "General information",
-            "score": 665.02
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.785,
-        "details": {
-          "description": "min=0.785, mean=0.785, max=0.785, sum=1.57 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=2.28, mean=2.28, max=2.28, sum=4.56 (2)",
-            "tab": "Efficiency",
-            "score": 2.279950815776609
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=494.457, mean=494.457, max=494.457, sum=988.913 (2)",
-            "tab": "General information",
-            "score": 494.4566037735849
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.766,
-        "details": {
-          "description": "min=0.766, mean=0.766, max=0.766, sum=1.532 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=2.125, mean=2.125, max=2.125, sum=4.25 (2)",
-            "tab": "Efficiency",
-            "score": 2.1249657225101553
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=388.536, mean=388.536, max=388.536, sum=777.072 (2)",
-            "tab": "General information",
-            "score": 388.53617021276597
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.724,
-        "details": {
-          "description": "min=0.724, mean=0.724, max=0.724, sum=1.448 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=2.336, mean=2.336, max=2.336, sum=4.672 (2)",
-            "tab": "Efficiency",
-            "score": 2.3361403728353567
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=502.041, mean=502.041, max=502.041, sum=1004.083 (2)",
-            "tab": "General information",
-            "score": 502.04137931034484
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.521,
-        "details": {
-          "description": "min=0.521, mean=0.521, max=0.521, sum=1.042 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=2.399, mean=2.399, max=2.399, sum=4.798 (2)",
-            "tab": "Efficiency",
-            "score": 2.398875941044439
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=576.066, mean=576.066, max=576.066, sum=1152.132 (2)",
-            "tab": "General information",
-            "score": 576.0661375661375
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=2.294, mean=2.294, max=2.294, sum=4.587 (2)",
-            "tab": "Efficiency",
-            "score": 2.293650850417122
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=711.746, mean=711.746, max=711.746, sum=1423.492 (2)",
-            "tab": "General information",
-            "score": 711.7460317460317
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.903,
-        "details": {
-          "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=2.36, mean=2.36, max=2.36, sum=4.72 (2)",
-            "tab": "Efficiency",
-            "score": 2.360204086765166
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=2.324, mean=2.324, max=2.324, sum=4.647 (2)",
-            "tab": "Efficiency",
-            "score": 2.3235761426352517
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=2.353, mean=2.353, max=2.353, sum=4.707 (2)",
-            "tab": "Efficiency",
-            "score": 2.3532658934593202
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=3.916, mean=3.916, max=3.916, sum=7.832 (2)",
-            "tab": "Efficiency",
-            "score": 3.915820397752704
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=2.217, mean=2.217, max=2.217, sum=4.434 (2)",
-            "tab": "Efficiency",
-            "score": 2.217141205614263
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=2.403, mean=2.403, max=2.403, sum=4.807 (2)",
-            "tab": "Efficiency",
-            "score": 2.4034566397493986
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=2.329, mean=2.329, max=2.329, sum=4.658 (2)",
-            "tab": "Efficiency",
-            "score": 2.3290999345290353
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=2.45, mean=2.45, max=2.45, sum=4.9 (2)",
-            "tab": "Efficiency",
-            "score": 2.4497611089988993
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=2.492, mean=2.492, max=2.492, sum=4.984 (2)",
-            "tab": "Efficiency",
-            "score": 2.492123728038884
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=2.268, mean=2.268, max=2.268, sum=4.536 (2)",
-            "tab": "Efficiency",
-            "score": 2.267898343256767
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=2.45, mean=2.45, max=2.45, sum=4.901 (2)",
-            "tab": "Efficiency",
-            "score": 2.4503073394845387
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=2.554, mean=2.554, max=2.554, sum=5.107 (2)",
-            "tab": "Efficiency",
-            "score": 2.5535844012543008
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=3.541, mean=3.541, max=3.541, sum=7.081 (2)",
-            "tab": "Efficiency",
-            "score": 3.540712014132855
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=3.012, mean=3.012, max=3.012, sum=6.025 (2)",
-            "tab": "Efficiency",
-            "score": 3.0123110571994056
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=599.577, mean=599.577, max=599.577, sum=1199.155 (2)",
-            "tab": "General information",
-            "score": 599.5774193548388
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=562.921, mean=562.921, max=562.921, sum=1125.842 (2)",
-            "tab": "General information",
-            "score": 562.9211822660099
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=947.4, mean=947.4, max=947.4, sum=1894.8 (2)",
-            "tab": "General information",
-            "score": 947.4
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2952.576, mean=2952.576, max=2952.576, sum=5905.152 (2)",
-            "tab": "General information",
-            "score": 2952.5757575757575
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=477.268, mean=477.268, max=477.268, sum=954.535 (2)",
-            "tab": "General information",
-            "score": 477.2676767676768
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=556.104, mean=556.104, max=556.104, sum=1112.207 (2)",
-            "tab": "General information",
-            "score": 556.1036269430052
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=471.036, mean=471.036, max=471.036, sum=942.072 (2)",
-            "tab": "General information",
-            "score": 471.0358974358974
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=584.881, mean=584.881, max=584.881, sum=1169.763 (2)",
-            "tab": "General information",
-            "score": 584.8814814814815
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=485.513, mean=485.513, max=485.513, sum=971.025 (2)",
-            "tab": "General information",
-            "score": 485.5126050420168
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=623.841, mean=623.841, max=623.841, sum=1247.682 (2)",
-            "tab": "General information",
-            "score": 623.841059602649
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=586.42, mean=586.42, max=586.42, sum=1172.84 (2)",
-            "tab": "General information",
-            "score": 586.4201834862386
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=871.963, mean=871.963, max=871.963, sum=1743.926 (2)",
-            "tab": "General information",
-            "score": 871.9629629629629
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2353.49, mean=2353.49, max=2353.49, sum=4706.98 (2)",
-            "tab": "General information",
-            "score": 2353.4901960784314
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1540.932, mean=1540.932, max=1540.932, sum=3081.865 (2)",
-            "tab": "General information",
-            "score": 1540.9324894514768
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.847,
-        "details": {
-          "description": "min=0.847, mean=0.847, max=0.847, sum=1.695 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=2.287, mean=2.287, max=2.287, sum=4.573 (2)",
-            "tab": "Efficiency",
-            "score": 2.286549251710353
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=2.14, mean=2.14, max=2.14, sum=4.28 (2)",
-            "tab": "Efficiency",
-            "score": 2.1399855577308715
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=400.955, mean=400.955, max=400.955, sum=801.91 (2)",
-            "tab": "General information",
-            "score": 400.95515695067263
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=436.496, mean=436.496, max=436.496, sum=872.992 (2)",
-            "tab": "General information",
-            "score": 436.4961832061069
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=2.339, mean=2.339, max=2.339, sum=4.679 (2)",
-            "tab": "Efficiency",
-            "score": 2.3394163206589123
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=729.165, mean=729.165, max=729.165, sum=1458.331 (2)",
-            "tab": "General information",
-            "score": 729.1652892561983
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.834,
-        "details": {
-          "description": "min=0.834, mean=0.834, max=0.834, sum=1.669 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=2.313, mean=2.313, max=2.313, sum=4.627 (2)",
-            "tab": "Efficiency",
-            "score": 2.3134736488201866
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=535.276, mean=535.276, max=535.276, sum=1070.552 (2)",
-            "tab": "General information",
-            "score": 535.2760736196319
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.482,
-        "details": {
-          "description": "min=0.482, mean=0.482, max=0.482, sum=0.964 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=2.246, mean=2.246, max=2.246, sum=4.492 (2)",
-            "tab": "Efficiency",
-            "score": 2.246019565633365
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=741.518, mean=741.518, max=741.518, sum=1483.036 (2)",
-            "tab": "General information",
-            "score": 741.5178571428571
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.825,
-        "details": {
-          "description": "min=0.825, mean=0.825, max=0.825, sum=1.65 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=2.02, mean=2.02, max=2.02, sum=4.041 (2)",
-            "tab": "Efficiency",
-            "score": 2.0203486507378736
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=366.282, mean=366.282, max=366.282, sum=732.563 (2)",
-            "tab": "General information",
-            "score": 366.28155339805824
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.923,
-        "details": {
-          "description": "min=0.923, mean=0.923, max=0.923, sum=1.846 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=2.371, mean=2.371, max=2.371, sum=4.741 (2)",
-            "tab": "Efficiency",
-            "score": 2.370740459515498
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=513.064, mean=513.064, max=513.064, sum=1026.128 (2)",
-            "tab": "General information",
-            "score": 513.0641025641025
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=2.213, mean=2.213, max=2.213, sum=4.426 (2)",
-            "tab": "Efficiency",
-            "score": 2.213027362823486
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=419.88, mean=419.88, max=419.88, sum=839.76 (2)",
-            "tab": "General information",
-            "score": 419.88
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "details": {
-          "description": "min=0.88, mean=0.88, max=0.88, sum=1.76 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=2.421, mean=2.421, max=2.421, sum=4.843 (2)",
-            "tab": "Efficiency",
-            "score": 2.421274871813992
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=393.628, mean=393.628, max=393.628, sum=787.257 (2)",
-            "tab": "General information",
-            "score": 393.62835249042143
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52,
-        "details": {
-          "description": "min=0.52, mean=0.52, max=0.52, sum=1.039 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=2.478, mean=2.478, max=2.478, sum=4.955 (2)",
-            "tab": "Efficiency",
-            "score": 2.4775779054344045
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=2.624, mean=2.624, max=2.624, sum=5.248 (2)",
-            "tab": "Efficiency",
-            "score": 2.624200687994504
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=576.789, mean=576.789, max=576.789, sum=1153.578 (2)",
-            "tab": "General information",
-            "score": 576.7890173410404
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=741.949, mean=741.949, max=741.949, sum=1483.897 (2)",
-            "tab": "General information",
-            "score": 741.9486033519553
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.781,
-        "details": {
-          "description": "min=0.781, mean=0.781, max=0.781, sum=1.562 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=2.516, mean=2.516, max=2.516, sum=5.033 (2)",
-            "tab": "Efficiency",
-            "score": 2.516486873813704
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=682.065, mean=682.065, max=682.065, sum=1364.131 (2)",
-            "tab": "General information",
-            "score": 682.0653594771242
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=0.997, mean=0.997, max=0.997, sum=1.993 (2)",
-            "tab": "General information",
-            "score": 0.9967320261437909
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.821,
-        "details": {
-          "description": "min=0.821, mean=0.821, max=0.821, sum=1.642 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=2.431, mean=2.431, max=2.431, sum=4.862 (2)",
-            "tab": "Efficiency",
-            "score": 2.4310101116145097
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=610.639, mean=610.639, max=610.639, sum=1221.278 (2)",
-            "tab": "General information",
-            "score": 610.6388888888889
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.773,
-        "details": {
-          "description": "min=0.773, mean=0.773, max=0.773, sum=1.545 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=2.068, mean=2.068, max=2.068, sum=4.136 (2)",
-            "tab": "Efficiency",
-            "score": 2.067864069071683
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=497.991, mean=497.991, max=497.991, sum=995.982 (2)",
-            "tab": "General information",
-            "score": 497.9909090909091
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.812,
-        "details": {
-          "description": "min=0.812, mean=0.812, max=0.812, sum=1.624 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=2.854, mean=2.854, max=2.854, sum=5.708 (2)",
-            "tab": "Efficiency",
-            "score": 2.8541687430167686
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1308.804, mean=1308.804, max=1308.804, sum=2617.608 (2)",
-            "tab": "General information",
-            "score": 1308.8040816326532
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.886,
-        "details": {
-          "description": "min=0.886, mean=0.886, max=0.886, sum=1.771 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=2.362, mean=2.362, max=2.362, sum=4.725 (2)",
-            "tab": "Efficiency",
-            "score": 2.362461663004178
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=532.274, mean=532.274, max=532.274, sum=1064.547 (2)",
-            "tab": "General information",
-            "score": 532.273631840796
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.554,
-        "details": {
-          "description": "min=0.554, mean=0.554, max=0.554, sum=1.108 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=2.231, mean=2.231, max=2.231, sum=4.462 (2)",
-            "tab": "Efficiency",
-            "score": 2.2311078037124084
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=427.651, mean=427.651, max=427.651, sum=855.301 (2)",
-            "tab": "General information",
-            "score": 427.65060240963857
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.854,
-        "details": {
-          "description": "min=0.854, mean=0.854, max=0.854, sum=1.708 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=2.237, mean=2.237, max=2.237, sum=4.474 (2)",
-            "tab": "Efficiency",
-            "score": 2.2371394411165113
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=358.018, mean=358.018, max=358.018, sum=716.035 (2)",
-            "tab": "General information",
-            "score": 358.0175438596491
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.048,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json b/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json
deleted file mode 100644
index 76628bf51..000000000
--- a/data/helm_mmlu/anthropic/claude-3-5-haiku-20241022/305a7f25-6e22-4146-9678-6a687a701567.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-haiku-20241022/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 3.5 Haiku 20241022",
-    "id": "anthropic/claude-3-5-haiku-20241022",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.743,
-        "details": {
-          "description": "min=0.359, mean=0.743, max=0.94, sum=84.719 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.909, mean=1.108, max=1.572, sum=126.32 (114)",
-            "tab": "Efficiency",
-            "score": 1.1080717974066416
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=293.018, mean=638.288, max=2887.576, sum=72764.875 (114)",
-            "tab": "General information",
-            "score": 638.2883793758953
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.47,
-        "details": {
-          "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.909, mean=0.909, max=0.909, sum=1.819 (2)",
-            "tab": "Efficiency",
-            "score": 0.9094081521034241
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=370.26, mean=370.26, max=370.26, sum=740.52 (2)",
-            "tab": "General information",
-            "score": 370.26
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.793,
-        "details": {
-          "description": "min=0.793, mean=0.793, max=0.793, sum=1.585 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=1.124, mean=1.124, max=1.124, sum=2.247 (2)",
-            "tab": "Efficiency",
-            "score": 1.1236292309231228
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=370.8, mean=370.8, max=370.8, sum=741.6 (2)",
-            "tab": "General information",
-            "score": 370.8
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52,
-        "details": {
-          "description": "min=0.52, mean=0.52, max=0.52, sum=1.039 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=1.196, mean=1.196, max=1.196, sum=2.392 (2)",
-            "tab": "Efficiency",
-            "score": 1.1962119388580321
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=1.247, mean=1.247, max=1.247, sum=2.494 (2)",
-            "tab": "Efficiency",
-            "score": 1.2467927502261267
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=1.572, mean=1.572, max=1.572, sum=3.144 (2)",
-            "tab": "Efficiency",
-            "score": 1.5719245457649231
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=1.13, mean=1.13, max=1.13, sum=2.26 (2)",
-            "tab": "Efficiency",
-            "score": 1.1302329087257386
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=1.259, mean=1.259, max=1.259, sum=2.517 (2)",
-            "tab": "Efficiency",
-            "score": 1.2587321479885565
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=1.261, mean=1.261, max=1.261, sum=2.521 (2)",
-            "tab": "Efficiency",
-            "score": 1.2606473857281255
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=550.01, mean=550.01, max=550.01, sum=1100.02 (2)",
-            "tab": "General information",
-            "score": 550.01
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=490.347, mean=490.347, max=490.347, sum=980.694 (2)",
-            "tab": "General information",
-            "score": 490.34722222222223
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=838.24, mean=838.24, max=838.24, sum=1676.48 (2)",
-            "tab": "General information",
-            "score": 838.24
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=604.19, mean=604.19, max=604.19, sum=1208.38 (2)",
-            "tab": "General information",
-            "score": 604.19
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=540.63, mean=540.63, max=540.63, sum=1081.26 (2)",
-            "tab": "General information",
-            "score": 540.6300578034682
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=489.48, mean=489.48, max=489.48, sum=978.961 (2)",
-            "tab": "General information",
-            "score": 489.48039215686276
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=1.013, mean=1.013, max=1.013, sum=2.027 (2)",
-            "tab": "Efficiency",
-            "score": 1.0133756017684936
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=398.62, mean=398.62, max=398.62, sum=797.24 (2)",
-            "tab": "General information",
-            "score": 398.62
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.596,
-        "details": {
-          "description": "min=0.596, mean=0.596, max=0.596, sum=1.193 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.922, mean=0.922, max=0.922, sum=1.845 (2)",
-            "tab": "Efficiency",
-            "score": 0.9224813549142135
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=619.596, mean=619.596, max=619.596, sum=1239.193 (2)",
-            "tab": "General information",
-            "score": 619.5964912280701
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=1.101, mean=1.101, max=1.101, sum=2.201 (2)",
-            "tab": "Efficiency",
-            "score": 1.1007365608215331
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=411.61, mean=411.61, max=411.61, sum=823.22 (2)",
-            "tab": "General information",
-            "score": 411.61
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.861,
-        "details": {
-          "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=1.104, mean=1.104, max=1.104, sum=2.209 (2)",
-            "tab": "Efficiency",
-            "score": 1.1042848251484059
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=431.426, mean=431.426, max=431.426, sum=862.852 (2)",
-            "tab": "General information",
-            "score": 431.4259259259259
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.823,
-        "details": {
-          "description": "min=0.823, mean=0.823, max=0.823, sum=1.646 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=1.117, mean=1.117, max=1.117, sum=2.233 (2)",
-            "tab": "Efficiency",
-            "score": 1.1165370488856767
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=359.965, mean=359.965, max=359.965, sum=719.929 (2)",
-            "tab": "General information",
-            "score": 359.9646302250804
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.825,
-        "details": {
-          "description": "min=0.825, mean=0.825, max=0.825, sum=1.65 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=1.412, mean=1.412, max=1.412, sum=2.824 (2)",
-            "tab": "Efficiency",
-            "score": 1.4119182877680834
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.984, mean=0.984, max=0.984, sum=1.967 (2)",
-            "tab": "Efficiency",
-            "score": 0.9836687187776498
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=1.016, mean=1.016, max=1.016, sum=2.032 (2)",
-            "tab": "Efficiency",
-            "score": 1.0160297585901412
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.979, mean=0.979, max=0.979, sum=1.958 (2)",
-            "tab": "Efficiency",
-            "score": 0.9789344672284095
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1123.537, mean=1123.537, max=1123.537, sum=2247.074 (2)",
-            "tab": "General information",
-            "score": 1123.5367647058824
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=665.422, mean=665.422, max=665.422, sum=1330.844 (2)",
-            "tab": "General information",
-            "score": 665.4219858156029
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1701.16, mean=1701.16, max=1701.16, sum=3402.321 (2)",
-            "tab": "General information",
-            "score": 1701.16036505867
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=603.168, mean=603.168, max=603.168, sum=1206.337 (2)",
-            "tab": "General information",
-            "score": 603.1683006535948
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.97, mean=0.97, max=0.97, sum=1.941 (2)",
-            "tab": "Efficiency",
-            "score": 0.9703591632843017
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=455.25, mean=455.25, max=455.25, sum=910.5 (2)",
-            "tab": "General information",
-            "score": 455.25
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.829,
-        "details": {
-          "description": "min=0.829, mean=0.829, max=0.829, sum=1.658 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=1.18, mean=1.18, max=1.18, sum=2.36 (2)",
-            "tab": "Efficiency",
-            "score": 1.1798271034893237
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=604.493, mean=604.493, max=604.493, sum=1208.987 (2)",
-            "tab": "General information",
-            "score": 604.4934210526316
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=1.147, mean=1.147, max=1.147, sum=2.295 (2)",
-            "tab": "Efficiency",
-            "score": 1.1473834657669066
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=600.02, mean=600.02, max=600.02, sum=1200.04 (2)",
-            "tab": "General information",
-            "score": 600.02
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.823,
-        "details": {
-          "description": "min=0.823, mean=0.823, max=0.823, sum=1.645 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=1.099, mean=1.099, max=1.099, sum=2.198 (2)",
-            "tab": "Efficiency",
-            "score": 1.0991604094235403
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=429.457, mean=429.457, max=429.457, sum=858.913 (2)",
-            "tab": "General information",
-            "score": 429.4566037735849
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.723,
-        "details": {
-          "description": "min=0.723, mean=0.723, max=0.723, sum=1.447 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=1.537, mean=1.537, max=1.537, sum=3.074 (2)",
-            "tab": "Efficiency",
-            "score": 1.536949543242759
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=323.536, mean=323.536, max=323.536, sum=647.072 (2)",
-            "tab": "General information",
-            "score": 323.53617021276597
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.717,
-        "details": {
-          "description": "min=0.717, mean=0.717, max=0.717, sum=1.434 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=1.249, mean=1.249, max=1.249, sum=2.497 (2)",
-            "tab": "Efficiency",
-            "score": 1.2485630594450852
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=437.041, mean=437.041, max=437.041, sum=874.083 (2)",
-            "tab": "General information",
-            "score": 437.04137931034484
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.561,
-        "details": {
-          "description": "min=0.561, mean=0.561, max=0.561, sum=1.122 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=1.558, mean=1.558, max=1.558, sum=3.116 (2)",
-            "tab": "Efficiency",
-            "score": 1.5580224965615248
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=511.066, mean=511.066, max=511.066, sum=1022.132 (2)",
-            "tab": "General information",
-            "score": 511.06613756613757
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.619,
-        "details": {
-          "description": "min=0.619, mean=0.619, max=0.619, sum=1.238 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=1.526, mean=1.526, max=1.526, sum=3.052 (2)",
-            "tab": "Efficiency",
-            "score": 1.5258309424869598
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=646.746, mean=646.746, max=646.746, sum=1293.492 (2)",
-            "tab": "General information",
-            "score": 646.7460317460317
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.882,
-        "details": {
-          "description": "min=0.882, mean=0.882, max=0.882, sum=1.764 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=1.15, mean=1.15, max=1.15, sum=2.299 (2)",
-            "tab": "Efficiency",
-            "score": 1.1497065974820044
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=1.227, mean=1.227, max=1.227, sum=2.454 (2)",
-            "tab": "Efficiency",
-            "score": 1.2272211636228514
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=1.014, mean=1.014, max=1.014, sum=2.027 (2)",
-            "tab": "Efficiency",
-            "score": 1.0136730527877809
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=1.024, mean=1.024, max=1.024, sum=2.047 (2)",
-            "tab": "Efficiency",
-            "score": 1.0236461119218305
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=1.059, mean=1.059, max=1.059, sum=2.119 (2)",
-            "tab": "Efficiency",
-            "score": 1.0594979368074975
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=1.138, mean=1.138, max=1.138, sum=2.275 (2)",
-            "tab": "Efficiency",
-            "score": 1.1376265478875354
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=1.107, mean=1.107, max=1.107, sum=2.214 (2)",
-            "tab": "Efficiency",
-            "score": 1.1069551357856164
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=1.094, mean=1.094, max=1.094, sum=2.188 (2)",
-            "tab": "Efficiency",
-            "score": 1.0940863344404432
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=1.034, mean=1.034, max=1.034, sum=2.068 (2)",
-            "tab": "Efficiency",
-            "score": 1.03420967815303
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=1.059, mean=1.059, max=1.059, sum=2.119 (2)",
-            "tab": "Efficiency",
-            "score": 1.0594944227610203
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=1.074, mean=1.074, max=1.074, sum=2.149 (2)",
-            "tab": "Efficiency",
-            "score": 1.07433808177983
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=1.053, mean=1.053, max=1.053, sum=2.107 (2)",
-            "tab": "Efficiency",
-            "score": 1.0534564554691315
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=1.101, mean=1.101, max=1.101, sum=2.201 (2)",
-            "tab": "Efficiency",
-            "score": 1.1006785748051662
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=1.093, mean=1.093, max=1.093, sum=2.186 (2)",
-            "tab": "Efficiency",
-            "score": 1.0931011674776359
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=534.577, mean=534.577, max=534.577, sum=1069.155 (2)",
-            "tab": "General information",
-            "score": 534.5774193548388
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=497.921, mean=497.921, max=497.921, sum=995.842 (2)",
-            "tab": "General information",
-            "score": 497.92118226600985
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=882.4, mean=882.4, max=882.4, sum=1764.8 (2)",
-            "tab": "General information",
-            "score": 882.4
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2887.576, mean=2887.576, max=2887.576, sum=5775.152 (2)",
-            "tab": "General information",
-            "score": 2887.5757575757575
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=412.268, mean=412.268, max=412.268, sum=824.535 (2)",
-            "tab": "General information",
-            "score": 412.2676767676768
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=491.104, mean=491.104, max=491.104, sum=982.207 (2)",
-            "tab": "General information",
-            "score": 491.10362694300517
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=406.036, mean=406.036, max=406.036, sum=812.072 (2)",
-            "tab": "General information",
-            "score": 406.0358974358974
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=519.881, mean=519.881, max=519.881, sum=1039.763 (2)",
-            "tab": "General information",
-            "score": 519.8814814814815
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=420.513, mean=420.513, max=420.513, sum=841.025 (2)",
-            "tab": "General information",
-            "score": 420.5126050420168
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=558.841, mean=558.841, max=558.841, sum=1117.682 (2)",
-            "tab": "General information",
-            "score": 558.841059602649
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=521.42, mean=521.42, max=521.42, sum=1042.84 (2)",
-            "tab": "General information",
-            "score": 521.4201834862386
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=806.963, mean=806.963, max=806.963, sum=1613.926 (2)",
-            "tab": "General information",
-            "score": 806.9629629629629
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2288.49, mean=2288.49, max=2288.49, sum=4576.98 (2)",
-            "tab": "General information",
-            "score": 2288.4901960784314
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1475.932, mean=1475.932, max=1475.932, sum=2951.865 (2)",
-            "tab": "General information",
-            "score": 1475.9324894514768
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.885,
-        "details": {
-          "description": "min=0.885, mean=0.885, max=0.885, sum=1.771 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=1.084, mean=1.084, max=1.084, sum=2.169 (2)",
-            "tab": "Efficiency",
-            "score": 1.0844623775225584
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=1.056, mean=1.056, max=1.056, sum=2.112 (2)",
-            "tab": "Efficiency",
-            "score": 1.0560545211529915
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=335.955, mean=335.955, max=335.955, sum=671.91 (2)",
-            "tab": "General information",
-            "score": 335.95515695067263
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=371.496, mean=371.496, max=371.496, sum=742.992 (2)",
-            "tab": "General information",
-            "score": 371.4961832061069
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.884,
-        "details": {
-          "description": "min=0.884, mean=0.884, max=0.884, sum=1.769 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=1.112, mean=1.112, max=1.112, sum=2.225 (2)",
-            "tab": "Efficiency",
-            "score": 1.1124236544301687
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=664.165, mean=664.165, max=664.165, sum=1328.331 (2)",
-            "tab": "General information",
-            "score": 664.1652892561983
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.822,
-        "details": {
-          "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=1.015, mean=1.015, max=1.015, sum=2.03 (2)",
-            "tab": "Efficiency",
-            "score": 1.0148307984591993
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=470.276, mean=470.276, max=470.276, sum=940.552 (2)",
-            "tab": "General information",
-            "score": 470.2760736196319
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.518,
-        "details": {
-          "description": "min=0.518, mean=0.518, max=0.518, sum=1.036 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=1.067, mean=1.067, max=1.067, sum=2.135 (2)",
-            "tab": "Efficiency",
-            "score": 1.0673569909163885
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=676.518, mean=676.518, max=676.518, sum=1353.036 (2)",
-            "tab": "General information",
-            "score": 676.5178571428571
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.845,
-        "details": {
-          "description": "min=0.845, mean=0.845, max=0.845, sum=1.689 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=1.038, mean=1.038, max=1.038, sum=2.076 (2)",
-            "tab": "Efficiency",
-            "score": 1.0377622229381673
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=301.282, mean=301.282, max=301.282, sum=602.563 (2)",
-            "tab": "General information",
-            "score": 301.28155339805824
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.897,
-        "details": {
-          "description": "min=0.897, mean=0.897, max=0.897, sum=1.795 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.993, mean=0.993, max=0.993, sum=1.986 (2)",
-            "tab": "Efficiency",
-            "score": 0.9929133276654105
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=448.064, mean=448.064, max=448.064, sum=896.128 (2)",
-            "tab": "General information",
-            "score": 448.06410256410254
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=1.041, mean=1.041, max=1.041, sum=2.082 (2)",
-            "tab": "Efficiency",
-            "score": 1.041243133544922
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=354.88, mean=354.88, max=354.88, sum=709.76 (2)",
-            "tab": "General information",
-            "score": 354.88
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.905,
-        "details": {
-          "description": "min=0.905, mean=0.905, max=0.905, sum=1.811 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=1.043, mean=1.043, max=1.043, sum=2.086 (2)",
-            "tab": "Efficiency",
-            "score": 1.0429492231225297
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=328.628, mean=328.628, max=328.628, sum=657.257 (2)",
-            "tab": "General information",
-            "score": 328.62835249042143
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.476,
-        "details": {
-          "description": "min=0.476, mean=0.476, max=0.476, sum=0.952 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=1.044, mean=1.044, max=1.044, sum=2.088 (2)",
-            "tab": "Efficiency",
-            "score": 1.0438106094481627
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.96, mean=0.96, max=0.96, sum=1.919 (2)",
-            "tab": "Efficiency",
-            "score": 0.95963474492121
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=511.789, mean=511.789, max=511.789, sum=1023.578 (2)",
-            "tab": "General information",
-            "score": 511.78901734104045
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=676.949, mean=676.949, max=676.949, sum=1353.897 (2)",
-            "tab": "General information",
-            "score": 676.9486033519553
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.846,
-        "details": {
-          "description": "min=0.846, mean=0.846, max=0.846, sum=1.693 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.981, mean=0.981, max=0.981, sum=1.962 (2)",
-            "tab": "Efficiency",
-            "score": 0.9811088399949417
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=617.065, mean=617.065, max=617.065, sum=1234.131 (2)",
-            "tab": "General information",
-            "score": 617.0653594771242
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.877,
-        "details": {
-          "description": "min=0.877, mean=0.877, max=0.877, sum=1.753 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=1.003, mean=1.003, max=1.003, sum=2.006 (2)",
-            "tab": "Efficiency",
-            "score": 1.0031694571177165
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=545.639, mean=545.639, max=545.639, sum=1091.278 (2)",
-            "tab": "General information",
-            "score": 545.6388888888889
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.727,
-        "details": {
-          "description": "min=0.727, mean=0.727, max=0.727, sum=1.455 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)",
-            "tab": "Efficiency",
-            "score": 0.9410657709295099
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=432.991, mean=432.991, max=432.991, sum=865.982 (2)",
-            "tab": "General information",
-            "score": 432.9909090909091
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.792,
-        "details": {
-          "description": "min=0.792, mean=0.792, max=0.792, sum=1.584 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=1.016, mean=1.016, max=1.016, sum=2.033 (2)",
-            "tab": "Efficiency",
-            "score": 1.0164005843960509
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1243.804, mean=1243.804, max=1243.804, sum=2487.608 (2)",
-            "tab": "General information",
-            "score": 1243.8040816326532
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.905,
-        "details": {
-          "description": "min=0.905, mean=0.905, max=0.905, sum=1.811 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.976, mean=0.976, max=0.976, sum=1.952 (2)",
-            "tab": "Efficiency",
-            "score": 0.9757713939420026
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=467.274, mean=467.274, max=467.274, sum=934.547 (2)",
-            "tab": "General information",
-            "score": 467.27363184079604
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.566,
-        "details": {
-          "description": "min=0.566, mean=0.566, max=0.566, sum=1.133 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.929, mean=0.929, max=0.929, sum=1.858 (2)",
-            "tab": "Efficiency",
-            "score": 0.9289331062730536
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=362.651, mean=362.651, max=362.651, sum=725.301 (2)",
-            "tab": "General information",
-            "score": 362.65060240963857
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "details": {
-          "description": "min=0.865, mean=0.865, max=0.865, sum=1.731 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=1.021, mean=1.021, max=1.021, sum=2.042 (2)",
-            "tab": "Efficiency",
-            "score": 1.0208685663011339
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=293.018, mean=293.018, max=293.018, sum=586.035 (2)",
-            "tab": "General information",
-            "score": 293.0175438596491
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.128,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json
deleted file mode 100644
index 9d9557efc..000000000
--- a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20240620/c6059976-85a1-40ce-b02f-67e182aa2f7d.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20240620/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 3.5 Sonnet 20240620",
-    "id": "anthropic/claude-3-5-sonnet-20240620",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "details": {
-          "description": "min=0.58, mean=0.865, max=0.98, sum=98.656 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.765, mean=1.1, max=3.433, sum=125.349 (114)",
-            "tab": "Efficiency",
-            "score": 1.099552619745469
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=302.018, mean=647.288, max=2896.576, sum=73790.875 (114)",
-            "tab": "General information",
-            "score": 647.2883793758954
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.75,
-        "details": {
-          "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.779, mean=0.779, max=0.779, sum=1.558 (2)",
-            "tab": "Efficiency",
-            "score": 0.7789034700393677
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=379.26, mean=379.26, max=379.26, sum=758.52 (2)",
-            "tab": "General information",
-            "score": 379.26
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.844,
-        "details": {
-          "description": "min=0.844, mean=0.844, max=0.844, sum=1.689 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.777, mean=0.777, max=0.777, sum=1.553 (2)",
-            "tab": "Efficiency",
-            "score": 0.7767299599117703
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=379.8, mean=379.8, max=379.8, sum=759.6 (2)",
-            "tab": "General information",
-            "score": 379.8
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.696,
-        "details": {
-          "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.797, mean=0.797, max=0.797, sum=1.594 (2)",
-            "tab": "Efficiency",
-            "score": 0.7968128871917725
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=1.09, mean=1.09, max=1.09, sum=2.18 (2)",
-            "tab": "Efficiency",
-            "score": 1.0898179478115506
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=1.27, mean=1.27, max=1.27, sum=2.539 (2)",
-            "tab": "Efficiency",
-            "score": 1.2695734238624572
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=1.72, mean=1.72, max=1.72, sum=3.439 (2)",
-            "tab": "Efficiency",
-            "score": 1.7196030735969543
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=1.28, mean=1.28, max=1.28, sum=2.559 (2)",
-            "tab": "Efficiency",
-            "score": 1.2795469209637944
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.796, mean=0.796, max=0.796, sum=1.591 (2)",
-            "tab": "Efficiency",
-            "score": 0.7955308311125812
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=559.01, mean=559.01, max=559.01, sum=1118.02 (2)",
-            "tab": "General information",
-            "score": 559.01
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=499.347, mean=499.347, max=499.347, sum=998.694 (2)",
-            "tab": "General information",
-            "score": 499.34722222222223
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=847.24, mean=847.24, max=847.24, sum=1694.48 (2)",
-            "tab": "General information",
-            "score": 847.24
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=613.19, mean=613.19, max=613.19, sum=1226.38 (2)",
-            "tab": "General information",
-            "score": 613.19
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=549.63, mean=549.63, max=549.63, sum=1099.26 (2)",
-            "tab": "General information",
-            "score": 549.6300578034682
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=498.48, mean=498.48, max=498.48, sum=996.961 (2)",
-            "tab": "General information",
-            "score": 498.48039215686276
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "details": {
-          "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.765, mean=0.765, max=0.765, sum=1.531 (2)",
-            "tab": "Efficiency",
-            "score": 0.7653794264793397
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=407.62, mean=407.62, max=407.62, sum=815.24 (2)",
-            "tab": "General information",
-            "score": 407.62
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.807,
-        "details": {
-          "description": "min=0.807, mean=0.807, max=0.807, sum=1.614 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.808, mean=0.808, max=0.808, sum=1.615 (2)",
-            "tab": "Efficiency",
-            "score": 0.8075556734152007
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=628.596, mean=628.596, max=628.596, sum=1257.193 (2)",
-            "tab": "General information",
-            "score": 628.5964912280701
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.72,
-        "details": {
-          "description": "min=0.72, mean=0.72, max=0.72, sum=1.44 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.785, mean=0.785, max=0.785, sum=1.571 (2)",
-            "tab": "Efficiency",
-            "score": 0.785265531539917
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=420.61, mean=420.61, max=420.61, sum=841.22 (2)",
-            "tab": "General information",
-            "score": 420.61
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.889,
-        "details": {
-          "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.781, mean=0.781, max=0.781, sum=1.563 (2)",
-            "tab": "Efficiency",
-            "score": 0.7813034631587841
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=440.426, mean=440.426, max=440.426, sum=880.852 (2)",
-            "tab": "General information",
-            "score": 440.4259259259259
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.891,
-        "details": {
-          "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=2.168, mean=2.168, max=2.168, sum=4.336 (2)",
-            "tab": "Efficiency",
-            "score": 2.1680153757812892
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=368.965, mean=368.965, max=368.965, sum=737.929 (2)",
-            "tab": "General information",
-            "score": 368.9646302250804
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.922,
-        "details": {
-          "description": "min=0.922, mean=0.922, max=0.922, sum=1.843 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=2.144, mean=2.144, max=2.144, sum=4.287 (2)",
-            "tab": "Efficiency",
-            "score": 2.1436235790743545
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=2.085, mean=2.085, max=2.085, sum=4.169 (2)",
-            "tab": "Efficiency",
-            "score": 2.084580805284757
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=1.308, mean=1.308, max=1.308, sum=2.616 (2)",
-            "tab": "Efficiency",
-            "score": 1.3078198053690726
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=1.15, mean=1.15, max=1.15, sum=2.301 (2)",
-            "tab": "Efficiency",
-            "score": 1.1502779430034114
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1132.537, mean=1132.537, max=1132.537, sum=2265.074 (2)",
-            "tab": "General information",
-            "score": 1132.5367647058824
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=674.422, mean=674.422, max=674.422, sum=1348.844 (2)",
-            "tab": "General information",
-            "score": 674.4219858156029
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1710.16, mean=1710.16, max=1710.16, sum=3420.321 (2)",
-            "tab": "General information",
-            "score": 1710.16036505867
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=612.168, mean=612.168, max=612.168, sum=1224.337 (2)",
-            "tab": "General information",
-            "score": 612.1683006535948
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.96,
-        "details": {
-          "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.973, mean=0.973, max=0.973, sum=1.946 (2)",
-            "tab": "Efficiency",
-            "score": 0.9727654385566712
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=464.25, mean=464.25, max=464.25, sum=928.5 (2)",
-            "tab": "General information",
-            "score": 464.25
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.961,
-        "details": {
-          "description": "min=0.961, mean=0.961, max=0.961, sum=1.921 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=1.35, mean=1.35, max=1.35, sum=2.7 (2)",
-            "tab": "Efficiency",
-            "score": 1.3501500989261426
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=613.493, mean=613.493, max=613.493, sum=1226.987 (2)",
-            "tab": "General information",
-            "score": 613.4934210526316
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=1.326, mean=1.326, max=1.326, sum=2.652 (2)",
-            "tab": "Efficiency",
-            "score": 1.325816671848297
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=609.02, mean=609.02, max=609.02, sum=1218.04 (2)",
-            "tab": "General information",
-            "score": 609.02
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.913,
-        "details": {
-          "description": "min=0.913, mean=0.913, max=0.913, sum=1.826 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=1.379, mean=1.379, max=1.379, sum=2.757 (2)",
-            "tab": "Efficiency",
-            "score": 1.3787489792086043
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=438.457, mean=438.457, max=438.457, sum=876.913 (2)",
-            "tab": "General information",
-            "score": 438.4566037735849
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.885,
-        "details": {
-          "description": "min=0.885, mean=0.885, max=0.885, sum=1.77 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)",
-            "tab": "Efficiency",
-            "score": 0.7780434922969087
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=332.536, mean=332.536, max=332.536, sum=665.072 (2)",
-            "tab": "General information",
-            "score": 332.53617021276597
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.828,
-        "details": {
-          "description": "min=0.828, mean=0.828, max=0.828, sum=1.655 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
-            "tab": "Efficiency",
-            "score": 0.789771790340029
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=446.041, mean=446.041, max=446.041, sum=892.083 (2)",
-            "tab": "General information",
-            "score": 446.04137931034484
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.892,
-        "details": {
-          "description": "min=0.892, mean=0.892, max=0.892, sum=1.783 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.806, mean=0.806, max=0.806, sum=1.612 (2)",
-            "tab": "Efficiency",
-            "score": 0.8060284802522609
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=520.066, mean=520.066, max=520.066, sum=1040.132 (2)",
-            "tab": "General information",
-            "score": 520.0661375661375
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.698,
-        "details": {
-          "description": "min=0.698, mean=0.698, max=0.698, sum=1.397 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.811, mean=0.811, max=0.811, sum=1.623 (2)",
-            "tab": "Efficiency",
-            "score": 0.8114165843479217
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=655.746, mean=655.746, max=655.746, sum=1311.492 (2)",
-            "tab": "General information",
-            "score": 655.7460317460317
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.954,
-        "details": {
-          "description": "min=0.954, mean=0.954, max=0.954, sum=1.907 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.802, mean=0.802, max=0.802, sum=1.605 (2)",
-            "tab": "Efficiency",
-            "score": 0.8022696918056857
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.806, mean=0.806, max=0.806, sum=1.612 (2)",
-            "tab": "Efficiency",
-            "score": 0.8062427619407917
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)",
-            "tab": "Efficiency",
-            "score": 0.8532347416877747
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=1.183, mean=1.183, max=1.183, sum=2.366 (2)",
-            "tab": "Efficiency",
-            "score": 1.1831647526134144
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.776, mean=0.776, max=0.776, sum=1.553 (2)",
-            "tab": "Efficiency",
-            "score": 0.7764992966796412
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.802, mean=0.802, max=0.802, sum=1.603 (2)",
-            "tab": "Efficiency",
-            "score": 0.8015919287587695
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.782, mean=0.782, max=0.782, sum=1.563 (2)",
-            "tab": "Efficiency",
-            "score": 0.781673603791457
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.805, mean=0.805, max=0.805, sum=1.61 (2)",
-            "tab": "Efficiency",
-            "score": 0.80511144178885
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.788, mean=0.788, max=0.788, sum=1.576 (2)",
-            "tab": "Efficiency",
-            "score": 0.7879440243504628
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.829, mean=0.829, max=0.829, sum=1.658 (2)",
-            "tab": "Efficiency",
-            "score": 0.8290448062467259
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.807, mean=0.807, max=0.807, sum=1.614 (2)",
-            "tab": "Efficiency",
-            "score": 0.8071829231507187
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.812, mean=0.812, max=0.812, sum=1.624 (2)",
-            "tab": "Efficiency",
-            "score": 0.8119496272669898
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.938, mean=0.938, max=0.938, sum=1.877 (2)",
-            "tab": "Efficiency",
-            "score": 0.9383000193857679
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=1.097, mean=1.097, max=1.097, sum=2.194 (2)",
-            "tab": "Efficiency",
-            "score": 1.0968722401791986
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=543.577, mean=543.577, max=543.577, sum=1087.155 (2)",
-            "tab": "General information",
-            "score": 543.5774193548388
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=506.921, mean=506.921, max=506.921, sum=1013.842 (2)",
-            "tab": "General information",
-            "score": 506.92118226600985
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=891.4, mean=891.4, max=891.4, sum=1782.8 (2)",
-            "tab": "General information",
-            "score": 891.4
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2896.576, mean=2896.576, max=2896.576, sum=5793.152 (2)",
-            "tab": "General information",
-            "score": 2896.5757575757575
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=421.268, mean=421.268, max=421.268, sum=842.535 (2)",
-            "tab": "General information",
-            "score": 421.2676767676768
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=500.104, mean=500.104, max=500.104, sum=1000.207 (2)",
-            "tab": "General information",
-            "score": 500.10362694300517
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=415.036, mean=415.036, max=415.036, sum=830.072 (2)",
-            "tab": "General information",
-            "score": 415.0358974358974
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=528.881, mean=528.881, max=528.881, sum=1057.763 (2)",
-            "tab": "General information",
-            "score": 528.8814814814815
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=429.513, mean=429.513, max=429.513, sum=859.025 (2)",
-            "tab": "General information",
-            "score": 429.5126050420168
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=567.841, mean=567.841, max=567.841, sum=1135.682 (2)",
-            "tab": "General information",
-            "score": 567.841059602649
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=530.42, mean=530.42, max=530.42, sum=1060.84 (2)",
-            "tab": "General information",
-            "score": 530.4201834862386
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=815.963, mean=815.963, max=815.963, sum=1631.926 (2)",
-            "tab": "General information",
-            "score": 815.9629629629629
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2297.49, mean=2297.49, max=2297.49, sum=4594.98 (2)",
-            "tab": "General information",
-            "score": 2297.4901960784314
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1484.932, mean=1484.932, max=1484.932, sum=2969.865 (2)",
-            "tab": "General information",
-            "score": 1484.9324894514768
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.939,
-        "details": {
-          "description": "min=0.939, mean=0.939, max=0.939, sum=1.878 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.785, mean=0.785, max=0.785, sum=1.569 (2)",
-            "tab": "Efficiency",
-            "score": 0.7847084699724822
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.811, mean=0.811, max=0.811, sum=1.622 (2)",
-            "tab": "Efficiency",
-            "score": 0.8110958565282458
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=344.955, mean=344.955, max=344.955, sum=689.91 (2)",
-            "tab": "General information",
-            "score": 344.95515695067263
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=380.496, mean=380.496, max=380.496, sum=760.992 (2)",
-            "tab": "General information",
-            "score": 380.4961832061069
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.959,
-        "details": {
-          "description": "min=0.959, mean=0.959, max=0.959, sum=1.917 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)",
-            "tab": "Efficiency",
-            "score": 0.8220856209431798
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=673.165, mean=673.165, max=673.165, sum=1346.331 (2)",
-            "tab": "General information",
-            "score": 673.1652892561983
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.926,
-        "details": {
-          "description": "min=0.926, mean=0.926, max=0.926, sum=1.853 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)",
-            "tab": "Efficiency",
-            "score": 0.778087305876375
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=479.276, mean=479.276, max=479.276, sum=958.552 (2)",
-            "tab": "General information",
-            "score": 479.2760736196319
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.786,
-        "details": {
-          "description": "min=0.786, mean=0.786, max=0.786, sum=1.571 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.81, mean=0.81, max=0.81, sum=1.619 (2)",
-            "tab": "Efficiency",
-            "score": 0.809621695961271
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=685.518, mean=685.518, max=685.518, sum=1371.036 (2)",
-            "tab": "General information",
-            "score": 685.5178571428571
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.942,
-        "details": {
-          "description": "min=0.942, mean=0.942, max=0.942, sum=1.883 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.848, mean=0.848, max=0.848, sum=1.696 (2)",
-            "tab": "Efficiency",
-            "score": 0.8480523350169358
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=310.282, mean=310.282, max=310.282, sum=620.563 (2)",
-            "tab": "General information",
-            "score": 310.28155339805824
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.949,
-        "details": {
-          "description": "min=0.949, mean=0.949, max=0.949, sum=1.897 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=2.55, mean=2.55, max=2.55, sum=5.1 (2)",
-            "tab": "Efficiency",
-            "score": 2.550003965695699
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=457.064, mean=457.064, max=457.064, sum=914.128 (2)",
-            "tab": "General information",
-            "score": 457.06410256410254
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.98,
-        "details": {
-          "description": "min=0.98, mean=0.98, max=0.98, sum=1.96 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=3.433, mean=3.433, max=3.433, sum=6.867 (2)",
-            "tab": "Efficiency",
-            "score": 3.4333492875099183
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=363.88, mean=363.88, max=363.88, sum=727.76 (2)",
-            "tab": "General information",
-            "score": 363.88
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.962,
-        "details": {
-          "description": "min=0.962, mean=0.962, max=0.962, sum=1.923 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=1.474, mean=1.474, max=1.474, sum=2.949 (2)",
-            "tab": "Efficiency",
-            "score": 1.4744500937285248
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=337.628, mean=337.628, max=337.628, sum=675.257 (2)",
-            "tab": "General information",
-            "score": 337.62835249042143
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.882,
-        "details": {
-          "description": "min=0.882, mean=0.882, max=0.882, sum=1.763 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.817, mean=0.817, max=0.817, sum=1.635 (2)",
-            "tab": "Efficiency",
-            "score": 0.8173547728213272
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=1.043, mean=1.043, max=1.043, sum=2.085 (2)",
-            "tab": "Efficiency",
-            "score": 1.0425983404980026
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=520.789, mean=520.789, max=520.789, sum=1041.578 (2)",
-            "tab": "General information",
-            "score": 520.7890173410404
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=685.949, mean=685.949, max=685.949, sum=1371.897 (2)",
-            "tab": "General information",
-            "score": 685.9486033519553
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.912,
-        "details": {
-          "description": "min=0.912, mean=0.912, max=0.912, sum=1.824 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.987, mean=0.987, max=0.987, sum=1.973 (2)",
-            "tab": "Efficiency",
-            "score": 0.9867353338042116
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=626.065, mean=626.065, max=626.065, sum=1252.131 (2)",
-            "tab": "General information",
-            "score": 626.0653594771242
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.951,
-        "details": {
-          "description": "min=0.951, mean=0.951, max=0.951, sum=1.901 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.887, mean=0.887, max=0.887, sum=1.775 (2)",
-            "tab": "Efficiency",
-            "score": 0.8874673313564725
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=554.639, mean=554.639, max=554.639, sum=1109.278 (2)",
-            "tab": "General information",
-            "score": 554.6388888888889
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.855,
-        "details": {
-          "description": "min=0.855, mean=0.855, max=0.855, sum=1.709 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=1.124, mean=1.124, max=1.124, sum=2.248 (2)",
-            "tab": "Efficiency",
-            "score": 1.1237782673402266
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=441.991, mean=441.991, max=441.991, sum=883.982 (2)",
-            "tab": "General information",
-            "score": 441.9909090909091
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.878,
-        "details": {
-          "description": "min=0.878, mean=0.878, max=0.878, sum=1.755 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=1.219, mean=1.219, max=1.219, sum=2.438 (2)",
-            "tab": "Efficiency",
-            "score": 1.2191707075858602
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1252.804, mean=1252.804, max=1252.804, sum=2505.608 (2)",
-            "tab": "General information",
-            "score": 1252.8040816326532
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.96,
-        "details": {
-          "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=1.141, mean=1.141, max=1.141, sum=2.282 (2)",
-            "tab": "Efficiency",
-            "score": 1.141001319410789
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=476.274, mean=476.274, max=476.274, sum=952.547 (2)",
-            "tab": "General information",
-            "score": 476.27363184079604
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.602,
-        "details": {
-          "description": "min=0.602, mean=0.602, max=0.602, sum=1.205 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=1.15, mean=1.15, max=1.15, sum=2.3 (2)",
-            "tab": "Efficiency",
-            "score": 1.1499209547617348
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=371.651, mean=371.651, max=371.651, sum=743.301 (2)",
-            "tab": "General information",
-            "score": 371.65060240963857
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.924,
-        "details": {
-          "description": "min=0.924, mean=0.924, max=0.924, sum=1.848 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=1.201, mean=1.201, max=1.201, sum=2.402 (2)",
-            "tab": "Efficiency",
-            "score": 1.200854153661003
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=302.018, mean=302.018, max=302.018, sum=604.035 (2)",
-            "tab": "General information",
-            "score": 302.0175438596491
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.17,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json b/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json
deleted file mode 100644
index 35be68aa6..000000000
--- a/data/helm_mmlu/anthropic/claude-3-5-sonnet-20241022/6a59feac-f2d5-4eaf-a440-036b0acfbfc0.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-5-sonnet-20241022/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 3.5 Sonnet 20241022",
-    "id": "anthropic/claude-3-5-sonnet-20241022",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.873,
-        "details": {
-          "description": "min=0.584, mean=0.873, max=0.984, sum=99.491 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.615, mean=0.688, max=1.002, sum=78.403 (114)",
-            "tab": "Efficiency",
-            "score": 0.6877486861856626
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=293.018, mean=638.288, max=2887.576, sum=72764.875 (114)",
-            "tab": "General information",
-            "score": 638.2883793758953
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78,
-        "details": {
-          "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.673, mean=0.673, max=0.673, sum=1.345 (2)",
-            "tab": "Efficiency",
-            "score": 0.672634687423706
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=370.26, mean=370.26, max=370.26, sum=740.52 (2)",
-            "tab": "General information",
-            "score": 370.26
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.859,
-        "details": {
-          "description": "min=0.859, mean=0.859, max=0.859, sum=1.719 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.654, mean=0.654, max=0.654, sum=1.308 (2)",
-            "tab": "Efficiency",
-            "score": 0.653886115109479
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=370.8, mean=370.8, max=370.8, sum=741.6 (2)",
-            "tab": "General information",
-            "score": 370.8
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.775,
-        "details": {
-          "description": "min=0.775, mean=0.775, max=0.775, sum=1.549 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.689, mean=0.689, max=0.689, sum=1.379 (2)",
-            "tab": "Efficiency",
-            "score": 0.6893502926826477
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)",
-            "tab": "Efficiency",
-            "score": 0.6600197752316793
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.673, mean=0.673, max=0.673, sum=1.345 (2)",
-            "tab": "Efficiency",
-            "score": 0.6726715517044067
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.689, mean=0.689, max=0.689, sum=1.378 (2)",
-            "tab": "Efficiency",
-            "score": 0.6890151953697204
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.668, mean=0.668, max=0.668, sum=1.337 (2)",
-            "tab": "Efficiency",
-            "score": 0.6682831924085673
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.704, mean=0.704, max=0.704, sum=1.407 (2)",
-            "tab": "Efficiency",
-            "score": 0.7037388226565193
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=550.01, mean=550.01, max=550.01, sum=1100.02 (2)",
-            "tab": "General information",
-            "score": 550.01
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=490.347, mean=490.347, max=490.347, sum=980.694 (2)",
-            "tab": "General information",
-            "score": 490.34722222222223
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=838.24, mean=838.24, max=838.24, sum=1676.48 (2)",
-            "tab": "General information",
-            "score": 838.24
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=604.19, mean=604.19, max=604.19, sum=1208.38 (2)",
-            "tab": "General information",
-            "score": 604.19
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=540.63, mean=540.63, max=540.63, sum=1081.26 (2)",
-            "tab": "General information",
-            "score": 540.6300578034682
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=489.48, mean=489.48, max=489.48, sum=978.961 (2)",
-            "tab": "General information",
-            "score": 489.48039215686276
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.661, mean=0.661, max=0.661, sum=1.322 (2)",
-            "tab": "Efficiency",
-            "score": 0.6610880661010742
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=398.62, mean=398.62, max=398.62, sum=797.24 (2)",
-            "tab": "General information",
-            "score": 398.62
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.807,
-        "details": {
-          "description": "min=0.807, mean=0.807, max=0.807, sum=1.614 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.684, mean=0.684, max=0.684, sum=1.367 (2)",
-            "tab": "Efficiency",
-            "score": 0.6837067018475449
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=619.596, mean=619.596, max=619.596, sum=1239.193 (2)",
-            "tab": "General information",
-            "score": 619.5964912280701
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.637, mean=0.637, max=0.637, sum=1.274 (2)",
-            "tab": "Efficiency",
-            "score": 0.6369614601135254
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=411.61, mean=411.61, max=411.61, sum=823.22 (2)",
-            "tab": "General information",
-            "score": 411.61
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.898,
-        "details": {
-          "description": "min=0.898, mean=0.898, max=0.898, sum=1.796 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)",
-            "tab": "Efficiency",
-            "score": 0.6427947613928053
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=431.426, mean=431.426, max=431.426, sum=862.852 (2)",
-            "tab": "General information",
-            "score": 431.4259259259259
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.891,
-        "details": {
-          "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.645, mean=0.645, max=0.645, sum=1.291 (2)",
-            "tab": "Efficiency",
-            "score": 0.6454648833566157
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=359.965, mean=359.965, max=359.965, sum=719.929 (2)",
-            "tab": "General information",
-            "score": 359.9646302250804
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.922,
-        "details": {
-          "description": "min=0.922, mean=0.922, max=0.922, sum=1.843 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.622, mean=0.622, max=0.622, sum=1.243 (2)",
-            "tab": "Efficiency",
-            "score": 0.6215311034637339
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)",
-            "tab": "Efficiency",
-            "score": 0.6900012104223806
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=1.002, mean=1.002, max=1.002, sum=2.004 (2)",
-            "tab": "Efficiency",
-            "score": 1.002109061319483
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.682, mean=0.682, max=0.682, sum=1.364 (2)",
-            "tab": "Efficiency",
-            "score": 0.6821525521527708
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1123.537, mean=1123.537, max=1123.537, sum=2247.074 (2)",
-            "tab": "General information",
-            "score": 1123.5367647058824
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=665.422, mean=665.422, max=665.422, sum=1330.844 (2)",
-            "tab": "General information",
-            "score": 665.4219858156029
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1701.16, mean=1701.16, max=1701.16, sum=3402.321 (2)",
-            "tab": "General information",
-            "score": 1701.16036505867
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=603.168, mean=603.168, max=603.168, sum=1206.337 (2)",
-            "tab": "General information",
-            "score": 603.1683006535948
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.96,
-        "details": {
-          "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)",
-            "tab": "Efficiency",
-            "score": 0.660010986328125
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=455.25, mean=455.25, max=455.25, sum=910.5 (2)",
-            "tab": "General information",
-            "score": 455.25
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.974,
-        "details": {
-          "description": "min=0.974, mean=0.974, max=0.974, sum=1.947 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.672, mean=0.672, max=0.672, sum=1.344 (2)",
-            "tab": "Efficiency",
-            "score": 0.6717779793237385
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=604.493, mean=604.493, max=604.493, sum=1208.987 (2)",
-            "tab": "General information",
-            "score": 604.4934210526316
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.651, mean=0.651, max=0.651, sum=1.302 (2)",
-            "tab": "Efficiency",
-            "score": 0.6511244606971741
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=600.02, mean=600.02, max=600.02, sum=1200.04 (2)",
-            "tab": "General information",
-            "score": 600.02
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.928,
-        "details": {
-          "description": "min=0.928, mean=0.928, max=0.928, sum=1.857 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)",
-            "tab": "Efficiency",
-            "score": 0.6499361712977572
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=429.457, mean=429.457, max=429.457, sum=858.913 (2)",
-            "tab": "General information",
-            "score": 429.4566037735849
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.906,
-        "details": {
-          "description": "min=0.906, mean=0.906, max=0.906, sum=1.813 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.615, mean=0.615, max=0.615, sum=1.229 (2)",
-            "tab": "Efficiency",
-            "score": 0.6146096341153409
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=323.536, mean=323.536, max=323.536, sum=647.072 (2)",
-            "tab": "General information",
-            "score": 323.53617021276597
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.848,
-        "details": {
-          "description": "min=0.848, mean=0.848, max=0.848, sum=1.697 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.646, mean=0.646, max=0.646, sum=1.292 (2)",
-            "tab": "Efficiency",
-            "score": 0.6462178690680143
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=437.041, mean=437.041, max=437.041, sum=874.083 (2)",
-            "tab": "General information",
-            "score": 437.04137931034484
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.918,
-        "details": {
-          "description": "min=0.918, mean=0.918, max=0.918, sum=1.836 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.709, mean=0.709, max=0.709, sum=1.418 (2)",
-            "tab": "Efficiency",
-            "score": 0.7089652012264918
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=511.066, mean=511.066, max=511.066, sum=1022.132 (2)",
-            "tab": "General information",
-            "score": 511.06613756613757
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.786,
-        "details": {
-          "description": "min=0.786, mean=0.786, max=0.786, sum=1.571 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.692, mean=0.692, max=0.692, sum=1.384 (2)",
-            "tab": "Efficiency",
-            "score": 0.691912295326354
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=646.746, mean=646.746, max=646.746, sum=1293.492 (2)",
-            "tab": "General information",
-            "score": 646.7460317460317
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.958,
-        "details": {
-          "description": "min=0.958, mean=0.958, max=0.958, sum=1.916 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.669, mean=0.669, max=0.669, sum=1.338 (2)",
-            "tab": "Efficiency",
-            "score": 0.6689629408621018
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.673, mean=0.673, max=0.673, sum=1.346 (2)",
-            "tab": "Efficiency",
-            "score": 0.6729868444903143
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.678, mean=0.678, max=0.678, sum=1.356 (2)",
-            "tab": "Efficiency",
-            "score": 0.677822756767273
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.697, mean=0.697, max=0.697, sum=1.395 (2)",
-            "tab": "Efficiency",
-            "score": 0.6973154544830322
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.64, mean=0.64, max=0.64, sum=1.281 (2)",
-            "tab": "Efficiency",
-            "score": 0.6404741051221134
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.661, mean=0.661, max=0.661, sum=1.323 (2)",
-            "tab": "Efficiency",
-            "score": 0.6613641341115527
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.631, mean=0.631, max=0.631, sum=1.261 (2)",
-            "tab": "Efficiency",
-            "score": 0.6305418686989026
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.668, mean=0.668, max=0.668, sum=1.336 (2)",
-            "tab": "Efficiency",
-            "score": 0.6677727399048982
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.656, mean=0.656, max=0.656, sum=1.312 (2)",
-            "tab": "Efficiency",
-            "score": 0.6559101263014209
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.676, mean=0.676, max=0.676, sum=1.353 (2)",
-            "tab": "Efficiency",
-            "score": 0.6763939494328783
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.671, mean=0.671, max=0.671, sum=1.342 (2)",
-            "tab": "Efficiency",
-            "score": 0.6708623107420195
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.702, mean=0.702, max=0.702, sum=1.404 (2)",
-            "tab": "Efficiency",
-            "score": 0.7019402329568509
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.646, mean=0.646, max=0.646, sum=1.293 (2)",
-            "tab": "Efficiency",
-            "score": 0.6463189136748221
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.695, mean=0.695, max=0.695, sum=1.39 (2)",
-            "tab": "Efficiency",
-            "score": 0.6947573730211217
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=534.577, mean=534.577, max=534.577, sum=1069.155 (2)",
-            "tab": "General information",
-            "score": 534.5774193548388
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=497.921, mean=497.921, max=497.921, sum=995.842 (2)",
-            "tab": "General information",
-            "score": 497.92118226600985
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=882.4, mean=882.4, max=882.4, sum=1764.8 (2)",
-            "tab": "General information",
-            "score": 882.4
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2887.576, mean=2887.576, max=2887.576, sum=5775.152 (2)",
-            "tab": "General information",
-            "score": 2887.5757575757575
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=412.268, mean=412.268, max=412.268, sum=824.535 (2)",
-            "tab": "General information",
-            "score": 412.2676767676768
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=491.104, mean=491.104, max=491.104, sum=982.207 (2)",
-            "tab": "General information",
-            "score": 491.10362694300517
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=406.036, mean=406.036, max=406.036, sum=812.072 (2)",
-            "tab": "General information",
-            "score": 406.0358974358974
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=519.881, mean=519.881, max=519.881, sum=1039.763 (2)",
-            "tab": "General information",
-            "score": 519.8814814814815
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=420.513, mean=420.513, max=420.513, sum=841.025 (2)",
-            "tab": "General information",
-            "score": 420.5126050420168
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=558.841, mean=558.841, max=558.841, sum=1117.682 (2)",
-            "tab": "General information",
-            "score": 558.841059602649
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=521.42, mean=521.42, max=521.42, sum=1042.84 (2)",
-            "tab": "General information",
-            "score": 521.4201834862386
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=806.963, mean=806.963, max=806.963, sum=1613.926 (2)",
-            "tab": "General information",
-            "score": 806.9629629629629
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2288.49, mean=2288.49, max=2288.49, sum=4576.98 (2)",
-            "tab": "General information",
-            "score": 2288.4901960784314
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1475.932, mean=1475.932, max=1475.932, sum=2951.865 (2)",
-            "tab": "General information",
-            "score": 1475.9324894514768
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.939,
-        "details": {
-          "description": "min=0.939, mean=0.939, max=0.939, sum=1.878 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.656, mean=0.656, max=0.656, sum=1.312 (2)",
-            "tab": "Efficiency",
-            "score": 0.6560797862407872
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.686, mean=0.686, max=0.686, sum=1.372 (2)",
-            "tab": "Efficiency",
-            "score": 0.6857976003457572
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=335.955, mean=335.955, max=335.955, sum=671.91 (2)",
-            "tab": "General information",
-            "score": 335.95515695067263
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=371.496, mean=371.496, max=371.496, sum=742.992 (2)",
-            "tab": "General information",
-            "score": 371.4961832061069
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.959,
-        "details": {
-          "description": "min=0.959, mean=0.959, max=0.959, sum=1.917 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.713, mean=0.713, max=0.713, sum=1.426 (2)",
-            "tab": "Efficiency",
-            "score": 0.7129175268914089
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=664.165, mean=664.165, max=664.165, sum=1328.331 (2)",
-            "tab": "General information",
-            "score": 664.1652892561983
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.914,
-        "details": {
-          "description": "min=0.914, mean=0.914, max=0.914, sum=1.828 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.821, mean=0.821, max=0.821, sum=1.642 (2)",
-            "tab": "Efficiency",
-            "score": 0.8211235926926501
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=470.276, mean=470.276, max=470.276, sum=940.552 (2)",
-            "tab": "General information",
-            "score": 470.2760736196319
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.839,
-        "details": {
-          "description": "min=0.839, mean=0.839, max=0.839, sum=1.679 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.697, mean=0.697, max=0.697, sum=1.393 (2)",
-            "tab": "Efficiency",
-            "score": 0.69659323990345
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=676.518, mean=676.518, max=676.518, sum=1353.036 (2)",
-            "tab": "General information",
-            "score": 676.5178571428571
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.932,
-        "details": {
-          "description": "min=0.932, mean=0.932, max=0.932, sum=1.864 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.702, mean=0.702, max=0.702, sum=1.404 (2)",
-            "tab": "Efficiency",
-            "score": 0.7021607287879129
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=301.282, mean=301.282, max=301.282, sum=602.563 (2)",
-            "tab": "General information",
-            "score": 301.28155339805824
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.953,
-        "details": {
-          "description": "min=0.953, mean=0.953, max=0.953, sum=1.906 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)",
-            "tab": "Efficiency",
-            "score": 0.8333144401892637
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=448.064, mean=448.064, max=448.064, sum=896.128 (2)",
-            "tab": "General information",
-            "score": 448.06410256410254
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.96,
-        "details": {
-          "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)",
-            "tab": "Efficiency",
-            "score": 0.7894818639755249
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=354.88, mean=354.88, max=354.88, sum=709.76 (2)",
-            "tab": "General information",
-            "score": 354.88
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.964,
-        "details": {
-          "description": "min=0.964, mean=0.964, max=0.964, sum=1.928 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.803, mean=0.803, max=0.803, sum=1.606 (2)",
-            "tab": "Efficiency",
-            "score": 0.8030681811073274
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=328.628, mean=328.628, max=328.628, sum=657.257 (2)",
-            "tab": "General information",
-            "score": 328.62835249042143
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.888,
-        "details": {
-          "description": "min=0.888, mean=0.888, max=0.888, sum=1.777 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.698, mean=0.698, max=0.698, sum=1.397 (2)",
-            "tab": "Efficiency",
-            "score": 0.6983739172103088
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.697, mean=0.697, max=0.697, sum=1.393 (2)",
-            "tab": "Efficiency",
-            "score": 0.6965836058781799
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=511.789, mean=511.789, max=511.789, sum=1023.578 (2)",
-            "tab": "General information",
-            "score": 511.78901734104045
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=676.949, mean=676.949, max=676.949, sum=1353.897 (2)",
-            "tab": "General information",
-            "score": 676.9486033519553
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.922,
-        "details": {
-          "description": "min=0.922, mean=0.922, max=0.922, sum=1.843 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.695, mean=0.695, max=0.695, sum=1.389 (2)",
-            "tab": "Efficiency",
-            "score": 0.6946531822478849
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=617.065, mean=617.065, max=617.065, sum=1234.131 (2)",
-            "tab": "General information",
-            "score": 617.0653594771242
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.941,
-        "details": {
-          "description": "min=0.941, mean=0.941, max=0.941, sum=1.883 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.682, mean=0.682, max=0.682, sum=1.365 (2)",
-            "tab": "Efficiency",
-            "score": 0.6824756529596117
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=545.639, mean=545.639, max=545.639, sum=1091.278 (2)",
-            "tab": "General information",
-            "score": 545.6388888888889
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.626, mean=0.626, max=0.626, sum=1.252 (2)",
-            "tab": "Efficiency",
-            "score": 0.6258317015387795
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=432.991, mean=432.991, max=432.991, sum=865.982 (2)",
-            "tab": "General information",
-            "score": 432.9909090909091
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.882,
-        "details": {
-          "description": "min=0.882, mean=0.882, max=0.882, sum=1.763 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.744, mean=0.744, max=0.744, sum=1.489 (2)",
-            "tab": "Efficiency",
-            "score": 0.7442785263061523
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1243.804, mean=1243.804, max=1243.804, sum=2487.608 (2)",
-            "tab": "General information",
-            "score": 1243.8040816326532
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.955,
-        "details": {
-          "description": "min=0.955, mean=0.955, max=0.955, sum=1.91 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.695, mean=0.695, max=0.695, sum=1.389 (2)",
-            "tab": "Efficiency",
-            "score": 0.6946055438388047
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=467.274, mean=467.274, max=467.274, sum=934.547 (2)",
-            "tab": "General information",
-            "score": 467.27363184079604
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.584,
-        "details": {
-          "description": "min=0.584, mean=0.584, max=0.584, sum=1.169 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.68, mean=0.68, max=0.68, sum=1.361 (2)",
-            "tab": "Efficiency",
-            "score": 0.6803859400461956
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=362.651, mean=362.651, max=362.651, sum=725.301 (2)",
-            "tab": "General information",
-            "score": 362.65060240963857
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.801 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.651, mean=0.651, max=0.651, sum=1.301 (2)",
-            "tab": "Efficiency",
-            "score": 0.6505623017138208
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=293.018, mean=293.018, max=293.018, sum=586.035 (2)",
-            "tab": "General information",
-            "score": 293.0175438596491
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.311,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json b/data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json
deleted file mode 100644
index 969900aba..000000000
--- a/data/helm_mmlu/anthropic/claude-3-haiku-20240307/f397ca7a-41c4-4926-b075-2523639f0a50.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-haiku-20240307/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 3 Haiku 20240307",
-    "id": "anthropic/claude-3-haiku-20240307",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.738,
-        "details": {
-          "description": "min=0.37, mean=0.738, max=0.95, sum=84.132 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.662, mean=0.734, max=1.711, sum=83.657 (114)",
-            "tab": "Efficiency",
-            "score": 0.7338373689865249
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=293.018, mean=638.288, max=2887.576, sum=72764.875 (114)",
-            "tab": "General information",
-            "score": 638.2883793758953
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.42,
-        "details": {
-          "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.693, mean=0.693, max=0.693, sum=1.386 (2)",
-            "tab": "Efficiency",
-            "score": 0.6928385472297669
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=370.26, mean=370.26, max=370.26, sum=740.52 (2)",
-            "tab": "General information",
-            "score": 370.26
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.711,
-        "details": {
-          "description": "min=0.711, mean=0.711, max=0.711, sum=1.422 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.668, mean=0.668, max=0.668, sum=1.336 (2)",
-            "tab": "Efficiency",
-            "score": 0.6677785749788637
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=370.8, mean=370.8, max=370.8, sum=741.6 (2)",
-            "tab": "General information",
-            "score": 370.8
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.48,
-        "details": {
-          "description": "min=0.48, mean=0.48, max=0.48, sum=0.961 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.692, mean=0.692, max=0.692, sum=1.385 (2)",
-            "tab": "Efficiency",
-            "score": 0.6923453903198242
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.702, mean=0.702, max=0.702, sum=1.405 (2)",
-            "tab": "Efficiency",
-            "score": 0.7022541695170932
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.735, mean=0.735, max=0.735, sum=1.47 (2)",
-            "tab": "Efficiency",
-            "score": 0.7352152991294861
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.715, mean=0.715, max=0.715, sum=1.43 (2)",
-            "tab": "Efficiency",
-            "score": 0.7152474927902222
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.713, mean=0.713, max=0.713, sum=1.425 (2)",
-            "tab": "Efficiency",
-            "score": 0.7125603780581083
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.726, mean=0.726, max=0.726, sum=1.453 (2)",
-            "tab": "Efficiency",
-            "score": 0.7264628340216244
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=550.01, mean=550.01, max=550.01, sum=1100.02 (2)",
-            "tab": "General information",
-            "score": 550.01
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=490.347, mean=490.347, max=490.347, sum=980.694 (2)",
-            "tab": "General information",
-            "score": 490.34722222222223
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=838.24, mean=838.24, max=838.24, sum=1676.48 (2)",
-            "tab": "General information",
-            "score": 838.24
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=604.19, mean=604.19, max=604.19, sum=1208.38 (2)",
-            "tab": "General information",
-            "score": 604.19
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=540.63, mean=540.63, max=540.63, sum=1081.26 (2)",
-            "tab": "General information",
-            "score": 540.6300578034682
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=489.48, mean=489.48, max=489.48, sum=978.961 (2)",
-            "tab": "General information",
-            "score": 489.48039215686276
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.686, mean=0.686, max=0.686, sum=1.371 (2)",
-            "tab": "Efficiency",
-            "score": 0.6855517983436584
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=398.62, mean=398.62, max=398.62, sum=797.24 (2)",
-            "tab": "General information",
-            "score": 398.62
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.632,
-        "details": {
-          "description": "min=0.632, mean=0.632, max=0.632, sum=1.263 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.721, mean=0.721, max=0.721, sum=1.442 (2)",
-            "tab": "Efficiency",
-            "score": 0.720871933719568
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=619.596, mean=619.596, max=619.596, sum=1239.193 (2)",
-            "tab": "General information",
-            "score": 619.5964912280701
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.47,
-        "details": {
-          "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.671, mean=0.671, max=0.671, sum=1.342 (2)",
-            "tab": "Efficiency",
-            "score": 0.6710420751571655
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=411.61, mean=411.61, max=411.61, sum=823.22 (2)",
-            "tab": "General information",
-            "score": 411.61
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.861,
-        "details": {
-          "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.717, mean=0.717, max=0.717, sum=1.435 (2)",
-            "tab": "Efficiency",
-            "score": 0.7174532214800516
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=431.426, mean=431.426, max=431.426, sum=862.852 (2)",
-            "tab": "General information",
-            "score": 431.4259259259259
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.814,
-        "details": {
-          "description": "min=0.814, mean=0.814, max=0.814, sum=1.627 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.702, mean=0.702, max=0.702, sum=1.405 (2)",
-            "tab": "Efficiency",
-            "score": 0.7023597537896258
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=359.965, mean=359.965, max=359.965, sum=719.929 (2)",
-            "tab": "General information",
-            "score": 359.9646302250804
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.802,
-        "details": {
-          "description": "min=0.802, mean=0.802, max=0.802, sum=1.605 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.786, mean=0.786, max=0.786, sum=1.572 (2)",
-            "tab": "Efficiency",
-            "score": 0.7859190036268795
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.771, mean=0.771, max=0.771, sum=1.542 (2)",
-            "tab": "Efficiency",
-            "score": 0.7710303414797952
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.826, mean=0.826, max=0.826, sum=1.652 (2)",
-            "tab": "Efficiency",
-            "score": 0.8259650812310687
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=1.711, mean=1.711, max=1.711, sum=3.422 (2)",
-            "tab": "Efficiency",
-            "score": 1.7109862737406314
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1123.537, mean=1123.537, max=1123.537, sum=2247.074 (2)",
-            "tab": "General information",
-            "score": 1123.5367647058824
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=665.422, mean=665.422, max=665.422, sum=1330.844 (2)",
-            "tab": "General information",
-            "score": 665.4219858156029
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1701.16, mean=1701.16, max=1701.16, sum=3402.321 (2)",
-            "tab": "General information",
-            "score": 1701.16036505867
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=603.168, mean=603.168, max=603.168, sum=1206.337 (2)",
-            "tab": "General information",
-            "score": 603.1683006535948
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.95,
-        "details": {
-          "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.694, mean=0.694, max=0.694, sum=1.388 (2)",
-            "tab": "Efficiency",
-            "score": 0.6937756729125977
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=455.25, mean=455.25, max=455.25, sum=910.5 (2)",
-            "tab": "General information",
-            "score": 455.25
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.803 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.707, mean=0.707, max=0.707, sum=1.415 (2)",
-            "tab": "Efficiency",
-            "score": 0.7072845524863193
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=604.493, mean=604.493, max=604.493, sum=1208.987 (2)",
-            "tab": "General information",
-            "score": 604.4934210526316
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78,
-        "details": {
-          "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.705, mean=0.705, max=0.705, sum=1.411 (2)",
-            "tab": "Efficiency",
-            "score": 0.7054399585723877
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=600.02, mean=600.02, max=600.02, sum=1200.04 (2)",
-            "tab": "General information",
-            "score": 600.02
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.789,
-        "details": {
-          "description": "min=0.789, mean=0.789, max=0.789, sum=1.577 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.716, mean=0.716, max=0.716, sum=1.432 (2)",
-            "tab": "Efficiency",
-            "score": 0.7159239804969644
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=429.457, mean=429.457, max=429.457, sum=858.913 (2)",
-            "tab": "General information",
-            "score": 429.4566037735849
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.715,
-        "details": {
-          "description": "min=0.715, mean=0.715, max=0.715, sum=1.43 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.686, mean=0.686, max=0.686, sum=1.373 (2)",
-            "tab": "Efficiency",
-            "score": 0.686391481440118
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=323.536, mean=323.536, max=323.536, sum=647.072 (2)",
-            "tab": "General information",
-            "score": 323.53617021276597
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69,
-        "details": {
-          "description": "min=0.69, mean=0.69, max=0.69, sum=1.379 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)",
-            "tab": "Efficiency",
-            "score": 0.6958530524681354
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=437.041, mean=437.041, max=437.041, sum=874.083 (2)",
-            "tab": "General information",
-            "score": 437.04137931034484
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.558,
-        "details": {
-          "description": "min=0.558, mean=0.558, max=0.558, sum=1.116 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.734, mean=0.734, max=0.734, sum=1.468 (2)",
-            "tab": "Efficiency",
-            "score": 0.73423323177156
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=511.066, mean=511.066, max=511.066, sum=1022.132 (2)",
-            "tab": "General information",
-            "score": 511.06613756613757
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.579,
-        "details": {
-          "description": "min=0.579, mean=0.579, max=0.579, sum=1.159 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.731, mean=0.731, max=0.731, sum=1.462 (2)",
-            "tab": "Efficiency",
-            "score": 0.7307745880550809
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=646.746, mean=646.746, max=646.746, sum=1293.492 (2)",
-            "tab": "General information",
-            "score": 646.7460317460317
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.878,
-        "details": {
-          "description": "min=0.878, mean=0.878, max=0.878, sum=1.755 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.714, mean=0.714, max=0.714, sum=1.428 (2)",
-            "tab": "Efficiency",
-            "score": 0.7141557578117617
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.701, mean=0.701, max=0.701, sum=1.403 (2)",
-            "tab": "Efficiency",
-            "score": 0.7014370187750003
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.747, mean=0.747, max=0.747, sum=1.494 (2)",
-            "tab": "Efficiency",
-            "score": 0.7470939707756042
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.966, mean=0.966, max=0.966, sum=1.932 (2)",
-            "tab": "Efficiency",
-            "score": 0.9658473159327652
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.663, mean=0.663, max=0.663, sum=1.326 (2)",
-            "tab": "Efficiency",
-            "score": 0.6627856938525883
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.686, mean=0.686, max=0.686, sum=1.373 (2)",
-            "tab": "Efficiency",
-            "score": 0.6863837884497767
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.681, mean=0.681, max=0.681, sum=1.361 (2)",
-            "tab": "Efficiency",
-            "score": 0.6806940922370324
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.708, mean=0.708, max=0.708, sum=1.416 (2)",
-            "tab": "Efficiency",
-            "score": 0.7079638242721558
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.674, mean=0.674, max=0.674, sum=1.348 (2)",
-            "tab": "Efficiency",
-            "score": 0.6742001541522371
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.706, mean=0.706, max=0.706, sum=1.411 (2)",
-            "tab": "Efficiency",
-            "score": 0.7056786966639639
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)",
-            "tab": "Efficiency",
-            "score": 0.6960603683366688
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.726, mean=0.726, max=0.726, sum=1.452 (2)",
-            "tab": "Efficiency",
-            "score": 0.7262004735293212
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.876, mean=0.876, max=0.876, sum=1.752 (2)",
-            "tab": "Efficiency",
-            "score": 0.8757836842069439
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.785, mean=0.785, max=0.785, sum=1.571 (2)",
-            "tab": "Efficiency",
-            "score": 0.7852678007214381
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=534.577, mean=534.577, max=534.577, sum=1069.155 (2)",
-            "tab": "General information",
-            "score": 534.5774193548388
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=497.921, mean=497.921, max=497.921, sum=995.842 (2)",
-            "tab": "General information",
-            "score": 497.92118226600985
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=882.4, mean=882.4, max=882.4, sum=1764.8 (2)",
-            "tab": "General information",
-            "score": 882.4
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2887.576, mean=2887.576, max=2887.576, sum=5775.152 (2)",
-            "tab": "General information",
-            "score": 2887.5757575757575
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=412.268, mean=412.268, max=412.268, sum=824.535 (2)",
-            "tab": "General information",
-            "score": 412.2676767676768
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=491.104, mean=491.104, max=491.104, sum=982.207 (2)",
-            "tab": "General information",
-            "score": 491.10362694300517
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=406.036, mean=406.036, max=406.036, sum=812.072 (2)",
-            "tab": "General information",
-            "score": 406.0358974358974
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=519.881, mean=519.881, max=519.881, sum=1039.763 (2)",
-            "tab": "General information",
-            "score": 519.8814814814815
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=420.513, mean=420.513, max=420.513, sum=841.025 (2)",
-            "tab": "General information",
-            "score": 420.5126050420168
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=558.841, mean=558.841, max=558.841, sum=1117.682 (2)",
-            "tab": "General information",
-            "score": 558.841059602649
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=521.42, mean=521.42, max=521.42, sum=1042.84 (2)",
-            "tab": "General information",
-            "score": 521.4201834862386
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=806.963, mean=806.963, max=806.963, sum=1613.926 (2)",
-            "tab": "General information",
-            "score": 806.9629629629629
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2288.49, mean=2288.49, max=2288.49, sum=4576.98 (2)",
-            "tab": "General information",
-            "score": 2288.4901960784314
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1475.932, mean=1475.932, max=1475.932, sum=2951.865 (2)",
-            "tab": "General information",
-            "score": 1475.9324894514768
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.824,
-        "details": {
-          "description": "min=0.824, mean=0.824, max=0.824, sum=1.649 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.691, mean=0.691, max=0.691, sum=1.382 (2)",
-            "tab": "Efficiency",
-            "score": 0.6907867818669888
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.665, mean=0.665, max=0.665, sum=1.331 (2)",
-            "tab": "Efficiency",
-            "score": 0.6653509722411177
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=335.955, mean=335.955, max=335.955, sum=671.91 (2)",
-            "tab": "General information",
-            "score": 335.95515695067263
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=371.496, mean=371.496, max=371.496, sum=742.992 (2)",
-            "tab": "General information",
-            "score": 371.4961832061069
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.723, mean=0.723, max=0.723, sum=1.446 (2)",
-            "tab": "Efficiency",
-            "score": 0.7232089219999708
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=664.165, mean=664.165, max=664.165, sum=1328.331 (2)",
-            "tab": "General information",
-            "score": 664.1652892561983
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791,
-        "details": {
-          "description": "min=0.791, mean=0.791, max=0.791, sum=1.583 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.665, mean=0.665, max=0.665, sum=1.331 (2)",
-            "tab": "Efficiency",
-            "score": 0.6653785354520646
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=470.276, mean=470.276, max=470.276, sum=940.552 (2)",
-            "tab": "General information",
-            "score": 470.2760736196319
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.589,
-        "details": {
-          "description": "min=0.589, mean=0.589, max=0.589, sum=1.179 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.722, mean=0.722, max=0.722, sum=1.444 (2)",
-            "tab": "Efficiency",
-            "score": 0.7220823402915683
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=676.518, mean=676.518, max=676.518, sum=1353.036 (2)",
-            "tab": "General information",
-            "score": 676.5178571428571
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.874,
-        "details": {
-          "description": "min=0.874, mean=0.874, max=0.874, sum=1.748 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.663, mean=0.663, max=0.663, sum=1.327 (2)",
-            "tab": "Efficiency",
-            "score": 0.6634428709456064
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=301.282, mean=301.282, max=301.282, sum=602.563 (2)",
-            "tab": "General information",
-            "score": 301.28155339805824
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.665, mean=0.665, max=0.665, sum=1.33 (2)",
-            "tab": "Efficiency",
-            "score": 0.6648106361046816
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=448.064, mean=448.064, max=448.064, sum=896.128 (2)",
-            "tab": "General information",
-            "score": 448.06410256410254
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.662, mean=0.662, max=0.662, sum=1.324 (2)",
-            "tab": "Efficiency",
-            "score": 0.6621059203147888
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=354.88, mean=354.88, max=354.88, sum=709.76 (2)",
-            "tab": "General information",
-            "score": 354.88
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.893,
-        "details": {
-          "description": "min=0.893, mean=0.893, max=0.893, sum=1.785 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.678, mean=0.678, max=0.678, sum=1.357 (2)",
-            "tab": "Efficiency",
-            "score": 0.6782779660109207
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=328.628, mean=328.628, max=328.628, sum=657.257 (2)",
-            "tab": "General information",
-            "score": 328.62835249042143
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.502,
-        "details": {
-          "description": "min=0.502, mean=0.502, max=0.502, sum=1.003 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.709, mean=0.709, max=0.709, sum=1.419 (2)",
-            "tab": "Efficiency",
-            "score": 0.7093146880927114
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.716, mean=0.716, max=0.716, sum=1.432 (2)",
-            "tab": "Efficiency",
-            "score": 0.7158833943265777
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=511.789, mean=511.789, max=511.789, sum=1023.578 (2)",
-            "tab": "General information",
-            "score": 511.78901734104045
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=676.949, mean=676.949, max=676.949, sum=1353.897 (2)",
-            "tab": "General information",
-            "score": 676.9486033519553
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.72, mean=0.72, max=0.72, sum=1.441 (2)",
-            "tab": "Efficiency",
-            "score": 0.720291394813388
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=617.065, mean=617.065, max=617.065, sum=1234.131 (2)",
-            "tab": "General information",
-            "score": 617.0653594771242
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.824,
-        "details": {
-          "description": "min=0.824, mean=0.824, max=0.824, sum=1.648 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.713, mean=0.713, max=0.713, sum=1.427 (2)",
-            "tab": "Efficiency",
-            "score": 0.7133041966108629
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=545.639, mean=545.639, max=545.639, sum=1091.278 (2)",
-            "tab": "General information",
-            "score": 545.6388888888889
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.755,
-        "details": {
-          "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.684, mean=0.684, max=0.684, sum=1.369 (2)",
-            "tab": "Efficiency",
-            "score": 0.6844336206262762
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=432.991, mean=432.991, max=432.991, sum=865.982 (2)",
-            "tab": "General information",
-            "score": 432.9909090909091
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.808,
-        "details": {
-          "description": "min=0.808, mean=0.808, max=0.808, sum=1.616 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)",
-            "tab": "Efficiency",
-            "score": 0.7701463602027114
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1243.804, mean=1243.804, max=1243.804, sum=2487.608 (2)",
-            "tab": "General information",
-            "score": 1243.8040816326532
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)",
-            "tab": "Efficiency",
-            "score": 0.6899205867330827
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=467.274, mean=467.274, max=467.274, sum=934.547 (2)",
-            "tab": "General information",
-            "score": 467.27363184079604
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.542,
-        "details": {
-          "description": "min=0.542, mean=0.542, max=0.542, sum=1.084 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.728, mean=0.728, max=0.728, sum=1.456 (2)",
-            "tab": "Efficiency",
-            "score": 0.7279246169400503
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=362.651, mean=362.651, max=362.651, sum=725.301 (2)",
-            "tab": "General information",
-            "score": 362.65060240963857
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=1.743 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.727, mean=0.727, max=0.727, sum=1.454 (2)",
-            "tab": "Efficiency",
-            "score": 0.7269549021246837
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=293.018, mean=293.018, max=293.018, sum=586.035 (2)",
-            "tab": "General information",
-            "score": 293.0175438596491
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.28,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json b/data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json
deleted file mode 100644
index 230be4291..000000000
--- a/data/helm_mmlu/anthropic/claude-3-opus-20240229/acdf4701-e1c2-4867-bd85-d34ae8fb0991.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-opus-20240229/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 3 Opus 20240229",
-    "id": "anthropic/claude-3-opus-20240229",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.846,
-        "details": {
-          "description": "min=0.55, mean=0.846, max=0.979, sum=96.412 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=3.782, mean=4.077, max=5.005, sum=464.781 (114)",
-            "tab": "Efficiency",
-            "score": 4.077024270463863
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=293.018, mean=638.288, max=2887.576, sum=72764.875 (114)",
-            "tab": "General information",
-            "score": 638.2883793758953
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.64,
-        "details": {
-          "description": "min=0.64, mean=0.64, max=0.64, sum=1.28 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=4.182, mean=4.182, max=4.182, sum=8.364 (2)",
-            "tab": "Efficiency",
-            "score": 4.182226595878601
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=370.26, mean=370.26, max=370.26, sum=740.52 (2)",
-            "tab": "General information",
-            "score": 370.26
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=4.115, mean=4.115, max=4.115, sum=8.23 (2)",
-            "tab": "Efficiency",
-            "score": 4.114818896187677
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=370.8, mean=370.8, max=370.8, sum=741.6 (2)",
-            "tab": "General information",
-            "score": 370.8
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.716,
-        "details": {
-          "description": "min=0.716, mean=0.716, max=0.716, sum=1.431 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=4.373, mean=4.373, max=4.373, sum=8.745 (2)",
-            "tab": "Efficiency",
-            "score": 4.372743592262268
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=4.045, mean=4.045, max=4.045, sum=8.09 (2)",
-            "tab": "Efficiency",
-            "score": 4.044814482331276
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=4.326, mean=4.326, max=4.326, sum=8.652 (2)",
-            "tab": "Efficiency",
-            "score": 4.3260163617134095
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=4.209, mean=4.209, max=4.209, sum=8.417 (2)",
-            "tab": "Efficiency",
-            "score": 4.208740277290344
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=3.994, mean=3.994, max=3.994, sum=7.988 (2)",
-            "tab": "Efficiency",
-            "score": 3.9939607113082976
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=3.982, mean=3.982, max=3.982, sum=7.965 (2)",
-            "tab": "Efficiency",
-            "score": 3.9823715172561944
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=550.01, mean=550.01, max=550.01, sum=1100.02 (2)",
-            "tab": "General information",
-            "score": 550.01
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=490.347, mean=490.347, max=490.347, sum=980.694 (2)",
-            "tab": "General information",
-            "score": 490.34722222222223
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=838.24, mean=838.24, max=838.24, sum=1676.48 (2)",
-            "tab": "General information",
-            "score": 838.24
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=604.19, mean=604.19, max=604.19, sum=1208.38 (2)",
-            "tab": "General information",
-            "score": 604.19
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=540.63, mean=540.63, max=540.63, sum=1081.26 (2)",
-            "tab": "General information",
-            "score": 540.6300578034682
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=489.48, mean=489.48, max=489.48, sum=978.961 (2)",
-            "tab": "General information",
-            "score": 489.48039215686276
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=4.105, mean=4.105, max=4.105, sum=8.211 (2)",
-            "tab": "Efficiency",
-            "score": 4.105417683124542
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=398.62, mean=398.62, max=398.62, sum=797.24 (2)",
-            "tab": "General information",
-            "score": 398.62
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.789,
-        "details": {
-          "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=4.284, mean=4.284, max=4.284, sum=8.569 (2)",
-            "tab": "Efficiency",
-            "score": 4.284419020016988
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=619.596, mean=619.596, max=619.596, sum=1239.193 (2)",
-            "tab": "General information",
-            "score": 619.5964912280701
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.66,
-        "details": {
-          "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=4.232, mean=4.232, max=4.232, sum=8.465 (2)",
-            "tab": "Efficiency",
-            "score": 4.232321140766143
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=411.61, mean=411.61, max=411.61, sum=823.22 (2)",
-            "tab": "General information",
-            "score": 411.61
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "details": {
-          "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=3.872, mean=3.872, max=3.872, sum=7.744 (2)",
-            "tab": "Efficiency",
-            "score": 3.8720074185618647
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=431.426, mean=431.426, max=431.426, sum=862.852 (2)",
-            "tab": "General information",
-            "score": 431.4259259259259
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=3.967, mean=3.967, max=3.967, sum=7.935 (2)",
-            "tab": "Efficiency",
-            "score": 3.9672668930801933
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=359.965, mean=359.965, max=359.965, sum=719.929 (2)",
-            "tab": "General information",
-            "score": 359.9646302250804
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.904,
-        "details": {
-          "description": "min=0.904, mean=0.904, max=0.904, sum=1.807 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=4.358, mean=4.358, max=4.358, sum=8.715 (2)",
-            "tab": "Efficiency",
-            "score": 4.357662654974881
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=3.982, mean=3.982, max=3.982, sum=7.965 (2)",
-            "tab": "Efficiency",
-            "score": 3.9823869661236486
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=4.483, mean=4.483, max=4.483, sum=8.967 (2)",
-            "tab": "Efficiency",
-            "score": 4.483374906953963
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=4.006, mean=4.006, max=4.006, sum=8.012 (2)",
-            "tab": "Efficiency",
-            "score": 4.0058385706415365
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1123.537, mean=1123.537, max=1123.537, sum=2247.074 (2)",
-            "tab": "General information",
-            "score": 1123.5367647058824
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=665.422, mean=665.422, max=665.422, sum=1330.844 (2)",
-            "tab": "General information",
-            "score": 665.4219858156029
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1701.16, mean=1701.16, max=1701.16, sum=3402.321 (2)",
-            "tab": "General information",
-            "score": 1701.16036505867
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=603.168, mean=603.168, max=603.168, sum=1206.337 (2)",
-            "tab": "General information",
-            "score": 603.1683006535948
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.96,
-        "details": {
-          "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=4.003, mean=4.003, max=4.003, sum=8.006 (2)",
-            "tab": "Efficiency",
-            "score": 4.002964313030243
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=455.25, mean=455.25, max=455.25, sum=910.5 (2)",
-            "tab": "General information",
-            "score": 455.25
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.967,
-        "details": {
-          "description": "min=0.967, mean=0.967, max=0.967, sum=1.934 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=4.099, mean=4.099, max=4.099, sum=8.198 (2)",
-            "tab": "Efficiency",
-            "score": 4.099087294779326
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=604.493, mean=604.493, max=604.493, sum=1208.987 (2)",
-            "tab": "General information",
-            "score": 604.4934210526316
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.86,
-        "details": {
-          "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=4.102, mean=4.102, max=4.102, sum=8.204 (2)",
-            "tab": "Efficiency",
-            "score": 4.102163214683532
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=600.02, mean=600.02, max=600.02, sum=1200.04 (2)",
-            "tab": "General information",
-            "score": 600.02
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.879,
-        "details": {
-          "description": "min=0.879, mean=0.879, max=0.879, sum=1.758 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=3.976, mean=3.976, max=3.976, sum=7.952 (2)",
-            "tab": "Efficiency",
-            "score": 3.9762323631430574
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=429.457, mean=429.457, max=429.457, sum=858.913 (2)",
-            "tab": "General information",
-            "score": 429.4566037735849
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.881,
-        "details": {
-          "description": "min=0.881, mean=0.881, max=0.881, sum=1.762 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=3.959, mean=3.959, max=3.959, sum=7.918 (2)",
-            "tab": "Efficiency",
-            "score": 3.9589331109473047
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=323.536, mean=323.536, max=323.536, sum=647.072 (2)",
-            "tab": "General information",
-            "score": 323.53617021276597
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.814,
-        "details": {
-          "description": "min=0.814, mean=0.814, max=0.814, sum=1.628 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=4.017, mean=4.017, max=4.017, sum=8.035 (2)",
-            "tab": "Efficiency",
-            "score": 4.017465997564382
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=437.041, mean=437.041, max=437.041, sum=874.083 (2)",
-            "tab": "General information",
-            "score": 437.04137931034484
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.862,
-        "details": {
-          "description": "min=0.862, mean=0.862, max=0.862, sum=1.725 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=3.937, mean=3.937, max=3.937, sum=7.874 (2)",
-            "tab": "Efficiency",
-            "score": 3.937073076212848
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=511.066, mean=511.066, max=511.066, sum=1022.132 (2)",
-            "tab": "General information",
-            "score": 511.06613756613757
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.698,
-        "details": {
-          "description": "min=0.698, mean=0.698, max=0.698, sum=1.397 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=4.178, mean=4.178, max=4.178, sum=8.356 (2)",
-            "tab": "Efficiency",
-            "score": 4.177885971372089
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=646.746, mean=646.746, max=646.746, sum=1293.492 (2)",
-            "tab": "General information",
-            "score": 646.7460317460317
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.941,
-        "details": {
-          "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=4.184, mean=4.184, max=4.184, sum=8.368 (2)",
-            "tab": "Efficiency",
-            "score": 4.183918527633913
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=4.027, mean=4.027, max=4.027, sum=8.055 (2)",
-            "tab": "Efficiency",
-            "score": 4.027491113822449
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=3.929, mean=3.929, max=3.929, sum=7.858 (2)",
-            "tab": "Efficiency",
-            "score": 3.929041051864624
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=5.005, mean=5.005, max=5.005, sum=10.009 (2)",
-            "tab": "Efficiency",
-            "score": 5.004520618554317
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=3.872, mean=3.872, max=3.872, sum=7.743 (2)",
-            "tab": "Efficiency",
-            "score": 3.87151506332436
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=3.936, mean=3.936, max=3.936, sum=7.872 (2)",
-            "tab": "Efficiency",
-            "score": 3.936160638542373
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=3.782, mean=3.782, max=3.782, sum=7.563 (2)",
-            "tab": "Efficiency",
-            "score": 3.781650854379703
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=4.061, mean=4.061, max=4.061, sum=8.122 (2)",
-            "tab": "Efficiency",
-            "score": 4.0608021259307865
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=3.861, mean=3.861, max=3.861, sum=7.722 (2)",
-            "tab": "Efficiency",
-            "score": 3.860906556874764
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=3.938, mean=3.938, max=3.938, sum=7.876 (2)",
-            "tab": "Efficiency",
-            "score": 3.9381139499462203
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=4.059, mean=4.059, max=4.059, sum=8.118 (2)",
-            "tab": "Efficiency",
-            "score": 4.058962697282843
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=4.024, mean=4.024, max=4.024, sum=8.047 (2)",
-            "tab": "Efficiency",
-            "score": 4.023671524392234
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=4.606, mean=4.606, max=4.606, sum=9.213 (2)",
-            "tab": "Efficiency",
-            "score": 4.606354508914199
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=4.336, mean=4.336, max=4.336, sum=8.672 (2)",
-            "tab": "Efficiency",
-            "score": 4.335798429537423
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=534.577, mean=534.577, max=534.577, sum=1069.155 (2)",
-            "tab": "General information",
-            "score": 534.5774193548388
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=497.921, mean=497.921, max=497.921, sum=995.842 (2)",
-            "tab": "General information",
-            "score": 497.92118226600985
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=882.4, mean=882.4, max=882.4, sum=1764.8 (2)",
-            "tab": "General information",
-            "score": 882.4
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2887.576, mean=2887.576, max=2887.576, sum=5775.152 (2)",
-            "tab": "General information",
-            "score": 2887.5757575757575
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=412.268, mean=412.268, max=412.268, sum=824.535 (2)",
-            "tab": "General information",
-            "score": 412.2676767676768
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=491.104, mean=491.104, max=491.104, sum=982.207 (2)",
-            "tab": "General information",
-            "score": 491.10362694300517
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=406.036, mean=406.036, max=406.036, sum=812.072 (2)",
-            "tab": "General information",
-            "score": 406.0358974358974
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=519.881, mean=519.881, max=519.881, sum=1039.763 (2)",
-            "tab": "General information",
-            "score": 519.8814814814815
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=420.513, mean=420.513, max=420.513, sum=841.025 (2)",
-            "tab": "General information",
-            "score": 420.5126050420168
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=558.841, mean=558.841, max=558.841, sum=1117.682 (2)",
-            "tab": "General information",
-            "score": 558.841059602649
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=521.42, mean=521.42, max=521.42, sum=1042.84 (2)",
-            "tab": "General information",
-            "score": 521.4201834862386
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=806.963, mean=806.963, max=806.963, sum=1613.926 (2)",
-            "tab": "General information",
-            "score": 806.9629629629629
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2288.49, mean=2288.49, max=2288.49, sum=4576.98 (2)",
-            "tab": "General information",
-            "score": 2288.4901960784314
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1475.932, mean=1475.932, max=1475.932, sum=2951.865 (2)",
-            "tab": "General information",
-            "score": 1475.9324894514768
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.908,
-        "details": {
-          "description": "min=0.908, mean=0.908, max=0.908, sum=1.817 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=3.859, mean=3.859, max=3.859, sum=7.719 (2)",
-            "tab": "Efficiency",
-            "score": 3.8594313245183147
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=3.96, mean=3.96, max=3.96, sum=7.92 (2)",
-            "tab": "Efficiency",
-            "score": 3.9598546119136664
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=335.955, mean=335.955, max=335.955, sum=671.91 (2)",
-            "tab": "General information",
-            "score": 335.95515695067263
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=371.496, mean=371.496, max=371.496, sum=742.992 (2)",
-            "tab": "General information",
-            "score": 371.4961832061069
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=3.884, mean=3.884, max=3.884, sum=7.767 (2)",
-            "tab": "Efficiency",
-            "score": 3.8836900754408403
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=664.165, mean=664.165, max=664.165, sum=1328.331 (2)",
-            "tab": "General information",
-            "score": 664.1652892561983
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.896,
-        "details": {
-          "description": "min=0.896, mean=0.896, max=0.896, sum=1.791 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=3.913, mean=3.913, max=3.913, sum=7.826 (2)",
-            "tab": "Efficiency",
-            "score": 3.9131746394502605
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=470.276, mean=470.276, max=470.276, sum=940.552 (2)",
-            "tab": "General information",
-            "score": 470.2760736196319
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.741,
-        "details": {
-          "description": "min=0.741, mean=0.741, max=0.741, sum=1.482 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=4.19, mean=4.19, max=4.19, sum=8.379 (2)",
-            "tab": "Efficiency",
-            "score": 4.189559940780912
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=676.518, mean=676.518, max=676.518, sum=1353.036 (2)",
-            "tab": "General information",
-            "score": 676.5178571428571
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.942,
-        "details": {
-          "description": "min=0.942, mean=0.942, max=0.942, sum=1.883 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=4.01, mean=4.01, max=4.01, sum=8.02 (2)",
-            "tab": "Efficiency",
-            "score": 4.009768469819745
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=301.282, mean=301.282, max=301.282, sum=602.563 (2)",
-            "tab": "General information",
-            "score": 301.28155339805824
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.944,
-        "details": {
-          "description": "min=0.944, mean=0.944, max=0.944, sum=1.889 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=3.988, mean=3.988, max=3.988, sum=7.975 (2)",
-            "tab": "Efficiency",
-            "score": 3.9875136002516136
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=448.064, mean=448.064, max=448.064, sum=896.128 (2)",
-            "tab": "General information",
-            "score": 448.06410256410254
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "details": {
-          "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=3.913, mean=3.913, max=3.913, sum=7.827 (2)",
-            "tab": "Efficiency",
-            "score": 3.913457498550415
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=354.88, mean=354.88, max=354.88, sum=709.76 (2)",
-            "tab": "General information",
-            "score": 354.88
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.951,
-        "details": {
-          "description": "min=0.951, mean=0.951, max=0.951, sum=1.903 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=3.945, mean=3.945, max=3.945, sum=7.889 (2)",
-            "tab": "Efficiency",
-            "score": 3.9445087267216747
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=328.628, mean=328.628, max=328.628, sum=657.257 (2)",
-            "tab": "General information",
-            "score": 328.62835249042143
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.826,
-        "details": {
-          "description": "min=0.826, mean=0.826, max=0.826, sum=1.651 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=4.057, mean=4.057, max=4.057, sum=8.113 (2)",
-            "tab": "Efficiency",
-            "score": 4.0566764987273025
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=4.082, mean=4.082, max=4.082, sum=8.165 (2)",
-            "tab": "Efficiency",
-            "score": 4.082338048892314
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=511.789, mean=511.789, max=511.789, sum=1023.578 (2)",
-            "tab": "General information",
-            "score": 511.78901734104045
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=676.949, mean=676.949, max=676.949, sum=1353.897 (2)",
-            "tab": "General information",
-            "score": 676.9486033519553
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.925,
-        "details": {
-          "description": "min=0.925, mean=0.925, max=0.925, sum=1.85 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=4.106, mean=4.106, max=4.106, sum=8.213 (2)",
-            "tab": "Efficiency",
-            "score": 4.106359853464014
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=617.065, mean=617.065, max=617.065, sum=1234.131 (2)",
-            "tab": "General information",
-            "score": 617.0653594771242
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.941,
-        "details": {
-          "description": "min=0.941, mean=0.941, max=0.941, sum=1.883 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=3.998, mean=3.998, max=3.998, sum=7.996 (2)",
-            "tab": "Efficiency",
-            "score": 3.998204750779234
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=545.639, mean=545.639, max=545.639, sum=1091.278 (2)",
-            "tab": "General information",
-            "score": 545.6388888888889
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.827,
-        "details": {
-          "description": "min=0.827, mean=0.827, max=0.827, sum=1.655 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=3.843, mean=3.843, max=3.843, sum=7.685 (2)",
-            "tab": "Efficiency",
-            "score": 3.8426286415620283
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=432.991, mean=432.991, max=432.991, sum=865.982 (2)",
-            "tab": "General information",
-            "score": 432.9909090909091
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.886,
-        "details": {
-          "description": "min=0.886, mean=0.886, max=0.886, sum=1.771 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=4.346, mean=4.346, max=4.346, sum=8.692 (2)",
-            "tab": "Efficiency",
-            "score": 4.3459005385029075
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1243.804, mean=1243.804, max=1243.804, sum=2487.608 (2)",
-            "tab": "General information",
-            "score": 1243.8040816326532
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.881 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=3.946, mean=3.946, max=3.946, sum=7.893 (2)",
-            "tab": "Efficiency",
-            "score": 3.94632918561869
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=467.274, mean=467.274, max=467.274, sum=934.547 (2)",
-            "tab": "General information",
-            "score": 467.27363184079604
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.578,
-        "details": {
-          "description": "min=0.578, mean=0.578, max=0.578, sum=1.157 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=3.932, mean=3.932, max=3.932, sum=7.865 (2)",
-            "tab": "Efficiency",
-            "score": 3.9324641141546777
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=362.651, mean=362.651, max=362.651, sum=725.301 (2)",
-            "tab": "General information",
-            "score": 362.65060240963857
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.801 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=4.011, mean=4.011, max=4.011, sum=8.023 (2)",
-            "tab": "Efficiency",
-            "score": 4.011422206086722
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=293.018, mean=293.018, max=293.018, sum=586.035 (2)",
-            "tab": "General information",
-            "score": 293.0175438596491
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.014,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json b/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json
deleted file mode 100644
index dd7543ecb..000000000
--- a/data/helm_mmlu/anthropic/claude-3-sonnet-20240229/3cd855af-9679-4fd0-bc3f-34db697c7855.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-3-sonnet-20240229/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude 3 Sonnet 20240229",
-    "id": "anthropic/claude-3-sonnet-20240229",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.759,
-        "details": {
-          "description": "min=0.39, mean=0.759, max=0.959, sum=86.545 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=1.21, mean=1.468, max=8.072, sum=167.341 (114)",
-            "tab": "Efficiency",
-            "score": 1.4679056233464987
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=293.018, mean=638.288, max=2887.576, sum=72764.875 (114)",
-            "tab": "General information",
-            "score": 638.2883793758953
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39,
-        "details": {
-          "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=1.248, mean=1.248, max=1.248, sum=2.495 (2)",
-            "tab": "Efficiency",
-            "score": 1.2476251411437989
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=370.26, mean=370.26, max=370.26, sum=740.52 (2)",
-            "tab": "General information",
-            "score": 370.26
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.711,
-        "details": {
-          "description": "min=0.711, mean=0.711, max=0.711, sum=1.422 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=1.225, mean=1.225, max=1.225, sum=2.45 (2)",
-            "tab": "Efficiency",
-            "score": 1.224808097768713
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=370.8, mean=370.8, max=370.8, sum=741.6 (2)",
-            "tab": "General information",
-            "score": 370.8
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.559,
-        "details": {
-          "description": "min=0.559, mean=0.559, max=0.559, sum=1.118 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=1.33, mean=1.33, max=1.33, sum=2.659 (2)",
-            "tab": "Efficiency",
-            "score": 1.3297029423713684
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=1.293, mean=1.293, max=1.293, sum=2.585 (2)",
-            "tab": "Efficiency",
-            "score": 1.2926498336924448
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=1.494, mean=1.494, max=1.494, sum=2.988 (2)",
-            "tab": "Efficiency",
-            "score": 1.493921182155609
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=1.346, mean=1.346, max=1.346, sum=2.693 (2)",
-            "tab": "Efficiency",
-            "score": 1.346416823863983
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=1.316, mean=1.316, max=1.316, sum=2.632 (2)",
-            "tab": "Efficiency",
-            "score": 1.315991141203511
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=1.286, mean=1.286, max=1.286, sum=2.573 (2)",
-            "tab": "Efficiency",
-            "score": 1.2863672691233017
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=550.01, mean=550.01, max=550.01, sum=1100.02 (2)",
-            "tab": "General information",
-            "score": 550.01
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=490.347, mean=490.347, max=490.347, sum=980.694 (2)",
-            "tab": "General information",
-            "score": 490.34722222222223
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=838.24, mean=838.24, max=838.24, sum=1676.48 (2)",
-            "tab": "General information",
-            "score": 838.24
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=604.19, mean=604.19, max=604.19, sum=1208.38 (2)",
-            "tab": "General information",
-            "score": 604.19
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=540.63, mean=540.63, max=540.63, sum=1081.26 (2)",
-            "tab": "General information",
-            "score": 540.6300578034682
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=489.48, mean=489.48, max=489.48, sum=978.961 (2)",
-            "tab": "General information",
-            "score": 489.48039215686276
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=1.228, mean=1.228, max=1.228, sum=2.456 (2)",
-            "tab": "Efficiency",
-            "score": 1.2280330896377563
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=398.62, mean=398.62, max=398.62, sum=797.24 (2)",
-            "tab": "General information",
-            "score": 398.62
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.64,
-        "details": {
-          "description": "min=0.64, mean=0.64, max=0.64, sum=1.281 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=1.341, mean=1.341, max=1.341, sum=2.682 (2)",
-            "tab": "Efficiency",
-            "score": 1.3410238989612513
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=619.596, mean=619.596, max=619.596, sum=1239.193 (2)",
-            "tab": "General information",
-            "score": 619.5964912280701
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53,
-        "details": {
-          "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=1.253, mean=1.253, max=1.253, sum=2.505 (2)",
-            "tab": "Efficiency",
-            "score": 1.2527140331268312
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=411.61, mean=411.61, max=411.61, sum=823.22 (2)",
-            "tab": "General information",
-            "score": 411.61
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.861,
-        "details": {
-          "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=1.248, mean=1.248, max=1.248, sum=2.496 (2)",
-            "tab": "Efficiency",
-            "score": 1.2482430162253204
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=431.426, mean=431.426, max=431.426, sum=862.852 (2)",
-            "tab": "General information",
-            "score": 431.4259259259259
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.852,
-        "details": {
-          "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=1.221, mean=1.221, max=1.221, sum=2.442 (2)",
-            "tab": "Efficiency",
-            "score": 1.22093992217944
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=359.965, mean=359.965, max=359.965, sum=719.929 (2)",
-            "tab": "General information",
-            "score": 359.9646302250804
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.814,
-        "details": {
-          "description": "min=0.814, mean=0.814, max=0.814, sum=1.627 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=1.608, mean=1.608, max=1.608, sum=3.216 (2)",
-            "tab": "Efficiency",
-            "score": 1.6081139156047035
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=1.391, mean=1.391, max=1.391, sum=2.781 (2)",
-            "tab": "Efficiency",
-            "score": 1.3905252252064697
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=1.87, mean=1.87, max=1.87, sum=3.741 (2)",
-            "tab": "Efficiency",
-            "score": 1.8703640130539139
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=1.297, mean=1.297, max=1.297, sum=2.593 (2)",
-            "tab": "Efficiency",
-            "score": 1.2967337436146207
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1123.537, mean=1123.537, max=1123.537, sum=2247.074 (2)",
-            "tab": "General information",
-            "score": 1123.5367647058824
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=665.422, mean=665.422, max=665.422, sum=1330.844 (2)",
-            "tab": "General information",
-            "score": 665.4219858156029
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1701.16, mean=1701.16, max=1701.16, sum=3402.321 (2)",
-            "tab": "General information",
-            "score": 1701.16036505867
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=603.168, mean=603.168, max=603.168, sum=1206.337 (2)",
-            "tab": "General information",
-            "score": 603.1683006535948
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=1.245, mean=1.245, max=1.245, sum=2.489 (2)",
-            "tab": "Efficiency",
-            "score": 1.2445136380195618
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=455.25, mean=455.25, max=455.25, sum=910.5 (2)",
-            "tab": "General information",
-            "score": 455.25
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.855,
-        "details": {
-          "description": "min=0.855, mean=0.855, max=0.855, sum=1.711 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=1.303, mean=1.303, max=1.303, sum=2.607 (2)",
-            "tab": "Efficiency",
-            "score": 1.3033642768859863
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=604.493, mean=604.493, max=604.493, sum=1208.987 (2)",
-            "tab": "General information",
-            "score": 604.4934210526316
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=1.304, mean=1.304, max=1.304, sum=2.607 (2)",
-            "tab": "Efficiency",
-            "score": 1.3036250400543212
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=600.02, mean=600.02, max=600.02, sum=1200.04 (2)",
-            "tab": "General information",
-            "score": 600.02
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.804,
-        "details": {
-          "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=1.24, mean=1.24, max=1.24, sum=2.48 (2)",
-            "tab": "Efficiency",
-            "score": 1.2399591086045751
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=429.457, mean=429.457, max=429.457, sum=858.913 (2)",
-            "tab": "General information",
-            "score": 429.4566037735849
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.774,
-        "details": {
-          "description": "min=0.774, mean=0.774, max=0.774, sum=1.549 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=1.256, mean=1.256, max=1.256, sum=2.513 (2)",
-            "tab": "Efficiency",
-            "score": 1.2563625832821461
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=323.536, mean=323.536, max=323.536, sum=647.072 (2)",
-            "tab": "General information",
-            "score": 323.53617021276597
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.703,
-        "details": {
-          "description": "min=0.703, mean=0.703, max=0.703, sum=1.407 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=1.276, mean=1.276, max=1.276, sum=2.553 (2)",
-            "tab": "Efficiency",
-            "score": 1.276360561107767
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=437.041, mean=437.041, max=437.041, sum=874.083 (2)",
-            "tab": "General information",
-            "score": 437.04137931034484
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.635,
-        "details": {
-          "description": "min=0.635, mean=0.635, max=0.635, sum=1.27 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=1.301, mean=1.301, max=1.301, sum=2.602 (2)",
-            "tab": "Efficiency",
-            "score": 1.3010439260926827
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=511.066, mean=511.066, max=511.066, sum=1022.132 (2)",
-            "tab": "General information",
-            "score": 511.06613756613757
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.579,
-        "details": {
-          "description": "min=0.579, mean=0.579, max=0.579, sum=1.159 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=1.369, mean=1.369, max=1.369, sum=2.738 (2)",
-            "tab": "Efficiency",
-            "score": 1.3692201716559274
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=646.746, mean=646.746, max=646.746, sum=1293.492 (2)",
-            "tab": "General information",
-            "score": 646.7460317460317
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.895,
-        "details": {
-          "description": "min=0.895, mean=0.895, max=0.895, sum=1.789 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=1.292, mean=1.292, max=1.292, sum=2.585 (2)",
-            "tab": "Efficiency",
-            "score": 1.2923692734010759
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=1.339, mean=1.339, max=1.339, sum=2.678 (2)",
-            "tab": "Efficiency",
-            "score": 1.3387701969428603
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=1.51, mean=1.51, max=1.51, sum=3.02 (2)",
-            "tab": "Efficiency",
-            "score": 1.5097803854942322
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=2.456, mean=2.456, max=2.456, sum=4.912 (2)",
-            "tab": "Efficiency",
-            "score": 2.4561073808959035
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=1.269, mean=1.269, max=1.269, sum=2.537 (2)",
-            "tab": "Efficiency",
-            "score": 1.2686388372170805
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=1.287, mean=1.287, max=1.287, sum=2.574 (2)",
-            "tab": "Efficiency",
-            "score": 1.2869715455900201
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=1.266, mean=1.266, max=1.266, sum=2.533 (2)",
-            "tab": "Efficiency",
-            "score": 1.2664643880648492
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=1.313, mean=1.313, max=1.313, sum=2.626 (2)",
-            "tab": "Efficiency",
-            "score": 1.3131960109428122
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=1.261, mean=1.261, max=1.261, sum=2.521 (2)",
-            "tab": "Efficiency",
-            "score": 1.260614112645638
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=1.302, mean=1.302, max=1.302, sum=2.603 (2)",
-            "tab": "Efficiency",
-            "score": 1.3015588419326882
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=1.304, mean=1.304, max=1.304, sum=2.607 (2)",
-            "tab": "Efficiency",
-            "score": 1.3036036592011058
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=1.512, mean=1.512, max=1.512, sum=3.025 (2)",
-            "tab": "Efficiency",
-            "score": 1.512356918167185
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=8.072, mean=8.072, max=8.072, sum=16.145 (2)",
-            "tab": "Efficiency",
-            "score": 8.072314507821027
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=1.746, mean=1.746, max=1.746, sum=3.491 (2)",
-            "tab": "Efficiency",
-            "score": 1.74568142066022
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=534.577, mean=534.577, max=534.577, sum=1069.155 (2)",
-            "tab": "General information",
-            "score": 534.5774193548388
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=497.921, mean=497.921, max=497.921, sum=995.842 (2)",
-            "tab": "General information",
-            "score": 497.92118226600985
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=882.4, mean=882.4, max=882.4, sum=1764.8 (2)",
-            "tab": "General information",
-            "score": 882.4
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2887.576, mean=2887.576, max=2887.576, sum=5775.152 (2)",
-            "tab": "General information",
-            "score": 2887.5757575757575
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=412.268, mean=412.268, max=412.268, sum=824.535 (2)",
-            "tab": "General information",
-            "score": 412.2676767676768
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=491.104, mean=491.104, max=491.104, sum=982.207 (2)",
-            "tab": "General information",
-            "score": 491.10362694300517
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=406.036, mean=406.036, max=406.036, sum=812.072 (2)",
-            "tab": "General information",
-            "score": 406.0358974358974
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=519.881, mean=519.881, max=519.881, sum=1039.763 (2)",
-            "tab": "General information",
-            "score": 519.8814814814815
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=420.513, mean=420.513, max=420.513, sum=841.025 (2)",
-            "tab": "General information",
-            "score": 420.5126050420168
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=558.841, mean=558.841, max=558.841, sum=1117.682 (2)",
-            "tab": "General information",
-            "score": 558.841059602649
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=521.42, mean=521.42, max=521.42, sum=1042.84 (2)",
-            "tab": "General information",
-            "score": 521.4201834862386
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=806.963, mean=806.963, max=806.963, sum=1613.926 (2)",
-            "tab": "General information",
-            "score": 806.9629629629629
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2288.49, mean=2288.49, max=2288.49, sum=4576.98 (2)",
-            "tab": "General information",
-            "score": 2288.4901960784314
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1475.932, mean=1475.932, max=1475.932, sum=2951.865 (2)",
-            "tab": "General information",
-            "score": 1475.9324894514768
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.809,
-        "details": {
-          "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=1.21, mean=1.21, max=1.21, sum=2.42 (2)",
-            "tab": "Efficiency",
-            "score": 1.2099821963117796
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=1.255, mean=1.255, max=1.255, sum=2.509 (2)",
-            "tab": "Efficiency",
-            "score": 1.2545511012768928
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=335.955, mean=335.955, max=335.955, sum=671.91 (2)",
-            "tab": "General information",
-            "score": 335.95515695067263
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=371.496, mean=371.496, max=371.496, sum=742.992 (2)",
-            "tab": "General information",
-            "score": 371.4961832061069
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.909,
-        "details": {
-          "description": "min=0.909, mean=0.909, max=0.909, sum=1.818 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=1.375, mean=1.375, max=1.375, sum=2.751 (2)",
-            "tab": "Efficiency",
-            "score": 1.3753716256007675
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=664.165, mean=664.165, max=664.165, sum=1328.331 (2)",
-            "tab": "General information",
-            "score": 664.1652892561983
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.853,
-        "details": {
-          "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=1.237, mean=1.237, max=1.237, sum=2.474 (2)",
-            "tab": "Efficiency",
-            "score": 1.23694542580587
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=470.276, mean=470.276, max=470.276, sum=940.552 (2)",
-            "tab": "General information",
-            "score": 470.2760736196319
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.643,
-        "details": {
-          "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=1.362, mean=1.362, max=1.362, sum=2.725 (2)",
-            "tab": "Efficiency",
-            "score": 1.3623365994010652
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=676.518, mean=676.518, max=676.518, sum=1353.036 (2)",
-            "tab": "General information",
-            "score": 676.5178571428571
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.922,
-        "details": {
-          "description": "min=0.922, mean=0.922, max=0.922, sum=1.845 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=1.265, mean=1.265, max=1.265, sum=2.529 (2)",
-            "tab": "Efficiency",
-            "score": 1.2646709923605317
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=301.282, mean=301.282, max=301.282, sum=602.563 (2)",
-            "tab": "General information",
-            "score": 301.28155339805824
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=1.701 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=1.251, mean=1.251, max=1.251, sum=2.503 (2)",
-            "tab": "Efficiency",
-            "score": 1.2514099310605953
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=448.064, mean=448.064, max=448.064, sum=896.128 (2)",
-            "tab": "General information",
-            "score": 448.06410256410254
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=1.22, mean=1.22, max=1.22, sum=2.441 (2)",
-            "tab": "Efficiency",
-            "score": 1.2204306960105895
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=354.88, mean=354.88, max=354.88, sum=709.76 (2)",
-            "tab": "General information",
-            "score": 354.88
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.872,
-        "details": {
-          "description": "min=0.872, mean=0.872, max=0.872, sum=1.745 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=1.233, mean=1.233, max=1.233, sum=2.467 (2)",
-            "tab": "Efficiency",
-            "score": 1.2334287364516374
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=328.628, mean=328.628, max=328.628, sum=657.257 (2)",
-            "tab": "General information",
-            "score": 328.62835249042143
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.626,
-        "details": {
-          "description": "min=0.626, mean=0.626, max=0.626, sum=1.251 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=1.287, mean=1.287, max=1.287, sum=2.575 (2)",
-            "tab": "Efficiency",
-            "score": 1.2873861700124134
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=1.361, mean=1.361, max=1.361, sum=2.722 (2)",
-            "tab": "Efficiency",
-            "score": 1.361004557156696
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=511.789, mean=511.789, max=511.789, sum=1023.578 (2)",
-            "tab": "General information",
-            "score": 511.78901734104045
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=676.949, mean=676.949, max=676.949, sum=1353.897 (2)",
-            "tab": "General information",
-            "score": 676.9486033519553
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.82, mean=0.82, max=0.82, sum=1.641 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=1.319, mean=1.319, max=1.319, sum=2.638 (2)",
-            "tab": "Efficiency",
-            "score": 1.3189228679619582
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=617.065, mean=617.065, max=617.065, sum=1234.131 (2)",
-            "tab": "General information",
-            "score": 617.0653594771242
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.864,
-        "details": {
-          "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=1.305, mean=1.305, max=1.305, sum=2.611 (2)",
-            "tab": "Efficiency",
-            "score": 1.305255777306027
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=545.639, mean=545.639, max=545.639, sum=1091.278 (2)",
-            "tab": "General information",
-            "score": 545.6388888888889
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.782,
-        "details": {
-          "description": "min=0.782, mean=0.782, max=0.782, sum=1.564 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=1.25, mean=1.25, max=1.25, sum=2.5 (2)",
-            "tab": "Efficiency",
-            "score": 1.2497538588263772
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=432.991, mean=432.991, max=432.991, sum=865.982 (2)",
-            "tab": "General information",
-            "score": 432.9909090909091
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "details": {
-          "description": "min=0.865, mean=0.865, max=0.865, sum=1.731 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=1.662, mean=1.662, max=1.662, sum=3.325 (2)",
-            "tab": "Efficiency",
-            "score": 1.6624354012158453
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1243.804, mean=1243.804, max=1243.804, sum=2487.608 (2)",
-            "tab": "General information",
-            "score": 1243.8040816326532
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.905,
-        "details": {
-          "description": "min=0.905, mean=0.905, max=0.905, sum=1.811 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=1.268, mean=1.268, max=1.268, sum=2.535 (2)",
-            "tab": "Efficiency",
-            "score": 1.267556501265189
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=467.274, mean=467.274, max=467.274, sum=934.547 (2)",
-            "tab": "General information",
-            "score": 467.27363184079604
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.578,
-        "details": {
-          "description": "min=0.578, mean=0.578, max=0.578, sum=1.157 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=1.321, mean=1.321, max=1.321, sum=2.642 (2)",
-            "tab": "Efficiency",
-            "score": 1.3211244660687733
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=362.651, mean=362.651, max=362.651, sum=725.301 (2)",
-            "tab": "General information",
-            "score": 362.65060240963857
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=1.743 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=1.271, mean=1.271, max=1.271, sum=2.542 (2)",
-            "tab": "Efficiency",
-            "score": 1.2710035530447263
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=293.018, mean=293.018, max=293.018, sum=586.035 (2)",
-            "tab": "General information",
-            "score": 293.0175438596491
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.082,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json b/data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json
deleted file mode 100644
index c9e9779b1..000000000
--- a/data/helm_mmlu/anthropic/claude-instant-1.2/78fb6814-e32f-4b15-b958-9e001637ba07.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/anthropic_claude-instant-1.2/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Claude Instant 1.2",
-    "id": "anthropic/claude-instant-1.2",
-    "developer": "anthropic",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.688,
-        "details": {
-          "description": "min=0.333, mean=0.688, max=0.902, sum=78.425 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.59, mean=0.932, max=1.62, sum=106.285 (114)",
-            "tab": "Efficiency",
-            "score": 0.9323255288146379
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=358.018, mean=703.288, max=2952.576, sum=80174.875 (114)",
-            "tab": "General information",
-            "score": 703.2883793758955
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0.994, mean=1.0, max=1, sum=113.988 (114)",
-            "tab": "General information",
-            "score": 0.9998985904066524
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37,
-        "details": {
-          "description": "min=0.37, mean=0.37, max=0.37, sum=0.74 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.59, mean=0.59, max=0.59, sum=1.181 (2)",
-            "tab": "Efficiency",
-            "score": 0.5904157018661499
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=435.26, mean=435.26, max=435.26, sum=870.52 (2)",
-            "tab": "General information",
-            "score": 435.26
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.637,
-        "details": {
-          "description": "min=0.637, mean=0.637, max=0.637, sum=1.274 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.914, mean=0.914, max=0.914, sum=1.827 (2)",
-            "tab": "Efficiency",
-            "score": 0.9135703210477476
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=435.8, mean=435.8, max=435.8, sum=871.6 (2)",
-            "tab": "General information",
-            "score": 435.8
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.49,
-        "details": {
-          "description": "min=0.49, mean=0.49, max=0.49, sum=0.98 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.636, mean=0.636, max=0.636, sum=1.272 (2)",
-            "tab": "Efficiency",
-            "score": 0.6360281848907471
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=1.016, mean=1.016, max=1.016, sum=2.033 (2)",
-            "tab": "Efficiency",
-            "score": 1.0163518455293443
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=1.153, mean=1.153, max=1.153, sum=2.306 (2)",
-            "tab": "Efficiency",
-            "score": 1.1530575346946716
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=1.157, mean=1.157, max=1.157, sum=2.314 (2)",
-            "tab": "Efficiency",
-            "score": 1.1569927215576172
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=1.086, mean=1.086, max=1.086, sum=2.173 (2)",
-            "tab": "Efficiency",
-            "score": 1.0863008636959715
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.938, mean=0.938, max=0.938, sum=1.875 (2)",
-            "tab": "Efficiency",
-            "score": 0.9376059443342919
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=615.01, mean=615.01, max=615.01, sum=1230.02 (2)",
-            "tab": "General information",
-            "score": 615.01
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=555.347, mean=555.347, max=555.347, sum=1110.694 (2)",
-            "tab": "General information",
-            "score": 555.3472222222222
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=903.24, mean=903.24, max=903.24, sum=1806.48 (2)",
-            "tab": "General information",
-            "score": 903.24
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=669.19, mean=669.19, max=669.19, sum=1338.38 (2)",
-            "tab": "General information",
-            "score": 669.19
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=605.63, mean=605.63, max=605.63, sum=1211.26 (2)",
-            "tab": "General information",
-            "score": 605.6300578034682
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0.994, mean=0.994, max=0.994, sum=1.988 (2)",
-            "tab": "General information",
-            "score": 0.9942196531791907
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=554.48, mean=554.48, max=554.48, sum=1108.961 (2)",
-            "tab": "General information",
-            "score": 554.4803921568628
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.597, mean=0.597, max=0.597, sum=1.194 (2)",
-            "tab": "Efficiency",
-            "score": 0.596819703578949
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=463.62, mean=463.62, max=463.62, sum=927.24 (2)",
-            "tab": "General information",
-            "score": 463.62
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.614,
-        "details": {
-          "description": "min=0.614, mean=0.614, max=0.614, sum=1.228 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.633, mean=0.633, max=0.633, sum=1.267 (2)",
-            "tab": "Efficiency",
-            "score": 0.6333246440218206
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=684.596, mean=684.596, max=684.596, sum=1369.193 (2)",
-            "tab": "General information",
-            "score": 684.5964912280701
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38,
-        "details": {
-          "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.975, mean=0.975, max=0.975, sum=1.949 (2)",
-            "tab": "Efficiency",
-            "score": 0.9746571969985962
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=476.61, mean=476.61, max=476.61, sum=953.22 (2)",
-            "tab": "General information",
-            "score": 476.61
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.833,
-        "details": {
-          "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.811, mean=0.811, max=0.811, sum=1.621 (2)",
-            "tab": "Efficiency",
-            "score": 0.8107206269546792
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=496.426, mean=496.426, max=496.426, sum=992.852 (2)",
-            "tab": "General information",
-            "score": 496.4259259259259
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.756,
-        "details": {
-          "description": "min=0.756, mean=0.756, max=0.756, sum=1.511 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.832, mean=0.832, max=0.832, sum=1.664 (2)",
-            "tab": "Efficiency",
-            "score": 0.8319868075502647
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=424.965, mean=424.965, max=424.965, sum=849.929 (2)",
-            "tab": "General information",
-            "score": 424.9646302250804
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.724,
-        "details": {
-          "description": "min=0.724, mean=0.724, max=0.724, sum=1.448 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=1.073, mean=1.073, max=1.073, sum=2.146 (2)",
-            "tab": "Efficiency",
-            "score": 1.072824116138851
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.895, mean=0.895, max=0.895, sum=1.79 (2)",
-            "tab": "Efficiency",
-            "score": 0.8950984232814599
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=1.058, mean=1.058, max=1.058, sum=2.117 (2)",
-            "tab": "Efficiency",
-            "score": 1.0584386131754133
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.859, mean=0.859, max=0.859, sum=1.718 (2)",
-            "tab": "Efficiency",
-            "score": 0.8591087651408575
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1188.537, mean=1188.537, max=1188.537, sum=2377.074 (2)",
-            "tab": "General information",
-            "score": 1188.5367647058824
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=730.422, mean=730.422, max=730.422, sum=1460.844 (2)",
-            "tab": "General information",
-            "score": 730.4219858156029
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1766.16, mean=1766.16, max=1766.16, sum=3532.321 (2)",
-            "tab": "General information",
-            "score": 1766.16036505867
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=668.168, mean=668.168, max=668.168, sum=1336.337 (2)",
-            "tab": "General information",
-            "score": 668.1683006535948
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.613, mean=0.613, max=0.613, sum=1.226 (2)",
-            "tab": "Efficiency",
-            "score": 0.6128408885002137
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=520.25, mean=520.25, max=520.25, sum=1040.5 (2)",
-            "tab": "General information",
-            "score": 520.25
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.743,
-        "details": {
-          "description": "min=0.743, mean=0.743, max=0.743, sum=1.487 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=1.124, mean=1.124, max=1.124, sum=2.248 (2)",
-            "tab": "Efficiency",
-            "score": 1.123885358634748
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=669.493, mean=669.493, max=669.493, sum=1338.987 (2)",
-            "tab": "General information",
-            "score": 669.4934210526316
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7,
-        "details": {
-          "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=1.102, mean=1.102, max=1.102, sum=2.204 (2)",
-            "tab": "Efficiency",
-            "score": 1.101954047679901
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=665.02, mean=665.02, max=665.02, sum=1330.04 (2)",
-            "tab": "General information",
-            "score": 665.02
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.709,
-        "details": {
-          "description": "min=0.709, mean=0.709, max=0.709, sum=1.419 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.899, mean=0.899, max=0.899, sum=1.799 (2)",
-            "tab": "Efficiency",
-            "score": 0.8994299870616985
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=494.457, mean=494.457, max=494.457, sum=988.913 (2)",
-            "tab": "General information",
-            "score": 494.4566037735849
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.613,
-        "details": {
-          "description": "min=0.613, mean=0.613, max=0.613, sum=1.226 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.773, mean=0.773, max=0.773, sum=1.546 (2)",
-            "tab": "Efficiency",
-            "score": 0.7728059119366585
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=388.536, mean=388.536, max=388.536, sum=777.072 (2)",
-            "tab": "General information",
-            "score": 388.53617021276597
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.641,
-        "details": {
-          "description": "min=0.641, mean=0.641, max=0.641, sum=1.283 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.932, mean=0.932, max=0.932, sum=1.865 (2)",
-            "tab": "Efficiency",
-            "score": 0.9323823583537134
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=502.041, mean=502.041, max=502.041, sum=1004.083 (2)",
-            "tab": "General information",
-            "score": 502.04137931034484
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.45,
-        "details": {
-          "description": "min=0.45, mean=0.45, max=0.45, sum=0.899 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.945, mean=0.945, max=0.945, sum=1.891 (2)",
-            "tab": "Efficiency",
-            "score": 0.945274135423085
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=576.066, mean=576.066, max=576.066, sum=1152.132 (2)",
-            "tab": "General information",
-            "score": 576.0661375661375
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.444,
-        "details": {
-          "description": "min=0.444, mean=0.444, max=0.444, sum=0.889 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=1.151, mean=1.151, max=1.151, sum=2.302 (2)",
-            "tab": "Efficiency",
-            "score": 1.1508805732878427
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=711.746, mean=711.746, max=711.746, sum=1423.492 (2)",
-            "tab": "General information",
-            "score": 711.7460317460317
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.878,
-        "details": {
-          "description": "min=0.878, mean=0.878, max=0.878, sum=1.755 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.998, mean=0.998, max=0.998, sum=1.996 (2)",
-            "tab": "Efficiency",
-            "score": 0.9978926274084275
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)",
-            "tab": "Efficiency",
-            "score": 0.9337695701956161
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=1.046, mean=1.046, max=1.046, sum=2.091 (2)",
-            "tab": "Efficiency",
-            "score": 1.0455269980430604
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=1.62, mean=1.62, max=1.62, sum=3.241 (2)",
-            "tab": "Efficiency",
-            "score": 1.6203449783903179
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)",
-            "tab": "Efficiency",
-            "score": 0.876823568584943
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=1.037, mean=1.037, max=1.037, sum=2.074 (2)",
-            "tab": "Efficiency",
-            "score": 1.0370552873364385
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.925, mean=0.925, max=0.925, sum=1.849 (2)",
-            "tab": "Efficiency",
-            "score": 0.9246660091938117
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=1.014, mean=1.014, max=1.014, sum=2.027 (2)",
-            "tab": "Efficiency",
-            "score": 1.013659605273494
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=1.163, mean=1.163, max=1.163, sum=2.325 (2)",
-            "tab": "Efficiency",
-            "score": 1.1627413104562199
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.963, mean=0.963, max=0.963, sum=1.925 (2)",
-            "tab": "Efficiency",
-            "score": 0.9627095689836717
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.947, mean=0.947, max=0.947, sum=1.894 (2)",
-            "tab": "Efficiency",
-            "score": 0.9471190351958668
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.929, mean=0.929, max=0.929, sum=1.857 (2)",
-            "tab": "Efficiency",
-            "score": 0.9286887921668865
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=1.383, mean=1.383, max=1.383, sum=2.766 (2)",
-            "tab": "Efficiency",
-            "score": 1.3831783030547349
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=1.246, mean=1.246, max=1.246, sum=2.492 (2)",
-            "tab": "Efficiency",
-            "score": 1.2459266769232127
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=599.577, mean=599.577, max=599.577, sum=1199.155 (2)",
-            "tab": "General information",
-            "score": 599.5774193548388
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=562.921, mean=562.921, max=562.921, sum=1125.842 (2)",
-            "tab": "General information",
-            "score": 562.9211822660099
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=947.4, mean=947.4, max=947.4, sum=1894.8 (2)",
-            "tab": "General information",
-            "score": 947.4
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2952.576, mean=2952.576, max=2952.576, sum=5905.152 (2)",
-            "tab": "General information",
-            "score": 2952.5757575757575
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=477.268, mean=477.268, max=477.268, sum=954.535 (2)",
-            "tab": "General information",
-            "score": 477.2676767676768
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=556.104, mean=556.104, max=556.104, sum=1112.207 (2)",
-            "tab": "General information",
-            "score": 556.1036269430052
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=471.036, mean=471.036, max=471.036, sum=942.072 (2)",
-            "tab": "General information",
-            "score": 471.0358974358974
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=584.881, mean=584.881, max=584.881, sum=1169.763 (2)",
-            "tab": "General information",
-            "score": 584.8814814814815
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=485.513, mean=485.513, max=485.513, sum=971.025 (2)",
-            "tab": "General information",
-            "score": 485.5126050420168
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=623.841, mean=623.841, max=623.841, sum=1247.682 (2)",
-            "tab": "General information",
-            "score": 623.841059602649
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=586.42, mean=586.42, max=586.42, sum=1172.84 (2)",
-            "tab": "General information",
-            "score": 586.4201834862386
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=871.963, mean=871.963, max=871.963, sum=1743.926 (2)",
-            "tab": "General information",
-            "score": 871.9629629629629
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2353.49, mean=2353.49, max=2353.49, sum=4706.98 (2)",
-            "tab": "General information",
-            "score": 2353.4901960784314
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1540.932, mean=1540.932, max=1540.932, sum=3081.865 (2)",
-            "tab": "General information",
-            "score": 1540.9324894514768
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.794,
-        "details": {
-          "description": "min=0.794, mean=0.794, max=0.794, sum=1.588 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.782, mean=0.782, max=0.782, sum=1.563 (2)",
-            "tab": "Efficiency",
-            "score": 0.7815119557316528
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.763, mean=0.763, max=0.763, sum=1.526 (2)",
-            "tab": "Efficiency",
-            "score": 0.7630931584889652
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=400.955, mean=400.955, max=400.955, sum=801.91 (2)",
-            "tab": "General information",
-            "score": 400.95515695067263
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=436.496, mean=436.496, max=436.496, sum=872.992 (2)",
-            "tab": "General information",
-            "score": 436.4961832061069
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.851,
-        "details": {
-          "description": "min=0.851, mean=0.851, max=0.851, sum=1.702 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.888, mean=0.888, max=0.888, sum=1.775 (2)",
-            "tab": "Efficiency",
-            "score": 0.8875030958948057
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=729.165, mean=729.165, max=729.165, sum=1458.331 (2)",
-            "tab": "General information",
-            "score": 729.1652892561983
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.939, mean=0.939, max=0.939, sum=1.878 (2)",
-            "tab": "Efficiency",
-            "score": 0.9389484660025754
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=535.276, mean=535.276, max=535.276, sum=1070.552 (2)",
-            "tab": "General information",
-            "score": 535.2760736196319
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.67,
-        "details": {
-          "description": "min=0.67, mean=0.67, max=0.67, sum=1.339 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.887, mean=0.887, max=0.887, sum=1.774 (2)",
-            "tab": "Efficiency",
-            "score": 0.8872403161866325
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=741.518, mean=741.518, max=741.518, sum=1483.036 (2)",
-            "tab": "General information",
-            "score": 741.5178571428571
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.835,
-        "details": {
-          "description": "min=0.835, mean=0.835, max=0.835, sum=1.67 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.931, mean=0.931, max=0.931, sum=1.862 (2)",
-            "tab": "Efficiency",
-            "score": 0.9309975244466541
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=366.282, mean=366.282, max=366.282, sum=732.563 (2)",
-            "tab": "General information",
-            "score": 366.28155339805824
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.885,
-        "details": {
-          "description": "min=0.885, mean=0.885, max=0.885, sum=1.769 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.961, mean=0.961, max=0.961, sum=1.923 (2)",
-            "tab": "Efficiency",
-            "score": 0.9613573286268446
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=513.064, mean=513.064, max=513.064, sum=1026.128 (2)",
-            "tab": "General information",
-            "score": 513.0641025641025
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.71,
-        "details": {
-          "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.81, mean=0.81, max=0.81, sum=1.621 (2)",
-            "tab": "Efficiency",
-            "score": 0.8103219223022461
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=419.88, mean=419.88, max=419.88, sum=839.76 (2)",
-            "tab": "General information",
-            "score": 419.88
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.828,
-        "details": {
-          "description": "min=0.828, mean=0.828, max=0.828, sum=1.655 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.826, mean=0.826, max=0.826, sum=1.652 (2)",
-            "tab": "Efficiency",
-            "score": 0.8259343528503964
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=393.628, mean=393.628, max=393.628, sum=787.257 (2)",
-            "tab": "General information",
-            "score": 393.62835249042143
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.488,
-        "details": {
-          "description": "min=0.488, mean=0.488, max=0.488, sum=0.977 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.938, mean=0.938, max=0.938, sum=1.876 (2)",
-            "tab": "Efficiency",
-            "score": 0.937887375065357
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.885, mean=0.885, max=0.885, sum=1.77 (2)",
-            "tab": "Efficiency",
-            "score": 0.8848049091893201
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=576.789, mean=576.789, max=576.789, sum=1153.578 (2)",
-            "tab": "General information",
-            "score": 576.7890173410404
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=741.949, mean=741.949, max=741.949, sum=1483.897 (2)",
-            "tab": "General information",
-            "score": 741.9486033519553
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.735,
-        "details": {
-          "description": "min=0.735, mean=0.735, max=0.735, sum=1.471 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.881, mean=0.881, max=0.881, sum=1.761 (2)",
-            "tab": "Efficiency",
-            "score": 0.8806839573617075
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=682.065, mean=682.065, max=682.065, sum=1364.131 (2)",
-            "tab": "General information",
-            "score": 682.0653594771242
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.762,
-        "details": {
-          "description": "min=0.762, mean=0.762, max=0.762, sum=1.525 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.819, mean=0.819, max=0.819, sum=1.638 (2)",
-            "tab": "Efficiency",
-            "score": 0.8192079758938448
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=610.639, mean=610.639, max=610.639, sum=1221.278 (2)",
-            "tab": "General information",
-            "score": 610.6388888888889
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.627,
-        "details": {
-          "description": "min=0.627, mean=0.627, max=0.627, sum=1.255 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.736, mean=0.736, max=0.736, sum=1.471 (2)",
-            "tab": "Efficiency",
-            "score": 0.735536317391829
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=497.991, mean=497.991, max=497.991, sum=995.982 (2)",
-            "tab": "General information",
-            "score": 497.9909090909091
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.784,
-        "details": {
-          "description": "min=0.784, mean=0.784, max=0.784, sum=1.567 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.949, mean=0.949, max=0.949, sum=1.898 (2)",
-            "tab": "Efficiency",
-            "score": 0.9487942345288335
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1308.804, mean=1308.804, max=1308.804, sum=2617.608 (2)",
-            "tab": "General information",
-            "score": 1308.8040816326532
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.841,
-        "details": {
-          "description": "min=0.841, mean=0.841, max=0.841, sum=1.682 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.843, mean=0.843, max=0.843, sum=1.687 (2)",
-            "tab": "Efficiency",
-            "score": 0.8433953909138542
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=532.274, mean=532.274, max=532.274, sum=1064.547 (2)",
-            "tab": "General information",
-            "score": 532.273631840796
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.548,
-        "details": {
-          "description": "min=0.548, mean=0.548, max=0.548, sum=1.096 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.713, mean=0.713, max=0.713, sum=1.425 (2)",
-            "tab": "Efficiency",
-            "score": 0.7126703147428581
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=427.651, mean=427.651, max=427.651, sum=855.301 (2)",
-            "tab": "General information",
-            "score": 427.65060240963857
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.784,
-        "details": {
-          "description": "min=0.784, mean=0.784, max=0.784, sum=1.567 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)",
-            "tab": "Efficiency",
-            "score": 0.7498089402739765
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=358.018, mean=358.018, max=358.018, sum=716.035 (2)",
-            "tab": "General information",
-            "score": 358.0175438596491
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.186,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json b/data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json
deleted file mode 100644
index 6bebd236d..000000000
--- a/data/helm_mmlu/cohere/command-r-plus/f3bccdeb-88a2-46ce-bfc9-5d5c3a7e8708.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/cohere_command-r-plus/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Command R Plus",
-    "id": "cohere/command-r-plus",
-    "developer": "cohere",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.694,
-        "details": {
-          "description": "min=0.21, mean=0.694, max=0.927, sum=79.063 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.199, mean=0.305, max=0.74, sum=34.817 (114)",
-            "tab": "Efficiency",
-            "score": 0.30541327600292584
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=277.047, mean=648.571, max=2823.042, sum=73937.062 (114)",
-            "tab": "General information",
-            "score": 648.5707227335503
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.21,
-        "details": {
-          "description": "min=0.21, mean=0.21, max=0.21, sum=0.42 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.26, mean=0.26, max=0.26, sum=0.521 (2)",
-            "tab": "Efficiency",
-            "score": 0.2603452730178833
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=397.66, mean=397.66, max=397.66, sum=795.32 (2)",
-            "tab": "General information",
-            "score": 397.66
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.644,
-        "details": {
-          "description": "min=0.644, mean=0.644, max=0.644, sum=1.289 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)",
-            "tab": "Efficiency",
-            "score": 0.289820040596856
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=360.096, mean=360.096, max=360.096, sum=720.193 (2)",
-            "tab": "General information",
-            "score": 360.0962962962963
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52,
-        "details": {
-          "description": "min=0.52, mean=0.52, max=0.52, sum=1.039 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.419, mean=0.419, max=0.419, sum=0.839 (2)",
-            "tab": "Efficiency",
-            "score": 0.41949598789215087
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.319, mean=0.319, max=0.319, sum=0.638 (2)",
-            "tab": "Efficiency",
-            "score": 0.3188936991824044
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.262, mean=0.262, max=0.262, sum=0.525 (2)",
-            "tab": "Efficiency",
-            "score": 0.262396776676178
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.46, mean=0.46, max=0.46, sum=0.92 (2)",
-            "tab": "Efficiency",
-            "score": 0.45980838298797605
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.328, mean=0.328, max=0.328, sum=0.656 (2)",
-            "tab": "Efficiency",
-            "score": 0.32775250611277673
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.383, mean=0.383, max=0.383, sum=0.766 (2)",
-            "tab": "Efficiency",
-            "score": 0.38314491861006794
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=586.57, mean=586.57, max=586.57, sum=1173.14 (2)",
-            "tab": "General information",
-            "score": 586.57
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=496.632, mean=496.632, max=496.632, sum=993.264 (2)",
-            "tab": "General information",
-            "score": 496.63194444444446
-          },
-          "College Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=869.29, mean=869.29, max=869.29, sum=1738.58 (2)",
-            "tab": "General information",
-            "score": 869.29
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=645.25, mean=645.25, max=645.25, sum=1290.5 (2)",
-            "tab": "General information",
-            "score": 645.25
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=535.092, mean=535.092, max=535.092, sum=1070.185 (2)",
-            "tab": "General information",
-            "score": 535.0924855491329
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=530.382, mean=530.382, max=530.382, sum=1060.765 (2)",
-            "tab": "General information",
-            "score": 530.3823529411765
-          },
-          "College Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.74,
-        "details": {
-          "description": "min=0.74, mean=0.74, max=0.74, sum=1.48 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.481, mean=0.481, max=0.481, sum=0.961 (2)",
-            "tab": "Efficiency",
-            "score": 0.4807459425926208
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=399.41, mean=399.41, max=399.41, sum=798.82 (2)",
-            "tab": "General information",
-            "score": 399.41
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.561,
-        "details": {
-          "description": "min=0.561, mean=0.561, max=0.561, sum=1.123 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.339, mean=0.339, max=0.339, sum=0.679 (2)",
-            "tab": "Efficiency",
-            "score": 0.33940661162660835
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=661.579, mean=661.579, max=661.579, sum=1323.158 (2)",
-            "tab": "General information",
-            "score": 661.578947368421
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.593 (2)",
-            "tab": "Efficiency",
-            "score": 0.2966678738594055
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=469.58, mean=469.58, max=469.58, sum=939.16 (2)",
-            "tab": "General information",
-            "score": 469.58
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.806,
-        "details": {
-          "description": "min=0.806, mean=0.806, max=0.806, sum=1.611 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.288, mean=0.288, max=0.288, sum=0.577 (2)",
-            "tab": "Efficiency",
-            "score": 0.2883643927397551
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=417.944, mean=417.944, max=417.944, sum=835.889 (2)",
-            "tab": "General information",
-            "score": 417.94444444444446
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.695,
-        "details": {
-          "description": "min=0.695, mean=0.695, max=0.695, sum=1.389 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.308, mean=0.308, max=0.308, sum=0.616 (2)",
-            "tab": "Efficiency",
-            "score": 0.3079479507311364
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=355.508, mean=355.508, max=355.508, sum=711.016 (2)",
-            "tab": "General information",
-            "score": 355.508038585209
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.735,
-        "details": {
-          "description": "min=0.735, mean=0.735, max=0.735, sum=1.471 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.451, mean=0.451, max=0.451, sum=0.903 (2)",
-            "tab": "Efficiency",
-            "score": 0.45139760129592
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.292, mean=0.292, max=0.292, sum=0.584 (2)",
-            "tab": "Efficiency",
-            "score": 0.2920728659798913
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.406, mean=0.406, max=0.406, sum=0.811 (2)",
-            "tab": "Efficiency",
-            "score": 0.4056029599524228
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.305, mean=0.305, max=0.305, sum=0.609 (2)",
-            "tab": "Efficiency",
-            "score": 0.30459034287072473
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1143.129, mean=1143.129, max=1143.129, sum=2286.257 (2)",
-            "tab": "General information",
-            "score": 1143.1286764705883
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=767.429, mean=767.429, max=767.429, sum=1534.858 (2)",
-            "tab": "General information",
-            "score": 767.4290780141844
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1697.692, mean=1697.692, max=1697.692, sum=3395.385 (2)",
-            "tab": "General information",
-            "score": 1697.6923076923076
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=609.167, mean=609.167, max=609.167, sum=1218.333 (2)",
-            "tab": "General information",
-            "score": 609.1666666666666
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "details": {
-          "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.594 (2)",
-            "tab": "Efficiency",
-            "score": 0.29705020904541013
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=452.23, mean=452.23, max=452.23, sum=904.46 (2)",
-            "tab": "General information",
-            "score": 452.23
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.783,
-        "details": {
-          "description": "min=0.783, mean=0.783, max=0.783, sum=1.566 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.492, mean=0.492, max=0.492, sum=0.984 (2)",
-            "tab": "Efficiency",
-            "score": 0.49223921016642924
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=615.276, mean=615.276, max=615.276, sum=1230.553 (2)",
-            "tab": "General information",
-            "score": 615.2763157894736
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.296, mean=0.296, max=0.296, sum=0.593 (2)",
-            "tab": "Efficiency",
-            "score": 0.2964653515815735
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=582.46, mean=582.46, max=582.46, sum=1164.92 (2)",
-            "tab": "General information",
-            "score": 582.46
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.743,
-        "details": {
-          "description": "min=0.743, mean=0.743, max=0.743, sum=1.487 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.337, mean=0.337, max=0.337, sum=0.675 (2)",
-            "tab": "Efficiency",
-            "score": 0.33743472009334924
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=433.181, mean=433.181, max=433.181, sum=866.362 (2)",
-            "tab": "General information",
-            "score": 433.1811320754717
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.591,
-        "details": {
-          "description": "min=0.591, mean=0.591, max=0.591, sum=1.183 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.199, mean=0.199, max=0.199, sum=0.398 (2)",
-            "tab": "Efficiency",
-            "score": 0.19917301928743403
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=322.511, mean=322.511, max=322.511, sum=645.021 (2)",
-            "tab": "General information",
-            "score": 322.51063829787233
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.71,
-        "details": {
-          "description": "min=0.71, mean=0.71, max=0.71, sum=1.421 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.238, mean=0.238, max=0.238, sum=0.476 (2)",
-            "tab": "Efficiency",
-            "score": 0.2378004501605856
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=494.648, mean=494.648, max=494.648, sum=989.297 (2)",
-            "tab": "General information",
-            "score": 494.64827586206894
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.474,
-        "details": {
-          "description": "min=0.474, mean=0.474, max=0.474, sum=0.947 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.256, mean=0.256, max=0.256, sum=0.512 (2)",
-            "tab": "Efficiency",
-            "score": 0.2562026693707421
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=609.537, mean=609.537, max=609.537, sum=1219.074 (2)",
-            "tab": "General information",
-            "score": 609.5370370370371
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.484,
-        "details": {
-          "description": "min=0.484, mean=0.484, max=0.484, sum=0.968 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.285, mean=0.285, max=0.285, sum=0.57 (2)",
-            "tab": "Efficiency",
-            "score": 0.2847565715275114
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=630.992, mean=630.992, max=630.992, sum=1261.984 (2)",
-            "tab": "General information",
-            "score": 630.9920634920635
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.827,
-        "details": {
-          "description": "min=0.827, mean=0.827, max=0.827, sum=1.654 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.295, mean=0.295, max=0.295, sum=0.59 (2)",
-            "tab": "Efficiency",
-            "score": 0.29477174051346317
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.225, mean=0.225, max=0.225, sum=0.451 (2)",
-            "tab": "Efficiency",
-            "score": 0.22539391071338372
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.27, mean=0.27, max=0.27, sum=0.539 (2)",
-            "tab": "Efficiency",
-            "score": 0.26950850486755373
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.74, mean=0.74, max=0.74, sum=1.48 (2)",
-            "tab": "Efficiency",
-            "score": 0.7398316253315319
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.205, mean=0.205, max=0.205, sum=0.41 (2)",
-            "tab": "Efficiency",
-            "score": 0.20521813570851027
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.243, mean=0.243, max=0.243, sum=0.487 (2)",
-            "tab": "Efficiency",
-            "score": 0.24341652430400948
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.221, mean=0.221, max=0.221, sum=0.442 (2)",
-            "tab": "Efficiency",
-            "score": 0.2207918637838119
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.296, mean=0.296, max=0.296, sum=0.592 (2)",
-            "tab": "Efficiency",
-            "score": 0.29578982988993324
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)",
-            "tab": "Efficiency",
-            "score": 0.342765681883868
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.279, mean=0.279, max=0.279, sum=0.558 (2)",
-            "tab": "Efficiency",
-            "score": 0.2788162073552214
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.249, mean=0.249, max=0.249, sum=0.499 (2)",
-            "tab": "Efficiency",
-            "score": 0.2494196336203759
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.286, mean=0.286, max=0.286, sum=0.572 (2)",
-            "tab": "Efficiency",
-            "score": 0.28620046377182007
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.467, mean=0.467, max=0.467, sum=0.934 (2)",
-            "tab": "Efficiency",
-            "score": 0.4672480844983868
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.374, mean=0.374, max=0.374, sum=0.748 (2)",
-            "tab": "Efficiency",
-            "score": 0.3738658830586365
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=527.213, mean=527.213, max=527.213, sum=1054.426 (2)",
-            "tab": "General information",
-            "score": 527.2129032258065
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=530.635, mean=530.635, max=530.635, sum=1061.271 (2)",
-            "tab": "General information",
-            "score": 530.6354679802955
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=932.02, mean=932.02, max=932.02, sum=1864.04 (2)",
-            "tab": "General information",
-            "score": 932.02
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2823.042, mean=2823.042, max=2823.042, sum=5646.085 (2)",
-            "tab": "General information",
-            "score": 2823.042424242424
-          },
-          "High School European History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=407.818, mean=407.818, max=407.818, sum=815.636 (2)",
-            "tab": "General information",
-            "score": 407.8181818181818
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=489.155, mean=489.155, max=489.155, sum=978.311 (2)",
-            "tab": "General information",
-            "score": 489.1554404145078
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=407.654, mean=407.654, max=407.654, sum=815.308 (2)",
-            "tab": "General information",
-            "score": 407.65384615384613
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=589.774, mean=589.774, max=589.774, sum=1179.548 (2)",
-            "tab": "General information",
-            "score": 589.7740740740741
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=428.403, mean=428.403, max=428.403, sum=856.807 (2)",
-            "tab": "General information",
-            "score": 428.4033613445378
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=604.272, mean=604.272, max=604.272, sum=1208.543 (2)",
-            "tab": "General information",
-            "score": 604.2715231788079
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=516.004, mean=516.004, max=516.004, sum=1032.007 (2)",
-            "tab": "General information",
-            "score": 516.0036697247706
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=871.264, mean=871.264, max=871.264, sum=1742.528 (2)",
-            "tab": "General information",
-            "score": 871.2638888888889
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2240.358, mean=2240.358, max=2240.358, sum=4480.716 (2)",
-            "tab": "General information",
-            "score": 2240.357843137255
-          },
-          "High School US History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1443.321, mean=1443.321, max=1443.321, sum=2886.641 (2)",
-            "tab": "General information",
-            "score": 1443.3206751054852
-          },
-          "High School World History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.786,
-        "details": {
-          "description": "min=0.786, mean=0.786, max=0.786, sum=1.573 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.199, mean=0.199, max=0.199, sum=0.399 (2)",
-            "tab": "Efficiency",
-            "score": 0.19925055482462384
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.227, mean=0.227, max=0.227, sum=0.454 (2)",
-            "tab": "Efficiency",
-            "score": 0.22696546925843217
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=336.09, mean=336.09, max=336.09, sum=672.179 (2)",
-            "tab": "General information",
-            "score": 336.0896860986547
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=367.16, mean=367.16, max=367.16, sum=734.321 (2)",
-            "tab": "General information",
-            "score": 367.1603053435114
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.835,
-        "details": {
-          "description": "min=0.835, mean=0.835, max=0.835, sum=1.669 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.247, mean=0.247, max=0.247, sum=0.494 (2)",
-            "tab": "Efficiency",
-            "score": 0.2467749296141065
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=653.612, mean=653.612, max=653.612, sum=1307.223 (2)",
-            "tab": "General information",
-            "score": 653.6115702479339
-          },
-          "International Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791,
-        "details": {
-          "description": "min=0.791, mean=0.791, max=0.791, sum=1.583 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.25, mean=0.25, max=0.25, sum=0.5 (2)",
-            "tab": "Efficiency",
-            "score": 0.24988567463459413
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=463.773, mean=463.773, max=463.773, sum=927.546 (2)",
-            "tab": "General information",
-            "score": 463.7730061349693
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.518,
-        "details": {
-          "description": "min=0.518, mean=0.518, max=0.518, sum=1.036 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.265, mean=0.265, max=0.265, sum=0.529 (2)",
-            "tab": "Efficiency",
-            "score": 0.2645062953233719
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=716.438, mean=716.438, max=716.438, sum=1432.875 (2)",
-            "tab": "General information",
-            "score": 716.4375
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.835,
-        "details": {
-          "description": "min=0.835, mean=0.835, max=0.835, sum=1.67 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.204, mean=0.204, max=0.204, sum=0.409 (2)",
-            "tab": "Efficiency",
-            "score": 0.20434052735856437
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=294.456, mean=294.456, max=294.456, sum=588.913 (2)",
-            "tab": "General information",
-            "score": 294.45631067961165
-          },
-          "Management - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.927,
-        "details": {
-          "description": "min=0.927, mean=0.927, max=0.927, sum=1.855 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.228, mean=0.228, max=0.228, sum=0.456 (2)",
-            "tab": "Efficiency",
-            "score": 0.22806417840158838
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=446.855, mean=446.855, max=446.855, sum=893.709 (2)",
-            "tab": "General information",
-            "score": 446.85470085470087
-          },
-          "Marketing - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)",
-            "tab": "Efficiency",
-            "score": 0.3072425937652588
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=357.02, mean=357.02, max=357.02, sum=714.04 (2)",
-            "tab": "General information",
-            "score": 357.02
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.844,
-        "details": {
-          "description": "min=0.844, mean=0.844, max=0.844, sum=1.688 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.208, mean=0.208, max=0.208, sum=0.417 (2)",
-            "tab": "Efficiency",
-            "score": 0.20840222990832566
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=325.76, mean=325.76, max=325.76, sum=651.52 (2)",
-            "tab": "General information",
-            "score": 325.75989782886336
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.585,
-        "details": {
-          "description": "min=0.585, mean=0.585, max=0.585, sum=1.171 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.229, mean=0.229, max=0.229, sum=0.457 (2)",
-            "tab": "Efficiency",
-            "score": 0.2285733340103502
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.282, mean=0.282, max=0.282, sum=0.564 (2)",
-            "tab": "Efficiency",
-            "score": 0.2819661257653263
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=506.78, mean=506.78, max=506.78, sum=1013.561 (2)",
-            "tab": "General information",
-            "score": 506.78034682080926
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=699.344, mean=699.344, max=699.344, sum=1398.688 (2)",
-            "tab": "General information",
-            "score": 699.3441340782123
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.742,
-        "details": {
-          "description": "min=0.742, mean=0.742, max=0.742, sum=1.484 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.282, mean=0.282, max=0.282, sum=0.563 (2)",
-            "tab": "Efficiency",
-            "score": 0.2817091388640061
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=618.402, mean=618.402, max=618.402, sum=1236.804 (2)",
-            "tab": "General information",
-            "score": 618.4019607843137
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.821,
-        "details": {
-          "description": "min=0.821, mean=0.821, max=0.821, sum=1.642 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.287, mean=0.287, max=0.287, sum=0.574 (2)",
-            "tab": "Efficiency",
-            "score": 0.2871434423658583
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=549.235, mean=549.235, max=549.235, sum=1098.469 (2)",
-            "tab": "General information",
-            "score": 549.2345679012345
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.709,
-        "details": {
-          "description": "min=0.709, mean=0.709, max=0.709, sum=1.418 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.278, mean=0.278, max=0.278, sum=0.557 (2)",
-            "tab": "Efficiency",
-            "score": 0.27829633842815055
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=434.682, mean=434.682, max=434.682, sum=869.364 (2)",
-            "tab": "General information",
-            "score": 434.6818181818182
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.751,
-        "details": {
-          "description": "min=0.751, mean=0.751, max=0.751, sum=1.502 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.345, mean=0.345, max=0.345, sum=0.69 (2)",
-            "tab": "Efficiency",
-            "score": 0.3448335861673161
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1207.494, mean=1207.494, max=1207.494, sum=2414.988 (2)",
-            "tab": "General information",
-            "score": 1207.4938775510204
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.876,
-        "details": {
-          "description": "min=0.876, mean=0.876, max=0.876, sum=1.751 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.296, mean=0.296, max=0.296, sum=0.591 (2)",
-            "tab": "Efficiency",
-            "score": 0.2956119153037
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=467.343, mean=467.343, max=467.343, sum=934.687 (2)",
-            "tab": "General information",
-            "score": 467.34328358208955
-          },
-          "Sociology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.56,
-        "details": {
-          "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.287, mean=0.287, max=0.287, sum=0.575 (2)",
-            "tab": "Efficiency",
-            "score": 0.2874818997210767
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=352.861, mean=352.861, max=352.861, sum=705.723 (2)",
-            "tab": "General information",
-            "score": 352.8614457831325
-          },
-          "Virology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.842,
-        "details": {
-          "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.205, mean=0.205, max=0.205, sum=0.41 (2)",
-            "tab": "Efficiency",
-            "score": 0.20489408119380126
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=277.047, mean=277.047, max=277.047, sum=554.094 (2)",
-            "tab": "General information",
-            "score": 277.046783625731
-          },
-          "World Religions - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.825,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json b/data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json
deleted file mode 100644
index e82639d82..000000000
--- a/data/helm_mmlu/cohere/command-r/cefc3b25-0779-4fb3-93a5-3c7a285304af.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/cohere_command-r/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Command R",
-    "id": "cohere/command-r",
-    "developer": "cohere",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.652,
-        "details": {
-          "description": "min=0.326, mean=0.652, max=0.891, sum=74.329 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.145, mean=0.176, max=0.289, sum=20.061 (114)",
-            "tab": "Efficiency",
-            "score": 0.17597788408479575
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=277.047, mean=648.571, max=2823.042, sum=73937.062 (114)",
-            "tab": "General information",
-            "score": 648.5707227335503
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.33,
-        "details": {
-          "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.162, mean=0.162, max=0.162, sum=0.324 (2)",
-            "tab": "Efficiency",
-            "score": 0.1620460057258606
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=397.66, mean=397.66, max=397.66, sum=795.32 (2)",
-            "tab": "General information",
-            "score": 397.66
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.615,
-        "details": {
-          "description": "min=0.615, mean=0.615, max=0.615, sum=1.23 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.157, mean=0.157, max=0.157, sum=0.314 (2)",
-            "tab": "Efficiency",
-            "score": 0.15700986297042283
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=360.096, mean=360.096, max=360.096, sum=720.193 (2)",
-            "tab": "General information",
-            "score": 360.0962962962963
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.382,
-        "details": {
-          "description": "min=0.382, mean=0.382, max=0.382, sum=0.765 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.185, mean=0.185, max=0.185, sum=0.37 (2)",
-            "tab": "Efficiency",
-            "score": 0.18501442193984985
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.163, mean=0.163, max=0.163, sum=0.325 (2)",
-            "tab": "Efficiency",
-            "score": 0.1627496729294459
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.182, mean=0.182, max=0.182, sum=0.363 (2)",
-            "tab": "Efficiency",
-            "score": 0.18159597158432006
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.173, mean=0.173, max=0.173, sum=0.346 (2)",
-            "tab": "Efficiency",
-            "score": 0.17305777072906495
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.167, mean=0.167, max=0.167, sum=0.334 (2)",
-            "tab": "Efficiency",
-            "score": 0.1671100668824477
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.169, mean=0.169, max=0.169, sum=0.339 (2)",
-            "tab": "Efficiency",
-            "score": 0.16945467041988
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=586.57, mean=586.57, max=586.57, sum=1173.14 (2)",
-            "tab": "General information",
-            "score": 586.57
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=496.632, mean=496.632, max=496.632, sum=993.264 (2)",
-            "tab": "General information",
-            "score": 496.63194444444446
-          },
-          "College Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=869.29, mean=869.29, max=869.29, sum=1738.58 (2)",
-            "tab": "General information",
-            "score": 869.29
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=645.25, mean=645.25, max=645.25, sum=1290.5 (2)",
-            "tab": "General information",
-            "score": 645.25
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=535.092, mean=535.092, max=535.092, sum=1070.185 (2)",
-            "tab": "General information",
-            "score": 535.0924855491329
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=530.382, mean=530.382, max=530.382, sum=1060.765 (2)",
-            "tab": "General information",
-            "score": 530.3823529411765
-          },
-          "College Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78,
-        "details": {
-          "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.163, mean=0.163, max=0.163, sum=0.327 (2)",
-            "tab": "Efficiency",
-            "score": 0.16325130462646484
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=399.41, mean=399.41, max=399.41, sum=798.82 (2)",
-            "tab": "General information",
-            "score": 399.41
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.456,
-        "details": {
-          "description": "min=0.456, mean=0.456, max=0.456, sum=0.912 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.174, mean=0.174, max=0.174, sum=0.347 (2)",
-            "tab": "Efficiency",
-            "score": 0.17368793905827037
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=661.579, mean=661.579, max=661.579, sum=1323.158 (2)",
-            "tab": "General information",
-            "score": 661.578947368421
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.42,
-        "details": {
-          "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.166, mean=0.166, max=0.166, sum=0.332 (2)",
-            "tab": "Efficiency",
-            "score": 0.16606518507003784
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=469.58, mean=469.58, max=469.58, sum=939.16 (2)",
-            "tab": "General information",
-            "score": 469.58
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "description": "min=0.796, mean=0.796, max=0.796, sum=1.593 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.16, mean=0.16, max=0.16, sum=0.319 (2)",
-            "tab": "Efficiency",
-            "score": 0.15962726098519783
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=417.944, mean=417.944, max=417.944, sum=835.889 (2)",
-            "tab": "General information",
-            "score": 417.94444444444446
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.685,
-        "details": {
-          "description": "min=0.685, mean=0.685, max=0.685, sum=1.37 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.154, mean=0.154, max=0.154, sum=0.307 (2)",
-            "tab": "Efficiency",
-            "score": 0.1535167272451223
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=355.508, mean=355.508, max=355.508, sum=711.016 (2)",
-            "tab": "General information",
-            "score": 355.508038585209
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.681,
-        "details": {
-          "description": "min=0.681, mean=0.681, max=0.681, sum=1.363 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.195, mean=0.195, max=0.195, sum=0.389 (2)",
-            "tab": "Efficiency",
-            "score": 0.19464709828881657
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.177, mean=0.177, max=0.177, sum=0.354 (2)",
-            "tab": "Efficiency",
-            "score": 0.1770885929148248
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.234, mean=0.234, max=0.234, sum=0.469 (2)",
-            "tab": "Efficiency",
-            "score": 0.23427105509473262
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.211, mean=0.211, max=0.211, sum=0.423 (2)",
-            "tab": "Efficiency",
-            "score": 0.2114220471943126
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1143.129, mean=1143.129, max=1143.129, sum=2286.257 (2)",
-            "tab": "General information",
-            "score": 1143.1286764705883
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=767.429, mean=767.429, max=767.429, sum=1534.858 (2)",
-            "tab": "General information",
-            "score": 767.4290780141844
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1697.692, mean=1697.692, max=1697.692, sum=3395.385 (2)",
-            "tab": "General information",
-            "score": 1697.6923076923076
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=609.167, mean=609.167, max=609.167, sum=1218.333 (2)",
-            "tab": "General information",
-            "score": 609.1666666666666
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.183, mean=0.183, max=0.183, sum=0.366 (2)",
-            "tab": "Efficiency",
-            "score": 0.18277841329574585
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=452.23, mean=452.23, max=452.23, sum=904.46 (2)",
-            "tab": "General information",
-            "score": 452.23
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.743,
-        "details": {
-          "description": "min=0.743, mean=0.743, max=0.743, sum=1.487 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.223, mean=0.223, max=0.223, sum=0.446 (2)",
-            "tab": "Efficiency",
-            "score": 0.22317567624543844
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=615.276, mean=615.276, max=615.276, sum=1230.553 (2)",
-            "tab": "General information",
-            "score": 615.2763157894736
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.63,
-        "details": {
-          "description": "min=0.63, mean=0.63, max=0.63, sum=1.26 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.17, mean=0.17, max=0.17, sum=0.34 (2)",
-            "tab": "Efficiency",
-            "score": 0.16991474628448486
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=582.46, mean=582.46, max=582.46, sum=1164.92 (2)",
-            "tab": "General information",
-            "score": 582.46
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.751,
-        "details": {
-          "description": "min=0.751, mean=0.751, max=0.751, sum=1.502 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.186, mean=0.186, max=0.186, sum=0.371 (2)",
-            "tab": "Efficiency",
-            "score": 0.1857448289979179
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=433.181, mean=433.181, max=433.181, sum=866.362 (2)",
-            "tab": "General information",
-            "score": 433.1811320754717
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.528,
-        "details": {
-          "description": "min=0.528, mean=0.528, max=0.528, sum=1.055 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.146, mean=0.146, max=0.146, sum=0.293 (2)",
-            "tab": "Efficiency",
-            "score": 0.14639884360293123
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=322.511, mean=322.511, max=322.511, sum=645.021 (2)",
-            "tab": "General information",
-            "score": 322.51063829787233
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.593,
-        "details": {
-          "description": "min=0.593, mean=0.593, max=0.593, sum=1.186 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.164, mean=0.164, max=0.164, sum=0.329 (2)",
-            "tab": "Efficiency",
-            "score": 0.16444927248461494
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=494.648, mean=494.648, max=494.648, sum=989.297 (2)",
-            "tab": "General information",
-            "score": 494.64827586206894
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.437,
-        "details": {
-          "description": "min=0.437, mean=0.437, max=0.437, sum=0.873 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.171, mean=0.171, max=0.171, sum=0.342 (2)",
-            "tab": "Efficiency",
-            "score": 0.17102001079175838
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=609.537, mean=609.537, max=609.537, sum=1219.074 (2)",
-            "tab": "General information",
-            "score": 609.5370370370371
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.405,
-        "details": {
-          "description": "min=0.405, mean=0.405, max=0.405, sum=0.81 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.178, mean=0.178, max=0.178, sum=0.357 (2)",
-            "tab": "Efficiency",
-            "score": 0.17840472289494105
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=630.992, mean=630.992, max=630.992, sum=1261.984 (2)",
-            "tab": "General information",
-            "score": 630.9920634920635
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.168, mean=0.168, max=0.168, sum=0.337 (2)",
-            "tab": "Efficiency",
-            "score": 0.16835398827829667
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.171, mean=0.171, max=0.171, sum=0.341 (2)",
-            "tab": "Efficiency",
-            "score": 0.17066421649726154
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.183, mean=0.183, max=0.183, sum=0.367 (2)",
-            "tab": "Efficiency",
-            "score": 0.1834348964691162
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.289, mean=0.289, max=0.289, sum=0.577 (2)",
-            "tab": "Efficiency",
-            "score": 0.28851397543242485
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.155, mean=0.155, max=0.155, sum=0.31 (2)",
-            "tab": "Efficiency",
-            "score": 0.15488721987213752
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.169, mean=0.169, max=0.169, sum=0.338 (2)",
-            "tab": "Efficiency",
-            "score": 0.16877420331530002
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.159, mean=0.159, max=0.159, sum=0.318 (2)",
-            "tab": "Efficiency",
-            "score": 0.1590262247965886
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.169, mean=0.169, max=0.169, sum=0.337 (2)",
-            "tab": "Efficiency",
-            "score": 0.1685257187596074
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.156, mean=0.156, max=0.156, sum=0.312 (2)",
-            "tab": "Efficiency",
-            "score": 0.1562105868043018
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.165, mean=0.165, max=0.165, sum=0.33 (2)",
-            "tab": "Efficiency",
-            "score": 0.16475912277272206
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.169, mean=0.169, max=0.169, sum=0.339 (2)",
-            "tab": "Efficiency",
-            "score": 0.16945652830491373
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.184, mean=0.184, max=0.184, sum=0.368 (2)",
-            "tab": "Efficiency",
-            "score": 0.18419962348761382
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.285, mean=0.285, max=0.285, sum=0.571 (2)",
-            "tab": "Efficiency",
-            "score": 0.28542132938609405
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.216, mean=0.216, max=0.216, sum=0.433 (2)",
-            "tab": "Efficiency",
-            "score": 0.21634829698232658
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=527.213, mean=527.213, max=527.213, sum=1054.426 (2)",
-            "tab": "General information",
-            "score": 527.2129032258065
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=530.635, mean=530.635, max=530.635, sum=1061.271 (2)",
-            "tab": "General information",
-            "score": 530.6354679802955
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=932.02, mean=932.02, max=932.02, sum=1864.04 (2)",
-            "tab": "General information",
-            "score": 932.02
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2823.042, mean=2823.042, max=2823.042, sum=5646.085 (2)",
-            "tab": "General information",
-            "score": 2823.042424242424
-          },
-          "High School European History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=407.818, mean=407.818, max=407.818, sum=815.636 (2)",
-            "tab": "General information",
-            "score": 407.8181818181818
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=489.155, mean=489.155, max=489.155, sum=978.311 (2)",
-            "tab": "General information",
-            "score": 489.1554404145078
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=407.654, mean=407.654, max=407.654, sum=815.308 (2)",
-            "tab": "General information",
-            "score": 407.65384615384613
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=589.774, mean=589.774, max=589.774, sum=1179.548 (2)",
-            "tab": "General information",
-            "score": 589.7740740740741
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=428.403, mean=428.403, max=428.403, sum=856.807 (2)",
-            "tab": "General information",
-            "score": 428.4033613445378
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=604.272, mean=604.272, max=604.272, sum=1208.543 (2)",
-            "tab": "General information",
-            "score": 604.2715231788079
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=516.004, mean=516.004, max=516.004, sum=1032.007 (2)",
-            "tab": "General information",
-            "score": 516.0036697247706
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=871.264, mean=871.264, max=871.264, sum=1742.528 (2)",
-            "tab": "General information",
-            "score": 871.2638888888889
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2240.358, mean=2240.358, max=2240.358, sum=4480.716 (2)",
-            "tab": "General information",
-            "score": 2240.357843137255
-          },
-          "High School US History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1443.321, mean=1443.321, max=1443.321, sum=2886.641 (2)",
-            "tab": "General information",
-            "score": 1443.3206751054852
-          },
-          "High School World History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.763,
-        "details": {
-          "description": "min=0.763, mean=0.763, max=0.763, sum=1.527 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.154, mean=0.154, max=0.154, sum=0.308 (2)",
-            "tab": "Efficiency",
-            "score": 0.15405324649383134
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.159, mean=0.159, max=0.159, sum=0.318 (2)",
-            "tab": "Efficiency",
-            "score": 0.15911357275402274
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=336.09, mean=336.09, max=336.09, sum=672.179 (2)",
-            "tab": "General information",
-            "score": 336.0896860986547
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=367.16, mean=367.16, max=367.16, sum=734.321 (2)",
-            "tab": "General information",
-            "score": 367.1603053435114
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.802,
-        "details": {
-          "description": "min=0.802, mean=0.802, max=0.802, sum=1.603 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.174, mean=0.174, max=0.174, sum=0.347 (2)",
-            "tab": "Efficiency",
-            "score": 0.1736255066453918
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=653.612, mean=653.612, max=653.612, sum=1307.223 (2)",
-            "tab": "General information",
-            "score": 653.6115702479339
-          },
-          "International Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.798,
-        "details": {
-          "description": "min=0.798, mean=0.798, max=0.798, sum=1.595 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.164, mean=0.164, max=0.164, sum=0.327 (2)",
-            "tab": "Efficiency",
-            "score": 0.16361909117435386
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=463.773, mean=463.773, max=463.773, sum=927.546 (2)",
-            "tab": "General information",
-            "score": 463.7730061349693
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.446,
-        "details": {
-          "description": "min=0.446, mean=0.446, max=0.446, sum=0.893 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.176, mean=0.176, max=0.176, sum=0.352 (2)",
-            "tab": "Efficiency",
-            "score": 0.17583884937422617
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=716.438, mean=716.438, max=716.438, sum=1432.875 (2)",
-            "tab": "General information",
-            "score": 716.4375
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "description": "min=0.796, mean=0.796, max=0.796, sum=1.592 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.156, mean=0.156, max=0.156, sum=0.312 (2)",
-            "tab": "Efficiency",
-            "score": 0.15610716875317027
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=294.456, mean=294.456, max=294.456, sum=588.913 (2)",
-            "tab": "General information",
-            "score": 294.45631067961165
-          },
-          "Management - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.872,
-        "details": {
-          "description": "min=0.872, mean=0.872, max=0.872, sum=1.744 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.161, mean=0.161, max=0.161, sum=0.321 (2)",
-            "tab": "Efficiency",
-            "score": 0.16073521895286363
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=446.855, mean=446.855, max=446.855, sum=893.709 (2)",
-            "tab": "General information",
-            "score": 446.85470085470087
-          },
-          "Marketing - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.16, mean=0.16, max=0.16, sum=0.319 (2)",
-            "tab": "Efficiency",
-            "score": 0.15951916217803955
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=357.02, mean=357.02, max=357.02, sum=714.04 (2)",
-            "tab": "General information",
-            "score": 357.02
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.848,
-        "details": {
-          "description": "min=0.848, mean=0.848, max=0.848, sum=1.696 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.153, mean=0.153, max=0.153, sum=0.307 (2)",
-            "tab": "Efficiency",
-            "score": 0.15332558511317462
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=325.76, mean=325.76, max=325.76, sum=651.52 (2)",
-            "tab": "General information",
-            "score": 325.75989782886336
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.451,
-        "details": {
-          "description": "min=0.451, mean=0.451, max=0.451, sum=0.903 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.17, mean=0.17, max=0.17, sum=0.339 (2)",
-            "tab": "Efficiency",
-            "score": 0.16953640452699165
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.176, mean=0.176, max=0.176, sum=0.351 (2)",
-            "tab": "Efficiency",
-            "score": 0.1756493640345568
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=506.78, mean=506.78, max=506.78, sum=1013.561 (2)",
-            "tab": "General information",
-            "score": 506.78034682080926
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=699.344, mean=699.344, max=699.344, sum=1398.688 (2)",
-            "tab": "General information",
-            "score": 699.3441340782123
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.703,
-        "details": {
-          "description": "min=0.703, mean=0.703, max=0.703, sum=1.405 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.171, mean=0.171, max=0.171, sum=0.342 (2)",
-            "tab": "Efficiency",
-            "score": 0.17089871020098918
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=618.402, mean=618.402, max=618.402, sum=1236.804 (2)",
-            "tab": "General information",
-            "score": 618.4019607843137
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.728,
-        "details": {
-          "description": "min=0.728, mean=0.728, max=0.728, sum=1.457 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.167, mean=0.167, max=0.167, sum=0.333 (2)",
-            "tab": "Efficiency",
-            "score": 0.16663335429297554
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=549.235, mean=549.235, max=549.235, sum=1098.469 (2)",
-            "tab": "General information",
-            "score": 549.2345679012345
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7,
-        "details": {
-          "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.17, mean=0.17, max=0.17, sum=0.341 (2)",
-            "tab": "Efficiency",
-            "score": 0.17039124532179398
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=434.682, mean=434.682, max=434.682, sum=869.364 (2)",
-            "tab": "General information",
-            "score": 434.6818181818182
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.714,
-        "details": {
-          "description": "min=0.714, mean=0.714, max=0.714, sum=1.429 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.203, mean=0.203, max=0.203, sum=0.405 (2)",
-            "tab": "Efficiency",
-            "score": 0.20251671927315848
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1207.494, mean=1207.494, max=1207.494, sum=2414.988 (2)",
-            "tab": "General information",
-            "score": 1207.4938775510204
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.866,
-        "details": {
-          "description": "min=0.866, mean=0.866, max=0.866, sum=1.731 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.164, mean=0.164, max=0.164, sum=0.327 (2)",
-            "tab": "Efficiency",
-            "score": 0.16369761163322485
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=467.343, mean=467.343, max=467.343, sum=934.687 (2)",
-            "tab": "General information",
-            "score": 467.34328358208955
-          },
-          "Sociology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.542,
-        "details": {
-          "description": "min=0.542, mean=0.542, max=0.542, sum=1.084 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.158, mean=0.158, max=0.158, sum=0.316 (2)",
-            "tab": "Efficiency",
-            "score": 0.15811713919582138
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=352.861, mean=352.861, max=352.861, sum=705.723 (2)",
-            "tab": "General information",
-            "score": 352.8614457831325
-          },
-          "Virology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.813,
-        "details": {
-          "description": "min=0.813, mean=0.813, max=0.813, sum=1.626 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.145, mean=0.145, max=0.145, sum=0.291 (2)",
-            "tab": "Efficiency",
-            "score": 0.1452833434991669
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=277.047, mean=277.047, max=277.047, sum=554.094 (2)",
-            "tab": "General information",
-            "score": 277.046783625731
-          },
-          "World Religions - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.959,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json b/data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json
deleted file mode 100644
index d5f73b61f..000000000
--- a/data/helm_mmlu/databricks/dbrx-instruct/7e00e082-0e79-45e0-b0ff-5458cc2aff85.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/databricks_dbrx-instruct/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DBRX Instruct",
-    "id": "databricks/dbrx-instruct",
-    "developer": "databricks",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.741,
-        "details": {
-          "description": "min=0.34, mean=0.741, max=0.953, sum=84.475 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.356, mean=0.459, max=1.347, sum=52.272 (114)",
-            "tab": "Efficiency",
-            "score": 0.4585284510595002
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=268.561, mean=607.852, max=2791.073, sum=69295.086 (114)",
-            "tab": "General information",
-            "score": 607.851634217556
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.34,
-        "details": {
-          "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.432, mean=0.432, max=0.432, sum=0.863 (2)",
-            "tab": "Efficiency",
-            "score": 0.4316913342475891
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=366.44, mean=366.44, max=366.44, sum=732.88 (2)",
-            "tab": "General information",
-            "score": 366.44
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.667,
-        "details": {
-          "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.385, mean=0.385, max=0.385, sum=0.771 (2)",
-            "tab": "Efficiency",
-            "score": 0.38546188672383624
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=346.978, mean=346.978, max=346.978, sum=693.956 (2)",
-            "tab": "General information",
-            "score": 346.97777777777776
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.539,
-        "details": {
-          "description": "min=0.539, mean=0.539, max=0.539, sum=1.078 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.395, mean=0.395, max=0.395, sum=0.789 (2)",
-            "tab": "Efficiency",
-            "score": 0.39454248666763303
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.391, mean=0.391, max=0.391, sum=0.781 (2)",
-            "tab": "Efficiency",
-            "score": 0.3906625145011478
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.439, mean=0.439, max=0.439, sum=0.877 (2)",
-            "tab": "Efficiency",
-            "score": 0.438518271446228
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.396, mean=0.396, max=0.396, sum=0.792 (2)",
-            "tab": "Efficiency",
-            "score": 0.3961342000961304
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.392, mean=0.392, max=0.392, sum=0.784 (2)",
-            "tab": "Efficiency",
-            "score": 0.39187397708782573
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.406, mean=0.406, max=0.406, sum=0.813 (2)",
-            "tab": "Efficiency",
-            "score": 0.4062807746962005
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=542.4, mean=542.4, max=542.4, sum=1084.8 (2)",
-            "tab": "General information",
-            "score": 542.4
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=466.917, mean=466.917, max=466.917, sum=933.833 (2)",
-            "tab": "General information",
-            "score": 466.9166666666667
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=821.39, mean=821.39, max=821.39, sum=1642.78 (2)",
-            "tab": "General information",
-            "score": 821.39
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=587.52, mean=587.52, max=587.52, sum=1175.04 (2)",
-            "tab": "General information",
-            "score": 587.52
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=495.728, mean=495.728, max=495.728, sum=991.457 (2)",
-            "tab": "General information",
-            "score": 495.728323699422
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=496.608, mean=496.608, max=496.608, sum=993.216 (2)",
-            "tab": "General information",
-            "score": 496.6078431372549
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)",
-            "tab": "Efficiency",
-            "score": 0.4148012113571167
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=371.54, mean=371.54, max=371.54, sum=743.08 (2)",
-            "tab": "General information",
-            "score": 371.54
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.605,
-        "details": {
-          "description": "min=0.605, mean=0.605, max=0.605, sum=1.211 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.431, mean=0.431, max=0.431, sum=0.863 (2)",
-            "tab": "Efficiency",
-            "score": 0.43133983904855294
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=607.43, mean=607.43, max=607.43, sum=1214.86 (2)",
-            "tab": "General information",
-            "score": 607.4298245614035
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46,
-        "details": {
-          "description": "min=0.46, mean=0.46, max=0.46, sum=0.92 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.429, mean=0.429, max=0.429, sum=0.857 (2)",
-            "tab": "Efficiency",
-            "score": 0.4286450815200806
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=392.71, mean=392.71, max=392.71, sum=785.42 (2)",
-            "tab": "General information",
-            "score": 392.71
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.843,
-        "details": {
-          "description": "min=0.843, mean=0.843, max=0.843, sum=1.685 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.873 (2)",
-            "tab": "Efficiency",
-            "score": 0.43625413488458703
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=387.639, mean=387.639, max=387.639, sum=775.278 (2)",
-            "tab": "General information",
-            "score": 387.6388888888889
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.804,
-        "details": {
-          "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)",
-            "tab": "Efficiency",
-            "score": 0.4079643100787589
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=322.084, mean=322.084, max=322.084, sum=644.167 (2)",
-            "tab": "General information",
-            "score": 322.08360128617363
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.801,
-        "details": {
-          "description": "min=0.801, mean=0.801, max=0.801, sum=1.601 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)",
-            "tab": "Efficiency",
-            "score": 0.4699658164206673
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.395, mean=0.395, max=0.395, sum=0.791 (2)",
-            "tab": "Efficiency",
-            "score": 0.39532034532398197
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.556, mean=0.556, max=0.556, sum=1.113 (2)",
-            "tab": "Efficiency",
-            "score": 0.5564531824579451
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.388, mean=0.388, max=0.388, sum=0.776 (2)",
-            "tab": "Efficiency",
-            "score": 0.3879917279567594
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1087.585, mean=1087.585, max=1087.585, sum=2175.169 (2)",
-            "tab": "General information",
-            "score": 1087.5845588235295
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=651.592, mean=651.592, max=651.592, sum=1303.184 (2)",
-            "tab": "General information",
-            "score": 651.5921985815603
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1630.787, mean=1630.787, max=1630.787, sum=3261.574 (2)",
-            "tab": "General information",
-            "score": 1630.7868318122555
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=568.114, mean=568.114, max=568.114, sum=1136.229 (2)",
-            "tab": "General information",
-            "score": 568.1143790849674
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "details": {
-          "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)",
-            "tab": "Efficiency",
-            "score": 0.3899818444252014
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=415.79, mean=415.79, max=415.79, sum=831.58 (2)",
-            "tab": "General information",
-            "score": 415.79
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.836,
-        "details": {
-          "description": "min=0.836, mean=0.836, max=0.836, sum=1.671 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.428, mean=0.428, max=0.428, sum=0.857 (2)",
-            "tab": "Efficiency",
-            "score": 0.42830287625915126
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=572.691, mean=572.691, max=572.691, sum=1145.382 (2)",
-            "tab": "General information",
-            "score": 572.6907894736842
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78,
-        "details": {
-          "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.446, mean=0.446, max=0.446, sum=0.891 (2)",
-            "tab": "Efficiency",
-            "score": 0.44572278976440427
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=562.52, mean=562.52, max=562.52, sum=1125.04 (2)",
-            "tab": "General information",
-            "score": 562.52
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.789,
-        "details": {
-          "description": "min=0.789, mean=0.789, max=0.789, sum=1.577 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.404, mean=0.404, max=0.404, sum=0.807 (2)",
-            "tab": "Efficiency",
-            "score": 0.4037102978184538
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=390.947, mean=390.947, max=390.947, sum=781.894 (2)",
-            "tab": "General information",
-            "score": 390.94716981132075
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.74,
-        "details": {
-          "description": "min=0.74, mean=0.74, max=0.74, sum=1.481 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.379, mean=0.379, max=0.379, sum=0.758 (2)",
-            "tab": "Efficiency",
-            "score": 0.3791612523667356
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=297.838, mean=297.838, max=297.838, sum=595.677 (2)",
-            "tab": "General information",
-            "score": 297.83829787234043
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.71,
-        "details": {
-          "description": "min=0.71, mean=0.71, max=0.71, sum=1.421 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.384, mean=0.384, max=0.384, sum=0.767 (2)",
-            "tab": "Efficiency",
-            "score": 0.3837302882095863
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=433.641, mean=433.641, max=433.641, sum=867.283 (2)",
-            "tab": "General information",
-            "score": 433.6413793103448
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.563,
-        "details": {
-          "description": "min=0.563, mean=0.563, max=0.563, sum=1.127 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.392, mean=0.392, max=0.392, sum=0.783 (2)",
-            "tab": "Efficiency",
-            "score": 0.3916708092210154
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=524.862, mean=524.862, max=524.862, sum=1049.725 (2)",
-            "tab": "General information",
-            "score": 524.8624338624338
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.563,
-        "details": {
-          "description": "min=0.563, mean=0.563, max=0.563, sum=1.127 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.419, mean=0.419, max=0.419, sum=0.837 (2)",
-            "tab": "Efficiency",
-            "score": 0.41872944339873297
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=599.762, mean=599.762, max=599.762, sum=1199.524 (2)",
-            "tab": "General information",
-            "score": 599.7619047619048
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.903,
-        "details": {
-          "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.387, mean=0.387, max=0.387, sum=0.775 (2)",
-            "tab": "Efficiency",
-            "score": 0.3873311073549332
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.356, mean=0.356, max=0.356, sum=0.712 (2)",
-            "tab": "Efficiency",
-            "score": 0.356056117071894
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.416, mean=0.416, max=0.416, sum=0.832 (2)",
-            "tab": "Efficiency",
-            "score": 0.4159617280960083
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.784, mean=0.784, max=0.784, sum=1.569 (2)",
-            "tab": "Efficiency",
-            "score": 0.7843083367203221
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.573, mean=0.573, max=0.573, sum=1.146 (2)",
-            "tab": "Efficiency",
-            "score": 0.573177902385442
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.522, mean=0.522, max=0.522, sum=1.043 (2)",
-            "tab": "Efficiency",
-            "score": 0.5217143093366079
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=1.013, mean=1.013, max=1.013, sum=2.025 (2)",
-            "tab": "Efficiency",
-            "score": 1.0127322582098155
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=1.347, mean=1.347, max=1.347, sum=2.694 (2)",
-            "tab": "Efficiency",
-            "score": 1.346758367397167
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.405, mean=0.405, max=0.405, sum=0.81 (2)",
-            "tab": "Efficiency",
-            "score": 0.40513940819171296
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.457, mean=0.457, max=0.457, sum=0.915 (2)",
-            "tab": "Efficiency",
-            "score": 0.45727316433230775
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.427, mean=0.427, max=0.427, sum=0.855 (2)",
-            "tab": "Efficiency",
-            "score": 0.42725621625917765
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.465, mean=0.465, max=0.465, sum=0.93 (2)",
-            "tab": "Efficiency",
-            "score": 0.4648557923458241
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.701, mean=0.701, max=0.701, sum=1.401 (2)",
-            "tab": "Efficiency",
-            "score": 0.7005175001481
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.543, mean=0.543, max=0.543, sum=1.085 (2)",
-            "tab": "Efficiency",
-            "score": 0.5426257642512583
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=506.677, mean=506.677, max=506.677, sum=1013.355 (2)",
-            "tab": "General information",
-            "score": 506.6774193548387
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=489.714, mean=489.714, max=489.714, sum=979.429 (2)",
-            "tab": "General information",
-            "score": 489.7142857142857
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=860.78, mean=860.78, max=860.78, sum=1721.56 (2)",
-            "tab": "General information",
-            "score": 860.78
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2791.073, mean=2791.073, max=2791.073, sum=5582.145 (2)",
-            "tab": "General information",
-            "score": 2791.072727272727
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=365.045, mean=365.045, max=365.045, sum=730.091 (2)",
-            "tab": "General information",
-            "score": 365.04545454545456
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=458.824, mean=458.824, max=458.824, sum=917.648 (2)",
-            "tab": "General information",
-            "score": 458.8238341968912
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=364.562, mean=364.562, max=364.562, sum=729.123 (2)",
-            "tab": "General information",
-            "score": 364.5615384615385
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=525.374, mean=525.374, max=525.374, sum=1050.748 (2)",
-            "tab": "General information",
-            "score": 525.3740740740741
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=392.025, mean=392.025, max=392.025, sum=784.05 (2)",
-            "tab": "General information",
-            "score": 392.02521008403363
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=553.464, mean=553.464, max=553.464, sum=1106.927 (2)",
-            "tab": "General information",
-            "score": 553.4635761589404
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=488.246, mean=488.246, max=488.246, sum=976.492 (2)",
-            "tab": "General information",
-            "score": 488.24587155963303
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=788.699, mean=788.699, max=788.699, sum=1577.398 (2)",
-            "tab": "General information",
-            "score": 788.699074074074
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2210.809, mean=2210.809, max=2210.809, sum=4421.618 (2)",
-            "tab": "General information",
-            "score": 2210.8088235294117
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1421.27, mean=1421.27, max=1421.27, sum=2842.54 (2)",
-            "tab": "General information",
-            "score": 1421.2700421940929
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.878,
-        "details": {
-          "description": "min=0.878, mean=0.878, max=0.878, sum=1.756 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.409, mean=0.409, max=0.409, sum=0.819 (2)",
-            "tab": "Efficiency",
-            "score": 0.4093097753054358
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.41, mean=0.41, max=0.41, sum=0.819 (2)",
-            "tab": "Efficiency",
-            "score": 0.40955095072738995
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=312.906, mean=312.906, max=312.906, sum=625.812 (2)",
-            "tab": "General information",
-            "score": 312.90582959641256
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=334.183, mean=334.183, max=334.183, sum=668.366 (2)",
-            "tab": "General information",
-            "score": 334.1832061068702
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.884,
-        "details": {
-          "description": "min=0.884, mean=0.884, max=0.884, sum=1.769 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.435, mean=0.435, max=0.435, sum=0.871 (2)",
-            "tab": "Efficiency",
-            "score": 0.43540735284159005
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=632.851, mean=632.851, max=632.851, sum=1265.702 (2)",
-            "tab": "General information",
-            "score": 632.8512396694215
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.847,
-        "details": {
-          "description": "min=0.847, mean=0.847, max=0.847, sum=1.693 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.418, mean=0.418, max=0.418, sum=0.836 (2)",
-            "tab": "Efficiency",
-            "score": 0.4178658789652257
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=442.595, mean=442.595, max=442.595, sum=885.19 (2)",
-            "tab": "General information",
-            "score": 442.5950920245399
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.625,
-        "details": {
-          "description": "min=0.625, mean=0.625, max=0.625, sum=1.25 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.442, mean=0.442, max=0.442, sum=0.884 (2)",
-            "tab": "Efficiency",
-            "score": 0.442230761051178
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=661.054, mean=661.054, max=661.054, sum=1322.107 (2)",
-            "tab": "General information",
-            "score": 661.0535714285714
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.854,
-        "details": {
-          "description": "min=0.854, mean=0.854, max=0.854, sum=1.709 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)",
-            "tab": "Efficiency",
-            "score": 0.42014194460748466
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=276.796, mean=276.796, max=276.796, sum=553.592 (2)",
-            "tab": "General information",
-            "score": 276.79611650485435
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.425, mean=0.425, max=0.425, sum=0.85 (2)",
-            "tab": "Efficiency",
-            "score": 0.4250037354281825
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=397.218, mean=397.218, max=397.218, sum=794.436 (2)",
-            "tab": "General information",
-            "score": 397.21794871794873
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)",
-            "tab": "Efficiency",
-            "score": 0.4227530717849731
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=334, mean=334, max=334, sum=668 (2)",
-            "tab": "General information",
-            "score": 334.0
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.911,
-        "details": {
-          "description": "min=0.911, mean=0.911, max=0.911, sum=1.821 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.367, mean=0.367, max=0.367, sum=0.734 (2)",
-            "tab": "Efficiency",
-            "score": 0.3670404892162649
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=292.925, mean=292.925, max=292.925, sum=585.849 (2)",
-            "tab": "General information",
-            "score": 292.92464878671774
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.465,
-        "details": {
-          "description": "min=0.465, mean=0.465, max=0.465, sum=0.93 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.383, mean=0.383, max=0.383, sum=0.766 (2)",
-            "tab": "Efficiency",
-            "score": 0.3832114066691757
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.4, mean=0.4, max=0.4, sum=0.801 (2)",
-            "tab": "Efficiency",
-            "score": 0.400396443478888
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=469.145, mean=469.145, max=469.145, sum=938.289 (2)",
-            "tab": "General information",
-            "score": 469.1445086705202
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=649.455, mean=649.455, max=649.455, sum=1298.909 (2)",
-            "tab": "General information",
-            "score": 649.454748603352
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.814,
-        "details": {
-          "description": "min=0.814, mean=0.814, max=0.814, sum=1.627 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.397, mean=0.397, max=0.397, sum=0.795 (2)",
-            "tab": "Efficiency",
-            "score": 0.39725586947272806
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=579.817, mean=579.817, max=579.817, sum=1159.634 (2)",
-            "tab": "General information",
-            "score": 579.8169934640523
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.381, mean=0.381, max=0.381, sum=0.763 (2)",
-            "tab": "Efficiency",
-            "score": 0.3814176806697139
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=507.559, mean=507.559, max=507.559, sum=1015.117 (2)",
-            "tab": "General information",
-            "score": 507.55864197530866
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.691,
-        "details": {
-          "description": "min=0.691, mean=0.691, max=0.691, sum=1.382 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.391, mean=0.391, max=0.391, sum=0.782 (2)",
-            "tab": "Efficiency",
-            "score": 0.3911652868444269
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=398.318, mean=398.318, max=398.318, sum=796.636 (2)",
-            "tab": "General information",
-            "score": 398.3181818181818
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.804,
-        "details": {
-          "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.464, mean=0.464, max=0.464, sum=0.928 (2)",
-            "tab": "Efficiency",
-            "score": 0.46417581013270787
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1157.473, mean=1157.473, max=1157.473, sum=2314.947 (2)",
-            "tab": "General information",
-            "score": 1157.4734693877551
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.896,
-        "details": {
-          "description": "min=0.896, mean=0.896, max=0.896, sum=1.791 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.401, mean=0.401, max=0.401, sum=0.801 (2)",
-            "tab": "Efficiency",
-            "score": 0.4007088568673205
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=438.522, mean=438.522, max=438.522, sum=877.045 (2)",
-            "tab": "General information",
-            "score": 438.5223880597015
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.566,
-        "details": {
-          "description": "min=0.566, mean=0.566, max=0.566, sum=1.133 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.386, mean=0.386, max=0.386, sum=0.771 (2)",
-            "tab": "Efficiency",
-            "score": 0.38554139022367545
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=336.09, mean=336.09, max=336.09, sum=672.181 (2)",
-            "tab": "General information",
-            "score": 336.0903614457831
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=1.743 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.382, mean=0.382, max=0.382, sum=0.765 (2)",
-            "tab": "Efficiency",
-            "score": 0.3823263380262587
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=268.561, mean=268.561, max=268.561, sum=537.123 (2)",
-            "tab": "General information",
-            "score": 268.56140350877195
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.537,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json b/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json
deleted file mode 100644
index 7ec071041..000000000
--- a/data/helm_mmlu/deepseek-ai/deepseek-llm-67b-chat/ee5528b4-b4a5-423f-8149-6c1dc4d2096d.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-llm-67b-chat/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek LLM Chat 67B",
-    "id": "deepseek-ai/deepseek-llm-67b-chat",
-    "developer": "deepseek-ai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.725,
-        "details": {
-          "description": "min=0.363, mean=0.725, max=0.964, sum=82.655 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.432, mean=0.591, max=1.828, sum=67.401 (114)",
-            "tab": "Efficiency",
-            "score": 0.5912370078072168
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=282.398, mean=644.941, max=2845.339, sum=73523.251 (114)",
-            "tab": "General information",
-            "score": 644.9407984438222
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44,
-        "details": {
-          "description": "min=0.44, mean=0.44, max=0.44, sum=0.88 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.485, mean=0.485, max=0.485, sum=0.97 (2)",
-            "tab": "Efficiency",
-            "score": 0.4850481009483337
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=382.07, mean=382.07, max=382.07, sum=764.14 (2)",
-            "tab": "General information",
-            "score": 382.07
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.667,
-        "details": {
-          "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.451, mean=0.451, max=0.451, sum=0.903 (2)",
-            "tab": "Efficiency",
-            "score": 0.4513168688173647
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=363.059, mean=363.059, max=363.059, sum=726.119 (2)",
-            "tab": "General information",
-            "score": 363.05925925925925
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.363,
-        "details": {
-          "description": "min=0.363, mean=0.363, max=0.363, sum=0.725 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.522, mean=0.522, max=0.522, sum=1.045 (2)",
-            "tab": "Efficiency",
-            "score": 0.5224089217185974
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.513, mean=0.513, max=0.513, sum=1.026 (2)",
-            "tab": "Efficiency",
-            "score": 0.5128465278281106
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.674, mean=0.674, max=0.674, sum=1.347 (2)",
-            "tab": "Efficiency",
-            "score": 0.6736601734161377
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.552, mean=0.552, max=0.552, sum=1.103 (2)",
-            "tab": "Efficiency",
-            "score": 0.5516978883743286
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.515, mean=0.515, max=0.515, sum=1.03 (2)",
-            "tab": "Efficiency",
-            "score": 0.5147825513960999
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.558, mean=0.558, max=0.558, sum=1.116 (2)",
-            "tab": "Efficiency",
-            "score": 0.5581503288418639
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=578.1, mean=578.1, max=578.1, sum=1156.2 (2)",
-            "tab": "General information",
-            "score": 578.1
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=502.611, mean=502.611, max=502.611, sum=1005.222 (2)",
-            "tab": "General information",
-            "score": 502.6111111111111
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=864.55, mean=864.55, max=864.55, sum=1729.1 (2)",
-            "tab": "General information",
-            "score": 864.55
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=630.13, mean=630.13, max=630.13, sum=1260.26 (2)",
-            "tab": "General information",
-            "score": 630.13
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=538.613, mean=538.613, max=538.613, sum=1077.225 (2)",
-            "tab": "General information",
-            "score": 538.6127167630058
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=507.157, mean=507.157, max=507.157, sum=1014.314 (2)",
-            "tab": "General information",
-            "score": 507.15686274509807
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.48, mean=0.48, max=0.48, sum=0.96 (2)",
-            "tab": "Efficiency",
-            "score": 0.48023970127105714
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=394.36, mean=394.36, max=394.36, sum=788.72 (2)",
-            "tab": "General information",
-            "score": 394.36
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.553,
-        "details": {
-          "description": "min=0.553, mean=0.553, max=0.553, sum=1.105 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.551, mean=0.551, max=0.551, sum=1.102 (2)",
-            "tab": "Efficiency",
-            "score": 0.5508757557785302
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=646.667, mean=646.667, max=646.667, sum=1293.333 (2)",
-            "tab": "General information",
-            "score": 646.6666666666666
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46,
-        "details": {
-          "description": "min=0.46, mean=0.46, max=0.46, sum=0.92 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.506, mean=0.506, max=0.506, sum=1.013 (2)",
-            "tab": "Efficiency",
-            "score": 0.5062541460990906
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=457.97, mean=457.97, max=457.97, sum=915.94 (2)",
-            "tab": "General information",
-            "score": 457.97
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.852,
-        "details": {
-          "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.477, mean=0.477, max=0.477, sum=0.954 (2)",
-            "tab": "Efficiency",
-            "score": 0.47704599963294136
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=415.861, mean=415.861, max=415.861, sum=831.722 (2)",
-            "tab": "General information",
-            "score": 415.8611111111111
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.801,
-        "details": {
-          "description": "min=0.801, mean=0.801, max=0.801, sum=1.601 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.432, mean=0.432, max=0.432, sum=0.864 (2)",
-            "tab": "Efficiency",
-            "score": 0.43181402736921404
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=347.907, mean=347.907, max=347.907, sum=695.814 (2)",
-            "tab": "General information",
-            "score": 347.90675241157555
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.809,
-        "details": {
-          "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.841, mean=0.841, max=0.841, sum=1.683 (2)",
-            "tab": "Efficiency",
-            "score": 0.8414969829952016
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.641, mean=0.641, max=0.641, sum=1.282 (2)",
-            "tab": "Efficiency",
-            "score": 0.6411697010621957
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=1.161, mean=1.161, max=1.161, sum=2.323 (2)",
-            "tab": "Efficiency",
-            "score": 1.1613836899263763
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.532, mean=0.532, max=0.532, sum=1.064 (2)",
-            "tab": "Efficiency",
-            "score": 0.5318081830841264
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1166.062, mean=1166.062, max=1166.062, sum=2332.125 (2)",
-            "tab": "General information",
-            "score": 1166.0625
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=759.752, mean=759.752, max=759.752, sum=1519.504 (2)",
-            "tab": "General information",
-            "score": 759.7517730496454
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1711.27, mean=1711.27, max=1711.27, sum=3422.54 (2)",
-            "tab": "General information",
-            "score": 1711.2698826597132
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=599.475, mean=599.475, max=599.475, sum=1198.951 (2)",
-            "tab": "General information",
-            "score": 599.4754901960785
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.504, mean=0.504, max=0.504, sum=1.007 (2)",
-            "tab": "Efficiency",
-            "score": 0.5037446546554566
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=453.51, mean=453.51, max=453.51, sum=907.02 (2)",
-            "tab": "General information",
-            "score": 453.51
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.822,
-        "details": {
-          "description": "min=0.822, mean=0.822, max=0.822, sum=1.645 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.527, mean=0.527, max=0.527, sum=1.054 (2)",
-            "tab": "Efficiency",
-            "score": 0.5270162303196756
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=613.967, mean=613.967, max=613.967, sum=1227.934 (2)",
-            "tab": "General information",
-            "score": 613.9671052631579
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.86,
-        "details": {
-          "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)",
-            "tab": "Efficiency",
-            "score": 0.5199160981178284
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=575.68, mean=575.68, max=575.68, sum=1151.36 (2)",
-            "tab": "General information",
-            "score": 575.68
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.785,
-        "details": {
-          "description": "min=0.785, mean=0.785, max=0.785, sum=1.57 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.49, mean=0.49, max=0.49, sum=0.979 (2)",
-            "tab": "Efficiency",
-            "score": 0.48968217777756023
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=436.902, mean=436.902, max=436.902, sum=873.804 (2)",
-            "tab": "General information",
-            "score": 436.90188679245284
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.723,
-        "details": {
-          "description": "min=0.723, mean=0.723, max=0.723, sum=1.447 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.442, mean=0.442, max=0.442, sum=0.883 (2)",
-            "tab": "Efficiency",
-            "score": 0.441747319444697
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=311.583, mean=311.583, max=311.583, sum=623.166 (2)",
-            "tab": "General information",
-            "score": 311.58297872340427
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.669,
-        "details": {
-          "description": "min=0.669, mean=0.669, max=0.669, sum=1.338 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.498, mean=0.498, max=0.498, sum=0.995 (2)",
-            "tab": "Efficiency",
-            "score": 0.4975001285816061
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=476.407, mean=476.407, max=476.407, sum=952.814 (2)",
-            "tab": "General information",
-            "score": 476.4068965517241
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.548,
-        "details": {
-          "description": "min=0.548, mean=0.548, max=0.548, sum=1.095 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.534, mean=0.534, max=0.534, sum=1.068 (2)",
-            "tab": "Efficiency",
-            "score": 0.5340847508617179
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=597.267, mean=597.267, max=597.267, sum=1194.534 (2)",
-            "tab": "General information",
-            "score": 597.2671957671957
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.548,
-        "details": {
-          "description": "min=0.548, mean=0.548, max=0.548, sum=1.095 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.555, mean=0.555, max=0.555, sum=1.11 (2)",
-            "tab": "Efficiency",
-            "score": 0.5548424853218926
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=655.698, mean=655.698, max=655.698, sum=1311.397 (2)",
-            "tab": "General information",
-            "score": 655.6984126984127
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.911,
-        "details": {
-          "description": "min=0.911, mean=0.911, max=0.911, sum=1.823 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.507, mean=0.507, max=0.507, sum=1.014 (2)",
-            "tab": "Efficiency",
-            "score": 0.5071036392642606
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.511, mean=0.511, max=0.511, sum=1.023 (2)",
-            "tab": "Efficiency",
-            "score": 0.5113655968839899
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.707, mean=0.707, max=0.707, sum=1.415 (2)",
-            "tab": "Efficiency",
-            "score": 0.707279555797577
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=1.828, mean=1.828, max=1.828, sum=3.657 (2)",
-            "tab": "Efficiency",
-            "score": 1.8283701000791608
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.483, mean=0.483, max=0.483, sum=0.967 (2)",
-            "tab": "Efficiency",
-            "score": 0.48332409545628713
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.483, mean=0.483, max=0.483, sum=0.967 (2)",
-            "tab": "Efficiency",
-            "score": 0.48336509719413795
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.486, mean=0.486, max=0.486, sum=0.973 (2)",
-            "tab": "Efficiency",
-            "score": 0.4863407966418144
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.531, mean=0.531, max=0.531, sum=1.062 (2)",
-            "tab": "Efficiency",
-            "score": 0.5308889477341263
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.503, mean=0.503, max=0.503, sum=1.006 (2)",
-            "tab": "Efficiency",
-            "score": 0.50309332478948
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.533, mean=0.533, max=0.533, sum=1.066 (2)",
-            "tab": "Efficiency",
-            "score": 0.5327805051740432
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.519, mean=0.519, max=0.519, sum=1.039 (2)",
-            "tab": "Efficiency",
-            "score": 0.5194539997555794
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.662, mean=0.662, max=0.662, sum=1.323 (2)",
-            "tab": "Efficiency",
-            "score": 0.6615116441691363
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=1.442, mean=1.442, max=1.442, sum=2.885 (2)",
-            "tab": "Efficiency",
-            "score": 1.4423445556678025
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=1.033, mean=1.033, max=1.033, sum=2.067 (2)",
-            "tab": "Efficiency",
-            "score": 1.033272183897123
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=517.116, mean=517.116, max=517.116, sum=1034.232 (2)",
-            "tab": "General information",
-            "score": 517.1161290322581
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=527.305, mean=527.305, max=527.305, sum=1054.611 (2)",
-            "tab": "General information",
-            "score": 527.3054187192118
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=937.05, mean=937.05, max=937.05, sum=1874.1 (2)",
-            "tab": "General information",
-            "score": 937.05
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2845.339, mean=2845.339, max=2845.339, sum=5690.679 (2)",
-            "tab": "General information",
-            "score": 2845.339393939394
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=397.934, mean=397.934, max=397.934, sum=795.869 (2)",
-            "tab": "General information",
-            "score": 397.9343434343434
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=485.57, mean=485.57, max=485.57, sum=971.14 (2)",
-            "tab": "General information",
-            "score": 485.5699481865285
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=396.095, mean=396.095, max=396.095, sum=792.19 (2)",
-            "tab": "General information",
-            "score": 396.0948717948718
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=568.481, mean=568.481, max=568.481, sum=1136.963 (2)",
-            "tab": "General information",
-            "score": 568.4814814814815
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=416.857, mean=416.857, max=416.857, sum=833.714 (2)",
-            "tab": "General information",
-            "score": 416.85714285714283
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=590.212, mean=590.212, max=590.212, sum=1180.424 (2)",
-            "tab": "General information",
-            "score": 590.2119205298013
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=512.505, mean=512.505, max=512.505, sum=1025.009 (2)",
-            "tab": "General information",
-            "score": 512.5045871559633
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=861.625, mean=861.625, max=861.625, sum=1723.25 (2)",
-            "tab": "General information",
-            "score": 861.625
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2259.147, mean=2259.147, max=2259.147, sum=4518.294 (2)",
-            "tab": "General information",
-            "score": 2259.1470588235293
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1467.696, mean=1467.696, max=1467.696, sum=2935.392 (2)",
-            "tab": "General information",
-            "score": 1467.6962025316457
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.434, mean=0.434, max=0.434, sum=0.868 (2)",
-            "tab": "Efficiency",
-            "score": 0.4340778626668614
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.499, mean=0.499, max=0.499, sum=0.999 (2)",
-            "tab": "Efficiency",
-            "score": 0.4992539391262841
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=327.825, mean=327.825, max=327.825, sum=655.65 (2)",
-            "tab": "General information",
-            "score": 327.82511210762334
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=366.824, mean=366.824, max=366.824, sum=733.649 (2)",
-            "tab": "General information",
-            "score": 366.82442748091603
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.851,
-        "details": {
-          "description": "min=0.851, mean=0.851, max=0.851, sum=1.702 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.571, mean=0.571, max=0.571, sum=1.142 (2)",
-            "tab": "Efficiency",
-            "score": 0.5709604842603699
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=652.669, mean=652.669, max=652.669, sum=1305.339 (2)",
-            "tab": "General information",
-            "score": 652.6694214876034
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.847,
-        "details": {
-          "description": "min=0.847, mean=0.847, max=0.847, sum=1.693 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.49, mean=0.49, max=0.49, sum=0.98 (2)",
-            "tab": "Efficiency",
-            "score": 0.48975605028538616
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=452.098, mean=452.098, max=452.098, sum=904.196 (2)",
-            "tab": "General information",
-            "score": 452.09815950920245
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.562,
-        "details": {
-          "description": "min=0.562, mean=0.562, max=0.562, sum=1.125 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.632, mean=0.632, max=0.632, sum=1.264 (2)",
-            "tab": "Efficiency",
-            "score": 0.6320873349905014
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=702.054, mean=702.054, max=702.054, sum=1404.107 (2)",
-            "tab": "General information",
-            "score": 702.0535714285714
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.903,
-        "details": {
-          "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.442, mean=0.442, max=0.442, sum=0.883 (2)",
-            "tab": "Efficiency",
-            "score": 0.4415167558540418
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=288.437, mean=288.437, max=288.437, sum=576.874 (2)",
-            "tab": "General information",
-            "score": 288.43689320388347
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.923,
-        "details": {
-          "description": "min=0.923, mean=0.923, max=0.923, sum=1.846 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.489, mean=0.489, max=0.489, sum=0.979 (2)",
-            "tab": "Efficiency",
-            "score": 0.4894245363708235
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=435.603, mean=435.603, max=435.603, sum=871.205 (2)",
-            "tab": "General information",
-            "score": 435.6025641025641
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.73,
-        "details": {
-          "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.474, mean=0.474, max=0.474, sum=0.947 (2)",
-            "tab": "Efficiency",
-            "score": 0.47359968423843385
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=357.07, mean=357.07, max=357.07, sum=714.14 (2)",
-            "tab": "General information",
-            "score": 357.07
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.904,
-        "details": {
-          "description": "min=0.904, mean=0.904, max=0.904, sum=1.808 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.435, mean=0.435, max=0.435, sum=0.871 (2)",
-            "tab": "Efficiency",
-            "score": 0.4352987403309361
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=320.964, mean=320.964, max=320.964, sum=641.928 (2)",
-            "tab": "General information",
-            "score": 320.9642401021711
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.544,
-        "details": {
-          "description": "min=0.544, mean=0.544, max=0.544, sum=1.088 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.491, mean=0.491, max=0.491, sum=0.983 (2)",
-            "tab": "Efficiency",
-            "score": 0.49129951827098867
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.583, mean=0.583, max=0.583, sum=1.165 (2)",
-            "tab": "Efficiency",
-            "score": 0.5826290319751761
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=497.379, mean=497.379, max=497.379, sum=994.757 (2)",
-            "tab": "General information",
-            "score": 497.37861271676303
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=688.891, mean=688.891, max=688.891, sum=1377.781 (2)",
-            "tab": "General information",
-            "score": 688.890502793296
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.781,
-        "details": {
-          "description": "min=0.781, mean=0.781, max=0.781, sum=1.562 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.548, mean=0.548, max=0.548, sum=1.096 (2)",
-            "tab": "Efficiency",
-            "score": 0.5477774073095882
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=619.314, mean=619.314, max=619.314, sum=1238.627 (2)",
-            "tab": "General information",
-            "score": 619.3137254901961
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.858,
-        "details": {
-          "description": "min=0.858, mean=0.858, max=0.858, sum=1.716 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.521, mean=0.521, max=0.521, sum=1.042 (2)",
-            "tab": "Efficiency",
-            "score": 0.5209115015135871
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=554.775, mean=554.775, max=554.775, sum=1109.549 (2)",
-            "tab": "General information",
-            "score": 554.7746913580247
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7,
-        "details": {
-          "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.473, mean=0.473, max=0.473, sum=0.945 (2)",
-            "tab": "Efficiency",
-            "score": 0.4725117553364147
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=431.673, mean=431.673, max=431.673, sum=863.345 (2)",
-            "tab": "General information",
-            "score": 431.6727272727273
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "description": "min=0.796, mean=0.796, max=0.796, sum=1.592 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.909, mean=0.909, max=0.909, sum=1.819 (2)",
-            "tab": "Efficiency",
-            "score": 0.9094535496770119
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1204.906, mean=1204.906, max=1204.906, sum=2409.812 (2)",
-            "tab": "General information",
-            "score": 1204.9061224489797
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.876,
-        "details": {
-          "description": "min=0.876, mean=0.876, max=0.876, sum=1.751 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.502, mean=0.502, max=0.502, sum=1.003 (2)",
-            "tab": "Efficiency",
-            "score": 0.5015075396542525
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=457.751, mean=457.751, max=457.751, sum=915.502 (2)",
-            "tab": "General information",
-            "score": 457.7512437810945
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.554,
-        "details": {
-          "description": "min=0.554, mean=0.554, max=0.554, sum=1.108 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.582, mean=0.582, max=0.582, sum=1.165 (2)",
-            "tab": "Efficiency",
-            "score": 0.5824309874729938
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=351.434, mean=351.434, max=351.434, sum=702.867 (2)",
-            "tab": "General information",
-            "score": 351.43373493975906
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "details": {
-          "description": "min=0.865, mean=0.865, max=0.865, sum=1.731 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.435, mean=0.435, max=0.435, sum=0.87 (2)",
-            "tab": "Efficiency",
-            "score": 0.434985329533181
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=282.398, mean=282.398, max=282.398, sum=564.795 (2)",
-            "tab": "General information",
-            "score": 282.39766081871346
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.387,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json b/data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json
deleted file mode 100644
index 200a6e19c..000000000
--- a/data/helm_mmlu/deepseek-ai/deepseek-v3/c97b0f33-eda0-4069-9ab6-f277c1f8e55b.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/deepseek-ai_deepseek-v3/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek v3",
-    "id": "deepseek-ai/deepseek-v3",
-    "developer": "deepseek-ai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.872,
-        "details": {
-          "description": "min=0.596, mean=0.872, max=0.979, sum=99.412 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.495, mean=1.354, max=6.344, sum=154.309 (114)",
-            "tab": "Efficiency",
-            "score": 1.353587049503403
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=268.918, mean=607.861, max=2773.188, sum=69296.195 (114)",
-            "tab": "General information",
-            "score": 607.8613565650774
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.585, mean=0.585, max=0.585, sum=1.171 (2)",
-            "tab": "Efficiency",
-            "score": 0.5853858423233033
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=373.01, mean=373.01, max=373.01, sum=746.02 (2)",
-            "tab": "General information",
-            "score": 373.01
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.867,
-        "details": {
-          "description": "min=0.867, mean=0.867, max=0.867, sum=1.733 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=1.804, mean=1.804, max=1.804, sum=3.607 (2)",
-            "tab": "Efficiency",
-            "score": 1.8037012683020697
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=332.119, mean=332.119, max=332.119, sum=664.237 (2)",
-            "tab": "General information",
-            "score": 332.1185185185185
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.814,
-        "details": {
-          "description": "min=0.814, mean=0.814, max=0.814, sum=1.627 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.557, mean=0.557, max=0.557, sum=1.113 (2)",
-            "tab": "Efficiency",
-            "score": 0.5567307829856872
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.776, mean=0.776, max=0.776, sum=1.553 (2)",
-            "tab": "Efficiency",
-            "score": 0.7763584835661782
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.505, mean=0.505, max=0.505, sum=1.01 (2)",
-            "tab": "Efficiency",
-            "score": 0.5047655653953552
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.495, mean=0.495, max=0.495, sum=0.989 (2)",
-            "tab": "Efficiency",
-            "score": 0.4945454502105713
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=1.811, mean=1.811, max=1.811, sum=3.623 (2)",
-            "tab": "Efficiency",
-            "score": 1.8114735322191535
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=6.344, mean=6.344, max=6.344, sum=12.687 (2)",
-            "tab": "Efficiency",
-            "score": 6.343635446885052
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=541.32, mean=541.32, max=541.32, sum=1082.64 (2)",
-            "tab": "General information",
-            "score": 541.32
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=456.201, mean=456.201, max=456.201, sum=912.403 (2)",
-            "tab": "General information",
-            "score": 456.2013888888889
-          },
-          "College Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=828.34, mean=828.34, max=828.34, sum=1656.68 (2)",
-            "tab": "General information",
-            "score": 828.34
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=592.74, mean=592.74, max=592.74, sum=1185.48 (2)",
-            "tab": "General information",
-            "score": 592.74
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=486.971, mean=486.971, max=486.971, sum=973.942 (2)",
-            "tab": "General information",
-            "score": 486.97109826589593
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=492.804, mean=492.804, max=492.804, sum=985.608 (2)",
-            "tab": "General information",
-            "score": 492.80392156862746
-          },
-          "College Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.86,
-        "details": {
-          "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.545, mean=0.545, max=0.545, sum=1.089 (2)",
-            "tab": "Efficiency",
-            "score": 0.5446710443496704
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=375.06, mean=375.06, max=375.06, sum=750.12 (2)",
-            "tab": "General information",
-            "score": 375.06
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.746,
-        "details": {
-          "description": "min=0.746, mean=0.746, max=0.746, sum=1.491 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.554, mean=0.554, max=0.554, sum=1.107 (2)",
-            "tab": "Efficiency",
-            "score": 0.5537264849010267
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=613.535, mean=613.535, max=613.535, sum=1227.07 (2)",
-            "tab": "General information",
-            "score": 613.5350877192982
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.68,
-        "details": {
-          "description": "min=0.68, mean=0.68, max=0.68, sum=1.36 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.978, mean=0.978, max=0.978, sum=1.955 (2)",
-            "tab": "Efficiency",
-            "score": 0.9775782990455627
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=398.63, mean=398.63, max=398.63, sum=797.26 (2)",
-            "tab": "General information",
-            "score": 398.63
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.898,
-        "details": {
-          "description": "min=0.898, mean=0.898, max=0.898, sum=1.796 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.834, mean=0.834, max=0.834, sum=1.668 (2)",
-            "tab": "Efficiency",
-            "score": 0.8338986083313271
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=387.694, mean=387.694, max=387.694, sum=775.389 (2)",
-            "tab": "General information",
-            "score": 387.69444444444446
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.836, mean=0.836, max=0.836, sum=1.673 (2)",
-            "tab": "Efficiency",
-            "score": 0.836391413710125
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=323.569, mean=323.569, max=323.569, sum=647.138 (2)",
-            "tab": "General information",
-            "score": 323.56913183279744
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.887,
-        "details": {
-          "description": "min=0.887, mean=0.887, max=0.887, sum=1.775 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.635, mean=0.635, max=0.635, sum=1.269 (2)",
-            "tab": "Efficiency",
-            "score": 0.6345776915550232
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=1.224, mean=1.224, max=1.224, sum=2.448 (2)",
-            "tab": "Efficiency",
-            "score": 1.2240875671941338
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.707, mean=0.707, max=0.707, sum=1.413 (2)",
-            "tab": "Efficiency",
-            "score": 0.7066206168941911
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.672, mean=0.672, max=0.672, sum=1.345 (2)",
-            "tab": "Efficiency",
-            "score": 0.6723053728053773
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1052.765, mean=1052.765, max=1052.765, sum=2105.529 (2)",
-            "tab": "General information",
-            "score": 1052.764705882353
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=659.613, mean=659.613, max=659.613, sum=1319.227 (2)",
-            "tab": "General information",
-            "score": 659.613475177305
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1629.421, mean=1629.421, max=1629.421, sum=3258.842 (2)",
-            "tab": "General information",
-            "score": 1629.4211212516298
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=574.508, mean=574.508, max=574.508, sum=1149.016 (2)",
-            "tab": "General information",
-            "score": 574.5081699346405
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.578, mean=0.578, max=0.578, sum=1.156 (2)",
-            "tab": "Efficiency",
-            "score": 0.5778071475028992
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=426.43, mean=426.43, max=426.43, sum=852.86 (2)",
-            "tab": "General information",
-            "score": 426.43
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.921,
-        "details": {
-          "description": "min=0.921, mean=0.921, max=0.921, sum=1.842 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.681, mean=0.681, max=0.681, sum=1.363 (2)",
-            "tab": "Efficiency",
-            "score": 0.6812541327978435
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=575.836, mean=575.836, max=575.836, sum=1151.671 (2)",
-            "tab": "General information",
-            "score": 575.8355263157895
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "details": {
-          "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=4.691, mean=4.691, max=4.691, sum=9.381 (2)",
-            "tab": "Efficiency",
-            "score": 4.690641319751739
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=550.46, mean=550.46, max=550.46, sum=1100.92 (2)",
-            "tab": "General information",
-            "score": 550.46
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.913,
-        "details": {
-          "description": "min=0.913, mean=0.913, max=0.913, sum=1.826 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.906, mean=0.906, max=0.906, sum=1.812 (2)",
-            "tab": "Efficiency",
-            "score": 0.9061050837894655
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=387.449, mean=387.449, max=387.449, sum=774.898 (2)",
-            "tab": "General information",
-            "score": 387.4490566037736
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.881 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.627, mean=0.627, max=0.627, sum=1.253 (2)",
-            "tab": "Efficiency",
-            "score": 0.6267383788494354
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=300.591, mean=300.591, max=300.591, sum=601.183 (2)",
-            "tab": "General information",
-            "score": 300.59148936170214
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.869,
-        "details": {
-          "description": "min=0.869, mean=0.869, max=0.869, sum=1.738 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=2.459, mean=2.459, max=2.459, sum=4.918 (2)",
-            "tab": "Efficiency",
-            "score": 2.4591504623150002
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=431.91, mean=431.91, max=431.91, sum=863.821 (2)",
-            "tab": "General information",
-            "score": 431.9103448275862
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.942,
-        "details": {
-          "description": "min=0.942, mean=0.942, max=0.942, sum=1.884 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=1.651, mean=1.651, max=1.651, sum=3.301 (2)",
-            "tab": "Efficiency",
-            "score": 1.650515148879359
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=531.693, mean=531.693, max=531.693, sum=1063.386 (2)",
-            "tab": "General information",
-            "score": 531.6931216931217
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.513, mean=0.513, max=0.513, sum=1.026 (2)",
-            "tab": "Efficiency",
-            "score": 0.5130742864003257
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=604.119, mean=604.119, max=604.119, sum=1208.238 (2)",
-            "tab": "General information",
-            "score": 604.1190476190476
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.928,
-        "details": {
-          "description": "min=0.928, mean=0.928, max=0.928, sum=1.857 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=2.647, mean=2.647, max=2.647, sum=5.294 (2)",
-            "tab": "Efficiency",
-            "score": 2.6472030393538937
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=3.847, mean=3.847, max=3.847, sum=7.695 (2)",
-            "tab": "Efficiency",
-            "score": 3.8474940337571018
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=2.761, mean=2.761, max=2.761, sum=5.523 (2)",
-            "tab": "Efficiency",
-            "score": 2.7613840389251707
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=1.944, mean=1.944, max=1.944, sum=3.888 (2)",
-            "tab": "Efficiency",
-            "score": 1.9442455436244155
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.607, mean=0.607, max=0.607, sum=1.215 (2)",
-            "tab": "Efficiency",
-            "score": 0.6073213755482375
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=2.403, mean=2.403, max=2.403, sum=4.805 (2)",
-            "tab": "Efficiency",
-            "score": 2.4025608480285485
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.654, mean=0.654, max=0.654, sum=1.308 (2)",
-            "tab": "Efficiency",
-            "score": 0.6539444972307255
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=2.285, mean=2.285, max=2.285, sum=4.57 (2)",
-            "tab": "Efficiency",
-            "score": 2.285083364557337
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=1.265, mean=1.265, max=1.265, sum=2.531 (2)",
-            "tab": "Efficiency",
-            "score": 1.2653034544792496
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=1.036, mean=1.036, max=1.036, sum=2.072 (2)",
-            "tab": "Efficiency",
-            "score": 1.0361600064283965
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=1.658, mean=1.658, max=1.658, sum=3.315 (2)",
-            "tab": "Efficiency",
-            "score": 1.6576398372650147
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.513, mean=0.513, max=0.513, sum=1.027 (2)",
-            "tab": "Efficiency",
-            "score": 0.5133153398831686
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.791, mean=0.791, max=0.791, sum=1.582 (2)",
-            "tab": "Efficiency",
-            "score": 0.7908881224837958
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=1.65, mean=1.65, max=1.65, sum=3.301 (2)",
-            "tab": "Efficiency",
-            "score": 1.6504118030081318
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=505.561, mean=505.561, max=505.561, sum=1011.123 (2)",
-            "tab": "General information",
-            "score": 505.56129032258065
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=479.32, mean=479.32, max=479.32, sum=958.64 (2)",
-            "tab": "General information",
-            "score": 479.320197044335
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=871.42, mean=871.42, max=871.42, sum=1742.84 (2)",
-            "tab": "General information",
-            "score": 871.42
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2773.188, mean=2773.188, max=2773.188, sum=5546.376 (2)",
-            "tab": "General information",
-            "score": 2773.1878787878786
-          },
-          "High School European History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=369.53, mean=369.53, max=369.53, sum=739.061 (2)",
-            "tab": "General information",
-            "score": 369.530303030303
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=463.767, mean=463.767, max=463.767, sum=927.534 (2)",
-            "tab": "General information",
-            "score": 463.76683937823833
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=370.418, mean=370.418, max=370.418, sum=740.836 (2)",
-            "tab": "General information",
-            "score": 370.4179487179487
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=520.57, mean=520.57, max=520.57, sum=1041.141 (2)",
-            "tab": "General information",
-            "score": 520.5703703703704
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=399.782, mean=399.782, max=399.782, sum=799.563 (2)",
-            "tab": "General information",
-            "score": 399.781512605042
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=559.967, mean=559.967, max=559.967, sum=1119.934 (2)",
-            "tab": "General information",
-            "score": 559.9668874172186
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=480.22, mean=480.22, max=480.22, sum=960.44 (2)",
-            "tab": "General information",
-            "score": 480.2201834862385
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=796.333, mean=796.333, max=796.333, sum=1592.667 (2)",
-            "tab": "General information",
-            "score": 796.3333333333334
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2202.103, mean=2202.103, max=2202.103, sum=4404.206 (2)",
-            "tab": "General information",
-            "score": 2202.1029411764707
-          },
-          "High School US History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1403.051, mean=1403.051, max=1403.051, sum=2806.101 (2)",
-            "tab": "General information",
-            "score": 1403.0506329113923
-          },
-          "High School World History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.924,
-        "details": {
-          "description": "min=0.924, mean=0.924, max=0.924, sum=1.847 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.553, mean=0.553, max=0.553, sum=1.106 (2)",
-            "tab": "Efficiency",
-            "score": 0.5531257503235821
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.511, mean=0.511, max=0.511, sum=1.022 (2)",
-            "tab": "Efficiency",
-            "score": 0.5109815524734613
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=315.26, mean=315.26, max=315.26, sum=630.52 (2)",
-            "tab": "General information",
-            "score": 315.26008968609864
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=341.29, mean=341.29, max=341.29, sum=682.58 (2)",
-            "tab": "General information",
-            "score": 341.29007633587787
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.95,
-        "details": {
-          "description": "min=0.95, mean=0.95, max=0.95, sum=1.901 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.886, mean=0.886, max=0.886, sum=1.772 (2)",
-            "tab": "Efficiency",
-            "score": 0.8861682651456723
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=639.339, mean=639.339, max=639.339, sum=1278.678 (2)",
-            "tab": "General information",
-            "score": 639.3388429752066
-          },
-          "International Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.914,
-        "details": {
-          "description": "min=0.914, mean=0.914, max=0.914, sum=1.828 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.919, mean=0.919, max=0.919, sum=1.838 (2)",
-            "tab": "Efficiency",
-            "score": 0.9191862732354849
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=442.239, mean=442.239, max=442.239, sum=884.479 (2)",
-            "tab": "General information",
-            "score": 442.23926380368096
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.786,
-        "details": {
-          "description": "min=0.786, mean=0.786, max=0.786, sum=1.571 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.518, mean=0.518, max=0.518, sum=1.036 (2)",
-            "tab": "Efficiency",
-            "score": 0.5179938631398338
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=666.277, mean=666.277, max=666.277, sum=1332.554 (2)",
-            "tab": "General information",
-            "score": 666.2767857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.903,
-        "details": {
-          "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=4.248, mean=4.248, max=4.248, sum=8.497 (2)",
-            "tab": "Efficiency",
-            "score": 4.248399836345784
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=277.379, mean=277.379, max=277.379, sum=554.757 (2)",
-            "tab": "General information",
-            "score": 277.378640776699
-          },
-          "Management - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.949,
-        "details": {
-          "description": "min=0.949, mean=0.949, max=0.949, sum=1.897 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=1.645, mean=1.645, max=1.645, sum=3.29 (2)",
-            "tab": "Efficiency",
-            "score": 1.6448312304977677
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=398.675, mean=398.675, max=398.675, sum=797.35 (2)",
-            "tab": "General information",
-            "score": 398.6752136752137
-          },
-          "Marketing - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.96,
-        "details": {
-          "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.527, mean=0.527, max=0.527, sum=1.054 (2)",
-            "tab": "Efficiency",
-            "score": 0.5272433400154114
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=328.48, mean=328.48, max=328.48, sum=656.96 (2)",
-            "tab": "General information",
-            "score": 328.48
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.949,
-        "details": {
-          "description": "min=0.949, mean=0.949, max=0.949, sum=1.898 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=2.642, mean=2.642, max=2.642, sum=5.284 (2)",
-            "tab": "Efficiency",
-            "score": 2.6419809954681006
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=296.626, mean=296.626, max=296.626, sum=593.252 (2)",
-            "tab": "General information",
-            "score": 296.6257982120051
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.808,
-        "details": {
-          "description": "min=0.808, mean=0.808, max=0.808, sum=1.616 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.637, mean=0.637, max=0.637, sum=1.275 (2)",
-            "tab": "Efficiency",
-            "score": 0.6374224183187319
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.624, mean=0.624, max=0.624, sum=1.247 (2)",
-            "tab": "Efficiency",
-            "score": 0.6235519771469372
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=477.78, mean=477.78, max=477.78, sum=955.561 (2)",
-            "tab": "General information",
-            "score": 477.78034682080926
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=662.517, mean=662.517, max=662.517, sum=1325.035 (2)",
-            "tab": "General information",
-            "score": 662.5173184357542
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.918,
-        "details": {
-          "description": "min=0.918, mean=0.918, max=0.918, sum=1.837 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=1.989, mean=1.989, max=1.989, sum=3.977 (2)",
-            "tab": "Efficiency",
-            "score": 1.9886824734070723
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=570.337, mean=570.337, max=570.337, sum=1140.673 (2)",
-            "tab": "General information",
-            "score": 570.3366013071895
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.923,
-        "details": {
-          "description": "min=0.923, mean=0.923, max=0.923, sum=1.846 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.909, mean=0.909, max=0.909, sum=1.819 (2)",
-            "tab": "Efficiency",
-            "score": 0.9094557386857492
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=505.194, mean=505.194, max=505.194, sum=1010.389 (2)",
-            "tab": "General information",
-            "score": 505.19444444444446
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.809,
-        "details": {
-          "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.847, mean=0.847, max=0.847, sum=1.695 (2)",
-            "tab": "Efficiency",
-            "score": 0.8472580974752253
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=402.009, mean=402.009, max=402.009, sum=804.018 (2)",
-            "tab": "General information",
-            "score": 402.0090909090909
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.837,
-        "details": {
-          "description": "min=0.837, mean=0.837, max=0.837, sum=1.673 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.659, mean=0.659, max=0.659, sum=1.318 (2)",
-            "tab": "Efficiency",
-            "score": 0.6588058092156235
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1160.294, mean=1160.294, max=1160.294, sum=2320.588 (2)",
-            "tab": "General information",
-            "score": 1160.2938775510204
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.955,
-        "details": {
-          "description": "min=0.955, mean=0.955, max=0.955, sum=1.91 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=1.251, mean=1.251, max=1.251, sum=2.501 (2)",
-            "tab": "Efficiency",
-            "score": 1.2506972652169603
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=443.891, mean=443.891, max=443.891, sum=887.781 (2)",
-            "tab": "General information",
-            "score": 443.8905472636816
-          },
-          "Sociology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.596,
-        "details": {
-          "description": "min=0.596, mean=0.596, max=0.596, sum=1.193 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.509, mean=0.509, max=0.509, sum=1.019 (2)",
-            "tab": "Efficiency",
-            "score": 0.5092598558908485
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=329.572, mean=329.572, max=329.572, sum=659.145 (2)",
-            "tab": "General information",
-            "score": 329.5722891566265
-          },
-          "Virology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.912,
-        "details": {
-          "description": "min=0.912, mean=0.912, max=0.912, sum=1.825 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=1.251, mean=1.251, max=1.251, sum=2.501 (2)",
-            "tab": "Efficiency",
-            "score": 1.2507223441586857
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=268.918, mean=268.918, max=268.918, sum=537.836 (2)",
-            "tab": "General information",
-            "score": 268.91812865497076
-          },
-          "World Religions - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.215,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json b/data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json
deleted file mode 100644
index 86096274a..000000000
--- a/data/helm_mmlu/google/gemini-1.0-pro-001/7ea5b404-d98f-4282-81d8-6ca5f6629429.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.0-pro-001/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 1.0 Pro 001",
-    "id": "google/gemini-1.0-pro-001",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7,
-        "details": {
-          "description": "min=0.333, mean=0.7, max=0.933, sum=79.795 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.291, mean=0.385, max=0.991, sum=43.868 (114)",
-            "tab": "Efficiency",
-            "score": 0.3848050244039386
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=260.164, mean=624.617, max=2789.424, sum=71206.345 (114)",
-            "tab": "General information",
-            "score": 624.6170571214202
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.34,
-        "details": {
-          "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.991, mean=0.991, max=0.991, sum=1.982 (2)",
-            "tab": "Efficiency",
-            "score": 0.9907678151130677
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=375.97, mean=375.97, max=375.97, sum=751.94 (2)",
-            "tab": "General information",
-            "score": 375.97
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.652,
-        "details": {
-          "description": "min=0.652, mean=0.652, max=0.652, sum=1.304 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.318, mean=0.318, max=0.318, sum=0.636 (2)",
-            "tab": "Efficiency",
-            "score": 0.3178748925526937
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=336.356, mean=336.356, max=336.356, sum=672.711 (2)",
-            "tab": "General information",
-            "score": 336.35555555555555
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.333,
-        "details": {
-          "description": "min=0.333, mean=0.333, max=0.333, sum=0.667 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.377, mean=0.377, max=0.377, sum=0.754 (2)",
-            "tab": "Efficiency",
-            "score": 0.37708688735961915
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)",
-            "tab": "Efficiency",
-            "score": 0.2937609056631724
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.375, mean=0.375, max=0.375, sum=0.75 (2)",
-            "tab": "Efficiency",
-            "score": 0.37500447273254395
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.356, mean=0.356, max=0.356, sum=0.712 (2)",
-            "tab": "Efficiency",
-            "score": 0.35595274686813355
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.314, mean=0.314, max=0.314, sum=0.627 (2)",
-            "tab": "Efficiency",
-            "score": 0.31358790535458253
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.672 (2)",
-            "tab": "Efficiency",
-            "score": 0.3357745151893765
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=562.02, mean=562.02, max=562.02, sum=1124.04 (2)",
-            "tab": "General information",
-            "score": 562.02
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=474.799, mean=474.799, max=474.799, sum=949.597 (2)",
-            "tab": "General information",
-            "score": 474.7986111111111
-          },
-          "College Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=849.86, mean=849.86, max=849.86, sum=1699.72 (2)",
-            "tab": "General information",
-            "score": 849.86
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=618.69, mean=618.69, max=618.69, sum=1237.38 (2)",
-            "tab": "General information",
-            "score": 618.69
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=505.37, mean=505.37, max=505.37, sum=1010.74 (2)",
-            "tab": "General information",
-            "score": 505.3699421965318
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=499.471, mean=499.471, max=499.471, sum=998.941 (2)",
-            "tab": "General information",
-            "score": 499.47058823529414
-          },
-          "College Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.314, mean=0.314, max=0.314, sum=0.627 (2)",
-            "tab": "Efficiency",
-            "score": 0.31363418102264407
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=372.91, mean=372.91, max=372.91, sum=745.82 (2)",
-            "tab": "General information",
-            "score": 372.91
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.553,
-        "details": {
-          "description": "min=0.553, mean=0.553, max=0.553, sum=1.105 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.377, mean=0.377, max=0.377, sum=0.754 (2)",
-            "tab": "Efficiency",
-            "score": 0.37716702620188397
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=626.553, mean=626.553, max=626.553, sum=1253.105 (2)",
-            "tab": "General information",
-            "score": 626.5526315789474
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.49,
-        "details": {
-          "description": "min=0.49, mean=0.49, max=0.49, sum=0.98 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.32, mean=0.32, max=0.32, sum=0.639 (2)",
-            "tab": "Efficiency",
-            "score": 0.3196276807785034
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=448.54, mean=448.54, max=448.54, sum=897.08 (2)",
-            "tab": "General information",
-            "score": 448.54
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.861,
-        "details": {
-          "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)",
-            "tab": "Efficiency",
-            "score": 0.29897612112539784
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=399.87, mean=399.87, max=399.87, sum=799.741 (2)",
-            "tab": "General information",
-            "score": 399.8703703703704
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.762,
-        "details": {
-          "description": "min=0.762, mean=0.762, max=0.762, sum=1.524 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.318, mean=0.318, max=0.318, sum=0.636 (2)",
-            "tab": "Efficiency",
-            "score": 0.31779951221306607
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=332.907, mean=332.907, max=332.907, sum=665.814 (2)",
-            "tab": "General information",
-            "score": 332.90675241157555
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.752,
-        "details": {
-          "description": "min=0.752, mean=0.752, max=0.752, sum=1.503 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.498, mean=0.498, max=0.498, sum=0.997 (2)",
-            "tab": "Efficiency",
-            "score": 0.49840929939298173
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)",
-            "tab": "Efficiency",
-            "score": 0.3838615434389588
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.509, mean=0.509, max=0.509, sum=1.019 (2)",
-            "tab": "Efficiency",
-            "score": 0.5094701207541172
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.388, mean=0.388, max=0.388, sum=0.775 (2)",
-            "tab": "Efficiency",
-            "score": 0.3877133719230953
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1105.092, mean=1105.092, max=1105.092, sum=2210.184 (2)",
-            "tab": "General information",
-            "score": 1105.0919117647059
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=747.418, mean=747.418, max=747.418, sum=1494.837 (2)",
-            "tab": "General information",
-            "score": 747.418439716312
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1677.119, mean=1677.119, max=1677.119, sum=3354.239 (2)",
-            "tab": "General information",
-            "score": 1677.119295958279
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=586.363, mean=586.363, max=586.363, sum=1172.725 (2)",
-            "tab": "General information",
-            "score": 586.3627450980392
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "details": {
-          "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.306, mean=0.306, max=0.306, sum=0.611 (2)",
-            "tab": "Efficiency",
-            "score": 0.30568787574768064
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=430.2, mean=430.2, max=430.2, sum=860.4 (2)",
-            "tab": "General information",
-            "score": 430.2
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "description": "min=0.796, mean=0.796, max=0.796, sum=1.592 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.517, mean=0.517, max=0.517, sum=1.035 (2)",
-            "tab": "Efficiency",
-            "score": 0.5173565070880087
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=594.421, mean=594.421, max=594.421, sum=1188.842 (2)",
-            "tab": "General information",
-            "score": 594.421052631579
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69,
-        "details": {
-          "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.386, mean=0.386, max=0.386, sum=0.772 (2)",
-            "tab": "Efficiency",
-            "score": 0.38599337100982667
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=544.87, mean=544.87, max=544.87, sum=1089.74 (2)",
-            "tab": "General information",
-            "score": 544.87
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.758,
-        "details": {
-          "description": "min=0.758, mean=0.758, max=0.758, sum=1.517 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.299, mean=0.299, max=0.299, sum=0.599 (2)",
-            "tab": "Efficiency",
-            "score": 0.29948959980370865
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=394.592, mean=394.592, max=394.592, sum=789.185 (2)",
-            "tab": "General information",
-            "score": 394.5924528301887
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.706,
-        "details": {
-          "description": "min=0.706, mean=0.706, max=0.706, sum=1.413 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)",
-            "tab": "Efficiency",
-            "score": 0.29394423606547904
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=301.213, mean=301.213, max=301.213, sum=602.426 (2)",
-            "tab": "General information",
-            "score": 301.21276595744683
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69,
-        "details": {
-          "description": "min=0.69, mean=0.69, max=0.69, sum=1.379 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.291, mean=0.291, max=0.291, sum=0.581 (2)",
-            "tab": "Efficiency",
-            "score": 0.2906524740416428
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=466.786, mean=466.786, max=466.786, sum=933.572 (2)",
-            "tab": "General information",
-            "score": 466.78620689655173
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.476,
-        "details": {
-          "description": "min=0.476, mean=0.476, max=0.476, sum=0.952 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.393, mean=0.393, max=0.393, sum=0.786 (2)",
-            "tab": "Efficiency",
-            "score": 0.3928584957879687
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=589.341, mean=589.341, max=589.341, sum=1178.683 (2)",
-            "tab": "General information",
-            "score": 589.3412698412699
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.468,
-        "details": {
-          "description": "min=0.468, mean=0.468, max=0.468, sum=0.937 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.398, mean=0.398, max=0.398, sum=0.797 (2)",
-            "tab": "Efficiency",
-            "score": 0.39849274120633565
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=611.563, mean=611.563, max=611.563, sum=1223.127 (2)",
-            "tab": "General information",
-            "score": 611.563492063492
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "details": {
-          "description": "min=0.865, mean=0.865, max=0.865, sum=1.73 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.643 (2)",
-            "tab": "Efficiency",
-            "score": 0.3214967135460146
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.341, mean=0.341, max=0.341, sum=0.683 (2)",
-            "tab": "Efficiency",
-            "score": 0.3413804282108551
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.378, mean=0.378, max=0.378, sum=0.756 (2)",
-            "tab": "Efficiency",
-            "score": 0.37822843074798584
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.836, mean=0.836, max=0.836, sum=1.672 (2)",
-            "tab": "Efficiency",
-            "score": 0.836203297701749
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.642 (2)",
-            "tab": "Efficiency",
-            "score": 0.3208902616693516
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)",
-            "tab": "Efficiency",
-            "score": 0.3069849088401992
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.32, mean=0.32, max=0.32, sum=0.641 (2)",
-            "tab": "Efficiency",
-            "score": 0.32043021275446965
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.386, mean=0.386, max=0.386, sum=0.772 (2)",
-            "tab": "Efficiency",
-            "score": 0.38611255663412586
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.315, mean=0.315, max=0.315, sum=0.631 (2)",
-            "tab": "Efficiency",
-            "score": 0.31541170993772877
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.383, mean=0.383, max=0.383, sum=0.767 (2)",
-            "tab": "Efficiency",
-            "score": 0.3833695673784673
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.334, mean=0.334, max=0.334, sum=0.668 (2)",
-            "tab": "Efficiency",
-            "score": 0.33389012427891
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)",
-            "tab": "Efficiency",
-            "score": 0.39985558611375316
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.827, mean=0.827, max=0.827, sum=1.655 (2)",
-            "tab": "Efficiency",
-            "score": 0.8272603574921104
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.517, mean=0.517, max=0.517, sum=1.035 (2)",
-            "tab": "Efficiency",
-            "score": 0.5172926987273784
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=492.958, mean=492.958, max=492.958, sum=985.916 (2)",
-            "tab": "General information",
-            "score": 492.958064516129
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=505.064, mean=505.064, max=505.064, sum=1010.128 (2)",
-            "tab": "General information",
-            "score": 505.064039408867
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=927.13, mean=927.13, max=927.13, sum=1854.26 (2)",
-            "tab": "General information",
-            "score": 927.13
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2789.424, mean=2789.424, max=2789.424, sum=5578.848 (2)",
-            "tab": "General information",
-            "score": 2789.4242424242425
-          },
-          "High School European History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=386.773, mean=386.773, max=386.773, sum=773.545 (2)",
-            "tab": "General information",
-            "score": 386.77272727272725
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=471.301, mean=471.301, max=471.301, sum=942.601 (2)",
-            "tab": "General information",
-            "score": 471.30051813471505
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=388.541, mean=388.541, max=388.541, sum=777.082 (2)",
-            "tab": "General information",
-            "score": 388.54102564102567
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=558.822, mean=558.822, max=558.822, sum=1117.644 (2)",
-            "tab": "General information",
-            "score": 558.8222222222222
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=407.954, mean=407.954, max=407.954, sum=815.908 (2)",
-            "tab": "General information",
-            "score": 407.953781512605
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=583.715, mean=583.715, max=583.715, sum=1167.43 (2)",
-            "tab": "General information",
-            "score": 583.7152317880794
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=494.604, mean=494.604, max=494.604, sum=989.207 (2)",
-            "tab": "General information",
-            "score": 494.60366972477067
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=850.931, mean=850.931, max=850.931, sum=1701.861 (2)",
-            "tab": "General information",
-            "score": 850.9305555555555
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2197.583, mean=2197.583, max=2197.583, sum=4395.167 (2)",
-            "tab": "General information",
-            "score": 2197.5833333333335
-          },
-          "High School US History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1418.544, mean=1418.544, max=1418.544, sum=2837.089 (2)",
-            "tab": "General information",
-            "score": 1418.5443037974683
-          },
-          "High School World History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.618,
-        "details": {
-          "description": "min=0.618, mean=0.618, max=0.618, sum=1.237 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.308, mean=0.308, max=0.308, sum=0.616 (2)",
-            "tab": "Efficiency",
-            "score": 0.3080115040321521
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.593 (2)",
-            "tab": "Efficiency",
-            "score": 0.29670037984848024
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=313.587, mean=313.587, max=313.587, sum=627.175 (2)",
-            "tab": "General information",
-            "score": 313.58744394618833
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=339.183, mean=339.183, max=339.183, sum=678.366 (2)",
-            "tab": "General information",
-            "score": 339.1832061068702
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.876,
-        "details": {
-          "description": "min=0.876, mean=0.876, max=0.876, sum=1.752 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.38, mean=0.38, max=0.38, sum=0.761 (2)",
-            "tab": "Efficiency",
-            "score": 0.3803488971773258
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=636.165, mean=636.165, max=636.165, sum=1272.331 (2)",
-            "tab": "General information",
-            "score": 636.1652892561983
-          },
-          "International Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.804,
-        "details": {
-          "description": "min=0.804, mean=0.804, max=0.804, sum=1.607 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)",
-            "tab": "Efficiency",
-            "score": 0.30376981372482204
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=442.049, mean=442.049, max=442.049, sum=884.098 (2)",
-            "tab": "General information",
-            "score": 442.0490797546012
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.527,
-        "details": {
-          "description": "min=0.527, mean=0.527, max=0.527, sum=1.054 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.381, mean=0.381, max=0.381, sum=0.761 (2)",
-            "tab": "Efficiency",
-            "score": 0.3805731492383139
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=694.402, mean=694.402, max=694.402, sum=1388.804 (2)",
-            "tab": "General information",
-            "score": 694.4017857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.845,
-        "details": {
-          "description": "min=0.845, mean=0.845, max=0.845, sum=1.689 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.301, mean=0.301, max=0.301, sum=0.603 (2)",
-            "tab": "Efficiency",
-            "score": 0.3013762247215197
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=273.301, mean=273.301, max=273.301, sum=546.602 (2)",
-            "tab": "General information",
-            "score": 273.3009708737864
-          },
-          "Management - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.307, mean=0.307, max=0.307, sum=0.615 (2)",
-            "tab": "Efficiency",
-            "score": 0.30740204122331405
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=420.35, mean=420.35, max=420.35, sum=840.701 (2)",
-            "tab": "General information",
-            "score": 420.35042735042737
-          },
-          "Marketing - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.369, mean=0.369, max=0.369, sum=0.738 (2)",
-            "tab": "Efficiency",
-            "score": 0.36919414043426513
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=330.89, mean=330.89, max=330.89, sum=661.78 (2)",
-            "tab": "General information",
-            "score": 330.89
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.851,
-        "details": {
-          "description": "min=0.851, mean=0.851, max=0.851, sum=1.701 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.305, mean=0.305, max=0.305, sum=0.61 (2)",
-            "tab": "Efficiency",
-            "score": 0.30495573064528814
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=306.669, mean=306.669, max=306.669, sum=613.338 (2)",
-            "tab": "General information",
-            "score": 306.669220945083
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46,
-        "details": {
-          "description": "min=0.46, mean=0.46, max=0.46, sum=0.921 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)",
-            "tab": "Efficiency",
-            "score": 0.3512327629706763
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)",
-            "tab": "Efficiency",
-            "score": 0.3902203835572113
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=487.003, mean=487.003, max=487.003, sum=974.006 (2)",
-            "tab": "General information",
-            "score": 487.0028901734104
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=682.542, mean=682.542, max=682.542, sum=1365.084 (2)",
-            "tab": "General information",
-            "score": 682.5418994413408
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.788,
-        "details": {
-          "description": "min=0.788, mean=0.788, max=0.788, sum=1.575 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.383, mean=0.383, max=0.383, sum=0.767 (2)",
-            "tab": "Efficiency",
-            "score": 0.3834058817695169
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=577.48, mean=577.48, max=577.48, sum=1154.961 (2)",
-            "tab": "General information",
-            "score": 577.4803921568628
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.802,
-        "details": {
-          "description": "min=0.802, mean=0.802, max=0.802, sum=1.605 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.423, mean=0.423, max=0.423, sum=0.845 (2)",
-            "tab": "Efficiency",
-            "score": 0.42272565026342135
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=532.198, mean=532.198, max=532.198, sum=1064.395 (2)",
-            "tab": "General information",
-            "score": 532.1975308641976
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.691,
-        "details": {
-          "description": "min=0.691, mean=0.691, max=0.691, sum=1.382 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.305, mean=0.305, max=0.305, sum=0.61 (2)",
-            "tab": "Efficiency",
-            "score": 0.3049524025483565
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=418.655, mean=418.655, max=418.655, sum=837.309 (2)",
-            "tab": "General information",
-            "score": 418.6545454545454
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.804,
-        "details": {
-          "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.523, mean=0.523, max=0.523, sum=1.046 (2)",
-            "tab": "Efficiency",
-            "score": 0.5228155525363222
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1185.869, mean=1185.869, max=1185.869, sum=2371.739 (2)",
-            "tab": "General information",
-            "score": 1185.869387755102
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.643 (2)",
-            "tab": "Efficiency",
-            "score": 0.32126195395170754
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=448.274, mean=448.274, max=448.274, sum=896.547 (2)",
-            "tab": "General information",
-            "score": 448.27363184079604
-          },
-          "Sociology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.536,
-        "details": {
-          "description": "min=0.536, mean=0.536, max=0.536, sum=1.072 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.309, mean=0.309, max=0.309, sum=0.618 (2)",
-            "tab": "Efficiency",
-            "score": 0.30881378018712424
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=328.753, mean=328.753, max=328.753, sum=657.506 (2)",
-            "tab": "General information",
-            "score": 328.7530120481928
-          },
-          "Virology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.86,
-        "details": {
-          "description": "min=0.86, mean=0.86, max=0.86, sum=1.719 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.673 (2)",
-            "tab": "Efficiency",
-            "score": 0.3363749897270872
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=260.164, mean=260.164, max=260.164, sum=520.327 (2)",
-            "tab": "General information",
-            "score": 260.1637426900585
-          },
-          "World Religions - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.677,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json b/data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json
deleted file mode 100644
index 7aac2d734..000000000
--- a/data/helm_mmlu/google/gemini-1.5-flash-001/7056c7e7-f68a-4764-aa48-a8368ae2e317.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-001/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 1.5 Flash 001",
-    "id": "google/gemini-1.5-flash-001",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.779,
-        "details": {
-          "description": "min=0.374, mean=0.779, max=0.974, sum=88.804 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.386, mean=0.487, max=0.665, sum=55.55 (114)",
-            "tab": "Efficiency",
-            "score": 0.4872786268013793
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=268.164, mean=632.617, max=2797.424, sum=72118.345 (114)",
-            "tab": "General information",
-            "score": 632.6170571214202
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.58,
-        "details": {
-          "description": "min=0.58, mean=0.58, max=0.58, sum=1.16 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.596, mean=0.596, max=0.596, sum=1.191 (2)",
-            "tab": "Efficiency",
-            "score": 0.595533971786499
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=383.97, mean=383.97, max=383.97, sum=767.94 (2)",
-            "tab": "General information",
-            "score": 383.97
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.536, mean=0.536, max=0.536, sum=1.071 (2)",
-            "tab": "Efficiency",
-            "score": 0.5356822949868661
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=344.356, mean=344.356, max=344.356, sum=688.711 (2)",
-            "tab": "General information",
-            "score": 344.35555555555555
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.696,
-        "details": {
-          "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.62, mean=0.62, max=0.62, sum=1.24 (2)",
-            "tab": "Efficiency",
-            "score": 0.6201749587059021
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.497, mean=0.497, max=0.497, sum=0.995 (2)",
-            "tab": "Efficiency",
-            "score": 0.4974212066994773
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.571, mean=0.571, max=0.571, sum=1.143 (2)",
-            "tab": "Efficiency",
-            "score": 0.5714822864532471
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.543, mean=0.543, max=0.543, sum=1.085 (2)",
-            "tab": "Efficiency",
-            "score": 0.5425397109985352
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.487, mean=0.487, max=0.487, sum=0.975 (2)",
-            "tab": "Efficiency",
-            "score": 0.48738120056990253
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.608, mean=0.608, max=0.608, sum=1.215 (2)",
-            "tab": "Efficiency",
-            "score": 0.6076285418342141
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=570.02, mean=570.02, max=570.02, sum=1140.04 (2)",
-            "tab": "General information",
-            "score": 570.02
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=482.799, mean=482.799, max=482.799, sum=965.597 (2)",
-            "tab": "General information",
-            "score": 482.7986111111111
-          },
-          "College Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=857.86, mean=857.86, max=857.86, sum=1715.72 (2)",
-            "tab": "General information",
-            "score": 857.86
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=626.69, mean=626.69, max=626.69, sum=1253.38 (2)",
-            "tab": "General information",
-            "score": 626.69
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=513.37, mean=513.37, max=513.37, sum=1026.74 (2)",
-            "tab": "General information",
-            "score": 513.3699421965318
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=507.471, mean=507.471, max=507.471, sum=1014.941 (2)",
-            "tab": "General information",
-            "score": 507.47058823529414
-          },
-          "College Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.538, mean=0.538, max=0.538, sum=1.075 (2)",
-            "tab": "Efficiency",
-            "score": 0.537526171207428
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=380.91, mean=380.91, max=380.91, sum=761.82 (2)",
-            "tab": "General information",
-            "score": 380.91
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.614,
-        "details": {
-          "description": "min=0.614, mean=0.614, max=0.614, sum=1.228 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.564, mean=0.564, max=0.564, sum=1.128 (2)",
-            "tab": "Efficiency",
-            "score": 0.5637641475911725
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=634.553, mean=634.553, max=634.553, sum=1269.105 (2)",
-            "tab": "General information",
-            "score": 634.5526315789474
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53,
-        "details": {
-          "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.549, mean=0.549, max=0.549, sum=1.097 (2)",
-            "tab": "Efficiency",
-            "score": 0.5487277007102966
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=456.54, mean=456.54, max=456.54, sum=913.08 (2)",
-            "tab": "General information",
-            "score": 456.54
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.889,
-        "details": {
-          "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.501, mean=0.501, max=0.501, sum=1.002 (2)",
-            "tab": "Efficiency",
-            "score": 0.5009041649323923
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=407.87, mean=407.87, max=407.87, sum=815.741 (2)",
-            "tab": "General information",
-            "score": 407.8703703703704
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791,
-        "details": {
-          "description": "min=0.791, mean=0.791, max=0.791, sum=1.582 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.48, mean=0.48, max=0.48, sum=0.96 (2)",
-            "tab": "Efficiency",
-            "score": 0.48008891700548373
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=340.907, mean=340.907, max=340.907, sum=681.814 (2)",
-            "tab": "General information",
-            "score": 340.90675241157555
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.828,
-        "details": {
-          "description": "min=0.828, mean=0.828, max=0.828, sum=1.657 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.477, mean=0.477, max=0.477, sum=0.955 (2)",
-            "tab": "Efficiency",
-            "score": 0.47726698907099085
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.44, mean=0.44, max=0.44, sum=0.88 (2)",
-            "tab": "Efficiency",
-            "score": 0.4398383096600255
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.424, mean=0.424, max=0.424, sum=0.848 (2)",
-            "tab": "Efficiency",
-            "score": 0.42376324315969854
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.386, mean=0.386, max=0.386, sum=0.773 (2)",
-            "tab": "Efficiency",
-            "score": 0.3864205361981141
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1113.092, mean=1113.092, max=1113.092, sum=2226.184 (2)",
-            "tab": "General information",
-            "score": 1113.0919117647059
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=755.418, mean=755.418, max=755.418, sum=1510.837 (2)",
-            "tab": "General information",
-            "score": 755.418439716312
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1685.119, mean=1685.119, max=1685.119, sum=3370.239 (2)",
-            "tab": "General information",
-            "score": 1685.119295958279
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=594.363, mean=594.363, max=594.363, sum=1188.725 (2)",
-            "tab": "General information",
-            "score": 594.3627450980392
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "details": {
-          "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.525, mean=0.525, max=0.525, sum=1.05 (2)",
-            "tab": "Efficiency",
-            "score": 0.5247626876831055
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=438.2, mean=438.2, max=438.2, sum=876.4 (2)",
-            "tab": "General information",
-            "score": 438.2
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.882,
-        "details": {
-          "description": "min=0.882, mean=0.882, max=0.882, sum=1.763 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.498, mean=0.498, max=0.498, sum=0.995 (2)",
-            "tab": "Efficiency",
-            "score": 0.49771531004654734
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=602.421, mean=602.421, max=602.421, sum=1204.842 (2)",
-            "tab": "General information",
-            "score": 602.421052631579
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.608, mean=0.608, max=0.608, sum=1.216 (2)",
-            "tab": "Efficiency",
-            "score": 0.608082628250122
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=552.87, mean=552.87, max=552.87, sum=1105.74 (2)",
-            "tab": "General information",
-            "score": 552.87
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.834,
-        "details": {
-          "description": "min=0.834, mean=0.834, max=0.834, sum=1.668 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.419, mean=0.419, max=0.419, sum=0.839 (2)",
-            "tab": "Efficiency",
-            "score": 0.41935023991566783
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=402.592, mean=402.592, max=402.592, sum=805.185 (2)",
-            "tab": "General information",
-            "score": 402.5924528301887
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.851,
-        "details": {
-          "description": "min=0.851, mean=0.851, max=0.851, sum=1.702 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.451, mean=0.451, max=0.451, sum=0.901 (2)",
-            "tab": "Efficiency",
-            "score": 0.4506680082767568
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=309.213, mean=309.213, max=309.213, sum=618.426 (2)",
-            "tab": "General information",
-            "score": 309.21276595744683
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.46, mean=0.46, max=0.46, sum=0.92 (2)",
-            "tab": "Efficiency",
-            "score": 0.4601488047632678
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=474.786, mean=474.786, max=474.786, sum=949.572 (2)",
-            "tab": "General information",
-            "score": 474.78620689655173
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.754,
-        "details": {
-          "description": "min=0.754, mean=0.754, max=0.754, sum=1.508 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.41, mean=0.41, max=0.41, sum=0.819 (2)",
-            "tab": "Efficiency",
-            "score": 0.40957188984704396
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=597.341, mean=597.341, max=597.341, sum=1194.683 (2)",
-            "tab": "General information",
-            "score": 597.3412698412699
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.627,
-        "details": {
-          "description": "min=0.627, mean=0.627, max=0.627, sum=1.254 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.524, mean=0.524, max=0.524, sum=1.047 (2)",
-            "tab": "Efficiency",
-            "score": 0.5235741989953178
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=619.563, mean=619.563, max=619.563, sum=1239.127 (2)",
-            "tab": "General information",
-            "score": 619.563492063492
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.907,
-        "details": {
-          "description": "min=0.907, mean=0.907, max=0.907, sum=1.814 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.439, mean=0.439, max=0.439, sum=0.878 (2)",
-            "tab": "Efficiency",
-            "score": 0.43886603309262184
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.468, mean=0.468, max=0.468, sum=0.937 (2)",
-            "tab": "Efficiency",
-            "score": 0.4683608938320517
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.511, mean=0.511, max=0.511, sum=1.022 (2)",
-            "tab": "Efficiency",
-            "score": 0.5109630298614501
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.665, mean=0.665, max=0.665, sum=1.33 (2)",
-            "tab": "Efficiency",
-            "score": 0.665167844656742
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.432, mean=0.432, max=0.432, sum=0.863 (2)",
-            "tab": "Efficiency",
-            "score": 0.43152768804569436
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.422, mean=0.422, max=0.422, sum=0.845 (2)",
-            "tab": "Efficiency",
-            "score": 0.4224596888290168
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.404, mean=0.404, max=0.404, sum=0.808 (2)",
-            "tab": "Efficiency",
-            "score": 0.4038744736940433
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.435, mean=0.435, max=0.435, sum=0.869 (2)",
-            "tab": "Efficiency",
-            "score": 0.43474441987496837
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.416, mean=0.416, max=0.416, sum=0.832 (2)",
-            "tab": "Efficiency",
-            "score": 0.4159359881857864
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.493, mean=0.493, max=0.493, sum=0.985 (2)",
-            "tab": "Efficiency",
-            "score": 0.49265997772974685
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.418, mean=0.418, max=0.418, sum=0.835 (2)",
-            "tab": "Efficiency",
-            "score": 0.41751264342490363
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.497, mean=0.497, max=0.497, sum=0.993 (2)",
-            "tab": "Efficiency",
-            "score": 0.49666665218494555
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.606, mean=0.606, max=0.606, sum=1.213 (2)",
-            "tab": "Efficiency",
-            "score": 0.6064977821181802
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.469, mean=0.469, max=0.469, sum=0.939 (2)",
-            "tab": "Efficiency",
-            "score": 0.46946642569851776
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=500.958, mean=500.958, max=500.958, sum=1001.916 (2)",
-            "tab": "General information",
-            "score": 500.958064516129
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=513.064, mean=513.064, max=513.064, sum=1026.128 (2)",
-            "tab": "General information",
-            "score": 513.064039408867
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=935.13, mean=935.13, max=935.13, sum=1870.26 (2)",
-            "tab": "General information",
-            "score": 935.13
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2797.424, mean=2797.424, max=2797.424, sum=5594.848 (2)",
-            "tab": "General information",
-            "score": 2797.4242424242425
-          },
-          "High School European History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=394.773, mean=394.773, max=394.773, sum=789.545 (2)",
-            "tab": "General information",
-            "score": 394.77272727272725
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=479.301, mean=479.301, max=479.301, sum=958.601 (2)",
-            "tab": "General information",
-            "score": 479.30051813471505
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=396.541, mean=396.541, max=396.541, sum=793.082 (2)",
-            "tab": "General information",
-            "score": 396.54102564102567
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=566.822, mean=566.822, max=566.822, sum=1133.644 (2)",
-            "tab": "General information",
-            "score": 566.8222222222222
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=415.954, mean=415.954, max=415.954, sum=831.908 (2)",
-            "tab": "General information",
-            "score": 415.953781512605
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=591.715, mean=591.715, max=591.715, sum=1183.43 (2)",
-            "tab": "General information",
-            "score": 591.7152317880794
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=502.604, mean=502.604, max=502.604, sum=1005.207 (2)",
-            "tab": "General information",
-            "score": 502.60366972477067
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=858.931, mean=858.931, max=858.931, sum=1717.861 (2)",
-            "tab": "General information",
-            "score": 858.9305555555555
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2205.583, mean=2205.583, max=2205.583, sum=4411.167 (2)",
-            "tab": "General information",
-            "score": 2205.5833333333335
-          },
-          "High School US History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1426.544, mean=1426.544, max=1426.544, sum=2853.089 (2)",
-            "tab": "General information",
-            "score": 1426.5443037974683
-          },
-          "High School World History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.374,
-        "details": {
-          "description": "min=0.374, mean=0.374, max=0.374, sum=0.748 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.45, mean=0.45, max=0.45, sum=0.901 (2)",
-            "tab": "Efficiency",
-            "score": 0.45039264396701695
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.494, mean=0.494, max=0.494, sum=0.989 (2)",
-            "tab": "Efficiency",
-            "score": 0.494300215931262
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=321.587, mean=321.587, max=321.587, sum=643.175 (2)",
-            "tab": "General information",
-            "score": 321.58744394618833
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=347.183, mean=347.183, max=347.183, sum=694.366 (2)",
-            "tab": "General information",
-            "score": 347.1832061068702
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.543, mean=0.543, max=0.543, sum=1.086 (2)",
-            "tab": "Efficiency",
-            "score": 0.5427691305964446
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=644.165, mean=644.165, max=644.165, sum=1288.331 (2)",
-            "tab": "General information",
-            "score": 644.1652892561983
-          },
-          "International Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.853,
-        "details": {
-          "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.485, mean=0.485, max=0.485, sum=0.969 (2)",
-            "tab": "Efficiency",
-            "score": 0.48451554263296304
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=450.049, mean=450.049, max=450.049, sum=900.098 (2)",
-            "tab": "General information",
-            "score": 450.0490797546012
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.571,
-        "details": {
-          "description": "min=0.571, mean=0.571, max=0.571, sum=1.143 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.515, mean=0.515, max=0.515, sum=1.029 (2)",
-            "tab": "Efficiency",
-            "score": 0.5145284725087029
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=702.402, mean=702.402, max=702.402, sum=1404.804 (2)",
-            "tab": "General information",
-            "score": 702.4017857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.864,
-        "details": {
-          "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.57, mean=0.57, max=0.57, sum=1.139 (2)",
-            "tab": "Efficiency",
-            "score": 0.5696360532519886
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=281.301, mean=281.301, max=281.301, sum=562.602 (2)",
-            "tab": "General information",
-            "score": 281.3009708737864
-          },
-          "Management - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.438, mean=0.438, max=0.438, sum=0.876 (2)",
-            "tab": "Efficiency",
-            "score": 0.43808113204108345
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=428.35, mean=428.35, max=428.35, sum=856.701 (2)",
-            "tab": "General information",
-            "score": 428.35042735042737
-          },
-          "Marketing - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.86,
-        "details": {
-          "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.514, mean=0.514, max=0.514, sum=1.029 (2)",
-            "tab": "Efficiency",
-            "score": 0.514304575920105
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=338.89, mean=338.89, max=338.89, sum=677.78 (2)",
-            "tab": "General information",
-            "score": 338.89
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.886,
-        "details": {
-          "description": "min=0.886, mean=0.886, max=0.886, sum=1.773 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.395, mean=0.395, max=0.395, sum=0.79 (2)",
-            "tab": "Efficiency",
-            "score": 0.3951411627870562
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=314.669, mean=314.669, max=314.669, sum=629.338 (2)",
-            "tab": "General information",
-            "score": 314.669220945083
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.637,
-        "details": {
-          "description": "min=0.637, mean=0.637, max=0.637, sum=1.274 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.403, mean=0.403, max=0.403, sum=0.806 (2)",
-            "tab": "Efficiency",
-            "score": 0.4028203390646672
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.4, mean=0.4, max=0.4, sum=0.801 (2)",
-            "tab": "Efficiency",
-            "score": 0.4004550709633243
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=495.003, mean=495.003, max=495.003, sum=990.006 (2)",
-            "tab": "General information",
-            "score": 495.0028901734104
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=690.542, mean=690.542, max=690.542, sum=1381.084 (2)",
-            "tab": "General information",
-            "score": 690.5418994413408
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.82, mean=0.82, max=0.82, sum=1.641 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)",
-            "tab": "Efficiency",
-            "score": 0.4019969655018227
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=585.48, mean=585.48, max=585.48, sum=1170.961 (2)",
-            "tab": "General information",
-            "score": 585.4803921568628
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.867,
-        "details": {
-          "description": "min=0.867, mean=0.867, max=0.867, sum=1.735 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.403, mean=0.403, max=0.403, sum=0.807 (2)",
-            "tab": "Efficiency",
-            "score": 0.40340044910525097
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=540.198, mean=540.198, max=540.198, sum=1080.395 (2)",
-            "tab": "General information",
-            "score": 540.1975308641976
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.764,
-        "details": {
-          "description": "min=0.764, mean=0.764, max=0.764, sum=1.527 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.554, mean=0.554, max=0.554, sum=1.109 (2)",
-            "tab": "Efficiency",
-            "score": 0.5543096672404896
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=426.655, mean=426.655, max=426.655, sum=853.309 (2)",
-            "tab": "General information",
-            "score": 426.6545454545454
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.808,
-        "details": {
-          "description": "min=0.808, mean=0.808, max=0.808, sum=1.616 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.456, mean=0.456, max=0.456, sum=0.913 (2)",
-            "tab": "Efficiency",
-            "score": 0.45644889948319417
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1193.869, mean=1193.869, max=1193.869, sum=2387.739 (2)",
-            "tab": "General information",
-            "score": 1193.869387755102
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.915,
-        "details": {
-          "description": "min=0.915, mean=0.915, max=0.915, sum=1.831 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.458, mean=0.458, max=0.458, sum=0.916 (2)",
-            "tab": "Efficiency",
-            "score": 0.4581311152349064
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=456.274, mean=456.274, max=456.274, sum=912.547 (2)",
-            "tab": "General information",
-            "score": 456.27363184079604
-          },
-          "Sociology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.566,
-        "details": {
-          "description": "min=0.566, mean=0.566, max=0.566, sum=1.133 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.45, mean=0.45, max=0.45, sum=0.899 (2)",
-            "tab": "Efficiency",
-            "score": 0.44963935197117816
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=336.753, mean=336.753, max=336.753, sum=673.506 (2)",
-            "tab": "General information",
-            "score": 336.7530120481928
-          },
-          "Virology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.883,
-        "details": {
-          "description": "min=0.883, mean=0.883, max=0.883, sum=1.766 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.459, mean=0.459, max=0.459, sum=0.919 (2)",
-            "tab": "Efficiency",
-            "score": 0.45928927890041416
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=268.164, mean=268.164, max=268.164, sum=536.327 (2)",
-            "tab": "General information",
-            "score": 268.1637426900585
-          },
-          "World Religions - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.47,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json b/data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json
deleted file mode 100644
index a87c94c3b..000000000
--- a/data/helm_mmlu/google/gemini-1.5-flash-002/5e67014d-6ca1-4e65-a85a-84d91e147d4d.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-002/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 1.5 Flash 002",
-    "id": "google/gemini-1.5-flash-002",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.739,
-        "details": {
-          "description": "min=0.27, mean=0.739, max=0.959, sum=84.201 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.262, mean=0.315, max=0.767, sum=35.937 (114)",
-            "tab": "Efficiency",
-            "score": 0.3152340762781926
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=268.164, mean=632.617, max=2797.424, sum=72118.345 (114)",
-            "tab": "General information",
-            "score": 632.6170571214202
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.63,
-        "details": {
-          "description": "min=0.63, mean=0.63, max=0.63, sum=1.26 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.298, mean=0.298, max=0.298, sum=0.596 (2)",
-            "tab": "Efficiency",
-            "score": 0.29806760787963865
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=383.97, mean=383.97, max=383.97, sum=767.94 (2)",
-            "tab": "General information",
-            "score": 383.97
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.793,
-        "details": {
-          "description": "min=0.793, mean=0.793, max=0.793, sum=1.585 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.292, mean=0.292, max=0.292, sum=0.583 (2)",
-            "tab": "Efficiency",
-            "score": 0.29152930047776965
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=344.356, mean=344.356, max=344.356, sum=688.711 (2)",
-            "tab": "General information",
-            "score": 344.35555555555555
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.637,
-        "details": {
-          "description": "min=0.637, mean=0.637, max=0.637, sum=1.275 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)",
-            "tab": "Efficiency",
-            "score": 0.2988364624977112
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.298, mean=0.298, max=0.298, sum=0.596 (2)",
-            "tab": "Efficiency",
-            "score": 0.29801897870169747
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.299, mean=0.299, max=0.299, sum=0.597 (2)",
-            "tab": "Efficiency",
-            "score": 0.2985741686820984
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.288, mean=0.288, max=0.288, sum=0.576 (2)",
-            "tab": "Efficiency",
-            "score": 0.28819103717803957
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.586 (2)",
-            "tab": "Efficiency",
-            "score": 0.29290392495304174
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.291, mean=0.291, max=0.291, sum=0.582 (2)",
-            "tab": "Efficiency",
-            "score": 0.29088794483857994
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=570.02, mean=570.02, max=570.02, sum=1140.04 (2)",
-            "tab": "General information",
-            "score": 570.02
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=482.799, mean=482.799, max=482.799, sum=965.597 (2)",
-            "tab": "General information",
-            "score": 482.7986111111111
-          },
-          "College Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=857.86, mean=857.86, max=857.86, sum=1715.72 (2)",
-            "tab": "General information",
-            "score": 857.86
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=626.69, mean=626.69, max=626.69, sum=1253.38 (2)",
-            "tab": "General information",
-            "score": 626.69
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=513.37, mean=513.37, max=513.37, sum=1026.74 (2)",
-            "tab": "General information",
-            "score": 513.3699421965318
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=507.471, mean=507.471, max=507.471, sum=1014.941 (2)",
-            "tab": "General information",
-            "score": 507.47058823529414
-          },
-          "College Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.72,
-        "details": {
-          "description": "min=0.72, mean=0.72, max=0.72, sum=1.44 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)",
-            "tab": "Efficiency",
-            "score": 0.2992409729957581
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=380.91, mean=380.91, max=380.91, sum=761.82 (2)",
-            "tab": "General information",
-            "score": 380.91
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.675,
-        "details": {
-          "description": "min=0.675, mean=0.675, max=0.675, sum=1.351 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.295, mean=0.295, max=0.295, sum=0.59 (2)",
-            "tab": "Efficiency",
-            "score": 0.295004924138387
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=634.553, mean=634.553, max=634.553, sum=1269.105 (2)",
-            "tab": "General information",
-            "score": 634.5526315789474
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.47,
-        "details": {
-          "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.301, mean=0.301, max=0.301, sum=0.602 (2)",
-            "tab": "Efficiency",
-            "score": 0.3007749605178833
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=456.54, mean=456.54, max=456.54, sum=913.08 (2)",
-            "tab": "General information",
-            "score": 456.54
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.852,
-        "details": {
-          "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)",
-            "tab": "Efficiency",
-            "score": 0.2988583313094245
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=407.87, mean=407.87, max=407.87, sum=815.741 (2)",
-            "tab": "General information",
-            "score": 407.8703703703704
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.797,
-        "details": {
-          "description": "min=0.797, mean=0.797, max=0.797, sum=1.595 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.289, mean=0.289, max=0.289, sum=0.578 (2)",
-            "tab": "Efficiency",
-            "score": 0.2892080227278436
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=340.907, mean=340.907, max=340.907, sum=681.814 (2)",
-            "tab": "General information",
-            "score": 340.90675241157555
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.806,
-        "details": {
-          "description": "min=0.806, mean=0.806, max=0.806, sum=1.611 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.303, max=0.303, sum=0.605 (2)",
-            "tab": "Efficiency",
-            "score": 0.3027217843953301
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.318, mean=0.318, max=0.318, sum=0.636 (2)",
-            "tab": "Efficiency",
-            "score": 0.318213385893098
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.344, mean=0.344, max=0.344, sum=0.687 (2)",
-            "tab": "Efficiency",
-            "score": 0.34364056462881
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.366, mean=0.366, max=0.366, sum=0.732 (2)",
-            "tab": "Efficiency",
-            "score": 0.3660228084894567
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1113.092, mean=1113.092, max=1113.092, sum=2226.184 (2)",
-            "tab": "General information",
-            "score": 1113.0919117647059
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=755.418, mean=755.418, max=755.418, sum=1510.837 (2)",
-            "tab": "General information",
-            "score": 755.418439716312
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1685.119, mean=1685.119, max=1685.119, sum=3370.239 (2)",
-            "tab": "General information",
-            "score": 1685.119295958279
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=594.363, mean=594.363, max=594.363, sum=1188.725 (2)",
-            "tab": "General information",
-            "score": 594.3627450980392
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.291, mean=0.291, max=0.291, sum=0.582 (2)",
-            "tab": "Efficiency",
-            "score": 0.291001398563385
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=438.2, mean=438.2, max=438.2, sum=876.4 (2)",
-            "tab": "General information",
-            "score": 438.2
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.895,
-        "details": {
-          "description": "min=0.895, mean=0.895, max=0.895, sum=1.789 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.292, mean=0.292, max=0.292, sum=0.584 (2)",
-            "tab": "Efficiency",
-            "score": 0.2922459558436745
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=602.421, mean=602.421, max=602.421, sum=1204.842 (2)",
-            "tab": "General information",
-            "score": 602.421052631579
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.27,
-        "details": {
-          "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.6 (2)",
-            "tab": "Efficiency",
-            "score": 0.29986772060394284
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=552.87, mean=552.87, max=552.87, sum=1105.74 (2)",
-            "tab": "General information",
-            "score": 552.87
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.792,
-        "details": {
-          "description": "min=0.792, mean=0.792, max=0.792, sum=1.585 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.601 (2)",
-            "tab": "Efficiency",
-            "score": 0.3003354540411031
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=402.592, mean=402.592, max=402.592, sum=805.185 (2)",
-            "tab": "General information",
-            "score": 402.5924528301887
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.851,
-        "details": {
-          "description": "min=0.851, mean=0.851, max=0.851, sum=1.702 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.288, mean=0.288, max=0.288, sum=0.575 (2)",
-            "tab": "Efficiency",
-            "score": 0.28759900458315585
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=309.213, mean=309.213, max=309.213, sum=618.426 (2)",
-            "tab": "General information",
-            "score": 309.21276595744683
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.772,
-        "details": {
-          "description": "min=0.772, mean=0.772, max=0.772, sum=1.545 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)",
-            "tab": "Efficiency",
-            "score": 0.2938007436949631
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=474.786, mean=474.786, max=474.786, sum=949.572 (2)",
-            "tab": "General information",
-            "score": 474.78620689655173
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.704,
-        "details": {
-          "description": "min=0.704, mean=0.704, max=0.704, sum=1.407 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.295, mean=0.295, max=0.295, sum=0.59 (2)",
-            "tab": "Efficiency",
-            "score": 0.29476307119641987
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=597.341, mean=597.341, max=597.341, sum=1194.683 (2)",
-            "tab": "General information",
-            "score": 597.3412698412699
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.595,
-        "details": {
-          "description": "min=0.595, mean=0.595, max=0.595, sum=1.19 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.283, mean=0.283, max=0.283, sum=0.567 (2)",
-            "tab": "Efficiency",
-            "score": 0.28335455107310464
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=619.563, mean=619.563, max=619.563, sum=1239.127 (2)",
-            "tab": "General information",
-            "score": 619.563492063492
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.869,
-        "details": {
-          "description": "min=0.869, mean=0.869, max=0.869, sum=1.738 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)",
-            "tab": "Efficiency",
-            "score": 0.2898174070542858
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.276, mean=0.276, max=0.276, sum=0.553 (2)",
-            "tab": "Efficiency",
-            "score": 0.27643810704423877
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.29, mean=0.29, max=0.29, sum=0.579 (2)",
-            "tab": "Efficiency",
-            "score": 0.28958702087402344
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.369, mean=0.369, max=0.369, sum=0.739 (2)",
-            "tab": "Efficiency",
-            "score": 0.369471347693241
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.278, mean=0.278, max=0.278, sum=0.556 (2)",
-            "tab": "Efficiency",
-            "score": 0.2780994249112678
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.269, mean=0.269, max=0.269, sum=0.538 (2)",
-            "tab": "Efficiency",
-            "score": 0.26881929382759057
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)",
-            "tab": "Efficiency",
-            "score": 0.2700315811695197
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.267, mean=0.267, max=0.267, sum=0.534 (2)",
-            "tab": "Efficiency",
-            "score": 0.2672289636400011
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.262, mean=0.262, max=0.262, sum=0.525 (2)",
-            "tab": "Efficiency",
-            "score": 0.2623477593189528
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.292, mean=0.292, max=0.292, sum=0.583 (2)",
-            "tab": "Efficiency",
-            "score": 0.2917157135262395
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.269, mean=0.269, max=0.269, sum=0.537 (2)",
-            "tab": "Efficiency",
-            "score": 0.2685232871169344
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.502, mean=0.502, max=0.502, sum=1.004 (2)",
-            "tab": "Efficiency",
-            "score": 0.5018655392858717
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.873 (2)",
-            "tab": "Efficiency",
-            "score": 0.4363996000850902
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)",
-            "tab": "Efficiency",
-            "score": 0.3298424698632478
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=500.958, mean=500.958, max=500.958, sum=1001.916 (2)",
-            "tab": "General information",
-            "score": 500.958064516129
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=513.064, mean=513.064, max=513.064, sum=1026.128 (2)",
-            "tab": "General information",
-            "score": 513.064039408867
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=935.13, mean=935.13, max=935.13, sum=1870.26 (2)",
-            "tab": "General information",
-            "score": 935.13
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2797.424, mean=2797.424, max=2797.424, sum=5594.848 (2)",
-            "tab": "General information",
-            "score": 2797.4242424242425
-          },
-          "High School European History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=394.773, mean=394.773, max=394.773, sum=789.545 (2)",
-            "tab": "General information",
-            "score": 394.77272727272725
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=479.301, mean=479.301, max=479.301, sum=958.601 (2)",
-            "tab": "General information",
-            "score": 479.30051813471505
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=396.541, mean=396.541, max=396.541, sum=793.082 (2)",
-            "tab": "General information",
-            "score": 396.54102564102567
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=566.822, mean=566.822, max=566.822, sum=1133.644 (2)",
-            "tab": "General information",
-            "score": 566.8222222222222
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=415.954, mean=415.954, max=415.954, sum=831.908 (2)",
-            "tab": "General information",
-            "score": 415.953781512605
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=591.715, mean=591.715, max=591.715, sum=1183.43 (2)",
-            "tab": "General information",
-            "score": 591.7152317880794
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=502.604, mean=502.604, max=502.604, sum=1005.207 (2)",
-            "tab": "General information",
-            "score": 502.60366972477067
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=858.931, mean=858.931, max=858.931, sum=1717.861 (2)",
-            "tab": "General information",
-            "score": 858.9305555555555
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2205.583, mean=2205.583, max=2205.583, sum=4411.167 (2)",
-            "tab": "General information",
-            "score": 2205.5833333333335
-          },
-          "High School US History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1426.544, mean=1426.544, max=1426.544, sum=2853.089 (2)",
-            "tab": "General information",
-            "score": 1426.5443037974683
-          },
-          "High School World History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.847,
-        "details": {
-          "description": "min=0.847, mean=0.847, max=0.847, sum=1.695 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.278, mean=0.278, max=0.278, sum=0.555 (2)",
-            "tab": "Efficiency",
-            "score": 0.2775634660849122
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.416, mean=0.416, max=0.416, sum=0.832 (2)",
-            "tab": "Efficiency",
-            "score": 0.41606709793323776
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=321.587, mean=321.587, max=321.587, sum=643.175 (2)",
-            "tab": "General information",
-            "score": 321.58744394618833
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=347.183, mean=347.183, max=347.183, sum=694.366 (2)",
-            "tab": "General information",
-            "score": 347.1832061068702
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.752,
-        "details": {
-          "description": "min=0.752, mean=0.752, max=0.752, sum=1.504 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.268, mean=0.268, max=0.268, sum=0.535 (2)",
-            "tab": "Efficiency",
-            "score": 0.267673009683278
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=644.165, mean=644.165, max=644.165, sum=1288.331 (2)",
-            "tab": "General information",
-            "score": 644.1652892561983
-          },
-          "International Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.859,
-        "details": {
-          "description": "min=0.859, mean=0.859, max=0.859, sum=1.718 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.268, mean=0.268, max=0.268, sum=0.535 (2)",
-            "tab": "Efficiency",
-            "score": 0.2676804094958159
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=450.049, mean=450.049, max=450.049, sum=900.098 (2)",
-            "tab": "General information",
-            "score": 450.0490797546012
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.616,
-        "details": {
-          "description": "min=0.616, mean=0.616, max=0.616, sum=1.232 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.27, mean=0.27, max=0.27, sum=0.539 (2)",
-            "tab": "Efficiency",
-            "score": 0.2695028483867645
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=702.402, mean=702.402, max=702.402, sum=1404.804 (2)",
-            "tab": "General information",
-            "score": 702.4017857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.893,
-        "details": {
-          "description": "min=0.893, mean=0.893, max=0.893, sum=1.786 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.665 (2)",
-            "tab": "Efficiency",
-            "score": 0.3324842568740104
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=281.301, mean=281.301, max=281.301, sum=562.602 (2)",
-            "tab": "General information",
-            "score": 281.3009708737864
-          },
-          "Management - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.953,
-        "details": {
-          "description": "min=0.953, mean=0.953, max=0.953, sum=1.906 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.767, mean=0.767, max=0.767, sum=1.533 (2)",
-            "tab": "Efficiency",
-            "score": 0.7665768270818596
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=428.35, mean=428.35, max=428.35, sum=856.701 (2)",
-            "tab": "General information",
-            "score": 428.35042735042737
-          },
-          "Marketing - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "details": {
-          "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.594 (2)",
-            "tab": "Efficiency",
-            "score": 0.2972432613372803
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=338.89, mean=338.89, max=338.89, sum=677.78 (2)",
-            "tab": "General information",
-            "score": 338.89
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.425, mean=0.425, max=0.425, sum=0.849 (2)",
-            "tab": "Efficiency",
-            "score": 0.4247035331652996
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=314.669, mean=314.669, max=314.669, sum=629.338 (2)",
-            "tab": "General information",
-            "score": 314.669220945083
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.676,
-        "details": {
-          "description": "min=0.676, mean=0.676, max=0.676, sum=1.352 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.593 (2)",
-            "tab": "Efficiency",
-            "score": 0.2965996671963289
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.593 (2)",
-            "tab": "Efficiency",
-            "score": 0.29666628491279134
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=495.003, mean=495.003, max=495.003, sum=990.006 (2)",
-            "tab": "General information",
-            "score": 495.0028901734104
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=690.542, mean=690.542, max=690.542, sum=1381.084 (2)",
-            "tab": "General information",
-            "score": 690.5418994413408
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.588,
-        "details": {
-          "description": "min=0.588, mean=0.588, max=0.588, sum=1.176 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.288, mean=0.288, max=0.288, sum=0.575 (2)",
-            "tab": "Efficiency",
-            "score": 0.2876783258774701
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=585.48, mean=585.48, max=585.48, sum=1170.961 (2)",
-            "tab": "General information",
-            "score": 585.4803921568628
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.762,
-        "details": {
-          "description": "min=0.762, mean=0.762, max=0.762, sum=1.525 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.6 (2)",
-            "tab": "Efficiency",
-            "score": 0.3001174411655944
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=540.198, mean=540.198, max=540.198, sum=1080.395 (2)",
-            "tab": "General information",
-            "score": 540.1975308641976
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7,
-        "details": {
-          "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.286, mean=0.286, max=0.286, sum=0.572 (2)",
-            "tab": "Efficiency",
-            "score": 0.2860603137449785
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=426.655, mean=426.655, max=426.655, sum=853.309 (2)",
-            "tab": "General information",
-            "score": 426.6545454545454
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.547,
-        "details": {
-          "description": "min=0.547, mean=0.547, max=0.547, sum=1.094 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.398, mean=0.398, max=0.398, sum=0.795 (2)",
-            "tab": "Efficiency",
-            "score": 0.3977492381115349
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1193.869, mean=1193.869, max=1193.869, sum=2387.739 (2)",
-            "tab": "General information",
-            "score": 1193.869387755102
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.851,
-        "details": {
-          "description": "min=0.851, mean=0.851, max=0.851, sum=1.701 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.295, mean=0.295, max=0.295, sum=0.59 (2)",
-            "tab": "Efficiency",
-            "score": 0.29507939969722313
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=456.274, mean=456.274, max=456.274, sum=912.547 (2)",
-            "tab": "General information",
-            "score": 456.27363184079604
-          },
-          "Sociology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.524,
-        "details": {
-          "description": "min=0.524, mean=0.524, max=0.524, sum=1.048 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.287, mean=0.287, max=0.287, sum=0.574 (2)",
-            "tab": "Efficiency",
-            "score": 0.28698748852833206
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=336.753, mean=336.753, max=336.753, sum=673.506 (2)",
-            "tab": "General information",
-            "score": 336.7530120481928
-          },
-          "Virology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "details": {
-          "description": "min=0.865, mean=0.865, max=0.865, sum=1.731 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.288, mean=0.288, max=0.288, sum=0.576 (2)",
-            "tab": "Efficiency",
-            "score": 0.2880588832654451
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=268.164, mean=268.164, max=268.164, sum=536.327 (2)",
-            "tab": "General information",
-            "score": 268.1637426900585
-          },
-          "World Religions - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.817,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json b/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json
deleted file mode 100644
index b8d59d877..000000000
--- a/data/helm_mmlu/google/gemini-1.5-flash-preview-0514/3e82f5a5-b80a-4f2f-a262-43c6ee50fbf8.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-flash-preview-0514/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 1.5 Flash 0514 preview",
-    "id": "google/gemini-1.5-flash-preview-0514",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.778,
-        "details": {
-          "description": "min=0.374, mean=0.778, max=0.969, sum=88.647 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.348, max=0.49, sum=39.671 (114)",
-            "tab": "Efficiency",
-            "score": 0.3479928578252291
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=268.164, mean=632.617, max=2797.424, sum=72118.345 (114)",
-            "tab": "General information",
-            "score": 632.6170571214202
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.56,
-        "details": {
-          "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.414, mean=0.414, max=0.414, sum=0.828 (2)",
-            "tab": "Efficiency",
-            "score": 0.4139195799827576
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=383.97, mean=383.97, max=383.97, sum=767.94 (2)",
-            "tab": "General information",
-            "score": 383.97
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.807,
-        "details": {
-          "description": "min=0.807, mean=0.807, max=0.807, sum=1.615 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.331, mean=0.331, max=0.331, sum=0.662 (2)",
-            "tab": "Efficiency",
-            "score": 0.33077726717348455
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=344.356, mean=344.356, max=344.356, sum=688.711 (2)",
-            "tab": "General information",
-            "score": 344.35555555555555
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.667,
-        "details": {
-          "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.341, mean=0.341, max=0.341, sum=0.683 (2)",
-            "tab": "Efficiency",
-            "score": 0.3412753510475159
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.331, mean=0.331, max=0.331, sum=0.662 (2)",
-            "tab": "Efficiency",
-            "score": 0.33089664578437805
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.358, mean=0.358, max=0.358, sum=0.715 (2)",
-            "tab": "Efficiency",
-            "score": 0.35753655195236206
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.344, mean=0.344, max=0.344, sum=0.688 (2)",
-            "tab": "Efficiency",
-            "score": 0.3440544652938843
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.339, mean=0.339, max=0.339, sum=0.679 (2)",
-            "tab": "Efficiency",
-            "score": 0.33949112616522464
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.339, mean=0.339, max=0.339, sum=0.678 (2)",
-            "tab": "Efficiency",
-            "score": 0.33893728957456704
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=570.02, mean=570.02, max=570.02, sum=1140.04 (2)",
-            "tab": "General information",
-            "score": 570.02
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=482.799, mean=482.799, max=482.799, sum=965.597 (2)",
-            "tab": "General information",
-            "score": 482.7986111111111
-          },
-          "College Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=857.86, mean=857.86, max=857.86, sum=1715.72 (2)",
-            "tab": "General information",
-            "score": 857.86
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=626.69, mean=626.69, max=626.69, sum=1253.38 (2)",
-            "tab": "General information",
-            "score": 626.69
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=513.37, mean=513.37, max=513.37, sum=1026.74 (2)",
-            "tab": "General information",
-            "score": 513.3699421965318
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=507.471, mean=507.471, max=507.471, sum=1014.941 (2)",
-            "tab": "General information",
-            "score": 507.47058823529414
-          },
-          "College Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.329, mean=0.329, max=0.329, sum=0.657 (2)",
-            "tab": "Efficiency",
-            "score": 0.3285136580467224
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=380.91, mean=380.91, max=380.91, sum=761.82 (2)",
-            "tab": "General information",
-            "score": 380.91
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.64,
-        "details": {
-          "description": "min=0.64, mean=0.64, max=0.64, sum=1.281 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.339, mean=0.339, max=0.339, sum=0.679 (2)",
-            "tab": "Efficiency",
-            "score": 0.33929300726505746
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=634.553, mean=634.553, max=634.553, sum=1269.105 (2)",
-            "tab": "General information",
-            "score": 634.5526315789474
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.55,
-        "details": {
-          "description": "min=0.55, mean=0.55, max=0.55, sum=1.1 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.325, mean=0.325, max=0.325, sum=0.65 (2)",
-            "tab": "Efficiency",
-            "score": 0.32497448682785035
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=456.54, mean=456.54, max=456.54, sum=913.08 (2)",
-            "tab": "General information",
-            "score": 456.54
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.889,
-        "details": {
-          "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.654 (2)",
-            "tab": "Efficiency",
-            "score": 0.3270833028687371
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=407.87, mean=407.87, max=407.87, sum=815.741 (2)",
-            "tab": "General information",
-            "score": 407.8703703703704
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.807,
-        "details": {
-          "description": "min=0.807, mean=0.807, max=0.807, sum=1.614 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.352, mean=0.352, max=0.352, sum=0.704 (2)",
-            "tab": "Efficiency",
-            "score": 0.3517766727128596
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=340.907, mean=340.907, max=340.907, sum=681.814 (2)",
-            "tab": "General information",
-            "score": 340.90675241157555
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.825,
-        "details": {
-          "description": "min=0.825, mean=0.825, max=0.825, sum=1.65 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.353, mean=0.353, max=0.353, sum=0.707 (2)",
-            "tab": "Efficiency",
-            "score": 0.3533606018967294
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.354, mean=0.354, max=0.354, sum=0.707 (2)",
-            "tab": "Efficiency",
-            "score": 0.35356061509315
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.376, mean=0.376, max=0.376, sum=0.752 (2)",
-            "tab": "Efficiency",
-            "score": 0.37605549059613214
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.353, mean=0.353, max=0.353, sum=0.707 (2)",
-            "tab": "Efficiency",
-            "score": 0.3533070875625861
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1113.092, mean=1113.092, max=1113.092, sum=2226.184 (2)",
-            "tab": "General information",
-            "score": 1113.0919117647059
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=755.418, mean=755.418, max=755.418, sum=1510.837 (2)",
-            "tab": "General information",
-            "score": 755.418439716312
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1685.119, mean=1685.119, max=1685.119, sum=3370.239 (2)",
-            "tab": "General information",
-            "score": 1685.119295958279
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=594.363, mean=594.363, max=594.363, sum=1188.725 (2)",
-            "tab": "General information",
-            "score": 594.3627450980392
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "details": {
-          "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.339, mean=0.339, max=0.339, sum=0.679 (2)",
-            "tab": "Efficiency",
-            "score": 0.3394037842750549
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=438.2, mean=438.2, max=438.2, sum=876.4 (2)",
-            "tab": "General information",
-            "score": 438.2
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.868,
-        "details": {
-          "description": "min=0.868, mean=0.868, max=0.868, sum=1.737 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.379, mean=0.379, max=0.379, sum=0.758 (2)",
-            "tab": "Efficiency",
-            "score": 0.3787926027649327
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=602.421, mean=602.421, max=602.421, sum=1204.842 (2)",
-            "tab": "General information",
-            "score": 602.421052631579
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.352, mean=0.352, max=0.352, sum=0.704 (2)",
-            "tab": "Efficiency",
-            "score": 0.3517553758621216
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=552.87, mean=552.87, max=552.87, sum=1105.74 (2)",
-            "tab": "General information",
-            "score": 552.87
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.838,
-        "details": {
-          "description": "min=0.838, mean=0.838, max=0.838, sum=1.675 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.325, mean=0.325, max=0.325, sum=0.649 (2)",
-            "tab": "Efficiency",
-            "score": 0.3246132454782162
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=402.592, mean=402.592, max=402.592, sum=805.185 (2)",
-            "tab": "General information",
-            "score": 402.5924528301887
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.855,
-        "details": {
-          "description": "min=0.855, mean=0.855, max=0.855, sum=1.711 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.328, mean=0.328, max=0.328, sum=0.655 (2)",
-            "tab": "Efficiency",
-            "score": 0.32754647579598933
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=309.213, mean=309.213, max=309.213, sum=618.426 (2)",
-            "tab": "General information",
-            "score": 309.21276595744683
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.814,
-        "details": {
-          "description": "min=0.814, mean=0.814, max=0.814, sum=1.628 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.328, mean=0.328, max=0.328, sum=0.656 (2)",
-            "tab": "Efficiency",
-            "score": 0.3282040464467016
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=474.786, mean=474.786, max=474.786, sum=949.572 (2)",
-            "tab": "General information",
-            "score": 474.78620689655173
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.778,
-        "details": {
-          "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.34, mean=0.34, max=0.34, sum=0.679 (2)",
-            "tab": "Efficiency",
-            "score": 0.33972583182905086
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=597.341, mean=597.341, max=597.341, sum=1194.683 (2)",
-            "tab": "General information",
-            "score": 597.3412698412699
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.611,
-        "details": {
-          "description": "min=0.611, mean=0.611, max=0.611, sum=1.222 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.347, mean=0.347, max=0.347, sum=0.693 (2)",
-            "tab": "Efficiency",
-            "score": 0.34669986982194206
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=619.563, mean=619.563, max=619.563, sum=1239.127 (2)",
-            "tab": "General information",
-            "score": 619.563492063492
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.907,
-        "details": {
-          "description": "min=0.907, mean=0.907, max=0.907, sum=1.814 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.362, mean=0.362, max=0.362, sum=0.725 (2)",
-            "tab": "Efficiency",
-            "score": 0.36248803600188223
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.672 (2)",
-            "tab": "Efficiency",
-            "score": 0.3359241544319491
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.354, mean=0.354, max=0.354, sum=0.709 (2)",
-            "tab": "Efficiency",
-            "score": 0.35430107831954955
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.49, mean=0.49, max=0.49, sum=0.98 (2)",
-            "tab": "Efficiency",
-            "score": 0.4900842637726755
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.673 (2)",
-            "tab": "Efficiency",
-            "score": 0.33633674395204793
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.335, mean=0.335, max=0.335, sum=0.669 (2)",
-            "tab": "Efficiency",
-            "score": 0.3347120445627005
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.33, mean=0.33, max=0.33, sum=0.661 (2)",
-            "tab": "Efficiency",
-            "score": 0.33047562073438597
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)",
-            "tab": "Efficiency",
-            "score": 0.3431409650378757
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.329, mean=0.329, max=0.329, sum=0.658 (2)",
-            "tab": "Efficiency",
-            "score": 0.328948572904122
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)",
-            "tab": "Efficiency",
-            "score": 0.3431161413129592
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.364, mean=0.364, max=0.364, sum=0.728 (2)",
-            "tab": "Efficiency",
-            "score": 0.3637816064498004
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.351, mean=0.351, max=0.351, sum=0.701 (2)",
-            "tab": "Efficiency",
-            "score": 0.35072638701509545
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.484, mean=0.484, max=0.484, sum=0.967 (2)",
-            "tab": "Efficiency",
-            "score": 0.48351573476604387
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.376, mean=0.376, max=0.376, sum=0.753 (2)",
-            "tab": "Efficiency",
-            "score": 0.3762651908246777
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=500.958, mean=500.958, max=500.958, sum=1001.916 (2)",
-            "tab": "General information",
-            "score": 500.958064516129
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=513.064, mean=513.064, max=513.064, sum=1026.128 (2)",
-            "tab": "General information",
-            "score": 513.064039408867
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=935.13, mean=935.13, max=935.13, sum=1870.26 (2)",
-            "tab": "General information",
-            "score": 935.13
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2797.424, mean=2797.424, max=2797.424, sum=5594.848 (2)",
-            "tab": "General information",
-            "score": 2797.4242424242425
-          },
-          "High School European History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=394.773, mean=394.773, max=394.773, sum=789.545 (2)",
-            "tab": "General information",
-            "score": 394.77272727272725
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=479.301, mean=479.301, max=479.301, sum=958.601 (2)",
-            "tab": "General information",
-            "score": 479.30051813471505
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=396.541, mean=396.541, max=396.541, sum=793.082 (2)",
-            "tab": "General information",
-            "score": 396.54102564102567
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=566.822, mean=566.822, max=566.822, sum=1133.644 (2)",
-            "tab": "General information",
-            "score": 566.8222222222222
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=415.954, mean=415.954, max=415.954, sum=831.908 (2)",
-            "tab": "General information",
-            "score": 415.953781512605
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=591.715, mean=591.715, max=591.715, sum=1183.43 (2)",
-            "tab": "General information",
-            "score": 591.7152317880794
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=502.604, mean=502.604, max=502.604, sum=1005.207 (2)",
-            "tab": "General information",
-            "score": 502.60366972477067
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=858.931, mean=858.931, max=858.931, sum=1717.861 (2)",
-            "tab": "General information",
-            "score": 858.9305555555555
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2205.583, mean=2205.583, max=2205.583, sum=4411.167 (2)",
-            "tab": "General information",
-            "score": 2205.5833333333335
-          },
-          "High School US History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1426.544, mean=1426.544, max=1426.544, sum=2853.089 (2)",
-            "tab": "General information",
-            "score": 1426.5443037974683
-          },
-          "High School World History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.374,
-        "details": {
-          "description": "min=0.374, mean=0.374, max=0.374, sum=0.748 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.329, mean=0.329, max=0.329, sum=0.658 (2)",
-            "tab": "Efficiency",
-            "score": 0.3287716788561355
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.323, max=0.323, sum=0.647 (2)",
-            "tab": "Efficiency",
-            "score": 0.32337414208105053
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=321.587, mean=321.587, max=321.587, sum=643.175 (2)",
-            "tab": "General information",
-            "score": 321.58744394618833
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=347.183, mean=347.183, max=347.183, sum=694.366 (2)",
-            "tab": "General information",
-            "score": 347.1832061068702
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.876,
-        "details": {
-          "description": "min=0.876, mean=0.876, max=0.876, sum=1.752 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.349, mean=0.349, max=0.349, sum=0.698 (2)",
-            "tab": "Efficiency",
-            "score": 0.34882096219653924
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=644.165, mean=644.165, max=644.165, sum=1288.331 (2)",
-            "tab": "General information",
-            "score": 644.1652892561983
-          },
-          "International Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.853,
-        "details": {
-          "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.329, mean=0.329, max=0.329, sum=0.658 (2)",
-            "tab": "Efficiency",
-            "score": 0.32894283277125447
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=450.049, mean=450.049, max=450.049, sum=900.098 (2)",
-            "tab": "General information",
-            "score": 450.0490797546012
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.562,
-        "details": {
-          "description": "min=0.562, mean=0.562, max=0.562, sum=1.125 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.345, mean=0.345, max=0.345, sum=0.689 (2)",
-            "tab": "Efficiency",
-            "score": 0.3445145934820175
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=702.402, mean=702.402, max=702.402, sum=1404.804 (2)",
-            "tab": "General information",
-            "score": 702.4017857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.854,
-        "details": {
-          "description": "min=0.854, mean=0.854, max=0.854, sum=1.709 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.326, mean=0.326, max=0.326, sum=0.652 (2)",
-            "tab": "Efficiency",
-            "score": 0.32611215461805027
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=281.301, mean=281.301, max=281.301, sum=562.602 (2)",
-            "tab": "General information",
-            "score": 281.3009708737864
-          },
-          "Management - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.936,
-        "details": {
-          "description": "min=0.936, mean=0.936, max=0.936, sum=1.872 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.331, mean=0.331, max=0.331, sum=0.663 (2)",
-            "tab": "Efficiency",
-            "score": 0.3313393389057909
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=428.35, mean=428.35, max=428.35, sum=856.701 (2)",
-            "tab": "General information",
-            "score": 428.35042735042737
-          },
-          "Marketing - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.86,
-        "details": {
-          "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.334, mean=0.334, max=0.334, sum=0.667 (2)",
-            "tab": "Efficiency",
-            "score": 0.3336531209945679
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=338.89, mean=338.89, max=338.89, sum=677.78 (2)",
-            "tab": "General information",
-            "score": 338.89
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.884,
-        "details": {
-          "description": "min=0.884, mean=0.884, max=0.884, sum=1.768 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)",
-            "tab": "Efficiency",
-            "score": 0.3299713125630814
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=314.669, mean=314.669, max=314.669, sum=629.338 (2)",
-            "tab": "General information",
-            "score": 314.669220945083
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.631,
-        "details": {
-          "description": "min=0.631, mean=0.631, max=0.631, sum=1.263 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.671 (2)",
-            "tab": "Efficiency",
-            "score": 0.33562634716863216
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.347, mean=0.347, max=0.347, sum=0.694 (2)",
-            "tab": "Efficiency",
-            "score": 0.34689992780224144
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=495.003, mean=495.003, max=495.003, sum=990.006 (2)",
-            "tab": "General information",
-            "score": 495.0028901734104
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=690.542, mean=690.542, max=690.542, sum=1381.084 (2)",
-            "tab": "General information",
-            "score": 690.5418994413408
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.801,
-        "details": {
-          "description": "min=0.801, mean=0.801, max=0.801, sum=1.601 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.348, mean=0.348, max=0.348, sum=0.695 (2)",
-            "tab": "Efficiency",
-            "score": 0.3477346959456899
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=585.48, mean=585.48, max=585.48, sum=1170.961 (2)",
-            "tab": "General information",
-            "score": 585.4803921568628
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.867,
-        "details": {
-          "description": "min=0.867, mean=0.867, max=0.867, sum=1.735 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.347, mean=0.347, max=0.347, sum=0.694 (2)",
-            "tab": "Efficiency",
-            "score": 0.34701154850147387
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=540.198, mean=540.198, max=540.198, sum=1080.395 (2)",
-            "tab": "General information",
-            "score": 540.1975308641976
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.773,
-        "details": {
-          "description": "min=0.773, mean=0.773, max=0.773, sum=1.545 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.664 (2)",
-            "tab": "Efficiency",
-            "score": 0.3317977645180442
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=426.655, mean=426.655, max=426.655, sum=853.309 (2)",
-            "tab": "General information",
-            "score": 426.6545454545454
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.812,
-        "details": {
-          "description": "min=0.812, mean=0.812, max=0.812, sum=1.624 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.37, mean=0.37, max=0.37, sum=0.74 (2)",
-            "tab": "Efficiency",
-            "score": 0.3700062508485755
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1193.869, mean=1193.869, max=1193.869, sum=2387.739 (2)",
-            "tab": "General information",
-            "score": 1193.869387755102
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)",
-            "tab": "Efficiency",
-            "score": 0.33022794794680466
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=456.274, mean=456.274, max=456.274, sum=912.547 (2)",
-            "tab": "General information",
-            "score": 456.27363184079604
-          },
-          "Sociology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.566,
-        "details": {
-          "description": "min=0.566, mean=0.566, max=0.566, sum=1.133 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.329, mean=0.329, max=0.329, sum=0.658 (2)",
-            "tab": "Efficiency",
-            "score": 0.3290767310613609
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=336.753, mean=336.753, max=336.753, sum=673.506 (2)",
-            "tab": "General information",
-            "score": 336.7530120481928
-          },
-          "Virology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=1.743 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.326, mean=0.326, max=0.326, sum=0.653 (2)",
-            "tab": "Efficiency",
-            "score": 0.3263405735729731
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=268.164, mean=268.164, max=268.164, sum=536.327 (2)",
-            "tab": "General information",
-            "score": 268.1637426900585
-          },
-          "World Religions - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.713,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json b/data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json
deleted file mode 100644
index 0632aee68..000000000
--- a/data/helm_mmlu/google/gemini-1.5-pro-001/46d5e547-507e-4c98-98a9-bad1bfad7f7b.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-001/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 1.5 Pro 001",
-    "id": "google/gemini-1.5-pro-001",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.827,
-        "details": {
-          "description": "min=0.374, mean=0.827, max=0.974, sum=94.288 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.519, mean=0.618, max=0.799, sum=70.445 (114)",
-            "tab": "Efficiency",
-            "score": 0.6179386045856378
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=268.164, mean=632.617, max=2797.424, sum=72118.345 (114)",
-            "tab": "General information",
-            "score": 632.6170571214202
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.75,
-        "details": {
-          "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.659, mean=0.659, max=0.659, sum=1.318 (2)",
-            "tab": "Efficiency",
-            "score": 0.6589885497093201
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=383.97, mean=383.97, max=383.97, sum=767.94 (2)",
-            "tab": "General information",
-            "score": 383.97
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.659 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.671, mean=0.671, max=0.671, sum=1.342 (2)",
-            "tab": "Efficiency",
-            "score": 0.6710023721059163
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=344.356, mean=344.356, max=344.356, sum=688.711 (2)",
-            "tab": "General information",
-            "score": 344.35555555555555
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.745,
-        "details": {
-          "description": "min=0.745, mean=0.745, max=0.745, sum=1.49 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.763, mean=0.763, max=0.763, sum=1.527 (2)",
-            "tab": "Efficiency",
-            "score": 0.7634538197517395
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.622, mean=0.622, max=0.622, sum=1.244 (2)",
-            "tab": "Efficiency",
-            "score": 0.6218778673145506
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.664, mean=0.664, max=0.664, sum=1.328 (2)",
-            "tab": "Efficiency",
-            "score": 0.6641578316688538
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.694, mean=0.694, max=0.694, sum=1.389 (2)",
-            "tab": "Efficiency",
-            "score": 0.6943222141265869
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.586, mean=0.586, max=0.586, sum=1.172 (2)",
-            "tab": "Efficiency",
-            "score": 0.5860298300065057
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.799, mean=0.799, max=0.799, sum=1.597 (2)",
-            "tab": "Efficiency",
-            "score": 0.7986945521597769
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=570.02, mean=570.02, max=570.02, sum=1140.04 (2)",
-            "tab": "General information",
-            "score": 570.02
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=482.799, mean=482.799, max=482.799, sum=965.597 (2)",
-            "tab": "General information",
-            "score": 482.7986111111111
-          },
-          "College Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=857.86, mean=857.86, max=857.86, sum=1715.72 (2)",
-            "tab": "General information",
-            "score": 857.86
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=626.69, mean=626.69, max=626.69, sum=1253.38 (2)",
-            "tab": "General information",
-            "score": 626.69
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=513.37, mean=513.37, max=513.37, sum=1026.74 (2)",
-            "tab": "General information",
-            "score": 513.3699421965318
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=507.471, mean=507.471, max=507.471, sum=1014.941 (2)",
-            "tab": "General information",
-            "score": 507.47058823529414
-          },
-          "College Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.702, mean=0.702, max=0.702, sum=1.404 (2)",
-            "tab": "Efficiency",
-            "score": 0.7018922233581543
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=380.91, mean=380.91, max=380.91, sum=761.82 (2)",
-            "tab": "General information",
-            "score": 380.91
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.728,
-        "details": {
-          "description": "min=0.728, mean=0.728, max=0.728, sum=1.456 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)",
-            "tab": "Efficiency",
-            "score": 0.6497656546140972
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=634.553, mean=634.553, max=634.553, sum=1269.105 (2)",
-            "tab": "General information",
-            "score": 634.5526315789474
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.66,
-        "details": {
-          "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.67, mean=0.67, max=0.67, sum=1.34 (2)",
-            "tab": "Efficiency",
-            "score": 0.6698257994651794
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=456.54, mean=456.54, max=456.54, sum=913.08 (2)",
-            "tab": "General information",
-            "score": 456.54
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.889,
-        "details": {
-          "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.624, mean=0.624, max=0.624, sum=1.248 (2)",
-            "tab": "Efficiency",
-            "score": 0.6239932885876408
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=407.87, mean=407.87, max=407.87, sum=815.741 (2)",
-            "tab": "General information",
-            "score": 407.8703703703704
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=1.743 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)",
-            "tab": "Efficiency",
-            "score": 0.5198829174041748
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=340.907, mean=340.907, max=340.907, sum=681.814 (2)",
-            "tab": "General information",
-            "score": 340.90675241157555
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.894,
-        "details": {
-          "description": "min=0.894, mean=0.894, max=0.894, sum=1.788 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.601, mean=0.601, max=0.601, sum=1.202 (2)",
-            "tab": "Efficiency",
-            "score": 0.6008452876467546
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.539, mean=0.539, max=0.539, sum=1.079 (2)",
-            "tab": "Efficiency",
-            "score": 0.5394198826864256
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.564, mean=0.564, max=0.564, sum=1.128 (2)",
-            "tab": "Efficiency",
-            "score": 0.5641645779784438
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.544, mean=0.544, max=0.544, sum=1.088 (2)",
-            "tab": "Efficiency",
-            "score": 0.5440043469792918
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1113.092, mean=1113.092, max=1113.092, sum=2226.184 (2)",
-            "tab": "General information",
-            "score": 1113.0919117647059
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=755.418, mean=755.418, max=755.418, sum=1510.837 (2)",
-            "tab": "General information",
-            "score": 755.418439716312
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1685.119, mean=1685.119, max=1685.119, sum=3370.239 (2)",
-            "tab": "General information",
-            "score": 1685.119295958279
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=594.363, mean=594.363, max=594.363, sum=1188.725 (2)",
-            "tab": "General information",
-            "score": 594.3627450980392
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "details": {
-          "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.677, mean=0.677, max=0.677, sum=1.354 (2)",
-            "tab": "Efficiency",
-            "score": 0.6769772005081177
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=438.2, mean=438.2, max=438.2, sum=876.4 (2)",
-            "tab": "General information",
-            "score": 438.2
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.914,
-        "details": {
-          "description": "min=0.914, mean=0.914, max=0.914, sum=1.829 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.649, mean=0.649, max=0.649, sum=1.298 (2)",
-            "tab": "Efficiency",
-            "score": 0.6491834003674356
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=602.421, mean=602.421, max=602.421, sum=1204.842 (2)",
-            "tab": "General information",
-            "score": 602.421052631579
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.697, mean=0.697, max=0.697, sum=1.394 (2)",
-            "tab": "Efficiency",
-            "score": 0.697232437133789
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=552.87, mean=552.87, max=552.87, sum=1105.74 (2)",
-            "tab": "General information",
-            "score": 552.87
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.853,
-        "details": {
-          "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.545, mean=0.545, max=0.545, sum=1.091 (2)",
-            "tab": "Efficiency",
-            "score": 0.545333849708989
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=402.592, mean=402.592, max=402.592, sum=805.185 (2)",
-            "tab": "General information",
-            "score": 402.5924528301887
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.949,
-        "details": {
-          "description": "min=0.949, mean=0.949, max=0.949, sum=1.898 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.573, mean=0.573, max=0.573, sum=1.146 (2)",
-            "tab": "Efficiency",
-            "score": 0.5729408700415428
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=309.213, mean=309.213, max=309.213, sum=618.426 (2)",
-            "tab": "General information",
-            "score": 309.21276595744683
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.745,
-        "details": {
-          "description": "min=0.745, mean=0.745, max=0.745, sum=1.49 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.622, mean=0.622, max=0.622, sum=1.244 (2)",
-            "tab": "Efficiency",
-            "score": 0.6219884050303492
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=474.786, mean=474.786, max=474.786, sum=949.572 (2)",
-            "tab": "General information",
-            "score": 474.78620689655173
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.939,
-        "details": {
-          "description": "min=0.939, mean=0.939, max=0.939, sum=1.878 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.548, mean=0.548, max=0.548, sum=1.097 (2)",
-            "tab": "Efficiency",
-            "score": 0.5484477596938926
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=597.341, mean=597.341, max=597.341, sum=1194.683 (2)",
-            "tab": "General information",
-            "score": 597.3412698412699
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.706,
-        "details": {
-          "description": "min=0.706, mean=0.706, max=0.706, sum=1.413 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.668, mean=0.668, max=0.668, sum=1.336 (2)",
-            "tab": "Efficiency",
-            "score": 0.6678630435277545
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=619.563, mean=619.563, max=619.563, sum=1239.127 (2)",
-            "tab": "General information",
-            "score": 619.563492063492
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.924,
-        "details": {
-          "description": "min=0.924, mean=0.924, max=0.924, sum=1.848 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.55, mean=0.55, max=0.55, sum=1.1 (2)",
-            "tab": "Efficiency",
-            "score": 0.5502124647940358
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.578, mean=0.578, max=0.578, sum=1.156 (2)",
-            "tab": "Efficiency",
-            "score": 0.5780763097584541
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)",
-            "tab": "Efficiency",
-            "score": 0.6602028679847717
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.775, mean=0.775, max=0.775, sum=1.55 (2)",
-            "tab": "Efficiency",
-            "score": 0.7751016385627515
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.571, mean=0.571, max=0.571, sum=1.141 (2)",
-            "tab": "Efficiency",
-            "score": 0.5705801778369479
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.582, mean=0.582, max=0.582, sum=1.163 (2)",
-            "tab": "Efficiency",
-            "score": 0.5816669402344857
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.54, mean=0.54, max=0.54, sum=1.081 (2)",
-            "tab": "Efficiency",
-            "score": 0.5402819168873322
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.584, mean=0.584, max=0.584, sum=1.168 (2)",
-            "tab": "Efficiency",
-            "score": 0.5841257324925175
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.556, mean=0.556, max=0.556, sum=1.113 (2)",
-            "tab": "Efficiency",
-            "score": 0.556499927985568
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.632, mean=0.632, max=0.632, sum=1.264 (2)",
-            "tab": "Efficiency",
-            "score": 0.6318649550936869
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.54, mean=0.54, max=0.54, sum=1.08 (2)",
-            "tab": "Efficiency",
-            "score": 0.5397529965814423
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.603, mean=0.603, max=0.603, sum=1.205 (2)",
-            "tab": "Efficiency",
-            "score": 0.6027307720096023
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.762, mean=0.762, max=0.762, sum=1.524 (2)",
-            "tab": "Efficiency",
-            "score": 0.7618554059196921
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.626, mean=0.626, max=0.626, sum=1.252 (2)",
-            "tab": "Efficiency",
-            "score": 0.6258294099493872
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=500.958, mean=500.958, max=500.958, sum=1001.916 (2)",
-            "tab": "General information",
-            "score": 500.958064516129
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=513.064, mean=513.064, max=513.064, sum=1026.128 (2)",
-            "tab": "General information",
-            "score": 513.064039408867
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=935.13, mean=935.13, max=935.13, sum=1870.26 (2)",
-            "tab": "General information",
-            "score": 935.13
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2797.424, mean=2797.424, max=2797.424, sum=5594.848 (2)",
-            "tab": "General information",
-            "score": 2797.4242424242425
-          },
-          "High School European History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=394.773, mean=394.773, max=394.773, sum=789.545 (2)",
-            "tab": "General information",
-            "score": 394.77272727272725
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=479.301, mean=479.301, max=479.301, sum=958.601 (2)",
-            "tab": "General information",
-            "score": 479.30051813471505
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=396.541, mean=396.541, max=396.541, sum=793.082 (2)",
-            "tab": "General information",
-            "score": 396.54102564102567
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=566.822, mean=566.822, max=566.822, sum=1133.644 (2)",
-            "tab": "General information",
-            "score": 566.8222222222222
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=415.954, mean=415.954, max=415.954, sum=831.908 (2)",
-            "tab": "General information",
-            "score": 415.953781512605
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=591.715, mean=591.715, max=591.715, sum=1183.43 (2)",
-            "tab": "General information",
-            "score": 591.7152317880794
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=502.604, mean=502.604, max=502.604, sum=1005.207 (2)",
-            "tab": "General information",
-            "score": 502.60366972477067
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=858.931, mean=858.931, max=858.931, sum=1717.861 (2)",
-            "tab": "General information",
-            "score": 858.9305555555555
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2205.583, mean=2205.583, max=2205.583, sum=4411.167 (2)",
-            "tab": "General information",
-            "score": 2205.5833333333335
-          },
-          "High School US History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1426.544, mean=1426.544, max=1426.544, sum=2853.089 (2)",
-            "tab": "General information",
-            "score": 1426.5443037974683
-          },
-          "High School World History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.374,
-        "details": {
-          "description": "min=0.374, mean=0.374, max=0.374, sum=0.748 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.563, mean=0.563, max=0.563, sum=1.127 (2)",
-            "tab": "Efficiency",
-            "score": 0.5634646939589838
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.616, mean=0.616, max=0.616, sum=1.231 (2)",
-            "tab": "Efficiency",
-            "score": 0.6156448550143484
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=321.587, mean=321.587, max=321.587, sum=643.175 (2)",
-            "tab": "General information",
-            "score": 321.58744394618833
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=347.183, mean=347.183, max=347.183, sum=694.366 (2)",
-            "tab": "General information",
-            "score": 347.1832061068702
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.917,
-        "details": {
-          "description": "min=0.917, mean=0.917, max=0.917, sum=1.835 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.673, mean=0.673, max=0.673, sum=1.346 (2)",
-            "tab": "Efficiency",
-            "score": 0.672865920815586
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=644.165, mean=644.165, max=644.165, sum=1288.331 (2)",
-            "tab": "General information",
-            "score": 644.1652892561983
-          },
-          "International Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.896,
-        "details": {
-          "description": "min=0.896, mean=0.896, max=0.896, sum=1.791 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.616, mean=0.616, max=0.616, sum=1.233 (2)",
-            "tab": "Efficiency",
-            "score": 0.6164792593271454
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=450.049, mean=450.049, max=450.049, sum=900.098 (2)",
-            "tab": "General information",
-            "score": 450.0490797546012
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.652,
-        "details": {
-          "description": "min=0.652, mean=0.652, max=0.652, sum=1.304 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.638, mean=0.638, max=0.638, sum=1.276 (2)",
-            "tab": "Efficiency",
-            "score": 0.6377767409597125
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=702.402, mean=702.402, max=702.402, sum=1404.804 (2)",
-            "tab": "General information",
-            "score": 702.4017857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.922,
-        "details": {
-          "description": "min=0.922, mean=0.922, max=0.922, sum=1.845 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.669, mean=0.669, max=0.669, sum=1.338 (2)",
-            "tab": "Efficiency",
-            "score": 0.6690320089025404
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=281.301, mean=281.301, max=281.301, sum=562.602 (2)",
-            "tab": "General information",
-            "score": 281.3009708737864
-          },
-          "Management - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.932,
-        "details": {
-          "description": "min=0.932, mean=0.932, max=0.932, sum=1.863 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.554, mean=0.554, max=0.554, sum=1.107 (2)",
-            "tab": "Efficiency",
-            "score": 0.5537131362491183
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=428.35, mean=428.35, max=428.35, sum=856.701 (2)",
-            "tab": "General information",
-            "score": 428.35042735042737
-          },
-          "Marketing - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.678, mean=0.678, max=0.678, sum=1.356 (2)",
-            "tab": "Efficiency",
-            "score": 0.678006865978241
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=338.89, mean=338.89, max=338.89, sum=677.78 (2)",
-            "tab": "General information",
-            "score": 338.89
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.958,
-        "details": {
-          "description": "min=0.958, mean=0.958, max=0.958, sum=1.916 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.519, mean=0.519, max=0.519, sum=1.038 (2)",
-            "tab": "Efficiency",
-            "score": 0.519028120113972
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=314.669, mean=314.669, max=314.669, sum=629.338 (2)",
-            "tab": "General information",
-            "score": 314.669220945083
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.739,
-        "details": {
-          "description": "min=0.739, mean=0.739, max=0.739, sum=1.477 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.546, mean=0.546, max=0.546, sum=1.092 (2)",
-            "tab": "Efficiency",
-            "score": 0.5461560525755952
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.536, mean=0.536, max=0.536, sum=1.072 (2)",
-            "tab": "Efficiency",
-            "score": 0.5358252359053416
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=495.003, mean=495.003, max=495.003, sum=990.006 (2)",
-            "tab": "General information",
-            "score": 495.0028901734104
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=690.542, mean=690.542, max=690.542, sum=1381.084 (2)",
-            "tab": "General information",
-            "score": 690.5418994413408
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.879,
-        "details": {
-          "description": "min=0.879, mean=0.879, max=0.879, sum=1.758 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.569, mean=0.569, max=0.569, sum=1.139 (2)",
-            "tab": "Efficiency",
-            "score": 0.5694240697848252
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=585.48, mean=585.48, max=585.48, sum=1170.961 (2)",
-            "tab": "General information",
-            "score": 585.4803921568628
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.57, mean=0.57, max=0.57, sum=1.141 (2)",
-            "tab": "Efficiency",
-            "score": 0.5704048761615047
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=540.198, mean=540.198, max=540.198, sum=1080.395 (2)",
-            "tab": "General information",
-            "score": 540.1975308641976
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.818,
-        "details": {
-          "description": "min=0.818, mean=0.818, max=0.818, sum=1.636 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.702, mean=0.702, max=0.702, sum=1.403 (2)",
-            "tab": "Efficiency",
-            "score": 0.7017486507242376
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=426.655, mean=426.655, max=426.655, sum=853.309 (2)",
-            "tab": "General information",
-            "score": 426.6545454545454
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.873,
-        "details": {
-          "description": "min=0.873, mean=0.873, max=0.873, sum=1.747 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)",
-            "tab": "Efficiency",
-            "score": 0.6002200584022366
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1193.869, mean=1193.869, max=1193.869, sum=2387.739 (2)",
-            "tab": "General information",
-            "score": 1193.869387755102
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.841 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.603, mean=0.603, max=0.603, sum=1.206 (2)",
-            "tab": "Efficiency",
-            "score": 0.6029752119263606
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=456.274, mean=456.274, max=456.274, sum=912.547 (2)",
-            "tab": "General information",
-            "score": 456.27363184079604
-          },
-          "Sociology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.554,
-        "details": {
-          "description": "min=0.554, mean=0.554, max=0.554, sum=1.108 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.59, mean=0.59, max=0.59, sum=1.181 (2)",
-            "tab": "Efficiency",
-            "score": 0.5903763368905309
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=336.753, mean=336.753, max=336.753, sum=673.506 (2)",
-            "tab": "General information",
-            "score": 336.7530120481928
-          },
-          "Virology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.854,
-        "details": {
-          "description": "min=0.854, mean=0.854, max=0.854, sum=1.708 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.568, mean=0.568, max=0.568, sum=1.137 (2)",
-            "tab": "Efficiency",
-            "score": 0.5682888700250994
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=268.164, mean=268.164, max=268.164, sum=536.327 (2)",
-            "tab": "General information",
-            "score": 268.1637426900585
-          },
-          "World Religions - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.349,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json b/data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json
deleted file mode 100644
index d6a3ba87a..000000000
--- a/data/helm_mmlu/google/gemini-1.5-pro-002/ce32874c-ceb9-4e6b-96bc-ff56cb99be5d.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-002/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 1.5 Pro 002",
-    "id": "google/gemini-1.5-pro-002",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.869,
-        "details": {
-          "description": "min=0.566, mean=0.869, max=0.99, sum=99.042 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.42, mean=0.696, max=1.671, sum=79.296 (114)",
-            "tab": "Efficiency",
-            "score": 0.695582110070124
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=268.164, mean=632.617, max=2797.424, sum=72118.345 (114)",
-            "tab": "General information",
-            "score": 632.6170571214202
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=1.671, mean=1.671, max=1.671, sum=3.341 (2)",
-            "tab": "Efficiency",
-            "score": 1.6706047868728637
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=383.97, mean=383.97, max=383.97, sum=767.94 (2)",
-            "tab": "General information",
-            "score": 383.97
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.659 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.653, mean=0.653, max=0.653, sum=1.306 (2)",
-            "tab": "Efficiency",
-            "score": 0.652814730891475
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=344.356, mean=344.356, max=344.356, sum=688.711 (2)",
-            "tab": "General information",
-            "score": 344.35555555555555
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.863,
-        "details": {
-          "description": "min=0.863, mean=0.863, max=0.863, sum=1.725 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=1.16, mean=1.16, max=1.16, sum=2.319 (2)",
-            "tab": "Efficiency",
-            "score": 1.1597088170051575
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.511, mean=0.511, max=0.511, sum=1.022 (2)",
-            "tab": "Efficiency",
-            "score": 0.5110265033112632
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.88, mean=0.88, max=0.88, sum=1.76 (2)",
-            "tab": "Efficiency",
-            "score": 0.8800347399711609
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.478, mean=0.478, max=0.478, sum=0.955 (2)",
-            "tab": "Efficiency",
-            "score": 0.477603075504303
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.865, mean=0.865, max=0.865, sum=1.73 (2)",
-            "tab": "Efficiency",
-            "score": 0.8651723158841877
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.593, mean=0.593, max=0.593, sum=1.186 (2)",
-            "tab": "Efficiency",
-            "score": 0.5927850522247016
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=570.02, mean=570.02, max=570.02, sum=1140.04 (2)",
-            "tab": "General information",
-            "score": 570.02
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=482.799, mean=482.799, max=482.799, sum=965.597 (2)",
-            "tab": "General information",
-            "score": 482.7986111111111
-          },
-          "College Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=857.86, mean=857.86, max=857.86, sum=1715.72 (2)",
-            "tab": "General information",
-            "score": 857.86
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=626.69, mean=626.69, max=626.69, sum=1253.38 (2)",
-            "tab": "General information",
-            "score": 626.69
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=513.37, mean=513.37, max=513.37, sum=1026.74 (2)",
-            "tab": "General information",
-            "score": 513.3699421965318
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=507.471, mean=507.471, max=507.471, sum=1014.941 (2)",
-            "tab": "General information",
-            "score": 507.47058823529414
-          },
-          "College Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.453, mean=0.453, max=0.453, sum=0.905 (2)",
-            "tab": "Efficiency",
-            "score": 0.45262243270874025
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=380.91, mean=380.91, max=380.91, sum=761.82 (2)",
-            "tab": "General information",
-            "score": 380.91
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.693,
-        "details": {
-          "description": "min=0.693, mean=0.693, max=0.693, sum=1.386 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=1.068, mean=1.068, max=1.068, sum=2.135 (2)",
-            "tab": "Efficiency",
-            "score": 1.067676763785513
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=634.553, mean=634.553, max=634.553, sum=1269.105 (2)",
-            "tab": "General information",
-            "score": 634.5526315789474
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.792, mean=0.792, max=0.792, sum=1.584 (2)",
-            "tab": "Efficiency",
-            "score": 0.7918326926231384
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=456.54, mean=456.54, max=456.54, sum=913.08 (2)",
-            "tab": "General information",
-            "score": 456.54
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.898,
-        "details": {
-          "description": "min=0.898, mean=0.898, max=0.898, sum=1.796 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
-            "tab": "Efficiency",
-            "score": 0.7597615586386787
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=407.87, mean=407.87, max=407.87, sum=815.741 (2)",
-            "tab": "General information",
-            "score": 407.8703703703704
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.887,
-        "details": {
-          "description": "min=0.887, mean=0.887, max=0.887, sum=1.775 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.453, mean=0.453, max=0.453, sum=0.907 (2)",
-            "tab": "Efficiency",
-            "score": 0.45336360793405023
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=340.907, mean=340.907, max=340.907, sum=681.814 (2)",
-            "tab": "General information",
-            "score": 340.90675241157555
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.912,
-        "details": {
-          "description": "min=0.912, mean=0.912, max=0.912, sum=1.824 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.804, mean=0.804, max=0.804, sum=1.609 (2)",
-            "tab": "Efficiency",
-            "score": 0.8043198874768089
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.738, mean=0.738, max=0.738, sum=1.476 (2)",
-            "tab": "Efficiency",
-            "score": 0.7378175072636165
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.546, mean=0.546, max=0.546, sum=1.091 (2)",
-            "tab": "Efficiency",
-            "score": 0.5455011718431694
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)",
-            "tab": "Efficiency",
-            "score": 0.47001955400105394
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1113.092, mean=1113.092, max=1113.092, sum=2226.184 (2)",
-            "tab": "General information",
-            "score": 1113.0919117647059
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=755.418, mean=755.418, max=755.418, sum=1510.837 (2)",
-            "tab": "General information",
-            "score": 755.418439716312
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1685.119, mean=1685.119, max=1685.119, sum=3370.239 (2)",
-            "tab": "General information",
-            "score": 1685.119295958279
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=594.363, mean=594.363, max=594.363, sum=1188.725 (2)",
-            "tab": "General information",
-            "score": 594.3627450980392
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.533, mean=0.533, max=0.533, sum=1.065 (2)",
-            "tab": "Efficiency",
-            "score": 0.5325308299064636
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=438.2, mean=438.2, max=438.2, sum=876.4 (2)",
-            "tab": "General information",
-            "score": 438.2
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.934,
-        "details": {
-          "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=1.036, mean=1.036, max=1.036, sum=2.071 (2)",
-            "tab": "Efficiency",
-            "score": 1.03554652239147
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=602.421, mean=602.421, max=602.421, sum=1204.842 (2)",
-            "tab": "General information",
-            "score": 602.421052631579
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=1.112, mean=1.112, max=1.112, sum=2.223 (2)",
-            "tab": "Efficiency",
-            "score": 1.1116365933418273
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=552.87, mean=552.87, max=552.87, sum=1105.74 (2)",
-            "tab": "General information",
-            "score": 552.87
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.906,
-        "details": {
-          "description": "min=0.906, mean=0.906, max=0.906, sum=1.811 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.469, mean=0.469, max=0.469, sum=0.937 (2)",
-            "tab": "Efficiency",
-            "score": 0.4685829783385655
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=402.592, mean=402.592, max=402.592, sum=805.185 (2)",
-            "tab": "General information",
-            "score": 402.5924528301887
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.945,
-        "details": {
-          "description": "min=0.945, mean=0.945, max=0.945, sum=1.889 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.659, mean=0.659, max=0.659, sum=1.317 (2)",
-            "tab": "Efficiency",
-            "score": 0.6586567797559373
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=309.213, mean=309.213, max=309.213, sum=618.426 (2)",
-            "tab": "General information",
-            "score": 309.21276595744683
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.855,
-        "details": {
-          "description": "min=0.855, mean=0.855, max=0.855, sum=1.71 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.474, mean=0.474, max=0.474, sum=0.948 (2)",
-            "tab": "Efficiency",
-            "score": 0.4739974646732725
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=474.786, mean=474.786, max=474.786, sum=949.572 (2)",
-            "tab": "General information",
-            "score": 474.78620689655173
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.942,
-        "details": {
-          "description": "min=0.942, mean=0.942, max=0.942, sum=1.884 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.58, mean=0.58, max=0.58, sum=1.16 (2)",
-            "tab": "Efficiency",
-            "score": 0.5800282936247568
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=597.341, mean=597.341, max=597.341, sum=1194.683 (2)",
-            "tab": "General information",
-            "score": 597.3412698412699
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.754,
-        "details": {
-          "description": "min=0.754, mean=0.754, max=0.754, sum=1.508 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.926, mean=0.926, max=0.926, sum=1.852 (2)",
-            "tab": "Efficiency",
-            "score": 0.9259536947522845
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=619.563, mean=619.563, max=619.563, sum=1239.127 (2)",
-            "tab": "General information",
-            "score": 619.563492063492
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.937,
-        "details": {
-          "description": "min=0.937, mean=0.937, max=0.937, sum=1.873 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.458, mean=0.458, max=0.458, sum=0.916 (2)",
-            "tab": "Efficiency",
-            "score": 0.4579133049134285
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.587, mean=0.587, max=0.587, sum=1.175 (2)",
-            "tab": "Efficiency",
-            "score": 0.5872501540066574
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.493, mean=0.493, max=0.493, sum=0.987 (2)",
-            "tab": "Efficiency",
-            "score": 0.49327227354049685
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)",
-            "tab": "Efficiency",
-            "score": 0.8402222113175826
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.674, mean=0.674, max=0.674, sum=1.349 (2)",
-            "tab": "Efficiency",
-            "score": 0.6743082650984177
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.494, mean=0.494, max=0.494, sum=0.988 (2)",
-            "tab": "Efficiency",
-            "score": 0.4939905238275083
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.75, mean=0.75, max=0.75, sum=1.501 (2)",
-            "tab": "Efficiency",
-            "score": 0.750414514541626
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)",
-            "tab": "Efficiency",
-            "score": 0.8088616865652579
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.711, mean=0.711, max=0.711, sum=1.423 (2)",
-            "tab": "Efficiency",
-            "score": 0.711490568994474
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.832, mean=0.832, max=0.832, sum=1.664 (2)",
-            "tab": "Efficiency",
-            "score": 0.8320141549141992
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.654, mean=0.654, max=0.654, sum=1.309 (2)",
-            "tab": "Efficiency",
-            "score": 0.6543280317149031
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.688, mean=0.688, max=0.688, sum=1.377 (2)",
-            "tab": "Efficiency",
-            "score": 0.6883480460555466
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.848, mean=0.848, max=0.848, sum=1.695 (2)",
-            "tab": "Efficiency",
-            "score": 0.8477429151535034
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.641, mean=0.641, max=0.641, sum=1.282 (2)",
-            "tab": "Efficiency",
-            "score": 0.6409383886474095
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=500.958, mean=500.958, max=500.958, sum=1001.916 (2)",
-            "tab": "General information",
-            "score": 500.958064516129
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=513.064, mean=513.064, max=513.064, sum=1026.128 (2)",
-            "tab": "General information",
-            "score": 513.064039408867
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=935.13, mean=935.13, max=935.13, sum=1870.26 (2)",
-            "tab": "General information",
-            "score": 935.13
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2797.424, mean=2797.424, max=2797.424, sum=5594.848 (2)",
-            "tab": "General information",
-            "score": 2797.4242424242425
-          },
-          "High School European History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=394.773, mean=394.773, max=394.773, sum=789.545 (2)",
-            "tab": "General information",
-            "score": 394.77272727272725
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=479.301, mean=479.301, max=479.301, sum=958.601 (2)",
-            "tab": "General information",
-            "score": 479.30051813471505
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=396.541, mean=396.541, max=396.541, sum=793.082 (2)",
-            "tab": "General information",
-            "score": 396.54102564102567
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=566.822, mean=566.822, max=566.822, sum=1133.644 (2)",
-            "tab": "General information",
-            "score": 566.8222222222222
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=415.954, mean=415.954, max=415.954, sum=831.908 (2)",
-            "tab": "General information",
-            "score": 415.953781512605
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=591.715, mean=591.715, max=591.715, sum=1183.43 (2)",
-            "tab": "General information",
-            "score": 591.7152317880794
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=502.604, mean=502.604, max=502.604, sum=1005.207 (2)",
-            "tab": "General information",
-            "score": 502.60366972477067
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=858.931, mean=858.931, max=858.931, sum=1717.861 (2)",
-            "tab": "General information",
-            "score": 858.9305555555555
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2205.583, mean=2205.583, max=2205.583, sum=4411.167 (2)",
-            "tab": "General information",
-            "score": 2205.5833333333335
-          },
-          "High School US History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1426.544, mean=1426.544, max=1426.544, sum=2853.089 (2)",
-            "tab": "General information",
-            "score": 1426.5443037974683
-          },
-          "High School World History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.878,
-        "details": {
-          "description": "min=0.878, mean=0.878, max=0.878, sum=1.756 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.825, mean=0.825, max=0.825, sum=1.651 (2)",
-            "tab": "Efficiency",
-            "score": 0.8252711541984113
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.689, mean=0.689, max=0.689, sum=1.378 (2)",
-            "tab": "Efficiency",
-            "score": 0.689175573014121
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=321.587, mean=321.587, max=321.587, sum=643.175 (2)",
-            "tab": "General information",
-            "score": 321.58744394618833
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=347.183, mean=347.183, max=347.183, sum=694.366 (2)",
-            "tab": "General information",
-            "score": 347.1832061068702
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.917,
-        "details": {
-          "description": "min=0.917, mean=0.917, max=0.917, sum=1.835 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.544, mean=0.544, max=0.544, sum=1.089 (2)",
-            "tab": "Efficiency",
-            "score": 0.5443926212216211
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=644.165, mean=644.165, max=644.165, sum=1288.331 (2)",
-            "tab": "General information",
-            "score": 644.1652892561983
-          },
-          "International Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.902,
-        "details": {
-          "description": "min=0.902, mean=0.902, max=0.902, sum=1.804 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.706, mean=0.706, max=0.706, sum=1.412 (2)",
-            "tab": "Efficiency",
-            "score": 0.7058728443332977
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=450.049, mean=450.049, max=450.049, sum=900.098 (2)",
-            "tab": "General information",
-            "score": 450.0490797546012
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.661 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.476, mean=0.476, max=0.476, sum=0.952 (2)",
-            "tab": "Efficiency",
-            "score": 0.47608799380915506
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=702.402, mean=702.402, max=702.402, sum=1404.804 (2)",
-            "tab": "General information",
-            "score": 702.4017857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.903,
-        "details": {
-          "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.51, mean=0.51, max=0.51, sum=1.02 (2)",
-            "tab": "Efficiency",
-            "score": 0.5099537488326286
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=281.301, mean=281.301, max=281.301, sum=562.602 (2)",
-            "tab": "General information",
-            "score": 281.3009708737864
-          },
-          "Management - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.962,
-        "details": {
-          "description": "min=0.962, mean=0.962, max=0.962, sum=1.923 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.422, mean=0.422, max=0.422, sum=0.843 (2)",
-            "tab": "Efficiency",
-            "score": 0.42154710415082103
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=428.35, mean=428.35, max=428.35, sum=856.701 (2)",
-            "tab": "General information",
-            "score": 428.35042735042737
-          },
-          "Marketing - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.66, mean=0.66, max=0.66, sum=1.321 (2)",
-            "tab": "Efficiency",
-            "score": 0.6604956579208374
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=338.89, mean=338.89, max=338.89, sum=677.78 (2)",
-            "tab": "General information",
-            "score": 338.89
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.959,
-        "details": {
-          "description": "min=0.959, mean=0.959, max=0.959, sum=1.918 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.564, mean=0.564, max=0.564, sum=1.128 (2)",
-            "tab": "Efficiency",
-            "score": 0.5638943230055301
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=314.669, mean=314.669, max=314.669, sum=629.338 (2)",
-            "tab": "General information",
-            "score": 314.669220945083
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.792,
-        "details": {
-          "description": "min=0.792, mean=0.792, max=0.792, sum=1.584 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=1.245, mean=1.245, max=1.245, sum=2.49 (2)",
-            "tab": "Efficiency",
-            "score": 1.244819999430221
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=1.526, mean=1.526, max=1.526, sum=3.052 (2)",
-            "tab": "Efficiency",
-            "score": 1.5260936177642652
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=495.003, mean=495.003, max=495.003, sum=990.006 (2)",
-            "tab": "General information",
-            "score": 495.0028901734104
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=690.542, mean=690.542, max=690.542, sum=1381.084 (2)",
-            "tab": "General information",
-            "score": 690.5418994413408
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.886,
-        "details": {
-          "description": "min=0.886, mean=0.886, max=0.886, sum=1.771 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.629, mean=0.629, max=0.629, sum=1.259 (2)",
-            "tab": "Efficiency",
-            "score": 0.6292609475017373
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=585.48, mean=585.48, max=585.48, sum=1170.961 (2)",
-            "tab": "General information",
-            "score": 585.4803921568628
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.926,
-        "details": {
-          "description": "min=0.926, mean=0.926, max=0.926, sum=1.852 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.54, mean=0.54, max=0.54, sum=1.08 (2)",
-            "tab": "Efficiency",
-            "score": 0.5400909362015901
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=540.198, mean=540.198, max=540.198, sum=1080.395 (2)",
-            "tab": "General information",
-            "score": 540.1975308641976
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.809,
-        "details": {
-          "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.442, mean=0.442, max=0.442, sum=0.884 (2)",
-            "tab": "Efficiency",
-            "score": 0.4420530059120872
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=426.655, mean=426.655, max=426.655, sum=853.309 (2)",
-            "tab": "General information",
-            "score": 426.6545454545454
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.857,
-        "details": {
-          "description": "min=0.857, mean=0.857, max=0.857, sum=1.714 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.443, mean=0.443, max=0.443, sum=0.886 (2)",
-            "tab": "Efficiency",
-            "score": 0.44290724871109943
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1193.869, mean=1193.869, max=1193.869, sum=2387.739 (2)",
-            "tab": "General information",
-            "score": 1193.869387755102
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.95,
-        "details": {
-          "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.841 (2)",
-            "tab": "Efficiency",
-            "score": 0.4202856958208986
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=456.274, mean=456.274, max=456.274, sum=912.547 (2)",
-            "tab": "General information",
-            "score": 456.27363184079604
-          },
-          "Sociology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.566,
-        "details": {
-          "description": "min=0.566, mean=0.566, max=0.566, sum=1.133 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.425, mean=0.425, max=0.425, sum=0.849 (2)",
-            "tab": "Efficiency",
-            "score": 0.4245123575968915
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=336.753, mean=336.753, max=336.753, sum=673.506 (2)",
-            "tab": "General information",
-            "score": 336.7530120481928
-          },
-          "Virology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.889,
-        "details": {
-          "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)",
-            "tab": "Efficiency",
-            "score": 0.4207720505563836
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=268.164, mean=268.164, max=268.164, sum=536.327 (2)",
-            "tab": "General information",
-            "score": 268.1637426900585
-          },
-          "World Religions - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.334,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json b/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json
deleted file mode 100644
index de3a77c03..000000000
--- a/data/helm_mmlu/google/gemini-1.5-pro-preview-0409/2b31b441-caa9-465c-a2d2-051c951c7be3.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemini-1.5-pro-preview-0409/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 1.5 Pro 0409 preview",
-    "id": "google/gemini-1.5-pro-preview-0409",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.397, mean=0.81, max=0.979, sum=92.284 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.877, mean=1.174, max=3.173, sum=133.815 (114)",
-            "tab": "Efficiency",
-            "score": 1.1738183835156866
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=268.164, mean=632.617, max=2797.424, sum=72118.345 (114)",
-            "tab": "General information",
-            "score": 632.6170571214202
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6,
-        "details": {
-          "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=1.767, mean=1.767, max=1.767, sum=3.533 (2)",
-            "tab": "Efficiency",
-            "score": 1.7665750813484191
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=383.97, mean=383.97, max=383.97, sum=767.94 (2)",
-            "tab": "General information",
-            "score": 383.97
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.541 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=3.173, mean=3.173, max=3.173, sum=6.346 (2)",
-            "tab": "Efficiency",
-            "score": 3.1730875386132134
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=344.356, mean=344.356, max=344.356, sum=688.711 (2)",
-            "tab": "General information",
-            "score": 344.35555555555555
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.804,
-        "details": {
-          "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=1.054, mean=1.054, max=1.054, sum=2.107 (2)",
-            "tab": "Efficiency",
-            "score": 1.053539514541626
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.985, mean=0.985, max=0.985, sum=1.971 (2)",
-            "tab": "Efficiency",
-            "score": 0.9854124503003227
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=1.301, mean=1.301, max=1.301, sum=2.603 (2)",
-            "tab": "Efficiency",
-            "score": 1.3013164806365967
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=1.187, mean=1.187, max=1.187, sum=2.375 (2)",
-            "tab": "Efficiency",
-            "score": 1.1873565983772278
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=1.149, mean=1.149, max=1.149, sum=2.298 (2)",
-            "tab": "Efficiency",
-            "score": 1.1490558723493807
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=1.017, mean=1.017, max=1.017, sum=2.034 (2)",
-            "tab": "Efficiency",
-            "score": 1.0169454929875392
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=570.02, mean=570.02, max=570.02, sum=1140.04 (2)",
-            "tab": "General information",
-            "score": 570.02
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=482.799, mean=482.799, max=482.799, sum=965.597 (2)",
-            "tab": "General information",
-            "score": 482.7986111111111
-          },
-          "College Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=857.86, mean=857.86, max=857.86, sum=1715.72 (2)",
-            "tab": "General information",
-            "score": 857.86
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=626.69, mean=626.69, max=626.69, sum=1253.38 (2)",
-            "tab": "General information",
-            "score": 626.69
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=513.37, mean=513.37, max=513.37, sum=1026.74 (2)",
-            "tab": "General information",
-            "score": 513.3699421965318
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=507.471, mean=507.471, max=507.471, sum=1014.941 (2)",
-            "tab": "General information",
-            "score": 507.47058823529414
-          },
-          "College Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=1.26, mean=1.26, max=1.26, sum=2.52 (2)",
-            "tab": "Efficiency",
-            "score": 1.2601169872283935
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=380.91, mean=380.91, max=380.91, sum=761.82 (2)",
-            "tab": "General information",
-            "score": 380.91
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.737,
-        "details": {
-          "description": "min=0.737, mean=0.737, max=0.737, sum=1.474 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.968, mean=0.968, max=0.968, sum=1.936 (2)",
-            "tab": "Efficiency",
-            "score": 0.9679407843372279
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=634.553, mean=634.553, max=634.553, sum=1269.105 (2)",
-            "tab": "General information",
-            "score": 634.5526315789474
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.66,
-        "details": {
-          "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=1.066, mean=1.066, max=1.066, sum=2.132 (2)",
-            "tab": "Efficiency",
-            "score": 1.065871012210846
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=456.54, mean=456.54, max=456.54, sum=913.08 (2)",
-            "tab": "General information",
-            "score": 456.54
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=1.079, mean=1.079, max=1.079, sum=2.157 (2)",
-            "tab": "Efficiency",
-            "score": 1.0785565420433327
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=407.87, mean=407.87, max=407.87, sum=815.741 (2)",
-            "tab": "General information",
-            "score": 407.8703703703704
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.846,
-        "details": {
-          "description": "min=0.846, mean=0.846, max=0.846, sum=1.691 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=1.057, mean=1.057, max=1.057, sum=2.114 (2)",
-            "tab": "Efficiency",
-            "score": 1.0571237967328626
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=340.907, mean=340.907, max=340.907, sum=681.814 (2)",
-            "tab": "General information",
-            "score": 340.90675241157555
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.866,
-        "details": {
-          "description": "min=0.866, mean=0.866, max=0.866, sum=1.732 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=1.258, mean=1.258, max=1.258, sum=2.516 (2)",
-            "tab": "Efficiency",
-            "score": 1.2578288101182213
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=1.179, mean=1.179, max=1.179, sum=2.359 (2)",
-            "tab": "Efficiency",
-            "score": 1.1793269350173625
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=1.246, mean=1.246, max=1.246, sum=2.491 (2)",
-            "tab": "Efficiency",
-            "score": 1.2455504093494716
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=1.181, mean=1.181, max=1.181, sum=2.362 (2)",
-            "tab": "Efficiency",
-            "score": 1.1811600880403268
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1113.092, mean=1113.092, max=1113.092, sum=2226.184 (2)",
-            "tab": "General information",
-            "score": 1113.0919117647059
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=755.418, mean=755.418, max=755.418, sum=1510.837 (2)",
-            "tab": "General information",
-            "score": 755.418439716312
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1685.119, mean=1685.119, max=1685.119, sum=3370.239 (2)",
-            "tab": "General information",
-            "score": 1685.119295958279
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=594.363, mean=594.363, max=594.363, sum=1188.725 (2)",
-            "tab": "General information",
-            "score": 594.3627450980392
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.969, mean=0.969, max=0.969, sum=1.938 (2)",
-            "tab": "Efficiency",
-            "score": 0.968876302242279
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=438.2, mean=438.2, max=438.2, sum=876.4 (2)",
-            "tab": "General information",
-            "score": 438.2
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.914,
-        "details": {
-          "description": "min=0.914, mean=0.914, max=0.914, sum=1.829 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
-            "tab": "Efficiency",
-            "score": 0.9198912256642392
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=602.421, mean=602.421, max=602.421, sum=1204.842 (2)",
-            "tab": "General information",
-            "score": 602.421052631579
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=1.009, mean=1.009, max=1.009, sum=2.019 (2)",
-            "tab": "Efficiency",
-            "score": 1.0093300080299377
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=552.87, mean=552.87, max=552.87, sum=1105.74 (2)",
-            "tab": "General information",
-            "score": 552.87
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.868,
-        "details": {
-          "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=1.079, mean=1.079, max=1.079, sum=2.157 (2)",
-            "tab": "Efficiency",
-            "score": 1.0787266893206902
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=402.592, mean=402.592, max=402.592, sum=805.185 (2)",
-            "tab": "General information",
-            "score": 402.5924528301887
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.915,
-        "details": {
-          "description": "min=0.915, mean=0.915, max=0.915, sum=1.83 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.962, mean=0.962, max=0.962, sum=1.925 (2)",
-            "tab": "Efficiency",
-            "score": 0.9624196154005984
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=309.213, mean=309.213, max=309.213, sum=618.426 (2)",
-            "tab": "General information",
-            "score": 309.21276595744683
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.772,
-        "details": {
-          "description": "min=0.772, mean=0.772, max=0.772, sum=1.545 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=1.272, mean=1.272, max=1.272, sum=2.544 (2)",
-            "tab": "Efficiency",
-            "score": 1.271799375270975
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=474.786, mean=474.786, max=474.786, sum=949.572 (2)",
-            "tab": "General information",
-            "score": 474.78620689655173
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.884,
-        "details": {
-          "description": "min=0.884, mean=0.884, max=0.884, sum=1.767 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=1.052, mean=1.052, max=1.052, sum=2.104 (2)",
-            "tab": "Efficiency",
-            "score": 1.0518414406549363
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=597.341, mean=597.341, max=597.341, sum=1194.683 (2)",
-            "tab": "General information",
-            "score": 597.3412698412699
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.643,
-        "details": {
-          "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=1.075, mean=1.075, max=1.075, sum=2.151 (2)",
-            "tab": "Efficiency",
-            "score": 1.0754183095598977
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=619.563, mean=619.563, max=619.563, sum=1239.127 (2)",
-            "tab": "General information",
-            "score": 619.563492063492
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.924,
-        "details": {
-          "description": "min=0.924, mean=0.924, max=0.924, sum=1.848 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=1.11, mean=1.11, max=1.11, sum=2.22 (2)",
-            "tab": "Efficiency",
-            "score": 1.1099017789286951
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=1.021, mean=1.021, max=1.021, sum=2.041 (2)",
-            "tab": "Efficiency",
-            "score": 1.0206051636211977
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=1.112, mean=1.112, max=1.112, sum=2.224 (2)",
-            "tab": "Efficiency",
-            "score": 1.1118335294723511
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=1.402, mean=1.402, max=1.402, sum=2.803 (2)",
-            "tab": "Efficiency",
-            "score": 1.4017024777152323
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.959, mean=0.959, max=0.959, sum=1.918 (2)",
-            "tab": "Efficiency",
-            "score": 0.9591333119556157
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=1.224, mean=1.224, max=1.224, sum=2.448 (2)",
-            "tab": "Efficiency",
-            "score": 1.2240539535957298
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=1.052, mean=1.052, max=1.052, sum=2.105 (2)",
-            "tab": "Efficiency",
-            "score": 1.052347583648486
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=1.167, mean=1.167, max=1.167, sum=2.335 (2)",
-            "tab": "Efficiency",
-            "score": 1.167454132327327
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.992, mean=0.992, max=0.992, sum=1.984 (2)",
-            "tab": "Efficiency",
-            "score": 0.991771269245308
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=1.275, mean=1.275, max=1.275, sum=2.549 (2)",
-            "tab": "Efficiency",
-            "score": 1.2746097031018593
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=1.143, mean=1.143, max=1.143, sum=2.286 (2)",
-            "tab": "Efficiency",
-            "score": 1.1432113459005075
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=1.417, mean=1.417, max=1.417, sum=2.834 (2)",
-            "tab": "Efficiency",
-            "score": 1.417081825159214
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=1.309, mean=1.309, max=1.309, sum=2.618 (2)",
-            "tab": "Efficiency",
-            "score": 1.3091707919158189
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=1.249, mean=1.249, max=1.249, sum=2.498 (2)",
-            "tab": "Efficiency",
-            "score": 1.2489153383150382
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=500.958, mean=500.958, max=500.958, sum=1001.916 (2)",
-            "tab": "General information",
-            "score": 500.958064516129
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=513.064, mean=513.064, max=513.064, sum=1026.128 (2)",
-            "tab": "General information",
-            "score": 513.064039408867
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=935.13, mean=935.13, max=935.13, sum=1870.26 (2)",
-            "tab": "General information",
-            "score": 935.13
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2797.424, mean=2797.424, max=2797.424, sum=5594.848 (2)",
-            "tab": "General information",
-            "score": 2797.4242424242425
-          },
-          "High School European History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=394.773, mean=394.773, max=394.773, sum=789.545 (2)",
-            "tab": "General information",
-            "score": 394.77272727272725
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=479.301, mean=479.301, max=479.301, sum=958.601 (2)",
-            "tab": "General information",
-            "score": 479.30051813471505
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=396.541, mean=396.541, max=396.541, sum=793.082 (2)",
-            "tab": "General information",
-            "score": 396.54102564102567
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=566.822, mean=566.822, max=566.822, sum=1133.644 (2)",
-            "tab": "General information",
-            "score": 566.8222222222222
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=415.954, mean=415.954, max=415.954, sum=831.908 (2)",
-            "tab": "General information",
-            "score": 415.953781512605
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=591.715, mean=591.715, max=591.715, sum=1183.43 (2)",
-            "tab": "General information",
-            "score": 591.7152317880794
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=502.604, mean=502.604, max=502.604, sum=1005.207 (2)",
-            "tab": "General information",
-            "score": 502.60366972477067
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=858.931, mean=858.931, max=858.931, sum=1717.861 (2)",
-            "tab": "General information",
-            "score": 858.9305555555555
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2205.583, mean=2205.583, max=2205.583, sum=4411.167 (2)",
-            "tab": "General information",
-            "score": 2205.5833333333335
-          },
-          "High School US History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1426.544, mean=1426.544, max=1426.544, sum=2853.089 (2)",
-            "tab": "General information",
-            "score": 1426.5443037974683
-          },
-          "High School World History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.397,
-        "details": {
-          "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=1.295, mean=1.295, max=1.295, sum=2.59 (2)",
-            "tab": "Efficiency",
-            "score": 1.2951436652196362
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=1.699, mean=1.699, max=1.699, sum=3.399 (2)",
-            "tab": "Efficiency",
-            "score": 1.6993297884019756
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=321.587, mean=321.587, max=321.587, sum=643.175 (2)",
-            "tab": "General information",
-            "score": 321.58744394618833
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=347.183, mean=347.183, max=347.183, sum=694.366 (2)",
-            "tab": "General information",
-            "score": 347.1832061068702
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.917,
-        "details": {
-          "description": "min=0.917, mean=0.917, max=0.917, sum=1.835 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=1.151, mean=1.151, max=1.151, sum=2.303 (2)",
-            "tab": "Efficiency",
-            "score": 1.1514279527112472
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=644.165, mean=644.165, max=644.165, sum=1288.331 (2)",
-            "tab": "General information",
-            "score": 644.1652892561983
-          },
-          "International Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.859,
-        "details": {
-          "description": "min=0.859, mean=0.859, max=0.859, sum=1.718 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=1.422, mean=1.422, max=1.422, sum=2.844 (2)",
-            "tab": "Efficiency",
-            "score": 1.4221880026390217
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=450.049, mean=450.049, max=450.049, sum=900.098 (2)",
-            "tab": "General information",
-            "score": 450.0490797546012
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.67,
-        "details": {
-          "description": "min=0.67, mean=0.67, max=0.67, sum=1.339 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=1.005, mean=1.005, max=1.005, sum=2.011 (2)",
-            "tab": "Efficiency",
-            "score": 1.005433154957635
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=702.402, mean=702.402, max=702.402, sum=1404.804 (2)",
-            "tab": "General information",
-            "score": 702.4017857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.874,
-        "details": {
-          "description": "min=0.874, mean=0.874, max=0.874, sum=1.748 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.939, mean=0.939, max=0.939, sum=1.879 (2)",
-            "tab": "Efficiency",
-            "score": 0.9392627234597808
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=281.301, mean=281.301, max=281.301, sum=562.602 (2)",
-            "tab": "General information",
-            "score": 281.3009708737864
-          },
-          "Management - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.953,
-        "details": {
-          "description": "min=0.953, mean=0.953, max=0.953, sum=1.906 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=1.261, mean=1.261, max=1.261, sum=2.523 (2)",
-            "tab": "Efficiency",
-            "score": 1.2613265443051982
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=428.35, mean=428.35, max=428.35, sum=856.701 (2)",
-            "tab": "General information",
-            "score": 428.35042735042737
-          },
-          "Marketing - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.897, mean=0.897, max=0.897, sum=1.795 (2)",
-            "tab": "Efficiency",
-            "score": 0.8973554396629333
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=338.89, mean=338.89, max=338.89, sum=677.78 (2)",
-            "tab": "General information",
-            "score": 338.89
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.928,
-        "details": {
-          "description": "min=0.928, mean=0.928, max=0.928, sum=1.857 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=1.136, mean=1.136, max=1.136, sum=2.272 (2)",
-            "tab": "Efficiency",
-            "score": 1.1357932166882707
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=314.669, mean=314.669, max=314.669, sum=629.338 (2)",
-            "tab": "General information",
-            "score": 314.669220945083
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.696,
-        "details": {
-          "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.966, mean=0.966, max=0.966, sum=1.933 (2)",
-            "tab": "Efficiency",
-            "score": 0.9664077420165573
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=1.0, mean=1.0, max=1.0, sum=1.999 (2)",
-            "tab": "Efficiency",
-            "score": 0.9996972816196952
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=495.003, mean=495.003, max=495.003, sum=990.006 (2)",
-            "tab": "General information",
-            "score": 495.0028901734104
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=690.542, mean=690.542, max=690.542, sum=1381.084 (2)",
-            "tab": "General information",
-            "score": 690.5418994413408
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.846,
-        "details": {
-          "description": "min=0.846, mean=0.846, max=0.846, sum=1.693 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=1.042, mean=1.042, max=1.042, sum=2.084 (2)",
-            "tab": "Efficiency",
-            "score": 1.04191489858565
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=585.48, mean=585.48, max=585.48, sum=1170.961 (2)",
-            "tab": "General information",
-            "score": 585.4803921568628
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.886,
-        "details": {
-          "description": "min=0.886, mean=0.886, max=0.886, sum=1.772 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.888, mean=0.888, max=0.888, sum=1.775 (2)",
-            "tab": "Efficiency",
-            "score": 0.8876422820267854
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=540.198, mean=540.198, max=540.198, sum=1080.395 (2)",
-            "tab": "General information",
-            "score": 540.1975308641976
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.755,
-        "details": {
-          "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.992, mean=0.992, max=0.992, sum=1.984 (2)",
-            "tab": "Efficiency",
-            "score": 0.9922328862276945
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=426.655, mean=426.655, max=426.655, sum=853.309 (2)",
-            "tab": "General information",
-            "score": 426.6545454545454
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.849,
-        "details": {
-          "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=1.117, mean=1.117, max=1.117, sum=2.234 (2)",
-            "tab": "Efficiency",
-            "score": 1.116919010512683
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1193.869, mean=1193.869, max=1193.869, sum=2387.739 (2)",
-            "tab": "General information",
-            "score": 1193.869387755102
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.925,
-        "details": {
-          "description": "min=0.925, mean=0.925, max=0.925, sum=1.851 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=1.296, mean=1.296, max=1.296, sum=2.592 (2)",
-            "tab": "Efficiency",
-            "score": 1.29619625195935
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=456.274, mean=456.274, max=456.274, sum=912.547 (2)",
-            "tab": "General information",
-            "score": 456.27363184079604
-          },
-          "Sociology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.584,
-        "details": {
-          "description": "min=0.584, mean=0.584, max=0.584, sum=1.169 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)",
-            "tab": "Efficiency",
-            "score": 0.8771147684878614
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=336.753, mean=336.753, max=336.753, sum=673.506 (2)",
-            "tab": "General information",
-            "score": 336.7530120481928
-          },
-          "Virology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.877,
-        "details": {
-          "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=1.225, mean=1.225, max=1.225, sum=2.451 (2)",
-            "tab": "Efficiency",
-            "score": 1.2254026856338769
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=268.164, mean=268.164, max=268.164, sum=536.327 (2)",
-            "tab": "General information",
-            "score": 268.1637426900585
-          },
-          "World Religions - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.118,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json b/data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json
deleted file mode 100644
index 6b53de064..000000000
--- a/data/helm_mmlu/google/gemini-2.0-flash-exp/b7ea6c93-af70-4c0f-ba50-03a539416a8b.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemini-2.0-flash-exp/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemini 2.0 Flash Experimental",
-    "id": "google/gemini-2.0-flash-exp",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.797,
-        "details": {
-          "description": "min=0.554, mean=0.797, max=0.969, sum=90.902 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.379, mean=0.422, max=0.926, sum=48.097 (114)",
-            "tab": "Efficiency",
-            "score": 0.4219020959728089
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=268.164, mean=632.617, max=2797.424, sum=72118.345 (114)",
-            "tab": "General information",
-            "score": 632.6170571214202
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.72,
-        "details": {
-          "description": "min=0.72, mean=0.72, max=0.72, sum=1.44 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)",
-            "tab": "Efficiency",
-            "score": 0.4077691292762756
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=383.97, mean=383.97, max=383.97, sum=767.94 (2)",
-            "tab": "General information",
-            "score": 383.97
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.807,
-        "details": {
-          "description": "min=0.807, mean=0.807, max=0.807, sum=1.615 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.926, mean=0.926, max=0.926, sum=1.852 (2)",
-            "tab": "Efficiency",
-            "score": 0.9258230227011222
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=344.356, mean=344.356, max=344.356, sum=688.711 (2)",
-            "tab": "General information",
-            "score": 344.35555555555555
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.696,
-        "details": {
-          "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.405, mean=0.405, max=0.405, sum=0.809 (2)",
-            "tab": "Efficiency",
-            "score": 0.4045387363433838
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.47, mean=0.47, max=0.47, sum=0.941 (2)",
-            "tab": "Efficiency",
-            "score": 0.4703653355439504
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.872 (2)",
-            "tab": "Efficiency",
-            "score": 0.4358289122581482
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.413, mean=0.413, max=0.413, sum=0.827 (2)",
-            "tab": "Efficiency",
-            "score": 0.413386971950531
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.426, mean=0.426, max=0.426, sum=0.852 (2)",
-            "tab": "Efficiency",
-            "score": 0.4259330606184943
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.456, mean=0.456, max=0.456, sum=0.912 (2)",
-            "tab": "Efficiency",
-            "score": 0.4557511432498109
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=570.02, mean=570.02, max=570.02, sum=1140.04 (2)",
-            "tab": "General information",
-            "score": 570.02
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=482.799, mean=482.799, max=482.799, sum=965.597 (2)",
-            "tab": "General information",
-            "score": 482.7986111111111
-          },
-          "College Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=857.86, mean=857.86, max=857.86, sum=1715.72 (2)",
-            "tab": "General information",
-            "score": 857.86
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=626.69, mean=626.69, max=626.69, sum=1253.38 (2)",
-            "tab": "General information",
-            "score": 626.69
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=513.37, mean=513.37, max=513.37, sum=1026.74 (2)",
-            "tab": "General information",
-            "score": 513.3699421965318
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=507.471, mean=507.471, max=507.471, sum=1014.941 (2)",
-            "tab": "General information",
-            "score": 507.47058823529414
-          },
-          "College Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.407, mean=0.407, max=0.407, sum=0.813 (2)",
-            "tab": "Efficiency",
-            "score": 0.4065685248374939
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=380.91, mean=380.91, max=380.91, sum=761.82 (2)",
-            "tab": "General information",
-            "score": 380.91
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.693,
-        "details": {
-          "description": "min=0.693, mean=0.693, max=0.693, sum=1.386 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.41, mean=0.41, max=0.41, sum=0.819 (2)",
-            "tab": "Efficiency",
-            "score": 0.4097107544279935
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=634.553, mean=634.553, max=634.553, sum=1269.105 (2)",
-            "tab": "General information",
-            "score": 634.5526315789474
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.66,
-        "details": {
-          "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)",
-            "tab": "Efficiency",
-            "score": 0.4148475766181946
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=456.54, mean=456.54, max=456.54, sum=913.08 (2)",
-            "tab": "General information",
-            "score": 456.54
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.898,
-        "details": {
-          "description": "min=0.898, mean=0.898, max=0.898, sum=1.796 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.442, mean=0.442, max=0.442, sum=0.884 (2)",
-            "tab": "Efficiency",
-            "score": 0.4418119721942478
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=407.87, mean=407.87, max=407.87, sum=815.741 (2)",
-            "tab": "General information",
-            "score": 407.8703703703704
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.887,
-        "details": {
-          "description": "min=0.887, mean=0.887, max=0.887, sum=1.775 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.409, mean=0.409, max=0.409, sum=0.817 (2)",
-            "tab": "Efficiency",
-            "score": 0.40853408831875426
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=340.907, mean=340.907, max=340.907, sum=681.814 (2)",
-            "tab": "General information",
-            "score": 340.90675241157555
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.876,
-        "details": {
-          "description": "min=0.876, mean=0.876, max=0.876, sum=1.752 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.467, mean=0.467, max=0.467, sum=0.934 (2)",
-            "tab": "Efficiency",
-            "score": 0.46713243337238536
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.386, mean=0.386, max=0.386, sum=0.771 (2)",
-            "tab": "Efficiency",
-            "score": 0.38551004812227074
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.429, mean=0.429, max=0.429, sum=0.859 (2)",
-            "tab": "Efficiency",
-            "score": 0.4294954424886691
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.397, mean=0.397, max=0.397, sum=0.793 (2)",
-            "tab": "Efficiency",
-            "score": 0.39653347715053683
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1113.092, mean=1113.092, max=1113.092, sum=2226.184 (2)",
-            "tab": "General information",
-            "score": 1113.0919117647059
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=755.418, mean=755.418, max=755.418, sum=1510.837 (2)",
-            "tab": "General information",
-            "score": 755.418439716312
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1685.119, mean=1685.119, max=1685.119, sum=3370.239 (2)",
-            "tab": "General information",
-            "score": 1685.119295958279
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=594.363, mean=594.363, max=594.363, sum=1188.725 (2)",
-            "tab": "General information",
-            "score": 594.3627450980392
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78,
-        "details": {
-          "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.414, mean=0.414, max=0.414, sum=0.829 (2)",
-            "tab": "Efficiency",
-            "score": 0.4144425654411316
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=438.2, mean=438.2, max=438.2, sum=876.4 (2)",
-            "tab": "General information",
-            "score": 438.2
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.928,
-        "details": {
-          "description": "min=0.928, mean=0.928, max=0.928, sum=1.855 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.432, mean=0.432, max=0.432, sum=0.864 (2)",
-            "tab": "Efficiency",
-            "score": 0.43207096739819173
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=602.421, mean=602.421, max=602.421, sum=1204.842 (2)",
-            "tab": "General information",
-            "score": 602.421052631579
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.73,
-        "details": {
-          "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.441, mean=0.441, max=0.441, sum=0.883 (2)",
-            "tab": "Efficiency",
-            "score": 0.441267569065094
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=552.87, mean=552.87, max=552.87, sum=1105.74 (2)",
-            "tab": "General information",
-            "score": 552.87
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.879,
-        "details": {
-          "description": "min=0.879, mean=0.879, max=0.879, sum=1.758 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.439, mean=0.439, max=0.439, sum=0.878 (2)",
-            "tab": "Efficiency",
-            "score": 0.43878708245619286
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=402.592, mean=402.592, max=402.592, sum=805.185 (2)",
-            "tab": "General information",
-            "score": 402.5924528301887
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.813,
-        "details": {
-          "description": "min=0.813, mean=0.813, max=0.813, sum=1.626 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.398, mean=0.398, max=0.398, sum=0.796 (2)",
-            "tab": "Efficiency",
-            "score": 0.3981509147806371
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=309.213, mean=309.213, max=309.213, sum=618.426 (2)",
-            "tab": "General information",
-            "score": 309.21276595744683
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.834,
-        "details": {
-          "description": "min=0.834, mean=0.834, max=0.834, sum=1.669 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.476, mean=0.476, max=0.476, sum=0.952 (2)",
-            "tab": "Efficiency",
-            "score": 0.47606519830637967
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=474.786, mean=474.786, max=474.786, sum=949.572 (2)",
-            "tab": "General information",
-            "score": 474.78620689655173
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.857,
-        "details": {
-          "description": "min=0.857, mean=0.857, max=0.857, sum=1.714 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)",
-            "tab": "Efficiency",
-            "score": 0.4077642039647178
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=597.341, mean=597.341, max=597.341, sum=1194.683 (2)",
-            "tab": "General information",
-            "score": 597.3412698412699
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.571,
-        "details": {
-          "description": "min=0.571, mean=0.571, max=0.571, sum=1.143 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)",
-            "tab": "Efficiency",
-            "score": 0.4018626610438029
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=619.563, mean=619.563, max=619.563, sum=1239.127 (2)",
-            "tab": "General information",
-            "score": 619.563492063492
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.743,
-        "details": {
-          "description": "min=0.743, mean=0.743, max=0.743, sum=1.485 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.392, mean=0.392, max=0.392, sum=0.784 (2)",
-            "tab": "Efficiency",
-            "score": 0.39193403643946495
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.391, mean=0.391, max=0.391, sum=0.783 (2)",
-            "tab": "Efficiency",
-            "score": 0.3914114583302014
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.392, mean=0.392, max=0.392, sum=0.785 (2)",
-            "tab": "Efficiency",
-            "score": 0.3924300479888916
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.452, mean=0.452, max=0.452, sum=0.903 (2)",
-            "tab": "Efficiency",
-            "score": 0.451710438005852
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.386, mean=0.386, max=0.386, sum=0.773 (2)",
-            "tab": "Efficiency",
-            "score": 0.3862521937399199
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.409, mean=0.409, max=0.409, sum=0.817 (2)",
-            "tab": "Efficiency",
-            "score": 0.40865302950607063
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.385, mean=0.385, max=0.385, sum=0.771 (2)",
-            "tab": "Efficiency",
-            "score": 0.3853575364137307
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.393, mean=0.393, max=0.393, sum=0.787 (2)",
-            "tab": "Efficiency",
-            "score": 0.39334204550142643
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)",
-            "tab": "Efficiency",
-            "score": 0.38397373171413646
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.412, mean=0.412, max=0.412, sum=0.823 (2)",
-            "tab": "Efficiency",
-            "score": 0.4116018955281239
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.393, mean=0.393, max=0.393, sum=0.786 (2)",
-            "tab": "Efficiency",
-            "score": 0.3931623751964044
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.449, mean=0.449, max=0.449, sum=0.898 (2)",
-            "tab": "Efficiency",
-            "score": 0.44901008628032824
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.468, mean=0.468, max=0.468, sum=0.935 (2)",
-            "tab": "Efficiency",
-            "score": 0.46768493044610115
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.452, mean=0.452, max=0.452, sum=0.903 (2)",
-            "tab": "Efficiency",
-            "score": 0.451718654310653
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=500.958, mean=500.958, max=500.958, sum=1001.916 (2)",
-            "tab": "General information",
-            "score": 500.958064516129
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=513.064, mean=513.064, max=513.064, sum=1026.128 (2)",
-            "tab": "General information",
-            "score": 513.064039408867
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=935.13, mean=935.13, max=935.13, sum=1870.26 (2)",
-            "tab": "General information",
-            "score": 935.13
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2797.424, mean=2797.424, max=2797.424, sum=5594.848 (2)",
-            "tab": "General information",
-            "score": 2797.4242424242425
-          },
-          "High School European History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=394.773, mean=394.773, max=394.773, sum=789.545 (2)",
-            "tab": "General information",
-            "score": 394.77272727272725
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=479.301, mean=479.301, max=479.301, sum=958.601 (2)",
-            "tab": "General information",
-            "score": 479.30051813471505
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=396.541, mean=396.541, max=396.541, sum=793.082 (2)",
-            "tab": "General information",
-            "score": 396.54102564102567
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=566.822, mean=566.822, max=566.822, sum=1133.644 (2)",
-            "tab": "General information",
-            "score": 566.8222222222222
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=415.954, mean=415.954, max=415.954, sum=831.908 (2)",
-            "tab": "General information",
-            "score": 415.953781512605
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=591.715, mean=591.715, max=591.715, sum=1183.43 (2)",
-            "tab": "General information",
-            "score": 591.7152317880794
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=502.604, mean=502.604, max=502.604, sum=1005.207 (2)",
-            "tab": "General information",
-            "score": 502.60366972477067
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=858.931, mean=858.931, max=858.931, sum=1717.861 (2)",
-            "tab": "General information",
-            "score": 858.9305555555555
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2205.583, mean=2205.583, max=2205.583, sum=4411.167 (2)",
-            "tab": "General information",
-            "score": 2205.5833333333335
-          },
-          "High School US History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1426.544, mean=1426.544, max=1426.544, sum=2853.089 (2)",
-            "tab": "General information",
-            "score": 1426.5443037974683
-          },
-          "High School World History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)",
-            "tab": "Efficiency",
-            "score": 0.3999073441253115
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.841 (2)",
-            "tab": "Efficiency",
-            "score": 0.4203109868610178
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=321.587, mean=321.587, max=321.587, sum=643.175 (2)",
-            "tab": "General information",
-            "score": 321.58744394618833
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=347.183, mean=347.183, max=347.183, sum=694.366 (2)",
-            "tab": "General information",
-            "score": 347.1832061068702
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.645,
-        "details": {
-          "description": "min=0.645, mean=0.645, max=0.645, sum=1.289 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.457, mean=0.457, max=0.457, sum=0.913 (2)",
-            "tab": "Efficiency",
-            "score": 0.45661053972795973
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=644.165, mean=644.165, max=644.165, sum=1288.331 (2)",
-            "tab": "General information",
-            "score": 644.1652892561983
-          },
-          "International Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.914,
-        "details": {
-          "description": "min=0.914, mean=0.914, max=0.914, sum=1.828 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.411, mean=0.411, max=0.411, sum=0.823 (2)",
-            "tab": "Efficiency",
-            "score": 0.4113436125538832
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=450.049, mean=450.049, max=450.049, sum=900.098 (2)",
-            "tab": "General information",
-            "score": 450.0490797546012
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.759,
-        "details": {
-          "description": "min=0.759, mean=0.759, max=0.759, sum=1.518 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.417, mean=0.417, max=0.417, sum=0.833 (2)",
-            "tab": "Efficiency",
-            "score": 0.4165512855563845
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=702.402, mean=702.402, max=702.402, sum=1404.804 (2)",
-            "tab": "General information",
-            "score": 702.4017857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.718,
-        "details": {
-          "description": "min=0.718, mean=0.718, max=0.718, sum=1.437 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.401, mean=0.401, max=0.401, sum=0.803 (2)",
-            "tab": "Efficiency",
-            "score": 0.4013508292077814
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=281.301, mean=281.301, max=281.301, sum=562.602 (2)",
-            "tab": "General information",
-            "score": 281.3009708737864
-          },
-          "Management - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.944,
-        "details": {
-          "description": "min=0.944, mean=0.944, max=0.944, sum=1.889 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.401, mean=0.401, max=0.401, sum=0.801 (2)",
-            "tab": "Efficiency",
-            "score": 0.4005699891310472
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=428.35, mean=428.35, max=428.35, sum=856.701 (2)",
-            "tab": "General information",
-            "score": 428.35042735042737
-          },
-          "Marketing - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "details": {
-          "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.387, mean=0.387, max=0.387, sum=0.773 (2)",
-            "tab": "Efficiency",
-            "score": 0.38653050899505614
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=338.89, mean=338.89, max=338.89, sum=677.78 (2)",
-            "tab": "General information",
-            "score": 338.89
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.939,
-        "details": {
-          "description": "min=0.939, mean=0.939, max=0.939, sum=1.877 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.386, mean=0.386, max=0.386, sum=0.772 (2)",
-            "tab": "Efficiency",
-            "score": 0.3861832460376647
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=314.669, mean=314.669, max=314.669, sum=629.338 (2)",
-            "tab": "General information",
-            "score": 314.669220945083
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.815,
-        "details": {
-          "description": "min=0.815, mean=0.815, max=0.815, sum=1.629 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)",
-            "tab": "Efficiency",
-            "score": 0.3839988109004291
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.405, mean=0.405, max=0.405, sum=0.81 (2)",
-            "tab": "Efficiency",
-            "score": 0.4048716662316349
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=495.003, mean=495.003, max=495.003, sum=990.006 (2)",
-            "tab": "General information",
-            "score": 495.0028901734104
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=690.542, mean=690.542, max=690.542, sum=1381.084 (2)",
-            "tab": "General information",
-            "score": 690.5418994413408
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.856,
-        "details": {
-          "description": "min=0.856, mean=0.856, max=0.856, sum=1.712 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)",
-            "tab": "Efficiency",
-            "score": 0.39706431027331385
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=585.48, mean=585.48, max=585.48, sum=1170.961 (2)",
-            "tab": "General information",
-            "score": 585.4803921568628
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.898,
-        "details": {
-          "description": "min=0.898, mean=0.898, max=0.898, sum=1.796 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)",
-            "tab": "Efficiency",
-            "score": 0.3900022072556578
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=540.198, mean=540.198, max=540.198, sum=1080.395 (2)",
-            "tab": "General information",
-            "score": 540.1975308641976
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791,
-        "details": {
-          "description": "min=0.791, mean=0.791, max=0.791, sum=1.582 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)",
-            "tab": "Efficiency",
-            "score": 0.37999111955816095
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=426.655, mean=426.655, max=426.655, sum=853.309 (2)",
-            "tab": "General information",
-            "score": 426.6545454545454
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69,
-        "details": {
-          "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.394, mean=0.394, max=0.394, sum=0.787 (2)",
-            "tab": "Efficiency",
-            "score": 0.3936534463142862
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1193.869, mean=1193.869, max=1193.869, sum=2387.739 (2)",
-            "tab": "General information",
-            "score": 1193.869387755102
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.786,
-        "details": {
-          "description": "min=0.786, mean=0.786, max=0.786, sum=1.572 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.388, mean=0.388, max=0.388, sum=0.776 (2)",
-            "tab": "Efficiency",
-            "score": 0.3881402205471969
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=456.274, mean=456.274, max=456.274, sum=912.547 (2)",
-            "tab": "General information",
-            "score": 456.27363184079604
-          },
-          "Sociology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.554,
-        "details": {
-          "description": "min=0.554, mean=0.554, max=0.554, sum=1.108 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.379, mean=0.379, max=0.379, sum=0.758 (2)",
-            "tab": "Efficiency",
-            "score": 0.3791351461985025
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=336.753, mean=336.753, max=336.753, sum=673.506 (2)",
-            "tab": "General information",
-            "score": 336.7530120481928
-          },
-          "Virology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.731,
-        "details": {
-          "description": "min=0.731, mean=0.731, max=0.731, sum=1.462 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)",
-            "tab": "Efficiency",
-            "score": 0.38400994964510377
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=268.164, mean=268.164, max=268.164, sum=536.327 (2)",
-            "tab": "General information",
-            "score": 268.1637426900585
-          },
-          "World Religions - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.567,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json b/data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json
deleted file mode 100644
index 8720cc062..000000000
--- a/data/helm_mmlu/google/gemma-2-27b/fe4cec30-e483-49a8-80ea-00b2c6231740.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemma-2-27b/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma 2 27B",
-    "id": "google/gemma-2-27b",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.757,
-        "details": {
-          "description": "min=0.394, mean=0.757, max=0.979, sum=86.303 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=1.169, mean=2.744, max=12.207, sum=312.86 (114)",
-            "tab": "Efficiency",
-            "score": 2.7443855864562217
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=260.164, mean=624.617, max=2789.424, sum=71206.345 (114)",
-            "tab": "General information",
-            "score": 624.6170571214202
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4,
-        "details": {
-          "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=1.522, mean=1.522, max=1.522, sum=3.043 (2)",
-            "tab": "Efficiency",
-            "score": 1.5217395949363708
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=375.97, mean=375.97, max=375.97, sum=751.94 (2)",
-            "tab": "General information",
-            "score": 375.97
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.541 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=1.179, mean=1.179, max=1.179, sum=2.359 (2)",
-            "tab": "Efficiency",
-            "score": 1.1792643246827301
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=336.356, mean=336.356, max=336.356, sum=672.711 (2)",
-            "tab": "General information",
-            "score": 336.35555555555555
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=2.168, mean=2.168, max=2.168, sum=4.337 (2)",
-            "tab": "Efficiency",
-            "score": 2.168372049331665
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=1.995, mean=1.995, max=1.995, sum=3.99 (2)",
-            "tab": "Efficiency",
-            "score": 1.994903423719936
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=3.315, mean=3.315, max=3.315, sum=6.631 (2)",
-            "tab": "Efficiency",
-            "score": 3.315422866344452
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=2.323, mean=2.323, max=2.323, sum=4.647 (2)",
-            "tab": "Efficiency",
-            "score": 2.323271915912628
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=2.118, mean=2.118, max=2.118, sum=4.236 (2)",
-            "tab": "Efficiency",
-            "score": 2.117893081179933
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=1.982, mean=1.982, max=1.982, sum=3.964 (2)",
-            "tab": "Efficiency",
-            "score": 1.9819396874483894
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=562.02, mean=562.02, max=562.02, sum=1124.04 (2)",
-            "tab": "General information",
-            "score": 562.02
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=474.799, mean=474.799, max=474.799, sum=949.597 (2)",
-            "tab": "General information",
-            "score": 474.7986111111111
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=849.86, mean=849.86, max=849.86, sum=1699.72 (2)",
-            "tab": "General information",
-            "score": 849.86
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=618.69, mean=618.69, max=618.69, sum=1237.38 (2)",
-            "tab": "General information",
-            "score": 618.69
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=505.37, mean=505.37, max=505.37, sum=1010.74 (2)",
-            "tab": "General information",
-            "score": 505.3699421965318
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=499.471, mean=499.471, max=499.471, sum=998.941 (2)",
-            "tab": "General information",
-            "score": 499.47058823529414
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=1.314, mean=1.314, max=1.314, sum=2.628 (2)",
-            "tab": "Efficiency",
-            "score": 1.3139495277404785
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=372.91, mean=372.91, max=372.91, sum=745.82 (2)",
-            "tab": "General information",
-            "score": 372.91
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.667,
-        "details": {
-          "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=2.14, mean=2.14, max=2.14, sum=4.28 (2)",
-            "tab": "Efficiency",
-            "score": 2.1398948138220266
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=626.553, mean=626.553, max=626.553, sum=1253.105 (2)",
-            "tab": "General information",
-            "score": 626.5526315789474
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.43,
-        "details": {
-          "description": "min=0.43, mean=0.43, max=0.43, sum=0.86 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=1.452, mean=1.452, max=1.452, sum=2.905 (2)",
-            "tab": "Efficiency",
-            "score": 1.4524464893341065
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=448.54, mean=448.54, max=448.54, sum=897.08 (2)",
-            "tab": "General information",
-            "score": 448.54
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.861,
-        "details": {
-          "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=1.421, mean=1.421, max=1.421, sum=2.841 (2)",
-            "tab": "Efficiency",
-            "score": 1.4206464577604223
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=399.87, mean=399.87, max=399.87, sum=799.741 (2)",
-            "tab": "General information",
-            "score": 399.8703703703704
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.849,
-        "details": {
-          "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=1.169, mean=1.169, max=1.169, sum=2.337 (2)",
-            "tab": "Efficiency",
-            "score": 1.168742698871821
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=332.907, mean=332.907, max=332.907, sum=665.814 (2)",
-            "tab": "General information",
-            "score": 332.90675241157555
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=4.2, mean=4.2, max=4.2, sum=8.399 (2)",
-            "tab": "Efficiency",
-            "score": 4.199711911818561
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=3.427, mean=3.427, max=3.427, sum=6.854 (2)",
-            "tab": "Efficiency",
-            "score": 3.4269232200392596
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=7.724, mean=7.724, max=7.724, sum=15.448 (2)",
-            "tab": "Efficiency",
-            "score": 7.723928280417581
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=2.721, mean=2.721, max=2.721, sum=5.442 (2)",
-            "tab": "Efficiency",
-            "score": 2.721013201997171
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1105.092, mean=1105.092, max=1105.092, sum=2210.184 (2)",
-            "tab": "General information",
-            "score": 1105.0919117647059
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=747.418, mean=747.418, max=747.418, sum=1494.837 (2)",
-            "tab": "General information",
-            "score": 747.418439716312
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1677.119, mean=1677.119, max=1677.119, sum=3354.239 (2)",
-            "tab": "General information",
-            "score": 1677.119295958279
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=586.363, mean=586.363, max=586.363, sum=1172.725 (2)",
-            "tab": "General information",
-            "score": 586.3627450980392
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.95,
-        "details": {
-          "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=1.555, mean=1.555, max=1.555, sum=3.109 (2)",
-            "tab": "Efficiency",
-            "score": 1.554630262851715
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=430.2, mean=430.2, max=430.2, sum=860.4 (2)",
-            "tab": "General information",
-            "score": 430.2
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.829,
-        "details": {
-          "description": "min=0.829, mean=0.829, max=0.829, sum=1.658 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=2.214, mean=2.214, max=2.214, sum=4.428 (2)",
-            "tab": "Efficiency",
-            "score": 2.214210780043351
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=594.421, mean=594.421, max=594.421, sum=1188.842 (2)",
-            "tab": "General information",
-            "score": 594.421052631579
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78,
-        "details": {
-          "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=2.156, mean=2.156, max=2.156, sum=4.311 (2)",
-            "tab": "Efficiency",
-            "score": 2.1555044412612916
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=544.87, mean=544.87, max=544.87, sum=1089.74 (2)",
-            "tab": "General information",
-            "score": 544.87
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.808,
-        "details": {
-          "description": "min=0.808, mean=0.808, max=0.808, sum=1.615 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=1.81, mean=1.81, max=1.81, sum=3.619 (2)",
-            "tab": "Efficiency",
-            "score": 1.8096552030095514
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=394.592, mean=394.592, max=394.592, sum=789.185 (2)",
-            "tab": "General information",
-            "score": 394.5924528301887
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.834,
-        "details": {
-          "description": "min=0.834, mean=0.834, max=0.834, sum=1.668 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=1.424, mean=1.424, max=1.424, sum=2.848 (2)",
-            "tab": "Efficiency",
-            "score": 1.423792755857427
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=301.213, mean=301.213, max=301.213, sum=602.426 (2)",
-            "tab": "General information",
-            "score": 301.21276595744683
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.738,
-        "details": {
-          "description": "min=0.738, mean=0.738, max=0.738, sum=1.476 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=1.947, mean=1.947, max=1.947, sum=3.893 (2)",
-            "tab": "Efficiency",
-            "score": 1.9467107739941827
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=466.786, mean=466.786, max=466.786, sum=933.572 (2)",
-            "tab": "General information",
-            "score": 466.78620689655173
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.558,
-        "details": {
-          "description": "min=0.558, mean=0.558, max=0.558, sum=1.116 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=2.287, mean=2.287, max=2.287, sum=4.574 (2)",
-            "tab": "Efficiency",
-            "score": 2.286756881330379
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=589.341, mean=589.341, max=589.341, sum=1178.683 (2)",
-            "tab": "General information",
-            "score": 589.3412698412699
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.516,
-        "details": {
-          "description": "min=0.516, mean=0.516, max=0.516, sum=1.032 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=2.327, mean=2.327, max=2.327, sum=4.653 (2)",
-            "tab": "Efficiency",
-            "score": 2.3266589963246904
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=611.563, mean=611.563, max=611.563, sum=1223.127 (2)",
-            "tab": "General information",
-            "score": 611.563492063492
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "details": {
-          "description": "min=0.89, mean=0.89, max=0.89, sum=1.781 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=2.021, mean=2.021, max=2.021, sum=4.043 (2)",
-            "tab": "Efficiency",
-            "score": 2.021439305428536
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=2.053, mean=2.053, max=2.053, sum=4.106 (2)",
-            "tab": "Efficiency",
-            "score": 2.0532372467623556
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=3.599, mean=3.599, max=3.599, sum=7.197 (2)",
-            "tab": "Efficiency",
-            "score": 3.5985250592231752
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=12.207, mean=12.207, max=12.207, sum=24.413 (2)",
-            "tab": "Efficiency",
-            "score": 12.20667136221221
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=1.952, mean=1.952, max=1.952, sum=3.903 (2)",
-            "tab": "Efficiency",
-            "score": 1.9516368020664563
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=2.276, mean=2.276, max=2.276, sum=4.552 (2)",
-            "tab": "Efficiency",
-            "score": 2.2759376226929184
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=1.97, mean=1.97, max=1.97, sum=3.94 (2)",
-            "tab": "Efficiency",
-            "score": 1.9697805410776383
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=2.617, mean=2.617, max=2.617, sum=5.234 (2)",
-            "tab": "Efficiency",
-            "score": 2.616950834238971
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=2.123, mean=2.123, max=2.123, sum=4.245 (2)",
-            "tab": "Efficiency",
-            "score": 2.1225664866070786
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=2.697, mean=2.697, max=2.697, sum=5.394 (2)",
-            "tab": "Efficiency",
-            "score": 2.6972478115006
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=2.368, mean=2.368, max=2.368, sum=4.735 (2)",
-            "tab": "Efficiency",
-            "score": 2.3675809317772543
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=3.958, mean=3.958, max=3.958, sum=7.917 (2)",
-            "tab": "Efficiency",
-            "score": 3.9584906564818487
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=9.745, mean=9.745, max=9.745, sum=19.491 (2)",
-            "tab": "Efficiency",
-            "score": 9.745334922098646
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=6.489, mean=6.489, max=6.489, sum=12.977 (2)",
-            "tab": "Efficiency",
-            "score": 6.488561074944991
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=492.958, mean=492.958, max=492.958, sum=985.916 (2)",
-            "tab": "General information",
-            "score": 492.958064516129
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=505.064, mean=505.064, max=505.064, sum=1010.128 (2)",
-            "tab": "General information",
-            "score": 505.064039408867
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=927.13, mean=927.13, max=927.13, sum=1854.26 (2)",
-            "tab": "General information",
-            "score": 927.13
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2789.424, mean=2789.424, max=2789.424, sum=5578.848 (2)",
-            "tab": "General information",
-            "score": 2789.4242424242425
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=386.773, mean=386.773, max=386.773, sum=773.545 (2)",
-            "tab": "General information",
-            "score": 386.77272727272725
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=471.301, mean=471.301, max=471.301, sum=942.601 (2)",
-            "tab": "General information",
-            "score": 471.30051813471505
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=388.541, mean=388.541, max=388.541, sum=777.082 (2)",
-            "tab": "General information",
-            "score": 388.54102564102567
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=558.822, mean=558.822, max=558.822, sum=1117.644 (2)",
-            "tab": "General information",
-            "score": 558.8222222222222
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=407.954, mean=407.954, max=407.954, sum=815.908 (2)",
-            "tab": "General information",
-            "score": 407.953781512605
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=583.715, mean=583.715, max=583.715, sum=1167.43 (2)",
-            "tab": "General information",
-            "score": 583.7152317880794
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=494.604, mean=494.604, max=494.604, sum=989.207 (2)",
-            "tab": "General information",
-            "score": 494.60366972477067
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=850.931, mean=850.931, max=850.931, sum=1701.861 (2)",
-            "tab": "General information",
-            "score": 850.9305555555555
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2197.583, mean=2197.583, max=2197.583, sum=4395.167 (2)",
-            "tab": "General information",
-            "score": 2197.5833333333335
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1418.544, mean=1418.544, max=1418.544, sum=2837.089 (2)",
-            "tab": "General information",
-            "score": 1418.5443037974683
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=1.712, mean=1.712, max=1.712, sum=3.425 (2)",
-            "tab": "Efficiency",
-            "score": 1.7123107461116773
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=1.754, mean=1.754, max=1.754, sum=3.508 (2)",
-            "tab": "Efficiency",
-            "score": 1.7542339390470783
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=313.587, mean=313.587, max=313.587, sum=627.175 (2)",
-            "tab": "General information",
-            "score": 313.58744394618833
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=339.183, mean=339.183, max=339.183, sum=678.366 (2)",
-            "tab": "General information",
-            "score": 339.1832061068702
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.843,
-        "details": {
-          "description": "min=0.843, mean=0.843, max=0.843, sum=1.686 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=2.9, mean=2.9, max=2.9, sum=5.801 (2)",
-            "tab": "Efficiency",
-            "score": 2.9003868654739757
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=636.165, mean=636.165, max=636.165, sum=1272.331 (2)",
-            "tab": "General information",
-            "score": 636.1652892561983
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "details": {
-          "description": "min=0.865, mean=0.865, max=0.865, sum=1.73 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=2.154, mean=2.154, max=2.154, sum=4.308 (2)",
-            "tab": "Efficiency",
-            "score": 2.1537599431956473
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=442.049, mean=442.049, max=442.049, sum=884.098 (2)",
-            "tab": "General information",
-            "score": 442.0490797546012
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.625,
-        "details": {
-          "description": "min=0.625, mean=0.625, max=0.625, sum=1.25 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=3.172, mean=3.172, max=3.172, sum=6.344 (2)",
-            "tab": "Efficiency",
-            "score": 3.172234045607703
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=694.402, mean=694.402, max=694.402, sum=1388.804 (2)",
-            "tab": "General information",
-            "score": 694.4017857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.864,
-        "details": {
-          "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=1.556, mean=1.556, max=1.556, sum=3.112 (2)",
-            "tab": "Efficiency",
-            "score": 1.5561023800118456
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=273.301, mean=273.301, max=273.301, sum=546.602 (2)",
-            "tab": "General information",
-            "score": 273.3009708737864
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=2.165, mean=2.165, max=2.165, sum=4.331 (2)",
-            "tab": "Efficiency",
-            "score": 2.1654122140672474
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=420.35, mean=420.35, max=420.35, sum=840.701 (2)",
-            "tab": "General information",
-            "score": 420.35042735042737
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=1.719, mean=1.719, max=1.719, sum=3.438 (2)",
-            "tab": "Efficiency",
-            "score": 1.7190089011192322
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=330.89, mean=330.89, max=330.89, sum=661.78 (2)",
-            "tab": "General information",
-            "score": 330.89
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.885,
-        "details": {
-          "description": "min=0.885, mean=0.885, max=0.885, sum=1.77 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=1.709, mean=1.709, max=1.709, sum=3.417 (2)",
-            "tab": "Efficiency",
-            "score": 1.7086633363141563
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=306.669, mean=306.669, max=306.669, sum=613.338 (2)",
-            "tab": "General information",
-            "score": 306.669220945083
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.394,
-        "details": {
-          "description": "min=0.394, mean=0.394, max=0.394, sum=0.789 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=2.315, mean=2.315, max=2.315, sum=4.631 (2)",
-            "tab": "Efficiency",
-            "score": 2.315398308583078
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=3.188, mean=3.188, max=3.188, sum=6.376 (2)",
-            "tab": "Efficiency",
-            "score": 3.187839964914588
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=487.003, mean=487.003, max=487.003, sum=974.006 (2)",
-            "tab": "General information",
-            "score": 487.0028901734104
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=682.542, mean=682.542, max=682.542, sum=1365.084 (2)",
-            "tab": "General information",
-            "score": 682.5418994413408
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.824,
-        "details": {
-          "description": "min=0.824, mean=0.824, max=0.824, sum=1.647 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=2.692, mean=2.692, max=2.692, sum=5.383 (2)",
-            "tab": "Efficiency",
-            "score": 2.691618916255976
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=577.48, mean=577.48, max=577.48, sum=1154.961 (2)",
-            "tab": "General information",
-            "score": 577.4803921568628
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.877,
-        "details": {
-          "description": "min=0.877, mean=0.877, max=0.877, sum=1.753 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=2.537, mean=2.537, max=2.537, sum=5.075 (2)",
-            "tab": "Efficiency",
-            "score": 2.5372923561084417
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=532.198, mean=532.198, max=532.198, sum=1064.395 (2)",
-            "tab": "General information",
-            "score": 532.1975308641976
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.745,
-        "details": {
-          "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=2.161, mean=2.161, max=2.161, sum=4.321 (2)",
-            "tab": "Efficiency",
-            "score": 2.160554786161943
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=418.655, mean=418.655, max=418.655, sum=837.309 (2)",
-            "tab": "General information",
-            "score": 418.6545454545454
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.808,
-        "details": {
-          "description": "min=0.808, mean=0.808, max=0.808, sum=1.616 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=5.336, mean=5.336, max=5.336, sum=10.672 (2)",
-            "tab": "Efficiency",
-            "score": 5.335982258465825
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1185.869, mean=1185.869, max=1185.869, sum=2371.739 (2)",
-            "tab": "General information",
-            "score": 1185.869387755102
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=2.204, mean=2.204, max=2.204, sum=4.409 (2)",
-            "tab": "Efficiency",
-            "score": 2.2043708201071515
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=448.274, mean=448.274, max=448.274, sum=896.547 (2)",
-            "tab": "General information",
-            "score": 448.27363184079604
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.56,
-        "details": {
-          "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=1.75, mean=1.75, max=1.75, sum=3.499 (2)",
-            "tab": "Efficiency",
-            "score": 1.7496386393007026
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=328.753, mean=328.753, max=328.753, sum=657.506 (2)",
-            "tab": "General information",
-            "score": 328.7530120481928
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.924,
-        "details": {
-          "description": "min=0.924, mean=0.924, max=0.924, sum=1.848 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=1.443, mean=1.443, max=1.443, sum=2.886 (2)",
-            "tab": "Efficiency",
-            "score": 1.443225710015548
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=260.164, mean=260.164, max=260.164, sum=520.327 (2)",
-            "tab": "General information",
-            "score": 260.1637426900585
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.05,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json b/data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json
deleted file mode 100644
index 2007b06df..000000000
--- a/data/helm_mmlu/google/gemma-2-9b/53fe520f-4dbc-436a-b9d6-4a5067c30ebd.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemma-2-9b/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma 2 9B",
-    "id": "google/gemma-2-9b",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.721,
-        "details": {
-          "description": "min=0.295, mean=0.721, max=0.953, sum=82.233 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.425, mean=0.901, max=3.986, sum=102.765 (114)",
-            "tab": "Efficiency",
-            "score": 0.9014510090022484
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=260.164, mean=624.617, max=2789.424, sum=71206.345 (114)",
-            "tab": "General information",
-            "score": 624.6170571214202
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4,
-        "details": {
-          "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)",
-            "tab": "Efficiency",
-            "score": 0.6499301409721374
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=375.97, mean=375.97, max=375.97, sum=751.94 (2)",
-            "tab": "General information",
-            "score": 375.97
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.704,
-        "details": {
-          "description": "min=0.704, mean=0.704, max=0.704, sum=1.407 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.492, mean=0.492, max=0.492, sum=0.984 (2)",
-            "tab": "Efficiency",
-            "score": 0.491805742405079
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=336.356, mean=336.356, max=336.356, sum=672.711 (2)",
-            "tab": "General information",
-            "score": 336.35555555555555
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.711, mean=0.711, max=0.711, sum=1.423 (2)",
-            "tab": "Efficiency",
-            "score": 0.7114056801795959
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.624, mean=0.624, max=0.624, sum=1.248 (2)",
-            "tab": "Efficiency",
-            "score": 0.6241771280765533
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=1.093, mean=1.093, max=1.093, sum=2.187 (2)",
-            "tab": "Efficiency",
-            "score": 1.0932785439491273
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.803, mean=0.803, max=0.803, sum=1.606 (2)",
-            "tab": "Efficiency",
-            "score": 0.8027684283256531
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.674, mean=0.674, max=0.674, sum=1.348 (2)",
-            "tab": "Efficiency",
-            "score": 0.6739495985769812
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.655, mean=0.655, max=0.655, sum=1.311 (2)",
-            "tab": "Efficiency",
-            "score": 0.6553734166949403
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=562.02, mean=562.02, max=562.02, sum=1124.04 (2)",
-            "tab": "General information",
-            "score": 562.02
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=474.799, mean=474.799, max=474.799, sum=949.597 (2)",
-            "tab": "General information",
-            "score": 474.7986111111111
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=849.86, mean=849.86, max=849.86, sum=1699.72 (2)",
-            "tab": "General information",
-            "score": 849.86
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=618.69, mean=618.69, max=618.69, sum=1237.38 (2)",
-            "tab": "General information",
-            "score": 618.69
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=505.37, mean=505.37, max=505.37, sum=1010.74 (2)",
-            "tab": "General information",
-            "score": 505.3699421965318
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=499.471, mean=499.471, max=499.471, sum=998.941 (2)",
-            "tab": "General information",
-            "score": 499.47058823529414
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.464, mean=0.464, max=0.464, sum=0.928 (2)",
-            "tab": "Efficiency",
-            "score": 0.4640101146697998
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=372.91, mean=372.91, max=372.91, sum=745.82 (2)",
-            "tab": "General information",
-            "score": 372.91
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.579,
-        "details": {
-          "description": "min=0.579, mean=0.579, max=0.579, sum=1.158 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.766, mean=0.766, max=0.766, sum=1.531 (2)",
-            "tab": "Efficiency",
-            "score": 0.7655813254808125
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=626.553, mean=626.553, max=626.553, sum=1253.105 (2)",
-            "tab": "General information",
-            "score": 626.5526315789474
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53,
-        "details": {
-          "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.542, mean=0.542, max=0.542, sum=1.084 (2)",
-            "tab": "Efficiency",
-            "score": 0.5422105526924134
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=448.54, mean=448.54, max=448.54, sum=897.08 (2)",
-            "tab": "General information",
-            "score": 448.54
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.833,
-        "details": {
-          "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.496, mean=0.496, max=0.496, sum=0.991 (2)",
-            "tab": "Efficiency",
-            "score": 0.4956528963866057
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=399.87, mean=399.87, max=399.87, sum=799.741 (2)",
-            "tab": "General information",
-            "score": 399.8703703703704
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.772,
-        "details": {
-          "description": "min=0.772, mean=0.772, max=0.772, sum=1.543 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.425, mean=0.425, max=0.425, sum=0.85 (2)",
-            "tab": "Efficiency",
-            "score": 0.4251678066621639
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=332.907, mean=332.907, max=332.907, sum=665.814 (2)",
-            "tab": "General information",
-            "score": 332.90675241157555
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.788,
-        "details": {
-          "description": "min=0.788, mean=0.788, max=0.788, sum=1.575 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=1.37, mean=1.37, max=1.37, sum=2.74 (2)",
-            "tab": "Efficiency",
-            "score": 1.3702202570789002
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=1.128, mean=1.128, max=1.128, sum=2.255 (2)",
-            "tab": "Efficiency",
-            "score": 1.1277324375531352
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=2.433, mean=2.433, max=2.433, sum=4.866 (2)",
-            "tab": "Efficiency",
-            "score": 2.433138657113564
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.909, mean=0.909, max=0.909, sum=1.818 (2)",
-            "tab": "Efficiency",
-            "score": 0.9092130824631336
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1105.092, mean=1105.092, max=1105.092, sum=2210.184 (2)",
-            "tab": "General information",
-            "score": 1105.0919117647059
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=747.418, mean=747.418, max=747.418, sum=1494.837 (2)",
-            "tab": "General information",
-            "score": 747.418439716312
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1677.119, mean=1677.119, max=1677.119, sum=3354.239 (2)",
-            "tab": "General information",
-            "score": 1677.119295958279
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=586.363, mean=586.363, max=586.363, sum=1172.725 (2)",
-            "tab": "General information",
-            "score": 586.3627450980392
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.544, mean=0.544, max=0.544, sum=1.088 (2)",
-            "tab": "Efficiency",
-            "score": 0.5438596844673157
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=430.2, mean=430.2, max=430.2, sum=860.4 (2)",
-            "tab": "General information",
-            "score": 430.2
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.789,
-        "details": {
-          "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.766, mean=0.766, max=0.766, sum=1.533 (2)",
-            "tab": "Efficiency",
-            "score": 0.7662546744472102
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=594.421, mean=594.421, max=594.421, sum=1188.842 (2)",
-            "tab": "General information",
-            "score": 594.421052631579
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.713, mean=0.713, max=0.713, sum=1.425 (2)",
-            "tab": "Efficiency",
-            "score": 0.7125983119010926
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=544.87, mean=544.87, max=544.87, sum=1089.74 (2)",
-            "tab": "General information",
-            "score": 544.87
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.777,
-        "details": {
-          "description": "min=0.777, mean=0.777, max=0.777, sum=1.555 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.561, mean=0.561, max=0.561, sum=1.121 (2)",
-            "tab": "Efficiency",
-            "score": 0.5606130177119992
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=394.592, mean=394.592, max=394.592, sum=789.185 (2)",
-            "tab": "General information",
-            "score": 394.5924528301887
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.732,
-        "details": {
-          "description": "min=0.732, mean=0.732, max=0.732, sum=1.464 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.44, mean=0.44, max=0.44, sum=0.879 (2)",
-            "tab": "Efficiency",
-            "score": 0.4395242579439853
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=301.213, mean=301.213, max=301.213, sum=602.426 (2)",
-            "tab": "General information",
-            "score": 301.21276595744683
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.724,
-        "details": {
-          "description": "min=0.724, mean=0.724, max=0.724, sum=1.448 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.621, mean=0.621, max=0.621, sum=1.242 (2)",
-            "tab": "Efficiency",
-            "score": 0.620852176074324
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=466.786, mean=466.786, max=466.786, sum=933.572 (2)",
-            "tab": "General information",
-            "score": 466.78620689655173
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.577,
-        "details": {
-          "description": "min=0.577, mean=0.577, max=0.577, sum=1.153 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.783, mean=0.783, max=0.783, sum=1.566 (2)",
-            "tab": "Efficiency",
-            "score": 0.7831445295343954
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=589.341, mean=589.341, max=589.341, sum=1178.683 (2)",
-            "tab": "General information",
-            "score": 589.3412698412699
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.492,
-        "details": {
-          "description": "min=0.492, mean=0.492, max=0.492, sum=0.984 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.805, mean=0.805, max=0.805, sum=1.61 (2)",
-            "tab": "Efficiency",
-            "score": 0.804882182015313
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=611.563, mean=611.563, max=611.563, sum=1223.127 (2)",
-            "tab": "General information",
-            "score": 611.563492063492
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "details": {
-          "description": "min=0.865, mean=0.865, max=0.865, sum=1.73 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.651, mean=0.651, max=0.651, sum=1.302 (2)",
-            "tab": "Efficiency",
-            "score": 0.6510615141161027
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)",
-            "tab": "Efficiency",
-            "score": 0.6597568284114593
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=1.159, mean=1.159, max=1.159, sum=2.317 (2)",
-            "tab": "Efficiency",
-            "score": 1.1585216951370239
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=3.986, mean=3.986, max=3.986, sum=7.972 (2)",
-            "tab": "Efficiency",
-            "score": 3.9859177892858333
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.638, mean=0.638, max=0.638, sum=1.276 (2)",
-            "tab": "Efficiency",
-            "score": 0.6379079361154576
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.719, mean=0.719, max=0.719, sum=1.438 (2)",
-            "tab": "Efficiency",
-            "score": 0.7190980182410521
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.646, mean=0.646, max=0.646, sum=1.292 (2)",
-            "tab": "Efficiency",
-            "score": 0.6461667580482288
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
-            "tab": "Efficiency",
-            "score": 0.8891835009610212
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.682, mean=0.682, max=0.682, sum=1.364 (2)",
-            "tab": "Efficiency",
-            "score": 0.6818269651477077
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.905, mean=0.905, max=0.905, sum=1.81 (2)",
-            "tab": "Efficiency",
-            "score": 0.9050559808086875
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.746, mean=0.746, max=0.746, sum=1.491 (2)",
-            "tab": "Efficiency",
-            "score": 0.7455598682438561
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=1.279, mean=1.279, max=1.279, sum=2.558 (2)",
-            "tab": "Efficiency",
-            "score": 1.278907789124383
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=3.106, mean=3.106, max=3.106, sum=6.212 (2)",
-            "tab": "Efficiency",
-            "score": 3.1062067454936457
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=2.068, mean=2.068, max=2.068, sum=4.137 (2)",
-            "tab": "Efficiency",
-            "score": 2.0682604393375574
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=492.958, mean=492.958, max=492.958, sum=985.916 (2)",
-            "tab": "General information",
-            "score": 492.958064516129
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=505.064, mean=505.064, max=505.064, sum=1010.128 (2)",
-            "tab": "General information",
-            "score": 505.064039408867
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=927.13, mean=927.13, max=927.13, sum=1854.26 (2)",
-            "tab": "General information",
-            "score": 927.13
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2789.424, mean=2789.424, max=2789.424, sum=5578.848 (2)",
-            "tab": "General information",
-            "score": 2789.4242424242425
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=386.773, mean=386.773, max=386.773, sum=773.545 (2)",
-            "tab": "General information",
-            "score": 386.77272727272725
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=471.301, mean=471.301, max=471.301, sum=942.601 (2)",
-            "tab": "General information",
-            "score": 471.30051813471505
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=388.541, mean=388.541, max=388.541, sum=777.082 (2)",
-            "tab": "General information",
-            "score": 388.54102564102567
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=558.822, mean=558.822, max=558.822, sum=1117.644 (2)",
-            "tab": "General information",
-            "score": 558.8222222222222
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=407.954, mean=407.954, max=407.954, sum=815.908 (2)",
-            "tab": "General information",
-            "score": 407.953781512605
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=583.715, mean=583.715, max=583.715, sum=1167.43 (2)",
-            "tab": "General information",
-            "score": 583.7152317880794
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=494.604, mean=494.604, max=494.604, sum=989.207 (2)",
-            "tab": "General information",
-            "score": 494.60366972477067
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=850.931, mean=850.931, max=850.931, sum=1701.861 (2)",
-            "tab": "General information",
-            "score": 850.9305555555555
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2197.583, mean=2197.583, max=2197.583, sum=4395.167 (2)",
-            "tab": "General information",
-            "score": 2197.5833333333335
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1418.544, mean=1418.544, max=1418.544, sum=2837.089 (2)",
-            "tab": "General information",
-            "score": 1418.5443037974683
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.809,
-        "details": {
-          "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.548, mean=0.548, max=0.548, sum=1.095 (2)",
-            "tab": "Efficiency",
-            "score": 0.5475642894950148
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.564, mean=0.564, max=0.564, sum=1.129 (2)",
-            "tab": "Efficiency",
-            "score": 0.5644530576604013
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=313.587, mean=313.587, max=313.587, sum=627.175 (2)",
-            "tab": "General information",
-            "score": 313.58744394618833
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=339.183, mean=339.183, max=339.183, sum=678.366 (2)",
-            "tab": "General information",
-            "score": 339.1832061068702
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.835,
-        "details": {
-          "description": "min=0.835, mean=0.835, max=0.835, sum=1.669 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.956, mean=0.956, max=0.956, sum=1.911 (2)",
-            "tab": "Efficiency",
-            "score": 0.9556485384948983
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=636.165, mean=636.165, max=636.165, sum=1272.331 (2)",
-            "tab": "General information",
-            "score": 636.1652892561983
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.816,
-        "details": {
-          "description": "min=0.816, mean=0.816, max=0.816, sum=1.632 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.699, mean=0.699, max=0.699, sum=1.398 (2)",
-            "tab": "Efficiency",
-            "score": 0.6992296397320332
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=442.049, mean=442.049, max=442.049, sum=884.098 (2)",
-            "tab": "General information",
-            "score": 442.0490797546012
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.509,
-        "details": {
-          "description": "min=0.509, mean=0.509, max=0.509, sum=1.018 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=1.048, mean=1.048, max=1.048, sum=2.096 (2)",
-            "tab": "Efficiency",
-            "score": 1.0480207417692458
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=694.402, mean=694.402, max=694.402, sum=1388.804 (2)",
-            "tab": "General information",
-            "score": 694.4017857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.874,
-        "details": {
-          "description": "min=0.874, mean=0.874, max=0.874, sum=1.748 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.509, mean=0.509, max=0.509, sum=1.019 (2)",
-            "tab": "Efficiency",
-            "score": 0.5093999186765801
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=273.301, mean=273.301, max=273.301, sum=546.602 (2)",
-            "tab": "General information",
-            "score": 273.3009708737864
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.919,
-        "details": {
-          "description": "min=0.919, mean=0.919, max=0.919, sum=1.838 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.697, mean=0.697, max=0.697, sum=1.394 (2)",
-            "tab": "Efficiency",
-            "score": 0.6969545549816556
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=420.35, mean=420.35, max=420.35, sum=840.701 (2)",
-            "tab": "General information",
-            "score": 420.35042735042737
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.534, mean=0.534, max=0.534, sum=1.067 (2)",
-            "tab": "Efficiency",
-            "score": 0.5335883450508118
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=330.89, mean=330.89, max=330.89, sum=661.78 (2)",
-            "tab": "General information",
-            "score": 330.89
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.844,
-        "details": {
-          "description": "min=0.844, mean=0.844, max=0.844, sum=1.688 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.536, mean=0.536, max=0.536, sum=1.073 (2)",
-            "tab": "Efficiency",
-            "score": 0.5363688258832442
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=306.669, mean=306.669, max=306.669, sum=613.338 (2)",
-            "tab": "General information",
-            "score": 306.669220945083
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.295,
-        "details": {
-          "description": "min=0.295, mean=0.295, max=0.295, sum=0.59 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.734, mean=0.734, max=0.734, sum=1.468 (2)",
-            "tab": "Efficiency",
-            "score": 0.7340341696160377
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=1.057, mean=1.057, max=1.057, sum=2.114 (2)",
-            "tab": "Efficiency",
-            "score": 1.0570912433070176
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=487.003, mean=487.003, max=487.003, sum=974.006 (2)",
-            "tab": "General information",
-            "score": 487.0028901734104
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=682.542, mean=682.542, max=682.542, sum=1365.084 (2)",
-            "tab": "General information",
-            "score": 682.5418994413408
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.775,
-        "details": {
-          "description": "min=0.775, mean=0.775, max=0.775, sum=1.549 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.889, mean=0.889, max=0.889, sum=1.779 (2)",
-            "tab": "Efficiency",
-            "score": 0.8894402412028094
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=577.48, mean=577.48, max=577.48, sum=1154.961 (2)",
-            "tab": "General information",
-            "score": 577.4803921568628
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.812,
-        "details": {
-          "description": "min=0.812, mean=0.812, max=0.812, sum=1.623 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.846, mean=0.846, max=0.846, sum=1.691 (2)",
-            "tab": "Efficiency",
-            "score": 0.8456013467576768
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=532.198, mean=532.198, max=532.198, sum=1064.395 (2)",
-            "tab": "General information",
-            "score": 532.1975308641976
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.736,
-        "details": {
-          "description": "min=0.736, mean=0.736, max=0.736, sum=1.473 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.698, mean=0.698, max=0.698, sum=1.395 (2)",
-            "tab": "Efficiency",
-            "score": 0.6977464697577737
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=418.655, mean=418.655, max=418.655, sum=837.309 (2)",
-            "tab": "General information",
-            "score": 418.6545454545454
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78,
-        "details": {
-          "description": "min=0.78, mean=0.78, max=0.78, sum=1.559 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=1.737, mean=1.737, max=1.737, sum=3.473 (2)",
-            "tab": "Efficiency",
-            "score": 1.7365190982818604
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1185.869, mean=1185.869, max=1185.869, sum=2371.739 (2)",
-            "tab": "General information",
-            "score": 1185.869387755102
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.712, mean=0.712, max=0.712, sum=1.423 (2)",
-            "tab": "Efficiency",
-            "score": 0.7115461138350454
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=448.274, mean=448.274, max=448.274, sum=896.547 (2)",
-            "tab": "General information",
-            "score": 448.27363184079604
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53,
-        "details": {
-          "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.571, mean=0.571, max=0.571, sum=1.142 (2)",
-            "tab": "Efficiency",
-            "score": 0.571121395352375
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=328.753, mean=328.753, max=328.753, sum=657.506 (2)",
-            "tab": "General information",
-            "score": 328.7530120481928
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.86,
-        "details": {
-          "description": "min=0.86, mean=0.86, max=0.86, sum=1.719 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.448, mean=0.448, max=0.448, sum=0.895 (2)",
-            "tab": "Efficiency",
-            "score": 0.44760305142542073
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=260.164, mean=260.164, max=260.164, sum=520.327 (2)",
-            "tab": "General information",
-            "score": 260.1637426900585
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.265,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json b/data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json
deleted file mode 100644
index 963d13c9a..000000000
--- a/data/helm_mmlu/google/gemma-7b/af88b02d-cb29-4d2c-bb33-5fddcf316a95.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_gemma-7b/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma 7B",
-    "id": "google/gemma-7b",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.661,
-        "details": {
-          "description": "min=0.28, mean=0.661, max=0.891, sum=75.376 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.238, mean=0.312, max=0.614, sum=35.566 (114)",
-            "tab": "Efficiency",
-            "score": 0.3119781121356026
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=260.164, mean=624.617, max=2789.424, sum=71206.345 (114)",
-            "tab": "General information",
-            "score": 624.6170571214202
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.28,
-        "details": {
-          "description": "min=0.28, mean=0.28, max=0.28, sum=0.56 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.271, mean=0.271, max=0.271, sum=0.543 (2)",
-            "tab": "Efficiency",
-            "score": 0.27131984949111937
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=375.97, mean=375.97, max=375.97, sum=751.94 (2)",
-            "tab": "General information",
-            "score": 375.97
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.563,
-        "details": {
-          "description": "min=0.563, mean=0.563, max=0.563, sum=1.126 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.294, mean=0.294, max=0.294, sum=0.587 (2)",
-            "tab": "Efficiency",
-            "score": 0.2935627672407362
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=336.356, mean=336.356, max=336.356, sum=672.711 (2)",
-            "tab": "General information",
-            "score": 336.35555555555555
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.412,
-        "details": {
-          "description": "min=0.412, mean=0.412, max=0.412, sum=0.824 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.267, mean=0.267, max=0.267, sum=0.534 (2)",
-            "tab": "Efficiency",
-            "score": 0.26709758281707763
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.296, mean=0.296, max=0.296, sum=0.592 (2)",
-            "tab": "Efficiency",
-            "score": 0.2961096896065606
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)",
-            "tab": "Efficiency",
-            "score": 0.2900628304481506
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)",
-            "tab": "Efficiency",
-            "score": 0.298998281955719
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.255, mean=0.255, max=0.255, sum=0.51 (2)",
-            "tab": "Efficiency",
-            "score": 0.25478591119622906
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.614, mean=0.614, max=0.614, sum=1.229 (2)",
-            "tab": "Efficiency",
-            "score": 0.614474796781353
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=562.02, mean=562.02, max=562.02, sum=1124.04 (2)",
-            "tab": "General information",
-            "score": 562.02
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=474.799, mean=474.799, max=474.799, sum=949.597 (2)",
-            "tab": "General information",
-            "score": 474.7986111111111
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=849.86, mean=849.86, max=849.86, sum=1699.72 (2)",
-            "tab": "General information",
-            "score": 849.86
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=618.69, mean=618.69, max=618.69, sum=1237.38 (2)",
-            "tab": "General information",
-            "score": 618.69
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=505.37, mean=505.37, max=505.37, sum=1010.74 (2)",
-            "tab": "General information",
-            "score": 505.3699421965318
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=499.471, mean=499.471, max=499.471, sum=998.941 (2)",
-            "tab": "General information",
-            "score": 499.47058823529414
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.75,
-        "details": {
-          "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.251, mean=0.251, max=0.251, sum=0.503 (2)",
-            "tab": "Efficiency",
-            "score": 0.2512932848930359
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=372.91, mean=372.91, max=372.91, sum=745.82 (2)",
-            "tab": "General information",
-            "score": 372.91
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.474,
-        "details": {
-          "description": "min=0.474, mean=0.474, max=0.474, sum=0.947 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.285, mean=0.285, max=0.285, sum=0.569 (2)",
-            "tab": "Efficiency",
-            "score": 0.28468057565521776
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=626.553, mean=626.553, max=626.553, sum=1253.105 (2)",
-            "tab": "General information",
-            "score": 626.5526315789474
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.42,
-        "details": {
-          "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.296, mean=0.296, max=0.296, sum=0.591 (2)",
-            "tab": "Efficiency",
-            "score": 0.2956829309463501
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=448.54, mean=448.54, max=448.54, sum=897.08 (2)",
-            "tab": "General information",
-            "score": 448.54
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.769,
-        "details": {
-          "description": "min=0.769, mean=0.769, max=0.769, sum=1.537 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.26, mean=0.26, max=0.26, sum=0.521 (2)",
-            "tab": "Efficiency",
-            "score": 0.26035096910264754
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=399.87, mean=399.87, max=399.87, sum=799.741 (2)",
-            "tab": "General information",
-            "score": 399.8703703703704
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.727,
-        "details": {
-          "description": "min=0.727, mean=0.727, max=0.727, sum=1.453 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.276, mean=0.276, max=0.276, sum=0.552 (2)",
-            "tab": "Efficiency",
-            "score": 0.276187143141817
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=332.907, mean=332.907, max=332.907, sum=665.814 (2)",
-            "tab": "General information",
-            "score": 332.90675241157555
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.712,
-        "details": {
-          "description": "min=0.712, mean=0.712, max=0.712, sum=1.425 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.311, mean=0.311, max=0.311, sum=0.621 (2)",
-            "tab": "Efficiency",
-            "score": 0.3106422327897128
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.292, mean=0.292, max=0.292, sum=0.583 (2)",
-            "tab": "Efficiency",
-            "score": 0.2916089237159026
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.385, mean=0.385, max=0.385, sum=0.77 (2)",
-            "tab": "Efficiency",
-            "score": 0.38496507379812867
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.261, mean=0.261, max=0.261, sum=0.522 (2)",
-            "tab": "Efficiency",
-            "score": 0.26078930010203444
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1105.092, mean=1105.092, max=1105.092, sum=2210.184 (2)",
-            "tab": "General information",
-            "score": 1105.0919117647059
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=747.418, mean=747.418, max=747.418, sum=1494.837 (2)",
-            "tab": "General information",
-            "score": 747.418439716312
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1677.119, mean=1677.119, max=1677.119, sum=3354.239 (2)",
-            "tab": "General information",
-            "score": 1677.119295958279
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=586.363, mean=586.363, max=586.363, sum=1172.725 (2)",
-            "tab": "General information",
-            "score": 586.3627450980392
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.586 (2)",
-            "tab": "Efficiency",
-            "score": 0.29293906927108765
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=430.2, mean=430.2, max=430.2, sum=860.4 (2)",
-            "tab": "General information",
-            "score": 430.2
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.717,
-        "details": {
-          "description": "min=0.717, mean=0.717, max=0.717, sum=1.434 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)",
-            "tab": "Efficiency",
-            "score": 0.2697504366699018
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=594.421, mean=594.421, max=594.421, sum=1188.842 (2)",
-            "tab": "General information",
-            "score": 594.421052631579
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.65,
-        "details": {
-          "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.298, mean=0.298, max=0.298, sum=0.596 (2)",
-            "tab": "Efficiency",
-            "score": 0.297854323387146
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=544.87, mean=544.87, max=544.87, sum=1089.74 (2)",
-            "tab": "General information",
-            "score": 544.87
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.698,
-        "details": {
-          "description": "min=0.698, mean=0.698, max=0.698, sum=1.396 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.257, mean=0.257, max=0.257, sum=0.515 (2)",
-            "tab": "Efficiency",
-            "score": 0.25743662816173624
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=394.592, mean=394.592, max=394.592, sum=789.185 (2)",
-            "tab": "General information",
-            "score": 394.5924528301887
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.621,
-        "details": {
-          "description": "min=0.621, mean=0.621, max=0.621, sum=1.243 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.249, mean=0.249, max=0.249, sum=0.498 (2)",
-            "tab": "Efficiency",
-            "score": 0.24898753064744017
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=301.213, mean=301.213, max=301.213, sum=602.426 (2)",
-            "tab": "General information",
-            "score": 301.21276595744683
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.628,
-        "details": {
-          "description": "min=0.628, mean=0.628, max=0.628, sum=1.255 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.254, mean=0.254, max=0.254, sum=0.508 (2)",
-            "tab": "Efficiency",
-            "score": 0.25389171797653726
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=466.786, mean=466.786, max=466.786, sum=933.572 (2)",
-            "tab": "General information",
-            "score": 466.78620689655173
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.516,
-        "details": {
-          "description": "min=0.516, mean=0.516, max=0.516, sum=1.032 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.287, mean=0.287, max=0.287, sum=0.573 (2)",
-            "tab": "Efficiency",
-            "score": 0.28658196219691523
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=589.341, mean=589.341, max=589.341, sum=1178.683 (2)",
-            "tab": "General information",
-            "score": 589.3412698412699
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.508,
-        "details": {
-          "description": "min=0.508, mean=0.508, max=0.508, sum=1.016 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.548, mean=0.548, max=0.548, sum=1.097 (2)",
-            "tab": "Efficiency",
-            "score": 0.5483344452721732
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=611.563, mean=611.563, max=611.563, sum=1223.127 (2)",
-            "tab": "General information",
-            "score": 611.563492063492
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.857,
-        "details": {
-          "description": "min=0.857, mean=0.857, max=0.857, sum=1.713 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.251, mean=0.251, max=0.251, sum=0.502 (2)",
-            "tab": "Efficiency",
-            "score": 0.2509724578549785
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.292, mean=0.292, max=0.292, sum=0.584 (2)",
-            "tab": "Efficiency",
-            "score": 0.2920628909406991
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)",
-            "tab": "Efficiency",
-            "score": 0.3299814939498901
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.553, mean=0.553, max=0.553, sum=1.107 (2)",
-            "tab": "Efficiency",
-            "score": 0.5534277785908092
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.471, mean=0.471, max=0.471, sum=0.943 (2)",
-            "tab": "Efficiency",
-            "score": 0.47140675602537213
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.282, mean=0.282, max=0.282, sum=0.565 (2)",
-            "tab": "Efficiency",
-            "score": 0.28242908734731725
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.316, mean=0.316, max=0.316, sum=0.632 (2)",
-            "tab": "Efficiency",
-            "score": 0.3160711630796775
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.256, mean=0.256, max=0.256, sum=0.512 (2)",
-            "tab": "Efficiency",
-            "score": 0.25601085556877984
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.251, mean=0.251, max=0.251, sum=0.503 (2)",
-            "tab": "Efficiency",
-            "score": 0.25132194386810813
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.339, mean=0.339, max=0.339, sum=0.679 (2)",
-            "tab": "Efficiency",
-            "score": 0.3394651823485924
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.348, mean=0.348, max=0.348, sum=0.697 (2)",
-            "tab": "Efficiency",
-            "score": 0.3483087859022508
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.316, mean=0.316, max=0.316, sum=0.632 (2)",
-            "tab": "Efficiency",
-            "score": 0.31601137033215276
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.452, mean=0.452, max=0.452, sum=0.905 (2)",
-            "tab": "Efficiency",
-            "score": 0.4523548308540793
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.342, mean=0.342, max=0.342, sum=0.683 (2)",
-            "tab": "Efficiency",
-            "score": 0.34174740565980033
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=492.958, mean=492.958, max=492.958, sum=985.916 (2)",
-            "tab": "General information",
-            "score": 492.958064516129
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=505.064, mean=505.064, max=505.064, sum=1010.128 (2)",
-            "tab": "General information",
-            "score": 505.064039408867
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=927.13, mean=927.13, max=927.13, sum=1854.26 (2)",
-            "tab": "General information",
-            "score": 927.13
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2789.424, mean=2789.424, max=2789.424, sum=5578.848 (2)",
-            "tab": "General information",
-            "score": 2789.4242424242425
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=386.773, mean=386.773, max=386.773, sum=773.545 (2)",
-            "tab": "General information",
-            "score": 386.77272727272725
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=471.301, mean=471.301, max=471.301, sum=942.601 (2)",
-            "tab": "General information",
-            "score": 471.30051813471505
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=388.541, mean=388.541, max=388.541, sum=777.082 (2)",
-            "tab": "General information",
-            "score": 388.54102564102567
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=558.822, mean=558.822, max=558.822, sum=1117.644 (2)",
-            "tab": "General information",
-            "score": 558.8222222222222
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=407.954, mean=407.954, max=407.954, sum=815.908 (2)",
-            "tab": "General information",
-            "score": 407.953781512605
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=583.715, mean=583.715, max=583.715, sum=1167.43 (2)",
-            "tab": "General information",
-            "score": 583.7152317880794
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=494.604, mean=494.604, max=494.604, sum=989.207 (2)",
-            "tab": "General information",
-            "score": 494.60366972477067
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=850.931, mean=850.931, max=850.931, sum=1701.861 (2)",
-            "tab": "General information",
-            "score": 850.9305555555555
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2197.583, mean=2197.583, max=2197.583, sum=4395.167 (2)",
-            "tab": "General information",
-            "score": 2197.5833333333335
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1418.544, mean=1418.544, max=1418.544, sum=2837.089 (2)",
-            "tab": "General information",
-            "score": 1418.5443037974683
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.733,
-        "details": {
-          "description": "min=0.733, mean=0.733, max=0.733, sum=1.466 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.406, mean=0.406, max=0.406, sum=0.812 (2)",
-            "tab": "Efficiency",
-            "score": 0.4062144061375092
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.238, mean=0.238, max=0.238, sum=0.476 (2)",
-            "tab": "Efficiency",
-            "score": 0.23785374910776852
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=313.587, mean=313.587, max=313.587, sum=627.175 (2)",
-            "tab": "General information",
-            "score": 313.58744394618833
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=339.183, mean=339.183, max=339.183, sum=678.366 (2)",
-            "tab": "General information",
-            "score": 339.1832061068702
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.835,
-        "details": {
-          "description": "min=0.835, mean=0.835, max=0.835, sum=1.669 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.292, mean=0.292, max=0.292, sum=0.584 (2)",
-            "tab": "Efficiency",
-            "score": 0.2918710767730208
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=636.165, mean=636.165, max=636.165, sum=1272.331 (2)",
-            "tab": "General information",
-            "score": 636.1652892561983
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.742,
-        "details": {
-          "description": "min=0.742, mean=0.742, max=0.742, sum=1.485 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.477, mean=0.477, max=0.477, sum=0.954 (2)",
-            "tab": "Efficiency",
-            "score": 0.47711458089161507
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=442.049, mean=442.049, max=442.049, sum=884.098 (2)",
-            "tab": "General information",
-            "score": 442.0490797546012
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.554,
-        "details": {
-          "description": "min=0.554, mean=0.554, max=0.554, sum=1.107 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.265, mean=0.265, max=0.265, sum=0.529 (2)",
-            "tab": "Efficiency",
-            "score": 0.2645489977938788
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=694.402, mean=694.402, max=694.402, sum=1388.804 (2)",
-            "tab": "General information",
-            "score": 694.4017857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.864,
-        "details": {
-          "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.587 (2)",
-            "tab": "Efficiency",
-            "score": 0.293421483734279
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=273.301, mean=273.301, max=273.301, sum=546.602 (2)",
-            "tab": "General information",
-            "score": 273.3009708737864
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.885,
-        "details": {
-          "description": "min=0.885, mean=0.885, max=0.885, sum=1.769 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.254, mean=0.254, max=0.254, sum=0.507 (2)",
-            "tab": "Efficiency",
-            "score": 0.25355013211568195
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=420.35, mean=420.35, max=420.35, sum=840.701 (2)",
-            "tab": "General information",
-            "score": 420.35042735042737
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7,
-        "details": {
-          "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.262, mean=0.262, max=0.262, sum=0.524 (2)",
-            "tab": "Efficiency",
-            "score": 0.26187997102737426
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=330.89, mean=330.89, max=330.89, sum=661.78 (2)",
-            "tab": "General information",
-            "score": 330.89
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.838,
-        "details": {
-          "description": "min=0.838, mean=0.838, max=0.838, sum=1.676 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.245, mean=0.245, max=0.245, sum=0.49 (2)",
-            "tab": "Efficiency",
-            "score": 0.24482133348935103
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=306.669, mean=306.669, max=306.669, sum=613.338 (2)",
-            "tab": "General information",
-            "score": 306.669220945083
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.377,
-        "details": {
-          "description": "min=0.377, mean=0.377, max=0.377, sum=0.753 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.254, mean=0.254, max=0.254, sum=0.508 (2)",
-            "tab": "Efficiency",
-            "score": 0.2542355225954442
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.392, mean=0.392, max=0.392, sum=0.784 (2)",
-            "tab": "Efficiency",
-            "score": 0.39224682173915415
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=487.003, mean=487.003, max=487.003, sum=974.006 (2)",
-            "tab": "General information",
-            "score": 487.0028901734104
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=682.542, mean=682.542, max=682.542, sum=1365.084 (2)",
-            "tab": "General information",
-            "score": 682.5418994413408
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.778,
-        "details": {
-          "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)",
-            "tab": "Efficiency",
-            "score": 0.3507605791091919
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=577.48, mean=577.48, max=577.48, sum=1154.961 (2)",
-            "tab": "General information",
-            "score": 577.4803921568628
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.756,
-        "details": {
-          "description": "min=0.756, mean=0.756, max=0.756, sum=1.512 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.254, mean=0.254, max=0.254, sum=0.509 (2)",
-            "tab": "Efficiency",
-            "score": 0.25446349014470604
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=532.198, mean=532.198, max=532.198, sum=1064.395 (2)",
-            "tab": "General information",
-            "score": 532.1975308641976
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.682,
-        "details": {
-          "description": "min=0.682, mean=0.682, max=0.682, sum=1.364 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.248, mean=0.248, max=0.248, sum=0.495 (2)",
-            "tab": "Efficiency",
-            "score": 0.24754605726762252
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=418.655, mean=418.655, max=418.655, sum=837.309 (2)",
-            "tab": "General information",
-            "score": 418.6545454545454
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.735,
-        "details": {
-          "description": "min=0.735, mean=0.735, max=0.735, sum=1.469 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.6 (2)",
-            "tab": "Efficiency",
-            "score": 0.30012765806548447
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1185.869, mean=1185.869, max=1185.869, sum=2371.739 (2)",
-            "tab": "General information",
-            "score": 1185.869387755102
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.841,
-        "details": {
-          "description": "min=0.841, mean=0.841, max=0.841, sum=1.682 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.586 (2)",
-            "tab": "Efficiency",
-            "score": 0.29275026487473826
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=448.274, mean=448.274, max=448.274, sum=896.547 (2)",
-            "tab": "General information",
-            "score": 448.27363184079604
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.548,
-        "details": {
-          "description": "min=0.548, mean=0.548, max=0.548, sum=1.096 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.25, mean=0.25, max=0.25, sum=0.501 (2)",
-            "tab": "Efficiency",
-            "score": 0.2502512199332915
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=328.753, mean=328.753, max=328.753, sum=657.506 (2)",
-            "tab": "General information",
-            "score": 328.7530120481928
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.842,
-        "details": {
-          "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.249, mean=0.249, max=0.249, sum=0.498 (2)",
-            "tab": "Efficiency",
-            "score": 0.24913478734200462
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=260.164, mean=260.164, max=260.164, sum=520.327 (2)",
-            "tab": "General information",
-            "score": 260.1637426900585
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.824,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json b/data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json
deleted file mode 100644
index c0271bcb3..000000000
--- a/data/helm_mmlu/google/text-bison@001/a0abcd19-58a1-478a-9786-d044a4181241.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_text-bison@001/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PaLM-2 Bison",
-    "id": "google/text-bison@001",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.692,
-        "details": {
-          "description": "min=0.331, mean=0.692, max=0.927, sum=78.899 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.619, mean=1.845, max=23.541, sum=210.314 (114)",
-            "tab": "Efficiency",
-            "score": 1.8448593983042894
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=270.187, mean=635.61, max=2823.23, sum=72459.527 (114)",
-            "tab": "General information",
-            "score": 635.6098850770794
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39,
-        "details": {
-          "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=1.017, mean=1.017, max=1.017, sum=2.033 (2)",
-            "tab": "Efficiency",
-            "score": 1.0166235256195069
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=387.12, mean=387.12, max=387.12, sum=774.24 (2)",
-            "tab": "General information",
-            "score": 387.12
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.644,
-        "details": {
-          "description": "min=0.644, mean=0.644, max=0.644, sum=1.289 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.837, mean=0.837, max=0.837, sum=1.673 (2)",
-            "tab": "Efficiency",
-            "score": 0.836542272567749
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=344.089, mean=344.089, max=344.089, sum=688.178 (2)",
-            "tab": "General information",
-            "score": 344.0888888888889
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.51,
-        "details": {
-          "description": "min=0.51, mean=0.51, max=0.51, sum=1.02 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=1.352, mean=1.352, max=1.352, sum=2.704 (2)",
-            "tab": "Efficiency",
-            "score": 1.3518596124649047
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.862, mean=0.862, max=0.862, sum=1.724 (2)",
-            "tab": "Efficiency",
-            "score": 0.8619864102866914
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=23.541, mean=23.541, max=23.541, sum=47.082 (2)",
-            "tab": "Efficiency",
-            "score": 23.54095259666443
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.931, mean=0.931, max=0.931, sum=1.862 (2)",
-            "tab": "Efficiency",
-            "score": 0.9307789158821106
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.947, mean=0.947, max=0.947, sum=1.894 (2)",
-            "tab": "Efficiency",
-            "score": 0.9472322174579422
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.928, mean=0.928, max=0.928, sum=1.856 (2)",
-            "tab": "Efficiency",
-            "score": 0.9281005485385072
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=573.7, mean=573.7, max=573.7, sum=1147.4 (2)",
-            "tab": "General information",
-            "score": 573.7
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=480.875, mean=480.875, max=480.875, sum=961.75 (2)",
-            "tab": "General information",
-            "score": 480.875
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=861.96, mean=861.96, max=861.96, sum=1723.92 (2)",
-            "tab": "General information",
-            "score": 861.96
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=636.94, mean=636.94, max=636.94, sum=1273.88 (2)",
-            "tab": "General information",
-            "score": 636.94
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=512.584, mean=512.584, max=512.584, sum=1025.168 (2)",
-            "tab": "General information",
-            "score": 512.5838150289018
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=513.647, mean=513.647, max=513.647, sum=1027.294 (2)",
-            "tab": "General information",
-            "score": 513.6470588235294
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.74,
-        "details": {
-          "description": "min=0.74, mean=0.74, max=0.74, sum=1.48 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=1.044, mean=1.044, max=1.044, sum=2.088 (2)",
-            "tab": "Efficiency",
-            "score": 1.0440657019615174
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=384.24, mean=384.24, max=384.24, sum=768.48 (2)",
-            "tab": "General information",
-            "score": 384.24
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.518,
-        "details": {
-          "description": "min=0.518, mean=0.518, max=0.518, sum=1.035 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=1.047, mean=1.047, max=1.047, sum=2.094 (2)",
-            "tab": "Efficiency",
-            "score": 1.04721718921996
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=644.395, mean=644.395, max=644.395, sum=1288.789 (2)",
-            "tab": "General information",
-            "score": 644.3947368421053
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38,
-        "details": {
-          "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.913, mean=0.913, max=0.913, sum=1.826 (2)",
-            "tab": "Efficiency",
-            "score": 0.9128784847259521
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=455.63, mean=455.63, max=455.63, sum=911.26 (2)",
-            "tab": "General information",
-            "score": 455.63
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.769,
-        "details": {
-          "description": "min=0.769, mean=0.769, max=0.769, sum=1.537 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.884, mean=0.884, max=0.884, sum=1.768 (2)",
-            "tab": "Efficiency",
-            "score": 0.8838474772594593
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=414.444, mean=414.444, max=414.444, sum=828.889 (2)",
-            "tab": "General information",
-            "score": 414.44444444444446
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.736,
-        "details": {
-          "description": "min=0.736, mean=0.736, max=0.736, sum=1.473 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.716, mean=0.716, max=0.716, sum=1.432 (2)",
-            "tab": "Efficiency",
-            "score": 0.7159656282406528
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=339.093, mean=339.093, max=339.093, sum=678.186 (2)",
-            "tab": "General information",
-            "score": 339.09324758842445
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.761,
-        "details": {
-          "description": "min=0.761, mean=0.761, max=0.761, sum=1.523 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=8.281, mean=8.281, max=8.281, sum=16.562 (2)",
-            "tab": "Efficiency",
-            "score": 8.280891868998022
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.812, mean=0.812, max=0.812, sum=1.624 (2)",
-            "tab": "Efficiency",
-            "score": 0.8122333144465237
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.634, mean=0.634, max=0.634, sum=1.268 (2)",
-            "tab": "Efficiency",
-            "score": 0.6340693978318335
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.697, mean=0.697, max=0.697, sum=1.394 (2)",
-            "tab": "Efficiency",
-            "score": 0.6971427946308859
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1104.614, mean=1104.614, max=1104.614, sum=2209.228 (2)",
-            "tab": "General information",
-            "score": 1104.6139705882354
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=752.83, mean=752.83, max=752.83, sum=1505.66 (2)",
-            "tab": "General information",
-            "score": 752.8297872340426
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1701.909, mean=1701.909, max=1701.909, sum=3403.819 (2)",
-            "tab": "General information",
-            "score": 1701.9093872229466
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=594.446, mean=594.446, max=594.446, sum=1188.892 (2)",
-            "tab": "General information",
-            "score": 594.4460784313726
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=1.101, mean=1.101, max=1.101, sum=2.202 (2)",
-            "tab": "Efficiency",
-            "score": 1.1012366461753844
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=440.48, mean=440.48, max=440.48, sum=880.96 (2)",
-            "tab": "General information",
-            "score": 440.48
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.803,
-        "details": {
-          "description": "min=0.803, mean=0.803, max=0.803, sum=1.605 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.715, mean=0.715, max=0.715, sum=1.43 (2)",
-            "tab": "Efficiency",
-            "score": 0.7148221495904421
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=613.033, mean=613.033, max=613.033, sum=1226.066 (2)",
-            "tab": "General information",
-            "score": 613.0328947368421
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.893, mean=0.893, max=0.893, sum=1.785 (2)",
-            "tab": "Efficiency",
-            "score": 0.8926668572425842
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=559.31, mean=559.31, max=559.31, sum=1118.62 (2)",
-            "tab": "General information",
-            "score": 559.31
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.725,
-        "details": {
-          "description": "min=0.725, mean=0.725, max=0.725, sum=1.449 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.77, mean=0.77, max=0.77, sum=1.541 (2)",
-            "tab": "Efficiency",
-            "score": 0.7704581980435353
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=394.77, mean=394.77, max=394.77, sum=789.54 (2)",
-            "tab": "General information",
-            "score": 394.76981132075474
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.694,
-        "details": {
-          "description": "min=0.694, mean=0.694, max=0.694, sum=1.387 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.828, mean=0.828, max=0.828, sum=1.656 (2)",
-            "tab": "Efficiency",
-            "score": 0.8279458959051903
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=309.477, mean=309.477, max=309.477, sum=618.953 (2)",
-            "tab": "General information",
-            "score": 309.4765957446809
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69,
-        "details": {
-          "description": "min=0.69, mean=0.69, max=0.69, sum=1.379 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=10.257, mean=10.257, max=10.257, sum=20.514 (2)",
-            "tab": "Efficiency",
-            "score": 10.257030944166512
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=480.524, mean=480.524, max=480.524, sum=961.048 (2)",
-            "tab": "General information",
-            "score": 480.5241379310345
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.487,
-        "details": {
-          "description": "min=0.487, mean=0.487, max=0.487, sum=0.974 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.751, mean=0.751, max=0.751, sum=1.502 (2)",
-            "tab": "Efficiency",
-            "score": 0.7508898708555434
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=599.828, mean=599.828, max=599.828, sum=1199.656 (2)",
-            "tab": "General information",
-            "score": 599.8280423280423
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.763, mean=0.763, max=0.763, sum=1.525 (2)",
-            "tab": "Efficiency",
-            "score": 0.7626136711665562
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=623.508, mean=623.508, max=623.508, sum=1247.016 (2)",
-            "tab": "General information",
-            "score": 623.5079365079365
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.869,
-        "details": {
-          "description": "min=0.869, mean=0.869, max=0.869, sum=1.738 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.789, mean=0.789, max=0.789, sum=1.577 (2)",
-            "tab": "Efficiency",
-            "score": 0.7886250380546816
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.837, mean=0.837, max=0.837, sum=1.675 (2)",
-            "tab": "Efficiency",
-            "score": 0.8373666197208348
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.961, mean=0.961, max=0.961, sum=1.922 (2)",
-            "tab": "Efficiency",
-            "score": 0.9611564636230469
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=1.13, mean=1.13, max=1.13, sum=2.26 (2)",
-            "tab": "Efficiency",
-            "score": 1.129964493260239
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.754, mean=0.754, max=0.754, sum=1.508 (2)",
-            "tab": "Efficiency",
-            "score": 0.7538033362590906
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.688, mean=0.688, max=0.688, sum=1.375 (2)",
-            "tab": "Efficiency",
-            "score": 0.6876482963562012
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.718, mean=0.718, max=0.718, sum=1.437 (2)",
-            "tab": "Efficiency",
-            "score": 0.7183168649673461
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.782, mean=0.782, max=0.782, sum=1.564 (2)",
-            "tab": "Efficiency",
-            "score": 0.7819750944773356
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.802, mean=0.802, max=0.802, sum=1.603 (2)",
-            "tab": "Efficiency",
-            "score": 0.8016475258755082
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.86, mean=0.86, max=0.86, sum=1.721 (2)",
-            "tab": "Efficiency",
-            "score": 0.860422892286288
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.675, mean=0.675, max=0.675, sum=1.35 (2)",
-            "tab": "Efficiency",
-            "score": 0.6752404208577008
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=9.407, mean=9.407, max=9.407, sum=18.814 (2)",
-            "tab": "Efficiency",
-            "score": 9.407231820954216
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=1.054, mean=1.054, max=1.054, sum=2.109 (2)",
-            "tab": "Efficiency",
-            "score": 1.0542718312319588
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.848, mean=0.848, max=0.848, sum=1.695 (2)",
-            "tab": "Efficiency",
-            "score": 0.8476851751029743
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=501.255, mean=501.255, max=501.255, sum=1002.51 (2)",
-            "tab": "General information",
-            "score": 501.2548387096774
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=515.473, mean=515.473, max=515.473, sum=1030.946 (2)",
-            "tab": "General information",
-            "score": 515.4729064039409
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=954.08, mean=954.08, max=954.08, sum=1908.16 (2)",
-            "tab": "General information",
-            "score": 954.08
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2823.23, mean=2823.23, max=2823.23, sum=5646.461 (2)",
-            "tab": "General information",
-            "score": 2823.230303030303
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=392.939, mean=392.939, max=392.939, sum=785.879 (2)",
-            "tab": "General information",
-            "score": 392.93939393939394
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=475.44, mean=475.44, max=475.44, sum=950.881 (2)",
-            "tab": "General information",
-            "score": 475.440414507772
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=395.962, mean=395.962, max=395.962, sum=791.923 (2)",
-            "tab": "General information",
-            "score": 395.96153846153845
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=580.393, mean=580.393, max=580.393, sum=1160.785 (2)",
-            "tab": "General information",
-            "score": 580.3925925925926
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=414.361, mean=414.361, max=414.361, sum=828.723 (2)",
-            "tab": "General information",
-            "score": 414.3613445378151
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=592.252, mean=592.252, max=592.252, sum=1184.503 (2)",
-            "tab": "General information",
-            "score": 592.2516556291391
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=496.51, mean=496.51, max=496.51, sum=993.02 (2)",
-            "tab": "General information",
-            "score": 496.5100917431193
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=860.532, mean=860.532, max=860.532, sum=1721.065 (2)",
-            "tab": "General information",
-            "score": 860.5324074074074
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2239.544, mean=2239.544, max=2239.544, sum=4479.088 (2)",
-            "tab": "General information",
-            "score": 2239.544117647059
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1437.051, mean=1437.051, max=1437.051, sum=2874.101 (2)",
-            "tab": "General information",
-            "score": 1437.0506329113923
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.619, mean=0.619, max=0.619, sum=1.237 (2)",
-            "tab": "Efficiency",
-            "score": 0.6185014632785267
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.851, mean=0.851, max=0.851, sum=1.702 (2)",
-            "tab": "Efficiency",
-            "score": 0.8510732850955642
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=323.906, mean=323.906, max=323.906, sum=647.812 (2)",
-            "tab": "General information",
-            "score": 323.90582959641256
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=338.74, mean=338.74, max=338.74, sum=677.481 (2)",
-            "tab": "General information",
-            "score": 338.74045801526717
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.835,
-        "details": {
-          "description": "min=0.835, mean=0.835, max=0.835, sum=1.669 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.93, mean=0.93, max=0.93, sum=1.859 (2)",
-            "tab": "Efficiency",
-            "score": 0.929545400556454
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=651.686, mean=651.686, max=651.686, sum=1303.372 (2)",
-            "tab": "General information",
-            "score": 651.6859504132232
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.853,
-        "details": {
-          "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.813, mean=0.813, max=0.813, sum=1.627 (2)",
-            "tab": "Efficiency",
-            "score": 0.8133661293544652
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=443.969, mean=443.969, max=443.969, sum=887.939 (2)",
-            "tab": "General information",
-            "score": 443.96932515337426
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.562,
-        "details": {
-          "description": "min=0.562, mean=0.562, max=0.562, sum=1.125 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.916, mean=0.916, max=0.916, sum=1.832 (2)",
-            "tab": "Efficiency",
-            "score": 0.9159843921661377
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=705.973, mean=705.973, max=705.973, sum=1411.946 (2)",
-            "tab": "General information",
-            "score": 705.9732142857143
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.893,
-        "details": {
-          "description": "min=0.893, mean=0.893, max=0.893, sum=1.786 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.817, mean=0.817, max=0.817, sum=1.633 (2)",
-            "tab": "Efficiency",
-            "score": 0.8166041281616804
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=284.68, mean=284.68, max=284.68, sum=569.359 (2)",
-            "tab": "General information",
-            "score": 284.6796116504854
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.893,
-        "details": {
-          "description": "min=0.893, mean=0.893, max=0.893, sum=1.786 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)",
-            "tab": "Efficiency",
-            "score": 0.789409975720267
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=428.726, mean=428.726, max=428.726, sum=857.453 (2)",
-            "tab": "General information",
-            "score": 428.7264957264957
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.75,
-        "details": {
-          "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.857, mean=0.857, max=0.857, sum=1.713 (2)",
-            "tab": "Efficiency",
-            "score": 0.8565307760238647
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=334.69, mean=334.69, max=334.69, sum=669.38 (2)",
-            "tab": "General information",
-            "score": 334.69
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.866,
-        "details": {
-          "description": "min=0.866, mean=0.866, max=0.866, sum=1.732 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=2.759, mean=2.759, max=2.759, sum=5.518 (2)",
-            "tab": "Efficiency",
-            "score": 2.7590373143991442
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=325.215, mean=325.215, max=325.215, sum=650.429 (2)",
-            "tab": "General information",
-            "score": 325.2145593869732
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.369,
-        "details": {
-          "description": "min=0.369, mean=0.369, max=0.369, sum=0.737 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.749, mean=0.749, max=0.749, sum=1.497 (2)",
-            "tab": "Efficiency",
-            "score": 0.7485969907286539
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.781, mean=0.781, max=0.781, sum=1.561 (2)",
-            "tab": "Efficiency",
-            "score": 0.7806768483955767
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=494.63, mean=494.63, max=494.63, sum=989.26 (2)",
-            "tab": "General information",
-            "score": 494.6300578034682
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=687.566, mean=687.566, max=687.566, sum=1375.133 (2)",
-            "tab": "General information",
-            "score": 687.5664804469274
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.709,
-        "details": {
-          "description": "min=0.709, mean=0.709, max=0.709, sum=1.418 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.81, mean=0.81, max=0.81, sum=1.621 (2)",
-            "tab": "Efficiency",
-            "score": 0.8104506489498163
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=589.663, mean=589.663, max=589.663, sum=1179.327 (2)",
-            "tab": "General information",
-            "score": 589.6633986928105
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.812,
-        "details": {
-          "description": "min=0.812, mean=0.812, max=0.812, sum=1.623 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.7, mean=0.7, max=0.7, sum=1.399 (2)",
-            "tab": "Efficiency",
-            "score": 0.6996216737193826
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=538.179, mean=538.179, max=538.179, sum=1076.358 (2)",
-            "tab": "General information",
-            "score": 538.179012345679
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.691,
-        "details": {
-          "description": "min=0.691, mean=0.691, max=0.691, sum=1.382 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.98, mean=0.98, max=0.98, sum=1.961 (2)",
-            "tab": "Efficiency",
-            "score": 0.980262413891879
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=426.982, mean=426.982, max=426.982, sum=853.964 (2)",
-            "tab": "General information",
-            "score": 426.9818181818182
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.812,
-        "details": {
-          "description": "min=0.812, mean=0.812, max=0.812, sum=1.624 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.857, mean=0.857, max=0.857, sum=1.713 (2)",
-            "tab": "Efficiency",
-            "score": 0.8567250339352355
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1185.8, mean=1185.8, max=1185.8, sum=2371.6 (2)",
-            "tab": "General information",
-            "score": 1185.8
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.841 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=7.515, mean=7.515, max=7.515, sum=15.029 (2)",
-            "tab": "Efficiency",
-            "score": 7.514506837028769
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=459.642, mean=459.642, max=459.642, sum=919.284 (2)",
-            "tab": "General information",
-            "score": 459.64179104477614
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.494,
-        "details": {
-          "description": "min=0.494, mean=0.494, max=0.494, sum=0.988 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.788, mean=0.788, max=0.788, sum=1.577 (2)",
-            "tab": "Efficiency",
-            "score": 0.7884655989796282
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=337.06, mean=337.06, max=337.06, sum=674.12 (2)",
-            "tab": "General information",
-            "score": 337.06024096385545
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.883,
-        "details": {
-          "description": "min=0.883, mean=0.883, max=0.883, sum=1.766 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.802, mean=0.802, max=0.802, sum=1.604 (2)",
-            "tab": "Efficiency",
-            "score": 0.8022187299895704
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=270.187, mean=270.187, max=270.187, sum=540.374 (2)",
-            "tab": "General information",
-            "score": 270.187134502924
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.192,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json b/data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json
deleted file mode 100644
index 42c5040aa..000000000
--- a/data/helm_mmlu/google/text-unicorn@001/95eda13a-cd34-4170-b2db-f2ead47250f9.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/google_text-unicorn@001/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PaLM-2 Unicorn",
-    "id": "google/text-unicorn@001",
-    "developer": "google",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.786,
-        "details": {
-          "description": "min=0.493, mean=0.786, max=0.979, sum=89.606 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.743, mean=1.052, max=2.108, sum=119.953 (114)",
-            "tab": "Efficiency",
-            "score": 1.0522220782452074
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=270.187, mean=635.61, max=2823.23, sum=72459.527 (114)",
-            "tab": "General information",
-            "score": 635.6098850770794
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.51,
-        "details": {
-          "description": "min=0.51, mean=0.51, max=0.51, sum=1.02 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=1.277, mean=1.277, max=1.277, sum=2.555 (2)",
-            "tab": "Efficiency",
-            "score": 1.2773328518867493
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=387.12, mean=387.12, max=387.12, sum=774.24 (2)",
-            "tab": "General information",
-            "score": 387.12
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.733,
-        "details": {
-          "description": "min=0.733, mean=0.733, max=0.733, sum=1.467 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.957, mean=0.957, max=0.957, sum=1.914 (2)",
-            "tab": "Efficiency",
-            "score": 0.9569159172199391
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=344.089, mean=344.089, max=344.089, sum=688.178 (2)",
-            "tab": "General information",
-            "score": 344.0888888888889
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.549,
-        "details": {
-          "description": "min=0.549, mean=0.549, max=0.549, sum=1.098 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.934, mean=0.934, max=0.934, sum=1.869 (2)",
-            "tab": "Efficiency",
-            "score": 0.9343120718002319
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.873, mean=0.873, max=0.873, sum=1.746 (2)",
-            "tab": "Efficiency",
-            "score": 0.8729922622442245
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=1.165, mean=1.165, max=1.165, sum=2.33 (2)",
-            "tab": "Efficiency",
-            "score": 1.165095055103302
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=1.062, mean=1.062, max=1.062, sum=2.124 (2)",
-            "tab": "Efficiency",
-            "score": 1.0619186329841614
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.978, mean=0.978, max=0.978, sum=1.957 (2)",
-            "tab": "Efficiency",
-            "score": 0.978282785140021
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)",
-            "tab": "Efficiency",
-            "score": 0.8518095483966902
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=573.7, mean=573.7, max=573.7, sum=1147.4 (2)",
-            "tab": "General information",
-            "score": 573.7
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=480.875, mean=480.875, max=480.875, sum=961.75 (2)",
-            "tab": "General information",
-            "score": 480.875
-          },
-          "College Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=861.96, mean=861.96, max=861.96, sum=1723.92 (2)",
-            "tab": "General information",
-            "score": 861.96
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=636.94, mean=636.94, max=636.94, sum=1273.88 (2)",
-            "tab": "General information",
-            "score": 636.94
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=512.584, mean=512.584, max=512.584, sum=1025.168 (2)",
-            "tab": "General information",
-            "score": 512.5838150289018
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=513.647, mean=513.647, max=513.647, sum=1027.294 (2)",
-            "tab": "General information",
-            "score": 513.6470588235294
-          },
-          "College Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.845, mean=0.845, max=0.845, sum=1.69 (2)",
-            "tab": "Efficiency",
-            "score": 0.8448482728004456
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=384.24, mean=384.24, max=384.24, sum=768.48 (2)",
-            "tab": "General information",
-            "score": 384.24
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.649,
-        "details": {
-          "description": "min=0.649, mean=0.649, max=0.649, sum=1.298 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)",
-            "tab": "Efficiency",
-            "score": 0.8522159112127203
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=644.395, mean=644.395, max=644.395, sum=1288.789 (2)",
-            "tab": "General information",
-            "score": 644.3947368421053
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53,
-        "details": {
-          "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.888, mean=0.888, max=0.888, sum=1.775 (2)",
-            "tab": "Efficiency",
-            "score": 0.8876941871643066
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=455.63, mean=455.63, max=455.63, sum=911.26 (2)",
-            "tab": "General information",
-            "score": 455.63
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "details": {
-          "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=1.017, mean=1.017, max=1.017, sum=2.034 (2)",
-            "tab": "Efficiency",
-            "score": 1.0168068651799802
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=414.444, mean=414.444, max=414.444, sum=828.889 (2)",
-            "tab": "General information",
-            "score": 414.44444444444446
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.836,
-        "details": {
-          "description": "min=0.836, mean=0.836, max=0.836, sum=1.672 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.895, mean=0.895, max=0.895, sum=1.79 (2)",
-            "tab": "Efficiency",
-            "score": 0.8949410808048064
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=339.093, mean=339.093, max=339.093, sum=678.186 (2)",
-            "tab": "General information",
-            "score": 339.09324758842445
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.858,
-        "details": {
-          "description": "min=0.858, mean=0.858, max=0.858, sum=1.716 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=1.088, mean=1.088, max=1.088, sum=2.175 (2)",
-            "tab": "Efficiency",
-            "score": 1.0875138991019304
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.978, mean=0.978, max=0.978, sum=1.956 (2)",
-            "tab": "Efficiency",
-            "score": 0.9778145923682139
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=1.205, mean=1.205, max=1.205, sum=2.41 (2)",
-            "tab": "Efficiency",
-            "score": 1.204983455416743
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.789, mean=0.789, max=0.789, sum=1.578 (2)",
-            "tab": "Efficiency",
-            "score": 0.7891469753645604
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1104.614, mean=1104.614, max=1104.614, sum=2209.228 (2)",
-            "tab": "General information",
-            "score": 1104.6139705882354
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=752.83, mean=752.83, max=752.83, sum=1505.66 (2)",
-            "tab": "General information",
-            "score": 752.8297872340426
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1701.909, mean=1701.909, max=1701.909, sum=3403.819 (2)",
-            "tab": "General information",
-            "score": 1701.9093872229466
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=594.446, mean=594.446, max=594.446, sum=1188.892 (2)",
-            "tab": "General information",
-            "score": 594.4460784313726
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.96,
-        "details": {
-          "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.743, mean=0.743, max=0.743, sum=1.485 (2)",
-            "tab": "Efficiency",
-            "score": 0.7426803350448609
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=440.48, mean=440.48, max=440.48, sum=880.96 (2)",
-            "tab": "General information",
-            "score": 440.48
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.862,
-        "details": {
-          "description": "min=0.862, mean=0.862, max=0.862, sum=1.724 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.843, mean=0.843, max=0.843, sum=1.686 (2)",
-            "tab": "Efficiency",
-            "score": 0.8429784712038542
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=613.033, mean=613.033, max=613.033, sum=1226.066 (2)",
-            "tab": "General information",
-            "score": 613.0328947368421
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=1.018, mean=1.018, max=1.018, sum=2.035 (2)",
-            "tab": "Efficiency",
-            "score": 1.0176324987411498
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=559.31, mean=559.31, max=559.31, sum=1118.62 (2)",
-            "tab": "General information",
-            "score": 559.31
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.804,
-        "details": {
-          "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.954, mean=0.954, max=0.954, sum=1.909 (2)",
-            "tab": "Efficiency",
-            "score": 0.9543584787620688
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=394.77, mean=394.77, max=394.77, sum=789.54 (2)",
-            "tab": "General information",
-            "score": 394.76981132075474
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.809,
-        "details": {
-          "description": "min=0.809, mean=0.809, max=0.809, sum=1.617 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.834, mean=0.834, max=0.834, sum=1.667 (2)",
-            "tab": "Efficiency",
-            "score": 0.8336589884250722
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=309.477, mean=309.477, max=309.477, sum=618.953 (2)",
-            "tab": "General information",
-            "score": 309.4765957446809
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.772,
-        "details": {
-          "description": "min=0.772, mean=0.772, max=0.772, sum=1.545 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=1.064, mean=1.064, max=1.064, sum=2.128 (2)",
-            "tab": "Efficiency",
-            "score": 1.0639554155283961
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=480.524, mean=480.524, max=480.524, sum=961.048 (2)",
-            "tab": "General information",
-            "score": 480.5241379310345
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.661,
-        "details": {
-          "description": "min=0.661, mean=0.661, max=0.661, sum=1.323 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=1.026, mean=1.026, max=1.026, sum=2.052 (2)",
-            "tab": "Efficiency",
-            "score": 1.0261994568759172
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=599.828, mean=599.828, max=599.828, sum=1199.656 (2)",
-            "tab": "General information",
-            "score": 599.8280423280423
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.659,
-        "details": {
-          "description": "min=0.659, mean=0.659, max=0.659, sum=1.317 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=1.016, mean=1.016, max=1.016, sum=2.032 (2)",
-            "tab": "Efficiency",
-            "score": 1.0157842484731523
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=623.508, mean=623.508, max=623.508, sum=1247.016 (2)",
-            "tab": "General information",
-            "score": 623.5079365079365
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.911,
-        "details": {
-          "description": "min=0.911, mean=0.911, max=0.911, sum=1.823 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=1.026, mean=1.026, max=1.026, sum=2.052 (2)",
-            "tab": "Efficiency",
-            "score": 1.026222055189071
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=1.054, mean=1.054, max=1.054, sum=2.109 (2)",
-            "tab": "Efficiency",
-            "score": 1.054317417990398
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=1.519, mean=1.519, max=1.519, sum=3.039 (2)",
-            "tab": "Efficiency",
-            "score": 1.519298493862152
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=2.108, mean=2.108, max=2.108, sum=4.215 (2)",
-            "tab": "Efficiency",
-            "score": 2.107529640197754
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=1.159, mean=1.159, max=1.159, sum=2.319 (2)",
-            "tab": "Efficiency",
-            "score": 1.1594982544581096
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=1.056, mean=1.056, max=1.056, sum=2.112 (2)",
-            "tab": "Efficiency",
-            "score": 1.0561638829621627
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=1.016, mean=1.016, max=1.016, sum=2.033 (2)",
-            "tab": "Efficiency",
-            "score": 1.0163854268880992
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=1.018, mean=1.018, max=1.018, sum=2.036 (2)",
-            "tab": "Efficiency",
-            "score": 1.0180342506479334
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.905, mean=0.905, max=0.905, sum=1.811 (2)",
-            "tab": "Efficiency",
-            "score": 0.9054926122937884
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=1.252, mean=1.252, max=1.252, sum=2.503 (2)",
-            "tab": "Efficiency",
-            "score": 1.2517439276966829
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.954, mean=0.954, max=0.954, sum=1.909 (2)",
-            "tab": "Efficiency",
-            "score": 0.9543260762450891
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=1.329, mean=1.329, max=1.329, sum=2.657 (2)",
-            "tab": "Efficiency",
-            "score": 1.3287169370386336
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=2.056, mean=2.056, max=2.056, sum=4.112 (2)",
-            "tab": "Efficiency",
-            "score": 2.0560385222528494
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=1.276, mean=1.276, max=1.276, sum=2.553 (2)",
-            "tab": "Efficiency",
-            "score": 1.2764891250224053
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=501.255, mean=501.255, max=501.255, sum=1002.51 (2)",
-            "tab": "General information",
-            "score": 501.2548387096774
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=515.473, mean=515.473, max=515.473, sum=1030.946 (2)",
-            "tab": "General information",
-            "score": 515.4729064039409
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=954.08, mean=954.08, max=954.08, sum=1908.16 (2)",
-            "tab": "General information",
-            "score": 954.08
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2823.23, mean=2823.23, max=2823.23, sum=5646.461 (2)",
-            "tab": "General information",
-            "score": 2823.230303030303
-          },
-          "High School European History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=392.939, mean=392.939, max=392.939, sum=785.879 (2)",
-            "tab": "General information",
-            "score": 392.93939393939394
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=475.44, mean=475.44, max=475.44, sum=950.881 (2)",
-            "tab": "General information",
-            "score": 475.440414507772
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=395.962, mean=395.962, max=395.962, sum=791.923 (2)",
-            "tab": "General information",
-            "score": 395.96153846153845
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=580.393, mean=580.393, max=580.393, sum=1160.785 (2)",
-            "tab": "General information",
-            "score": 580.3925925925926
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=414.361, mean=414.361, max=414.361, sum=828.723 (2)",
-            "tab": "General information",
-            "score": 414.3613445378151
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=592.252, mean=592.252, max=592.252, sum=1184.503 (2)",
-            "tab": "General information",
-            "score": 592.2516556291391
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=496.51, mean=496.51, max=496.51, sum=993.02 (2)",
-            "tab": "General information",
-            "score": 496.5100917431193
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=860.532, mean=860.532, max=860.532, sum=1721.065 (2)",
-            "tab": "General information",
-            "score": 860.5324074074074
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2239.544, mean=2239.544, max=2239.544, sum=4479.088 (2)",
-            "tab": "General information",
-            "score": 2239.544117647059
-          },
-          "High School US History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1437.051, mean=1437.051, max=1437.051, sum=2874.101 (2)",
-            "tab": "General information",
-            "score": 1437.0506329113923
-          },
-          "High School World History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.924,
-        "details": {
-          "description": "min=0.924, mean=0.924, max=0.924, sum=1.847 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.884, mean=0.884, max=0.884, sum=1.768 (2)",
-            "tab": "Efficiency",
-            "score": 0.8839223662833996
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=1.095, mean=1.095, max=1.095, sum=2.191 (2)",
-            "tab": "Efficiency",
-            "score": 1.0953879956980699
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=323.906, mean=323.906, max=323.906, sum=647.812 (2)",
-            "tab": "General information",
-            "score": 323.90582959641256
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=338.74, mean=338.74, max=338.74, sum=677.481 (2)",
-            "tab": "General information",
-            "score": 338.74045801526717
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.909,
-        "details": {
-          "description": "min=0.909, mean=0.909, max=0.909, sum=1.818 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=1.104, mean=1.104, max=1.104, sum=2.208 (2)",
-            "tab": "Efficiency",
-            "score": 1.1039516984923812
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=651.686, mean=651.686, max=651.686, sum=1303.372 (2)",
-            "tab": "General information",
-            "score": 651.6859504132232
-          },
-          "International Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.877,
-        "details": {
-          "description": "min=0.877, mean=0.877, max=0.877, sum=1.755 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=1.094, mean=1.094, max=1.094, sum=2.188 (2)",
-            "tab": "Efficiency",
-            "score": 1.0941538839983793
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=443.969, mean=443.969, max=443.969, sum=887.939 (2)",
-            "tab": "General information",
-            "score": 443.96932515337426
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.625,
-        "details": {
-          "description": "min=0.625, mean=0.625, max=0.625, sum=1.25 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=1.11, mean=1.11, max=1.11, sum=2.22 (2)",
-            "tab": "Efficiency",
-            "score": 1.110024324485234
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=705.973, mean=705.973, max=705.973, sum=1411.946 (2)",
-            "tab": "General information",
-            "score": 705.9732142857143
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.903,
-        "details": {
-          "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=1.154, mean=1.154, max=1.154, sum=2.308 (2)",
-            "tab": "Efficiency",
-            "score": 1.153875772235463
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=284.68, mean=284.68, max=284.68, sum=569.359 (2)",
-            "tab": "General information",
-            "score": 284.6796116504854
-          },
-          "Management - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=1.031, mean=1.031, max=1.031, sum=2.063 (2)",
-            "tab": "Efficiency",
-            "score": 1.0312827428181965
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=428.726, mean=428.726, max=428.726, sum=857.453 (2)",
-            "tab": "General information",
-            "score": 428.7264957264957
-          },
-          "Marketing - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=1.068, mean=1.068, max=1.068, sum=2.136 (2)",
-            "tab": "Efficiency",
-            "score": 1.0681284523010255
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=334.69, mean=334.69, max=334.69, sum=669.38 (2)",
-            "tab": "General information",
-            "score": 334.69
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.894,
-        "details": {
-          "description": "min=0.894, mean=0.894, max=0.894, sum=1.788 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.894, mean=0.894, max=0.894, sum=1.788 (2)",
-            "tab": "Efficiency",
-            "score": 0.8939257733818824
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=325.215, mean=325.215, max=325.215, sum=650.429 (2)",
-            "tab": "General information",
-            "score": 325.2145593869732
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.562,
-        "details": {
-          "description": "min=0.562, mean=0.562, max=0.562, sum=1.124 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.988, mean=0.988, max=0.988, sum=1.976 (2)",
-            "tab": "Efficiency",
-            "score": 0.9880901995421834
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.968, mean=0.968, max=0.968, sum=1.935 (2)",
-            "tab": "Efficiency",
-            "score": 0.9677273009742439
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=494.63, mean=494.63, max=494.63, sum=989.26 (2)",
-            "tab": "General information",
-            "score": 494.6300578034682
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=687.566, mean=687.566, max=687.566, sum=1375.133 (2)",
-            "tab": "General information",
-            "score": 687.5664804469274
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.856,
-        "details": {
-          "description": "min=0.856, mean=0.856, max=0.856, sum=1.712 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.912, mean=0.912, max=0.912, sum=1.824 (2)",
-            "tab": "Efficiency",
-            "score": 0.9120152238147711
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=589.663, mean=589.663, max=589.663, sum=1179.327 (2)",
-            "tab": "General information",
-            "score": 589.6633986928105
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.916, mean=0.916, max=0.916, sum=1.831 (2)",
-            "tab": "Efficiency",
-            "score": 0.9155398577819636
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=538.179, mean=538.179, max=538.179, sum=1076.358 (2)",
-            "tab": "General information",
-            "score": 538.179012345679
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.773,
-        "details": {
-          "description": "min=0.773, mean=0.773, max=0.773, sum=1.545 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.79, mean=0.79, max=0.79, sum=1.579 (2)",
-            "tab": "Efficiency",
-            "score": 0.7896393559195779
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=426.982, mean=426.982, max=426.982, sum=853.964 (2)",
-            "tab": "General information",
-            "score": 426.9818181818182
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.829,
-        "details": {
-          "description": "min=0.829, mean=0.829, max=0.829, sum=1.657 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=1.254, mean=1.254, max=1.254, sum=2.508 (2)",
-            "tab": "Efficiency",
-            "score": 1.2542338507516044
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1185.8, mean=1185.8, max=1185.8, sum=2371.6 (2)",
-            "tab": "General information",
-            "score": 1185.8
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.84, mean=0.84, max=0.84, sum=1.681 (2)",
-            "tab": "Efficiency",
-            "score": 0.8403987184685854
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=459.642, mean=459.642, max=459.642, sum=919.284 (2)",
-            "tab": "General information",
-            "score": 459.64179104477614
-          },
-          "Sociology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.572,
-        "details": {
-          "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=1.029, mean=1.029, max=1.029, sum=2.059 (2)",
-            "tab": "Efficiency",
-            "score": 1.0293473134557884
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=337.06, mean=337.06, max=337.06, sum=674.12 (2)",
-            "tab": "General information",
-            "score": 337.06024096385545
-          },
-          "Virology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.877,
-        "details": {
-          "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.963, mean=0.963, max=0.963, sum=1.926 (2)",
-            "tab": "Efficiency",
-            "score": 0.9628847495854249
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=270.187, mean=270.187, max=270.187, sum=540.374 (2)",
-            "tab": "General information",
-            "score": 270.187134502924
-          },
-          "World Religions - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.142,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json b/data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json
deleted file mode 100644
index 453cd8b3a..000000000
--- a/data/helm_mmlu/meta/llama-2-13b/7f37161a-3f1c-4bc4-860f-8fdbf623f63e.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-2-13b/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 2 13B",
-    "id": "meta/llama-2-13b",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.554,
-        "details": {
-          "description": "min=0.235, mean=0.554, max=0.83, sum=63.174 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.492, max=1.697, sum=56.065 (114)",
-            "tab": "Efficiency",
-            "score": 0.49179914059061297
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=304.474, mean=706.682, max=3159.636, sum=80561.749 (114)",
-            "tab": "General information",
-            "score": 706.6820126388612
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.27,
-        "details": {
-          "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.361, mean=0.361, max=0.361, sum=0.722 (2)",
-            "tab": "Efficiency",
-            "score": 0.3610322856903076
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=397.65, mean=397.65, max=397.65, sum=795.3 (2)",
-            "tab": "General information",
-            "score": 397.65
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.496,
-        "details": {
-          "description": "min=0.496, mean=0.496, max=0.496, sum=0.993 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.357, mean=0.357, max=0.357, sum=0.715 (2)",
-            "tab": "Efficiency",
-            "score": 0.35744349868209274
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=409.133, mean=409.133, max=409.133, sum=818.267 (2)",
-            "tab": "General information",
-            "score": 409.1333333333333
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.235,
-        "details": {
-          "description": "min=0.235, mean=0.235, max=0.235, sum=0.471 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.449, mean=0.449, max=0.449, sum=0.897 (2)",
-            "tab": "Efficiency",
-            "score": 0.44854954242706296
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.401, mean=0.401, max=0.401, sum=0.802 (2)",
-            "tab": "Efficiency",
-            "score": 0.40112912986013627
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.534, mean=0.534, max=0.534, sum=1.069 (2)",
-            "tab": "Efficiency",
-            "score": 0.5343992376327514
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.454, mean=0.454, max=0.454, sum=0.909 (2)",
-            "tab": "Efficiency",
-            "score": 0.45426050424575803
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.452, mean=0.452, max=0.452, sum=0.905 (2)",
-            "tab": "Efficiency",
-            "score": 0.4522962446157643
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.413, mean=0.413, max=0.413, sum=0.826 (2)",
-            "tab": "Efficiency",
-            "score": 0.4130270574607101
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=622.43, mean=622.43, max=622.43, sum=1244.86 (2)",
-            "tab": "General information",
-            "score": 622.43
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=553.632, mean=553.632, max=553.632, sum=1107.264 (2)",
-            "tab": "General information",
-            "score": 553.6319444444445
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=901.14, mean=901.14, max=901.14, sum=1802.28 (2)",
-            "tab": "General information",
-            "score": 901.14
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=646.96, mean=646.96, max=646.96, sum=1293.92 (2)",
-            "tab": "General information",
-            "score": 646.96
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=608.671, mean=608.671, max=608.671, sum=1217.341 (2)",
-            "tab": "General information",
-            "score": 608.6705202312139
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=551.873, mean=551.873, max=551.873, sum=1103.745 (2)",
-            "tab": "General information",
-            "score": 551.8725490196078
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69,
-        "details": {
-          "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.355, mean=0.355, max=0.355, sum=0.71 (2)",
-            "tab": "Efficiency",
-            "score": 0.3552073335647583
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=428.17, mean=428.17, max=428.17, sum=856.34 (2)",
-            "tab": "General information",
-            "score": 428.17
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307,
-        "details": {
-          "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.455, mean=0.455, max=0.455, sum=0.91 (2)",
-            "tab": "Efficiency",
-            "score": 0.45517582014987346
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=684.675, mean=684.675, max=684.675, sum=1369.351 (2)",
-            "tab": "General information",
-            "score": 684.6754385964912
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38,
-        "details": {
-          "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.39, mean=0.39, max=0.39, sum=0.781 (2)",
-            "tab": "Efficiency",
-            "score": 0.3903778100013733
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=484.54, mean=484.54, max=484.54, sum=969.08 (2)",
-            "tab": "General information",
-            "score": 484.54
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.704,
-        "details": {
-          "description": "min=0.704, mean=0.704, max=0.704, sum=1.407 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.359, mean=0.359, max=0.359, sum=0.718 (2)",
-            "tab": "Efficiency",
-            "score": 0.35898366460093745
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=449.898, mean=449.898, max=449.898, sum=899.796 (2)",
-            "tab": "General information",
-            "score": 449.89814814814815
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.672,
-        "details": {
-          "description": "min=0.672, mean=0.672, max=0.672, sum=1.344 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.323, max=0.323, sum=0.645 (2)",
-            "tab": "Efficiency",
-            "score": 0.3226076184361694
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=372.122, mean=372.122, max=372.122, sum=744.244 (2)",
-            "tab": "General information",
-            "score": 372.12218649517683
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.567,
-        "details": {
-          "description": "min=0.567, mean=0.567, max=0.567, sum=1.134 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.759, mean=0.759, max=0.759, sum=1.519 (2)",
-            "tab": "Efficiency",
-            "score": 0.7594411802642486
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.55, mean=0.55, max=0.55, sum=1.099 (2)",
-            "tab": "Efficiency",
-            "score": 0.5495186367778914
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=1.102, mean=1.102, max=1.102, sum=2.205 (2)",
-            "tab": "Efficiency",
-            "score": 1.1024409701957851
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.438, mean=0.438, max=0.438, sum=0.875 (2)",
-            "tab": "Efficiency",
-            "score": 0.43751365219066346
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1330.647, mean=1330.647, max=1330.647, sum=2661.294 (2)",
-            "tab": "General information",
-            "score": 1330.6470588235295
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=823.277, mean=823.277, max=823.277, sum=1646.553 (2)",
-            "tab": "General information",
-            "score": 823.2765957446809
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1915.007, mean=1915.007, max=1915.007, sum=3830.014 (2)",
-            "tab": "General information",
-            "score": 1915.0071707953064
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=650.078, mean=650.078, max=650.078, sum=1300.157 (2)",
-            "tab": "General information",
-            "score": 650.0784313725491
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.391, mean=0.391, max=0.391, sum=0.782 (2)",
-            "tab": "Efficiency",
-            "score": 0.3909334921836853
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=479.81, mean=479.81, max=479.81, sum=959.62 (2)",
-            "tab": "General information",
-            "score": 479.81
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.546,
-        "details": {
-          "description": "min=0.546, mean=0.546, max=0.546, sum=1.092 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.472, mean=0.472, max=0.472, sum=0.945 (2)",
-            "tab": "Efficiency",
-            "score": 0.47229841351509094
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=681.079, mean=681.079, max=681.079, sum=1362.158 (2)",
-            "tab": "General information",
-            "score": 681.078947368421
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.55,
-        "details": {
-          "description": "min=0.55, mean=0.55, max=0.55, sum=1.1 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.476, mean=0.476, max=0.476, sum=0.952 (2)",
-            "tab": "Efficiency",
-            "score": 0.4758677792549133
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=674.44, mean=674.44, max=674.44, sum=1348.88 (2)",
-            "tab": "General information",
-            "score": 674.44
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.592,
-        "details": {
-          "description": "min=0.592, mean=0.592, max=0.592, sum=1.185 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.386, mean=0.386, max=0.386, sum=0.772 (2)",
-            "tab": "Efficiency",
-            "score": 0.38589143843021034
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=487.374, mean=487.374, max=487.374, sum=974.747 (2)",
-            "tab": "General information",
-            "score": 487.3735849056604
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.413,
-        "details": {
-          "description": "min=0.413, mean=0.413, max=0.413, sum=0.826 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.48, mean=0.48, max=0.48, sum=0.961 (2)",
-            "tab": "Efficiency",
-            "score": 0.4802838366082374
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=333.153, mean=333.153, max=333.153, sum=666.306 (2)",
-            "tab": "General information",
-            "score": 333.1531914893617
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.49,
-        "details": {
-          "description": "min=0.49, mean=0.49, max=0.49, sum=0.979 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.368, mean=0.368, max=0.368, sum=0.737 (2)",
-            "tab": "Efficiency",
-            "score": 0.36833986249463313
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=497.779, mean=497.779, max=497.779, sum=995.559 (2)",
-            "tab": "General information",
-            "score": 497.7793103448276
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307,
-        "details": {
-          "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.497, mean=0.497, max=0.497, sum=0.995 (2)",
-            "tab": "Efficiency",
-            "score": 0.49746112028757733
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=609.156, mean=609.156, max=609.156, sum=1218.312 (2)",
-            "tab": "General information",
-            "score": 609.1560846560847
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.381,
-        "details": {
-          "description": "min=0.381, mean=0.381, max=0.381, sum=0.762 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.444, mean=0.444, max=0.444, sum=0.887 (2)",
-            "tab": "Efficiency",
-            "score": 0.4436971952044775
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=691.81, mean=691.81, max=691.81, sum=1383.619 (2)",
-            "tab": "General information",
-            "score": 691.8095238095239
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.705,
-        "details": {
-          "description": "min=0.705, mean=0.705, max=0.705, sum=1.409 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.437, mean=0.437, max=0.437, sum=0.873 (2)",
-            "tab": "Efficiency",
-            "score": 0.43674747020967547
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)",
-            "tab": "Efficiency",
-            "score": 0.42318584883741556
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.567, mean=0.567, max=0.567, sum=1.133 (2)",
-            "tab": "Efficiency",
-            "score": 0.5666733002662658
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=1.697, mean=1.697, max=1.697, sum=3.394 (2)",
-            "tab": "Efficiency",
-            "score": 1.6971724553541703
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.532, mean=0.532, max=0.532, sum=1.065 (2)",
-            "tab": "Efficiency",
-            "score": 0.5323956747247716
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.368, mean=0.368, max=0.368, sum=0.735 (2)",
-            "tab": "Efficiency",
-            "score": 0.36752033727774347
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.378, mean=0.378, max=0.378, sum=0.756 (2)",
-            "tab": "Efficiency",
-            "score": 0.3781696270673703
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.402, mean=0.402, max=0.402, sum=0.803 (2)",
-            "tab": "Efficiency",
-            "score": 0.4017471119209572
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.36, mean=0.36, max=0.36, sum=0.721 (2)",
-            "tab": "Efficiency",
-            "score": 0.3603636326910067
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.429, mean=0.429, max=0.429, sum=0.858 (2)",
-            "tab": "Efficiency",
-            "score": 0.4290682780032126
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)",
-            "tab": "Efficiency",
-            "score": 0.42302281703424016
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.535, mean=0.535, max=0.535, sum=1.069 (2)",
-            "tab": "Efficiency",
-            "score": 0.534513204186051
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=1.332, mean=1.332, max=1.332, sum=2.665 (2)",
-            "tab": "Efficiency",
-            "score": 1.33243932910994
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.866, mean=0.866, max=0.866, sum=1.733 (2)",
-            "tab": "Efficiency",
-            "score": 0.8663106930406788
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=596.894, mean=596.894, max=596.894, sum=1193.787 (2)",
-            "tab": "General information",
-            "score": 596.8935483870968
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=568.665, mean=568.665, max=568.665, sum=1137.33 (2)",
-            "tab": "General information",
-            "score": 568.6650246305419
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=988.57, mean=988.57, max=988.57, sum=1977.14 (2)",
-            "tab": "General information",
-            "score": 988.57
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=3159.636, mean=3159.636, max=3159.636, sum=6319.273 (2)",
-            "tab": "General information",
-            "score": 3159.6363636363635
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=436.657, mean=436.657, max=436.657, sum=873.313 (2)",
-            "tab": "General information",
-            "score": 436.65656565656565
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=527.927, mean=527.927, max=527.927, sum=1055.855 (2)",
-            "tab": "General information",
-            "score": 527.9274611398964
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=445.662, mean=445.662, max=445.662, sum=891.323 (2)",
-            "tab": "General information",
-            "score": 445.66153846153844
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=579.181, mean=579.181, max=579.181, sum=1158.363 (2)",
-            "tab": "General information",
-            "score": 579.1814814814815
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=449.492, mean=449.492, max=449.492, sum=898.983 (2)",
-            "tab": "General information",
-            "score": 449.49159663865544
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=621.788, mean=621.788, max=621.788, sum=1243.576 (2)",
-            "tab": "General information",
-            "score": 621.7880794701987
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=585.919, mean=585.919, max=585.919, sum=1171.839 (2)",
-            "tab": "General information",
-            "score": 585.9192660550459
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=908.208, mean=908.208, max=908.208, sum=1816.417 (2)",
-            "tab": "General information",
-            "score": 908.2083333333334
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2535.324, mean=2535.324, max=2535.324, sum=5070.647 (2)",
-            "tab": "General information",
-            "score": 2535.323529411765
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1638.219, mean=1638.219, max=1638.219, sum=3276.439 (2)",
-            "tab": "General information",
-            "score": 1638.2194092827003
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.618,
-        "details": {
-          "description": "min=0.618, mean=0.618, max=0.618, sum=1.237 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.473, mean=0.473, max=0.473, sum=0.947 (2)",
-            "tab": "Efficiency",
-            "score": 0.47327254385157014
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.368, mean=0.368, max=0.368, sum=0.737 (2)",
-            "tab": "Efficiency",
-            "score": 0.3683396113737849
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=361.26, mean=361.26, max=361.26, sum=722.52 (2)",
-            "tab": "General information",
-            "score": 361.26008968609864
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=403.382, mean=403.382, max=403.382, sum=806.763 (2)",
-            "tab": "General information",
-            "score": 403.381679389313
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.752,
-        "details": {
-          "description": "min=0.752, mean=0.752, max=0.752, sum=1.504 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.488, mean=0.488, max=0.488, sum=0.975 (2)",
-            "tab": "Efficiency",
-            "score": 0.48763008551164105
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=729.463, mean=729.463, max=729.463, sum=1458.926 (2)",
-            "tab": "General information",
-            "score": 729.4628099173553
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.687,
-        "details": {
-          "description": "min=0.687, mean=0.687, max=0.687, sum=1.374 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.361, mean=0.361, max=0.361, sum=0.722 (2)",
-            "tab": "Efficiency",
-            "score": 0.3607579462367333
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=502.755, mean=502.755, max=502.755, sum=1005.509 (2)",
-            "tab": "General information",
-            "score": 502.7546012269939
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.286,
-        "details": {
-          "description": "min=0.286, mean=0.286, max=0.286, sum=0.571 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.478, mean=0.478, max=0.478, sum=0.955 (2)",
-            "tab": "Efficiency",
-            "score": 0.4776035504681723
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=730.402, mean=730.402, max=730.402, sum=1460.804 (2)",
-            "tab": "General information",
-            "score": 730.4017857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.738,
-        "details": {
-          "description": "min=0.738, mean=0.738, max=0.738, sum=1.476 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)",
-            "tab": "Efficiency",
-            "score": 0.34303417715054113
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=315.777, mean=315.777, max=315.777, sum=631.553 (2)",
-            "tab": "General information",
-            "score": 315.77669902912623
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.786,
-        "details": {
-          "description": "min=0.786, mean=0.786, max=0.786, sum=1.573 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.374, mean=0.374, max=0.374, sum=0.749 (2)",
-            "tab": "Efficiency",
-            "score": 0.37440858845017916
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=472.628, mean=472.628, max=472.628, sum=945.256 (2)",
-            "tab": "General information",
-            "score": 472.62820512820514
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.57,
-        "details": {
-          "description": "min=0.57, mean=0.57, max=0.57, sum=1.14 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.365, mean=0.365, max=0.365, sum=0.73 (2)",
-            "tab": "Efficiency",
-            "score": 0.3651238298416138
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=408.14, mean=408.14, max=408.14, sum=816.28 (2)",
-            "tab": "General information",
-            "score": 408.14
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.748,
-        "details": {
-          "description": "min=0.748, mean=0.748, max=0.748, sum=1.497 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)",
-            "tab": "Efficiency",
-            "score": 0.34193715342768916
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=345.913, mean=345.913, max=345.913, sum=691.826 (2)",
-            "tab": "General information",
-            "score": 345.9131545338442
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.407,
-        "details": {
-          "description": "min=0.407, mean=0.407, max=0.407, sum=0.813 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.421, mean=0.421, max=0.421, sum=0.841 (2)",
-            "tab": "Efficiency",
-            "score": 0.4205500893510146
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.474, mean=0.474, max=0.474, sum=0.949 (2)",
-            "tab": "Efficiency",
-            "score": 0.4744861464260677
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=542.506, mean=542.506, max=542.506, sum=1085.012 (2)",
-            "tab": "General information",
-            "score": 542.5057803468208
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=756.479, mean=756.479, max=756.479, sum=1512.959 (2)",
-            "tab": "General information",
-            "score": 756.4793296089385
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.627,
-        "details": {
-          "description": "min=0.627, mean=0.627, max=0.627, sum=1.255 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.453, mean=0.453, max=0.453, sum=0.906 (2)",
-            "tab": "Efficiency",
-            "score": 0.4530853640799429
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=695.922, mean=695.922, max=695.922, sum=1391.843 (2)",
-            "tab": "General information",
-            "score": 695.9215686274509
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.654,
-        "details": {
-          "description": "min=0.654, mean=0.654, max=0.654, sum=1.309 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.445, mean=0.445, max=0.445, sum=0.889 (2)",
-            "tab": "Efficiency",
-            "score": 0.44473813345402846
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=619.185, mean=619.185, max=619.185, sum=1238.37 (2)",
-            "tab": "General information",
-            "score": 619.1851851851852
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6,
-        "details": {
-          "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.387, mean=0.387, max=0.387, sum=0.774 (2)",
-            "tab": "Efficiency",
-            "score": 0.38679331866177646
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=474.827, mean=474.827, max=474.827, sum=949.655 (2)",
-            "tab": "General information",
-            "score": 474.8272727272727
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.608,
-        "details": {
-          "description": "min=0.608, mean=0.608, max=0.608, sum=1.216 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.771, mean=0.771, max=0.771, sum=1.542 (2)",
-            "tab": "Efficiency",
-            "score": 0.7707553902450873
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1377.531, mean=1377.531, max=1377.531, sum=2755.061 (2)",
-            "tab": "General information",
-            "score": 1377.530612244898
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.761,
-        "details": {
-          "description": "min=0.761, mean=0.761, max=0.761, sum=1.522 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.385, mean=0.385, max=0.385, sum=0.77 (2)",
-            "tab": "Efficiency",
-            "score": 0.38491436853930727
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=508.478, mean=508.478, max=508.478, sum=1016.955 (2)",
-            "tab": "General information",
-            "score": 508.4776119402985
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.476,
-        "details": {
-          "description": "min=0.476, mean=0.476, max=0.476, sum=0.952 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)",
-            "tab": "Efficiency",
-            "score": 0.3499309801193605
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=405.108, mean=405.108, max=405.108, sum=810.217 (2)",
-            "tab": "General information",
-            "score": 405.10843373493975
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.338, mean=0.338, max=0.338, sum=0.675 (2)",
-            "tab": "Efficiency",
-            "score": 0.33768263197781745
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=304.474, mean=304.474, max=304.474, sum=608.947 (2)",
-            "tab": "General information",
-            "score": 304.4736842105263
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.502,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json b/data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json
deleted file mode 100644
index aa6a9caa2..000000000
--- a/data/helm_mmlu/meta/llama-2-70b/9da7439c-e96b-444f-b4fa-7ef638080740.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-2-70b/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 2 70B",
-    "id": "meta/llama-2-70b",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.695,
-        "details": {
-          "description": "min=0.31, mean=0.695, max=0.933, sum=79.283 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.314, mean=0.466, max=0.981, sum=53.164 (114)",
-            "tab": "Efficiency",
-            "score": 0.46634649940337786
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=304.474, mean=706.682, max=3159.636, sum=80561.749 (114)",
-            "tab": "General information",
-            "score": 706.6820126388612
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.31,
-        "details": {
-          "description": "min=0.31, mean=0.31, max=0.31, sum=0.62 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.36, mean=0.36, max=0.36, sum=0.72 (2)",
-            "tab": "Efficiency",
-            "score": 0.3601346731185913
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=397.65, mean=397.65, max=397.65, sum=795.3 (2)",
-            "tab": "General information",
-            "score": 397.65
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.607,
-        "details": {
-          "description": "min=0.607, mean=0.607, max=0.607, sum=1.215 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.433, mean=0.433, max=0.433, sum=0.866 (2)",
-            "tab": "Efficiency",
-            "score": 0.4331345310917607
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=409.133, mean=409.133, max=409.133, sum=818.267 (2)",
-            "tab": "General information",
-            "score": 409.1333333333333
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.363,
-        "details": {
-          "description": "min=0.363, mean=0.363, max=0.363, sum=0.725 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.379, mean=0.379, max=0.379, sum=0.757 (2)",
-            "tab": "Efficiency",
-            "score": 0.3786743521690369
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.387, mean=0.387, max=0.387, sum=0.773 (2)",
-            "tab": "Efficiency",
-            "score": 0.38658806019359165
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.444, mean=0.444, max=0.444, sum=0.888 (2)",
-            "tab": "Efficiency",
-            "score": 0.44394851446151734
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)",
-            "tab": "Efficiency",
-            "score": 0.7099040699005127
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.47, mean=0.47, max=0.47, sum=0.939 (2)",
-            "tab": "Efficiency",
-            "score": 0.4695483673514658
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.389, mean=0.389, max=0.389, sum=0.778 (2)",
-            "tab": "Efficiency",
-            "score": 0.3889027389825559
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=622.43, mean=622.43, max=622.43, sum=1244.86 (2)",
-            "tab": "General information",
-            "score": 622.43
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=553.632, mean=553.632, max=553.632, sum=1107.264 (2)",
-            "tab": "General information",
-            "score": 553.6319444444445
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=901.14, mean=901.14, max=901.14, sum=1802.28 (2)",
-            "tab": "General information",
-            "score": 901.14
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=646.96, mean=646.96, max=646.96, sum=1293.92 (2)",
-            "tab": "General information",
-            "score": 646.96
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=608.671, mean=608.671, max=608.671, sum=1217.341 (2)",
-            "tab": "General information",
-            "score": 608.6705202312139
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=551.873, mean=551.873, max=551.873, sum=1103.745 (2)",
-            "tab": "General information",
-            "score": 551.8725490196078
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.371, mean=0.371, max=0.371, sum=0.743 (2)",
-            "tab": "Efficiency",
-            "score": 0.3714062762260437
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=428.17, mean=428.17, max=428.17, sum=856.34 (2)",
-            "tab": "General information",
-            "score": 428.17
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.43,
-        "details": {
-          "description": "min=0.43, mean=0.43, max=0.43, sum=0.86 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.392, mean=0.392, max=0.392, sum=0.783 (2)",
-            "tab": "Efficiency",
-            "score": 0.3916624889039157
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=684.675, mean=684.675, max=684.675, sum=1369.351 (2)",
-            "tab": "General information",
-            "score": 684.6754385964912
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.47,
-        "details": {
-          "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.374, mean=0.374, max=0.374, sum=0.747 (2)",
-            "tab": "Efficiency",
-            "score": 0.3736806106567383
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=484.54, mean=484.54, max=484.54, sum=969.08 (2)",
-            "tab": "General information",
-            "score": 484.54
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.824,
-        "details": {
-          "description": "min=0.824, mean=0.824, max=0.824, sum=1.648 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.694, mean=0.694, max=0.694, sum=1.387 (2)",
-            "tab": "Efficiency",
-            "score": 0.6937185768727903
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=449.898, mean=449.898, max=449.898, sum=899.796 (2)",
-            "tab": "General information",
-            "score": 449.89814814814815
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791,
-        "details": {
-          "description": "min=0.791, mean=0.791, max=0.791, sum=1.582 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.314, mean=0.314, max=0.314, sum=0.628 (2)",
-            "tab": "Efficiency",
-            "score": 0.3140420009085603
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=372.122, mean=372.122, max=372.122, sum=744.244 (2)",
-            "tab": "General information",
-            "score": 372.12218649517683
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.515, mean=0.515, max=0.515, sum=1.029 (2)",
-            "tab": "Efficiency",
-            "score": 0.5146331287482205
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.387, mean=0.387, max=0.387, sum=0.774 (2)",
-            "tab": "Efficiency",
-            "score": 0.3871775383644916
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.697, mean=0.697, max=0.697, sum=1.395 (2)",
-            "tab": "Efficiency",
-            "score": 0.6972876995452224
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.393, mean=0.393, max=0.393, sum=0.787 (2)",
-            "tab": "Efficiency",
-            "score": 0.39348618851767647
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1330.647, mean=1330.647, max=1330.647, sum=2661.294 (2)",
-            "tab": "General information",
-            "score": 1330.6470588235295
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=823.277, mean=823.277, max=823.277, sum=1646.553 (2)",
-            "tab": "General information",
-            "score": 823.2765957446809
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1915.007, mean=1915.007, max=1915.007, sum=3830.014 (2)",
-            "tab": "General information",
-            "score": 1915.0071707953064
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=650.078, mean=650.078, max=650.078, sum=1300.157 (2)",
-            "tab": "General information",
-            "score": 650.0784313725491
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.348, mean=0.348, max=0.348, sum=0.696 (2)",
-            "tab": "Efficiency",
-            "score": 0.3482255029678345
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=479.81, mean=479.81, max=479.81, sum=959.62 (2)",
-            "tab": "General information",
-            "score": 479.81
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.829,
-        "details": {
-          "description": "min=0.829, mean=0.829, max=0.829, sum=1.658 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.456, mean=0.456, max=0.456, sum=0.912 (2)",
-            "tab": "Efficiency",
-            "score": 0.45624671798003347
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=681.079, mean=681.079, max=681.079, sum=1362.158 (2)",
-            "tab": "General information",
-            "score": 681.078947368421
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.73,
-        "details": {
-          "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.649, mean=0.649, max=0.649, sum=1.298 (2)",
-            "tab": "Efficiency",
-            "score": 0.6490170955657959
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=674.44, mean=674.44, max=674.44, sum=1348.88 (2)",
-            "tab": "General information",
-            "score": 674.44
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.717,
-        "details": {
-          "description": "min=0.717, mean=0.717, max=0.717, sum=1.434 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.394, mean=0.394, max=0.394, sum=0.788 (2)",
-            "tab": "Efficiency",
-            "score": 0.394086869257801
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=487.374, mean=487.374, max=487.374, sum=974.747 (2)",
-            "tab": "General information",
-            "score": 487.3735849056604
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.668,
-        "details": {
-          "description": "min=0.668, mean=0.668, max=0.668, sum=1.336 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.519, mean=0.519, max=0.519, sum=1.038 (2)",
-            "tab": "Efficiency",
-            "score": 0.5188552247717025
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=333.153, mean=333.153, max=333.153, sum=666.306 (2)",
-            "tab": "General information",
-            "score": 333.1531914893617
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.634,
-        "details": {
-          "description": "min=0.634, mean=0.634, max=0.634, sum=1.269 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)",
-            "tab": "Efficiency",
-            "score": 0.414785334159588
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=497.779, mean=497.779, max=497.779, sum=995.559 (2)",
-            "tab": "General information",
-            "score": 497.7793103448276
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.421,
-        "details": {
-          "description": "min=0.421, mean=0.421, max=0.421, sum=0.841 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.407, mean=0.407, max=0.407, sum=0.814 (2)",
-            "tab": "Efficiency",
-            "score": 0.4069670924433955
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=609.156, mean=609.156, max=609.156, sum=1218.312 (2)",
-            "tab": "General information",
-            "score": 609.1560846560847
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.468,
-        "details": {
-          "description": "min=0.468, mean=0.468, max=0.468, sum=0.937 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)",
-            "tab": "Efficiency",
-            "score": 0.41500668109409394
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=691.81, mean=691.81, max=691.81, sum=1383.619 (2)",
-            "tab": "General information",
-            "score": 691.8095238095239
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.882,
-        "details": {
-          "description": "min=0.882, mean=0.882, max=0.882, sum=1.764 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.379, mean=0.379, max=0.379, sum=0.759 (2)",
-            "tab": "Efficiency",
-            "score": 0.3793416823110273
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)",
-            "tab": "Efficiency",
-            "score": 0.4020436197666112
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.562, mean=0.562, max=0.562, sum=1.124 (2)",
-            "tab": "Efficiency",
-            "score": 0.5618092942237854
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.981, mean=0.981, max=0.981, sum=1.962 (2)",
-            "tab": "Efficiency",
-            "score": 0.9809041355595444
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)",
-            "tab": "Efficiency",
-            "score": 0.41476938218781445
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.388, mean=0.388, max=0.388, sum=0.775 (2)",
-            "tab": "Efficiency",
-            "score": 0.3875881736142648
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)",
-            "tab": "Efficiency",
-            "score": 0.3797990028674786
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.408, mean=0.408, max=0.408, sum=0.817 (2)",
-            "tab": "Efficiency",
-            "score": 0.40841888145164207
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.441, mean=0.441, max=0.441, sum=0.882 (2)",
-            "tab": "Efficiency",
-            "score": 0.4407546289828645
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.426, mean=0.426, max=0.426, sum=0.851 (2)",
-            "tab": "Efficiency",
-            "score": 0.42553993724039846
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.469, mean=0.469, max=0.469, sum=0.939 (2)",
-            "tab": "Efficiency",
-            "score": 0.46939194880494284
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.499, mean=0.499, max=0.499, sum=0.998 (2)",
-            "tab": "Efficiency",
-            "score": 0.4990172529662097
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.847, mean=0.847, max=0.847, sum=1.693 (2)",
-            "tab": "Efficiency",
-            "score": 0.8465246745184356
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.558, mean=0.558, max=0.558, sum=1.117 (2)",
-            "tab": "Efficiency",
-            "score": 0.5583362217190899
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=596.894, mean=596.894, max=596.894, sum=1193.787 (2)",
-            "tab": "General information",
-            "score": 596.8935483870968
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=568.665, mean=568.665, max=568.665, sum=1137.33 (2)",
-            "tab": "General information",
-            "score": 568.6650246305419
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=988.57, mean=988.57, max=988.57, sum=1977.14 (2)",
-            "tab": "General information",
-            "score": 988.57
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=3159.636, mean=3159.636, max=3159.636, sum=6319.273 (2)",
-            "tab": "General information",
-            "score": 3159.6363636363635
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=436.657, mean=436.657, max=436.657, sum=873.313 (2)",
-            "tab": "General information",
-            "score": 436.65656565656565
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=527.927, mean=527.927, max=527.927, sum=1055.855 (2)",
-            "tab": "General information",
-            "score": 527.9274611398964
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=445.662, mean=445.662, max=445.662, sum=891.323 (2)",
-            "tab": "General information",
-            "score": 445.66153846153844
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=579.181, mean=579.181, max=579.181, sum=1158.363 (2)",
-            "tab": "General information",
-            "score": 579.1814814814815
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=449.492, mean=449.492, max=449.492, sum=898.983 (2)",
-            "tab": "General information",
-            "score": 449.49159663865544
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=621.788, mean=621.788, max=621.788, sum=1243.576 (2)",
-            "tab": "General information",
-            "score": 621.7880794701987
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=585.919, mean=585.919, max=585.919, sum=1171.839 (2)",
-            "tab": "General information",
-            "score": 585.9192660550459
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=908.208, mean=908.208, max=908.208, sum=1816.417 (2)",
-            "tab": "General information",
-            "score": 908.2083333333334
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2535.324, mean=2535.324, max=2535.324, sum=5070.647 (2)",
-            "tab": "General information",
-            "score": 2535.323529411765
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1638.219, mean=1638.219, max=1638.219, sum=3276.439 (2)",
-            "tab": "General information",
-            "score": 1638.2194092827003
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.487, mean=0.487, max=0.487, sum=0.973 (2)",
-            "tab": "Efficiency",
-            "score": 0.4866963897585334
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.39, mean=0.39, max=0.39, sum=0.781 (2)",
-            "tab": "Efficiency",
-            "score": 0.3902700020156744
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=361.26, mean=361.26, max=361.26, sum=722.52 (2)",
-            "tab": "General information",
-            "score": 361.26008968609864
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=403.382, mean=403.382, max=403.382, sum=806.763 (2)",
-            "tab": "General information",
-            "score": 403.381679389313
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.868,
-        "details": {
-          "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.538, mean=0.538, max=0.538, sum=1.076 (2)",
-            "tab": "Efficiency",
-            "score": 0.5381311483619627
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=729.463, mean=729.463, max=729.463, sum=1458.926 (2)",
-            "tab": "General information",
-            "score": 729.4628099173553
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791,
-        "details": {
-          "description": "min=0.791, mean=0.791, max=0.791, sum=1.583 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.451, mean=0.451, max=0.451, sum=0.903 (2)",
-            "tab": "Efficiency",
-            "score": 0.4513764015736024
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=502.755, mean=502.755, max=502.755, sum=1005.509 (2)",
-            "tab": "General information",
-            "score": 502.7546012269939
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.491,
-        "details": {
-          "description": "min=0.491, mean=0.491, max=0.491, sum=0.982 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.447, mean=0.447, max=0.447, sum=0.895 (2)",
-            "tab": "Efficiency",
-            "score": 0.4473994416849954
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=730.402, mean=730.402, max=730.402, sum=1460.804 (2)",
-            "tab": "General information",
-            "score": 730.4017857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.845,
-        "details": {
-          "description": "min=0.845, mean=0.845, max=0.845, sum=1.689 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.482, mean=0.482, max=0.482, sum=0.965 (2)",
-            "tab": "Efficiency",
-            "score": 0.482250699719179
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=315.777, mean=315.777, max=315.777, sum=631.553 (2)",
-            "tab": "General information",
-            "score": 315.77669902912623
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.889,
-        "details": {
-          "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.529, mean=0.529, max=0.529, sum=1.059 (2)",
-            "tab": "Efficiency",
-            "score": 0.5294328500062991
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=472.628, mean=472.628, max=472.628, sum=945.256 (2)",
-            "tab": "General information",
-            "score": 472.62820512820514
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.72,
-        "details": {
-          "description": "min=0.72, mean=0.72, max=0.72, sum=1.44 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.426, mean=0.426, max=0.426, sum=0.852 (2)",
-            "tab": "Efficiency",
-            "score": 0.42598395347595214
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=408.14, mean=408.14, max=408.14, sum=816.28 (2)",
-            "tab": "General information",
-            "score": 408.14
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.857,
-        "details": {
-          "description": "min=0.857, mean=0.857, max=0.857, sum=1.714 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.434, mean=0.434, max=0.434, sum=0.868 (2)",
-            "tab": "Efficiency",
-            "score": 0.43395179502504233
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=345.913, mean=345.913, max=345.913, sum=691.826 (2)",
-            "tab": "General information",
-            "score": 345.9131545338442
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.45,
-        "details": {
-          "description": "min=0.45, mean=0.45, max=0.45, sum=0.901 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.404, mean=0.404, max=0.404, sum=0.809 (2)",
-            "tab": "Efficiency",
-            "score": 0.4043546129513338
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.514, mean=0.514, max=0.514, sum=1.028 (2)",
-            "tab": "Efficiency",
-            "score": 0.5137747306397508
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=542.506, mean=542.506, max=542.506, sum=1085.012 (2)",
-            "tab": "General information",
-            "score": 542.5057803468208
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=756.479, mean=756.479, max=756.479, sum=1512.959 (2)",
-            "tab": "General information",
-            "score": 756.4793296089385
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.758,
-        "details": {
-          "description": "min=0.758, mean=0.758, max=0.758, sum=1.516 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.447, mean=0.447, max=0.447, sum=0.895 (2)",
-            "tab": "Efficiency",
-            "score": 0.44729572885176716
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=695.922, mean=695.922, max=695.922, sum=1391.843 (2)",
-            "tab": "General information",
-            "score": 695.9215686274509
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.472, mean=0.472, max=0.472, sum=0.945 (2)",
-            "tab": "Efficiency",
-            "score": 0.4722691575686137
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=619.185, mean=619.185, max=619.185, sum=1238.37 (2)",
-            "tab": "General information",
-            "score": 619.1851851851852
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.745,
-        "details": {
-          "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.345, mean=0.345, max=0.345, sum=0.69 (2)",
-            "tab": "Efficiency",
-            "score": 0.34489609761671586
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=474.827, mean=474.827, max=474.827, sum=949.655 (2)",
-            "tab": "General information",
-            "score": 474.8272727272727
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "description": "min=0.796, mean=0.796, max=0.796, sum=1.592 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.484, mean=0.484, max=0.484, sum=0.968 (2)",
-            "tab": "Efficiency",
-            "score": 0.48404579649166185
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1377.531, mean=1377.531, max=1377.531, sum=2755.061 (2)",
-            "tab": "General information",
-            "score": 1377.530612244898
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.384, mean=0.384, max=0.384, sum=0.769 (2)",
-            "tab": "Efficiency",
-            "score": 0.38445919781775023
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=508.478, mean=508.478, max=508.478, sum=1016.955 (2)",
-            "tab": "General information",
-            "score": 508.4776119402985
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53,
-        "details": {
-          "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.464, mean=0.464, max=0.464, sum=0.928 (2)",
-            "tab": "Efficiency",
-            "score": 0.464106645928808
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=405.108, mean=405.108, max=405.108, sum=810.217 (2)",
-            "tab": "General information",
-            "score": 405.10843373493975
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.854,
-        "details": {
-          "description": "min=0.854, mean=0.854, max=0.854, sum=1.708 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.416, mean=0.416, max=0.416, sum=0.831 (2)",
-            "tab": "Efficiency",
-            "score": 0.41569664603785467
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=304.474, mean=304.474, max=304.474, sum=608.947 (2)",
-            "tab": "General information",
-            "score": 304.4736842105263
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.508,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json b/data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json
deleted file mode 100644
index 0649e7329..000000000
--- a/data/helm_mmlu/meta/llama-2-7b/294b22a0-1676-4d8c-8ad2-5cdc40267255.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-2-7b/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 2 7B",
-    "id": "meta/llama-2-7b",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.458,
-        "details": {
-          "description": "min=0.196, mean=0.458, max=0.713, sum=52.224 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.28, mean=0.374, max=0.947, sum=42.6 (114)",
-            "tab": "Efficiency",
-            "score": 0.37368440752207543
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=304.474, mean=706.682, max=3159.636, sum=80561.749 (114)",
-            "tab": "General information",
-            "score": 706.6820126388612
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.29,
-        "details": {
-          "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.664 (2)",
-            "tab": "Efficiency",
-            "score": 0.3319991087913513
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=397.65, mean=397.65, max=397.65, sum=795.3 (2)",
-            "tab": "General information",
-            "score": 397.65
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.452,
-        "details": {
-          "description": "min=0.452, mean=0.452, max=0.452, sum=0.904 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.31, mean=0.31, max=0.31, sum=0.619 (2)",
-            "tab": "Efficiency",
-            "score": 0.3097020767353199
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=409.133, mean=409.133, max=409.133, sum=818.267 (2)",
-            "tab": "General information",
-            "score": 409.1333333333333
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.196,
-        "details": {
-          "description": "min=0.196, mean=0.196, max=0.196, sum=0.392 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)",
-            "tab": "Efficiency",
-            "score": 0.35009843587875367
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.328, mean=0.328, max=0.328, sum=0.656 (2)",
-            "tab": "Efficiency",
-            "score": 0.3278946164581511
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.381, mean=0.381, max=0.381, sum=0.763 (2)",
-            "tab": "Efficiency",
-            "score": 0.38129755973815915
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.341, mean=0.341, max=0.341, sum=0.682 (2)",
-            "tab": "Efficiency",
-            "score": 0.3409119129180908
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.331, mean=0.331, max=0.331, sum=0.662 (2)",
-            "tab": "Efficiency",
-            "score": 0.3307889693045203
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)",
-            "tab": "Efficiency",
-            "score": 0.3398791224348779
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=622.43, mean=622.43, max=622.43, sum=1244.86 (2)",
-            "tab": "General information",
-            "score": 622.43
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=553.632, mean=553.632, max=553.632, sum=1107.264 (2)",
-            "tab": "General information",
-            "score": 553.6319444444445
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=901.14, mean=901.14, max=901.14, sum=1802.28 (2)",
-            "tab": "General information",
-            "score": 901.14
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=646.96, mean=646.96, max=646.96, sum=1293.92 (2)",
-            "tab": "General information",
-            "score": 646.96
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=608.671, mean=608.671, max=608.671, sum=1217.341 (2)",
-            "tab": "General information",
-            "score": 608.6705202312139
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=551.873, mean=551.873, max=551.873, sum=1103.745 (2)",
-            "tab": "General information",
-            "score": 551.8725490196078
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.59,
-        "details": {
-          "description": "min=0.59, mean=0.59, max=0.59, sum=1.18 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.329, mean=0.329, max=0.329, sum=0.659 (2)",
-            "tab": "Efficiency",
-            "score": 0.3293105459213257
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=428.17, mean=428.17, max=428.17, sum=856.34 (2)",
-            "tab": "General information",
-            "score": 428.17
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.316,
-        "details": {
-          "description": "min=0.316, mean=0.316, max=0.316, sum=0.632 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.375, mean=0.375, max=0.375, sum=0.749 (2)",
-            "tab": "Efficiency",
-            "score": 0.3746668204926608
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=684.675, mean=684.675, max=684.675, sum=1369.351 (2)",
-            "tab": "General information",
-            "score": 684.6754385964912
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.29,
-        "details": {
-          "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.329, mean=0.329, max=0.329, sum=0.659 (2)",
-            "tab": "Efficiency",
-            "score": 0.32934638738632205
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=484.54, mean=484.54, max=484.54, sum=969.08 (2)",
-            "tab": "General information",
-            "score": 484.54
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.519,
-        "details": {
-          "description": "min=0.519, mean=0.519, max=0.519, sum=1.037 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)",
-            "tab": "Efficiency",
-            "score": 0.2942208139984696
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=449.898, mean=449.898, max=449.898, sum=899.796 (2)",
-            "tab": "General information",
-            "score": 449.89814814814815
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.592,
-        "details": {
-          "description": "min=0.592, mean=0.592, max=0.592, sum=1.183 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.6 (2)",
-            "tab": "Efficiency",
-            "score": 0.2999055814896366
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=372.122, mean=372.122, max=372.122, sum=744.244 (2)",
-            "tab": "General information",
-            "score": 372.12218649517683
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.459,
-        "details": {
-          "description": "min=0.459, mean=0.459, max=0.459, sum=0.918 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.463, mean=0.463, max=0.463, sum=0.926 (2)",
-            "tab": "Efficiency",
-            "score": 0.463154871674145
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.377, mean=0.377, max=0.377, sum=0.755 (2)",
-            "tab": "Efficiency",
-            "score": 0.37741253392916196
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.618, mean=0.618, max=0.618, sum=1.235 (2)",
-            "tab": "Efficiency",
-            "score": 0.6177054020385543
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.44, mean=0.44, max=0.44, sum=0.879 (2)",
-            "tab": "Efficiency",
-            "score": 0.4397414544828577
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1330.647, mean=1330.647, max=1330.647, sum=2661.294 (2)",
-            "tab": "General information",
-            "score": 1330.6470588235295
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=823.277, mean=823.277, max=823.277, sum=1646.553 (2)",
-            "tab": "General information",
-            "score": 823.2765957446809
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1915.007, mean=1915.007, max=1915.007, sum=3830.014 (2)",
-            "tab": "General information",
-            "score": 1915.0071707953064
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=650.078, mean=650.078, max=650.078, sum=1300.157 (2)",
-            "tab": "General information",
-            "score": 650.0784313725491
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.64,
-        "details": {
-          "description": "min=0.64, mean=0.64, max=0.64, sum=1.28 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)",
-            "tab": "Efficiency",
-            "score": 0.3431359338760376
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=479.81, mean=479.81, max=479.81, sum=959.62 (2)",
-            "tab": "General information",
-            "score": 479.81
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.408,
-        "details": {
-          "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.345, mean=0.345, max=0.345, sum=0.69 (2)",
-            "tab": "Efficiency",
-            "score": 0.34498921193574605
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=681.079, mean=681.079, max=681.079, sum=1362.158 (2)",
-            "tab": "General information",
-            "score": 681.078947368421
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.48,
-        "details": {
-          "description": "min=0.48, mean=0.48, max=0.48, sum=0.96 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.334, mean=0.334, max=0.334, sum=0.668 (2)",
-            "tab": "Efficiency",
-            "score": 0.3342457461357117
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=674.44, mean=674.44, max=674.44, sum=1348.88 (2)",
-            "tab": "General information",
-            "score": 674.44
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.453,
-        "details": {
-          "description": "min=0.453, mean=0.453, max=0.453, sum=0.906 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.323, max=0.323, sum=0.645 (2)",
-            "tab": "Efficiency",
-            "score": 0.3225168426081819
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=487.374, mean=487.374, max=487.374, sum=974.747 (2)",
-            "tab": "General information",
-            "score": 487.3735849056604
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.434,
-        "details": {
-          "description": "min=0.434, mean=0.434, max=0.434, sum=0.868 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.323, max=0.323, sum=0.646 (2)",
-            "tab": "Efficiency",
-            "score": 0.32303770450835534
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=333.153, mean=333.153, max=333.153, sum=666.306 (2)",
-            "tab": "General information",
-            "score": 333.1531914893617
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.407,
-        "details": {
-          "description": "min=0.407, mean=0.407, max=0.407, sum=0.814 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.325, mean=0.325, max=0.325, sum=0.649 (2)",
-            "tab": "Efficiency",
-            "score": 0.32454562516048036
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=497.779, mean=497.779, max=497.779, sum=995.559 (2)",
-            "tab": "General information",
-            "score": 497.7793103448276
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.254,
-        "details": {
-          "description": "min=0.254, mean=0.254, max=0.254, sum=0.508 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.334, mean=0.334, max=0.334, sum=0.669 (2)",
-            "tab": "Efficiency",
-            "score": 0.33426338718051
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=609.156, mean=609.156, max=609.156, sum=1218.312 (2)",
-            "tab": "General information",
-            "score": 609.1560846560847
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.27,
-        "details": {
-          "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.383, mean=0.383, max=0.383, sum=0.766 (2)",
-            "tab": "Efficiency",
-            "score": 0.3832281846848745
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=691.81, mean=691.81, max=691.81, sum=1383.619 (2)",
-            "tab": "General information",
-            "score": 691.8095238095239
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.662,
-        "details": {
-          "description": "min=0.662, mean=0.662, max=0.662, sum=1.325 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.326, mean=0.326, max=0.326, sum=0.653 (2)",
-            "tab": "Efficiency",
-            "score": 0.32630388890543294
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.306, mean=0.306, max=0.306, sum=0.611 (2)",
-            "tab": "Efficiency",
-            "score": 0.30552317473688734
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.406, mean=0.406, max=0.406, sum=0.812 (2)",
-            "tab": "Efficiency",
-            "score": 0.4060112690925598
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.947, mean=0.947, max=0.947, sum=1.894 (2)",
-            "tab": "Efficiency",
-            "score": 0.9469690496271307
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.655 (2)",
-            "tab": "Efficiency",
-            "score": 0.32730214523546625
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)",
-            "tab": "Efficiency",
-            "score": 0.3369472236830954
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.331, mean=0.331, max=0.331, sum=0.662 (2)",
-            "tab": "Efficiency",
-            "score": 0.3308515047415709
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.671 (2)",
-            "tab": "Efficiency",
-            "score": 0.3355037459620723
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.459, mean=0.459, max=0.459, sum=0.918 (2)",
-            "tab": "Efficiency",
-            "score": 0.45884753475670054
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.671 (2)",
-            "tab": "Efficiency",
-            "score": 0.3355141222871692
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.389, mean=0.389, max=0.389, sum=0.778 (2)",
-            "tab": "Efficiency",
-            "score": 0.3889624678760494
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.393, mean=0.393, max=0.393, sum=0.786 (2)",
-            "tab": "Efficiency",
-            "score": 0.39307444846188583
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)",
-            "tab": "Efficiency",
-            "score": 0.7781471855500165
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.542, mean=0.542, max=0.542, sum=1.085 (2)",
-            "tab": "Efficiency",
-            "score": 0.5424087500270409
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=596.894, mean=596.894, max=596.894, sum=1193.787 (2)",
-            "tab": "General information",
-            "score": 596.8935483870968
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=568.665, mean=568.665, max=568.665, sum=1137.33 (2)",
-            "tab": "General information",
-            "score": 568.6650246305419
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=988.57, mean=988.57, max=988.57, sum=1977.14 (2)",
-            "tab": "General information",
-            "score": 988.57
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=3159.636, mean=3159.636, max=3159.636, sum=6319.273 (2)",
-            "tab": "General information",
-            "score": 3159.6363636363635
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=436.657, mean=436.657, max=436.657, sum=873.313 (2)",
-            "tab": "General information",
-            "score": 436.65656565656565
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=527.927, mean=527.927, max=527.927, sum=1055.855 (2)",
-            "tab": "General information",
-            "score": 527.9274611398964
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=445.662, mean=445.662, max=445.662, sum=891.323 (2)",
-            "tab": "General information",
-            "score": 445.66153846153844
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=579.181, mean=579.181, max=579.181, sum=1158.363 (2)",
-            "tab": "General information",
-            "score": 579.1814814814815
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=449.492, mean=449.492, max=449.492, sum=898.983 (2)",
-            "tab": "General information",
-            "score": 449.49159663865544
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=621.788, mean=621.788, max=621.788, sum=1243.576 (2)",
-            "tab": "General information",
-            "score": 621.7880794701987
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=585.919, mean=585.919, max=585.919, sum=1171.839 (2)",
-            "tab": "General information",
-            "score": 585.9192660550459
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=908.208, mean=908.208, max=908.208, sum=1816.417 (2)",
-            "tab": "General information",
-            "score": 908.2083333333334
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2535.324, mean=2535.324, max=2535.324, sum=5070.647 (2)",
-            "tab": "General information",
-            "score": 2535.323529411765
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1638.219, mean=1638.219, max=1638.219, sum=3276.439 (2)",
-            "tab": "General information",
-            "score": 1638.2194092827003
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.557,
-        "details": {
-          "description": "min=0.557, mean=0.557, max=0.557, sum=1.115 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.28, mean=0.28, max=0.28, sum=0.56 (2)",
-            "tab": "Efficiency",
-            "score": 0.28007102974861725
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.335, mean=0.335, max=0.335, sum=0.671 (2)",
-            "tab": "Efficiency",
-            "score": 0.3354811176998925
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=361.26, mean=361.26, max=361.26, sum=722.52 (2)",
-            "tab": "General information",
-            "score": 361.26008968609864
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=403.382, mean=403.382, max=403.382, sum=806.763 (2)",
-            "tab": "General information",
-            "score": 403.381679389313
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.628,
-        "details": {
-          "description": "min=0.628, mean=0.628, max=0.628, sum=1.256 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)",
-            "tab": "Efficiency",
-            "score": 0.3510365151176768
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=729.463, mean=729.463, max=729.463, sum=1458.926 (2)",
-            "tab": "General information",
-            "score": 729.4628099173553
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.466,
-        "details": {
-          "description": "min=0.466, mean=0.466, max=0.466, sum=0.933 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.655 (2)",
-            "tab": "Efficiency",
-            "score": 0.3273066304212699
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=502.755, mean=502.755, max=502.755, sum=1005.509 (2)",
-            "tab": "General information",
-            "score": 502.7546012269939
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.402,
-        "details": {
-          "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.366, mean=0.366, max=0.366, sum=0.732 (2)",
-            "tab": "Efficiency",
-            "score": 0.36619071449552265
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=730.402, mean=730.402, max=730.402, sum=1460.804 (2)",
-            "tab": "General information",
-            "score": 730.4017857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.563,
-        "details": {
-          "description": "min=0.563, mean=0.563, max=0.563, sum=1.126 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.335, mean=0.335, max=0.335, sum=0.669 (2)",
-            "tab": "Efficiency",
-            "score": 0.33452116632924495
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=315.777, mean=315.777, max=315.777, sum=631.553 (2)",
-            "tab": "General information",
-            "score": 315.77669902912623
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.697,
-        "details": {
-          "description": "min=0.697, mean=0.697, max=0.697, sum=1.393 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.331, mean=0.331, max=0.331, sum=0.662 (2)",
-            "tab": "Efficiency",
-            "score": 0.3312412653213892
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=472.628, mean=472.628, max=472.628, sum=945.256 (2)",
-            "tab": "General information",
-            "score": 472.62820512820514
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53,
-        "details": {
-          "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.34, mean=0.34, max=0.34, sum=0.679 (2)",
-            "tab": "Efficiency",
-            "score": 0.3395656991004944
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=408.14, mean=408.14, max=408.14, sum=816.28 (2)",
-            "tab": "General information",
-            "score": 408.14
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.632,
-        "details": {
-          "description": "min=0.632, mean=0.632, max=0.632, sum=1.264 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.353, mean=0.353, max=0.353, sum=0.706 (2)",
-            "tab": "Efficiency",
-            "score": 0.3531375576862126
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=345.913, mean=345.913, max=345.913, sum=691.826 (2)",
-            "tab": "General information",
-            "score": 345.9131545338442
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.238,
-        "details": {
-          "description": "min=0.238, mean=0.238, max=0.238, sum=0.476 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.326, mean=0.326, max=0.326, sum=0.653 (2)",
-            "tab": "Efficiency",
-            "score": 0.3263767213490657
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.369, mean=0.369, max=0.369, sum=0.738 (2)",
-            "tab": "Efficiency",
-            "score": 0.3688804725028949
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=542.506, mean=542.506, max=542.506, sum=1085.012 (2)",
-            "tab": "General information",
-            "score": 542.5057803468208
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=756.479, mean=756.479, max=756.479, sum=1512.959 (2)",
-            "tab": "General information",
-            "score": 756.4793296089385
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.497,
-        "details": {
-          "description": "min=0.497, mean=0.497, max=0.497, sum=0.993 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)",
-            "tab": "Efficiency",
-            "score": 0.34185195904152066
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=695.922, mean=695.922, max=695.922, sum=1391.843 (2)",
-            "tab": "General information",
-            "score": 695.9215686274509
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.503,
-        "details": {
-          "description": "min=0.503, mean=0.503, max=0.503, sum=1.006 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.665 (2)",
-            "tab": "Efficiency",
-            "score": 0.33259875023806534
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=619.185, mean=619.185, max=619.185, sum=1238.37 (2)",
-            "tab": "General information",
-            "score": 619.1851851851852
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.509,
-        "details": {
-          "description": "min=0.509, mean=0.509, max=0.509, sum=1.018 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.665 (2)",
-            "tab": "Efficiency",
-            "score": 0.3326493003151634
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=474.827, mean=474.827, max=474.827, sum=949.655 (2)",
-            "tab": "General information",
-            "score": 474.8272727272727
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.433,
-        "details": {
-          "description": "min=0.433, mean=0.433, max=0.433, sum=0.865 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.477, mean=0.477, max=0.477, sum=0.955 (2)",
-            "tab": "Efficiency",
-            "score": 0.4774373015578912
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1377.531, mean=1377.531, max=1377.531, sum=2755.061 (2)",
-            "tab": "General information",
-            "score": 1377.530612244898
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.617,
-        "details": {
-          "description": "min=0.617, mean=0.617, max=0.617, sum=1.234 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.312, mean=0.312, max=0.312, sum=0.623 (2)",
-            "tab": "Efficiency",
-            "score": 0.31150120170555307
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=508.478, mean=508.478, max=508.478, sum=1016.955 (2)",
-            "tab": "General information",
-            "score": 508.4776119402985
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.392,
-        "details": {
-          "description": "min=0.392, mean=0.392, max=0.392, sum=0.783 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)",
-            "tab": "Efficiency",
-            "score": 0.32997589513479947
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=405.108, mean=405.108, max=405.108, sum=810.217 (2)",
-            "tab": "General information",
-            "score": 405.10843373493975
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.713,
-        "details": {
-          "description": "min=0.713, mean=0.713, max=0.713, sum=1.427 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.6 (2)",
-            "tab": "Efficiency",
-            "score": 0.2998225702876933
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=304.474, mean=304.474, max=304.474, sum=608.947 (2)",
-            "tab": "General information",
-            "score": 304.4736842105263
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.681,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json b/data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json
deleted file mode 100644
index 4f09a5ee3..000000000
--- a/data/helm_mmlu/meta/llama-3-70b/1c11950d-bd2f-407b-928a-5cd33a0a3d6e.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3-70b/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 3 70B",
-    "id": "meta/llama-3-70b",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.793,
-        "details": {
-          "description": "min=0.43, mean=0.793, max=0.979, sum=90.444 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.333, mean=0.462, max=1.184, sum=52.708 (114)",
-            "tab": "Efficiency",
-            "score": 0.46235507518987096
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=267.52, mean=607.619, max=2790.885, sum=69268.61 (114)",
-            "tab": "General information",
-            "score": 607.6193817308517
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.43,
-        "details": {
-          "description": "min=0.43, mean=0.43, max=0.43, sum=0.86 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.387, mean=0.387, max=0.387, sum=0.774 (2)",
-            "tab": "Efficiency",
-            "score": 0.3868687057495117
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=366.43, mean=366.43, max=366.43, sum=732.86 (2)",
-            "tab": "General information",
-            "score": 366.43
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.785,
-        "details": {
-          "description": "min=0.785, mean=0.785, max=0.785, sum=1.57 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.391, mean=0.391, max=0.391, sum=0.782 (2)",
-            "tab": "Efficiency",
-            "score": 0.39101445586593064
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=346.874, mean=346.874, max=346.874, sum=693.748 (2)",
-            "tab": "General information",
-            "score": 346.8740740740741
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.529,
-        "details": {
-          "description": "min=0.529, mean=0.529, max=0.529, sum=1.059 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.432, mean=0.432, max=0.432, sum=0.864 (2)",
-            "tab": "Efficiency",
-            "score": 0.4319474816322327
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.394, mean=0.394, max=0.394, sum=0.788 (2)",
-            "tab": "Efficiency",
-            "score": 0.39422312213314903
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.48, mean=0.48, max=0.48, sum=0.959 (2)",
-            "tab": "Efficiency",
-            "score": 0.4797321176528931
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.403, mean=0.403, max=0.403, sum=0.806 (2)",
-            "tab": "Efficiency",
-            "score": 0.4030305552482605
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.425, mean=0.425, max=0.425, sum=0.849 (2)",
-            "tab": "Efficiency",
-            "score": 0.4245531242017801
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)",
-            "tab": "Efficiency",
-            "score": 0.41995686643263874
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=542.28, mean=542.28, max=542.28, sum=1084.56 (2)",
-            "tab": "General information",
-            "score": 542.28
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=466.875, mean=466.875, max=466.875, sum=933.75 (2)",
-            "tab": "General information",
-            "score": 466.875
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=821.29, mean=821.29, max=821.29, sum=1642.58 (2)",
-            "tab": "General information",
-            "score": 821.29
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=587.51, mean=587.51, max=587.51, sum=1175.02 (2)",
-            "tab": "General information",
-            "score": 587.51
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=495.705, mean=495.705, max=495.705, sum=991.41 (2)",
-            "tab": "General information",
-            "score": 495.70520231213874
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=496.569, mean=496.569, max=496.569, sum=993.137 (2)",
-            "tab": "General information",
-            "score": 496.5686274509804
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.392, mean=0.392, max=0.392, sum=0.783 (2)",
-            "tab": "Efficiency",
-            "score": 0.3916677093505859
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=371.51, mean=371.51, max=371.51, sum=743.02 (2)",
-            "tab": "General information",
-            "score": 371.51
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.693,
-        "details": {
-          "description": "min=0.693, mean=0.693, max=0.693, sum=1.386 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)",
-            "tab": "Efficiency",
-            "score": 0.4078888934955262
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=607.421, mean=607.421, max=607.421, sum=1214.842 (2)",
-            "tab": "General information",
-            "score": 607.421052631579
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.49,
-        "details": {
-          "description": "min=0.49, mean=0.49, max=0.49, sum=0.98 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.385, mean=0.385, max=0.385, sum=0.77 (2)",
-            "tab": "Efficiency",
-            "score": 0.3847800350189209
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=392.71, mean=392.71, max=392.71, sum=785.42 (2)",
-            "tab": "General information",
-            "score": 392.71
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.861,
-        "details": {
-          "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.368, mean=0.368, max=0.368, sum=0.736 (2)",
-            "tab": "Efficiency",
-            "score": 0.36775174847355596
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=387.63, mean=387.63, max=387.63, sum=775.259 (2)",
-            "tab": "General information",
-            "score": 387.6296296296296
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "details": {
-          "description": "min=0.865, mean=0.865, max=0.865, sum=1.73 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.357, mean=0.357, max=0.357, sum=0.713 (2)",
-            "tab": "Efficiency",
-            "score": 0.35669880894602685
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=322.084, mean=322.084, max=322.084, sum=644.167 (2)",
-            "tab": "General information",
-            "score": 322.08360128617363
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.523, mean=0.523, max=0.523, sum=1.046 (2)",
-            "tab": "Efficiency",
-            "score": 0.5229001255596385
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)",
-            "tab": "Efficiency",
-            "score": 0.4082087980094531
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.738, mean=0.738, max=0.738, sum=1.477 (2)",
-            "tab": "Efficiency",
-            "score": 0.7383932933658167
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.376, mean=0.376, max=0.376, sum=0.752 (2)",
-            "tab": "Efficiency",
-            "score": 0.3758435642797183
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1087.489, mean=1087.489, max=1087.489, sum=2174.978 (2)",
-            "tab": "General information",
-            "score": 1087.4889705882354
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=651.585, mean=651.585, max=651.585, sum=1303.17 (2)",
-            "tab": "General information",
-            "score": 651.5851063829788
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1630.601, mean=1630.601, max=1630.601, sum=3261.202 (2)",
-            "tab": "General information",
-            "score": 1630.6010430247718
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=568.098, mean=568.098, max=568.098, sum=1136.196 (2)",
-            "tab": "General information",
-            "score": 568.0980392156863
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.403, mean=0.403, max=0.403, sum=0.805 (2)",
-            "tab": "Efficiency",
-            "score": 0.4027411961555481
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=415.79, mean=415.79, max=415.79, sum=831.58 (2)",
-            "tab": "General information",
-            "score": 415.79
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.921,
-        "details": {
-          "description": "min=0.921, mean=0.921, max=0.921, sum=1.842 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.407, mean=0.407, max=0.407, sum=0.814 (2)",
-            "tab": "Efficiency",
-            "score": 0.4070533733618887
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=572.684, mean=572.684, max=572.684, sum=1145.368 (2)",
-            "tab": "General information",
-            "score": 572.6842105263158
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.393, mean=0.393, max=0.393, sum=0.786 (2)",
-            "tab": "Efficiency",
-            "score": 0.3931219887733459
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=562.52, mean=562.52, max=562.52, sum=1125.04 (2)",
-            "tab": "General information",
-            "score": 562.52
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.845,
-        "details": {
-          "description": "min=0.845, mean=0.845, max=0.845, sum=1.691 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.416, mean=0.416, max=0.416, sum=0.831 (2)",
-            "tab": "Efficiency",
-            "score": 0.41558496907072245
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=390.928, mean=390.928, max=390.928, sum=781.857 (2)",
-            "tab": "General information",
-            "score": 390.92830188679244
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.838,
-        "details": {
-          "description": "min=0.838, mean=0.838, max=0.838, sum=1.677 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.517, mean=0.517, max=0.517, sum=1.034 (2)",
-            "tab": "Efficiency",
-            "score": 0.5170877294337496
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=297.834, mean=297.834, max=297.834, sum=595.668 (2)",
-            "tab": "General information",
-            "score": 297.83404255319147
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.766,
-        "details": {
-          "description": "min=0.766, mean=0.766, max=0.766, sum=1.531 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.398, mean=0.398, max=0.398, sum=0.796 (2)",
-            "tab": "Efficiency",
-            "score": 0.39815263419315733
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=428.607, mean=428.607, max=428.607, sum=857.214 (2)",
-            "tab": "General information",
-            "score": 428.60689655172416
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.632,
-        "details": {
-          "description": "min=0.632, mean=0.632, max=0.632, sum=1.265 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.478, mean=0.478, max=0.478, sum=0.957 (2)",
-            "tab": "Efficiency",
-            "score": 0.47845223719480806
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=524.854, mean=524.854, max=524.854, sum=1049.709 (2)",
-            "tab": "General information",
-            "score": 524.8544973544973
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.651,
-        "details": {
-          "description": "min=0.651, mean=0.651, max=0.651, sum=1.302 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.872 (2)",
-            "tab": "Efficiency",
-            "score": 0.4359313628030202
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=594.778, mean=594.778, max=594.778, sum=1189.556 (2)",
-            "tab": "General information",
-            "score": 594.7777777777778
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.941,
-        "details": {
-          "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.512, mean=0.512, max=0.512, sum=1.023 (2)",
-            "tab": "Efficiency",
-            "score": 0.5115567738010037
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.545, mean=0.545, max=0.545, sum=1.089 (2)",
-            "tab": "Efficiency",
-            "score": 0.5445456727972171
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.501, mean=0.501, max=0.501, sum=1.002 (2)",
-            "tab": "Efficiency",
-            "score": 0.5008813333511353
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=1.184, mean=1.184, max=1.184, sum=2.367 (2)",
-            "tab": "Efficiency",
-            "score": 1.1835060582016455
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.372, mean=0.372, max=0.372, sum=0.744 (2)",
-            "tab": "Efficiency",
-            "score": 0.3721387037123092
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.567, mean=0.567, max=0.567, sum=1.134 (2)",
-            "tab": "Efficiency",
-            "score": 0.5668655022438326
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.582, mean=0.582, max=0.582, sum=1.164 (2)",
-            "tab": "Efficiency",
-            "score": 0.5819246842310979
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.41, mean=0.41, max=0.41, sum=0.821 (2)",
-            "tab": "Efficiency",
-            "score": 0.410357196242721
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.379, mean=0.379, max=0.379, sum=0.759 (2)",
-            "tab": "Efficiency",
-            "score": 0.3792707469283032
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.393, mean=0.393, max=0.393, sum=0.786 (2)",
-            "tab": "Efficiency",
-            "score": 0.39323860288455786
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.395, mean=0.395, max=0.395, sum=0.789 (2)",
-            "tab": "Efficiency",
-            "score": 0.3946729870017515
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.516, mean=0.516, max=0.516, sum=1.032 (2)",
-            "tab": "Efficiency",
-            "score": 0.5162484921790935
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.956, mean=0.956, max=0.956, sum=1.911 (2)",
-            "tab": "Efficiency",
-            "score": 0.9556132928997862
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.583, mean=0.583, max=0.583, sum=1.165 (2)",
-            "tab": "Efficiency",
-            "score": 0.5826822735589264
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=506.671, mean=506.671, max=506.671, sum=1013.342 (2)",
-            "tab": "General information",
-            "score": 506.6709677419355
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=489.704, mean=489.704, max=489.704, sum=979.409 (2)",
-            "tab": "General information",
-            "score": 489.70443349753697
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=860.78, mean=860.78, max=860.78, sum=1721.56 (2)",
-            "tab": "General information",
-            "score": 860.78
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2790.885, mean=2790.885, max=2790.885, sum=5581.77 (2)",
-            "tab": "General information",
-            "score": 2790.8848484848486
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=365.035, mean=365.035, max=365.035, sum=730.071 (2)",
-            "tab": "General information",
-            "score": 365.0353535353535
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=458.824, mean=458.824, max=458.824, sum=917.648 (2)",
-            "tab": "General information",
-            "score": 458.8238341968912
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=363.908, mean=363.908, max=363.908, sum=727.815 (2)",
-            "tab": "General information",
-            "score": 363.9076923076923
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=525.356, mean=525.356, max=525.356, sum=1050.711 (2)",
-            "tab": "General information",
-            "score": 525.3555555555556
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=392.013, mean=392.013, max=392.013, sum=784.025 (2)",
-            "tab": "General information",
-            "score": 392.0126050420168
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=553.457, mean=553.457, max=553.457, sum=1106.914 (2)",
-            "tab": "General information",
-            "score": 553.4569536423841
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=488.242, mean=488.242, max=488.242, sum=976.484 (2)",
-            "tab": "General information",
-            "score": 488.2422018348624
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=788.639, mean=788.639, max=788.639, sum=1577.278 (2)",
-            "tab": "General information",
-            "score": 788.6388888888889
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2210.809, mean=2210.809, max=2210.809, sum=4421.618 (2)",
-            "tab": "General information",
-            "score": 2210.8088235294117
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1421.173, mean=1421.173, max=1421.173, sum=2842.346 (2)",
-            "tab": "General information",
-            "score": 1421.1729957805908
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.878,
-        "details": {
-          "description": "min=0.878, mean=0.878, max=0.878, sum=1.756 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.346, mean=0.346, max=0.346, sum=0.693 (2)",
-            "tab": "Efficiency",
-            "score": 0.346398046733018
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)",
-            "tab": "Efficiency",
-            "score": 0.3509944832051983
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=312.888, mean=312.888, max=312.888, sum=625.776 (2)",
-            "tab": "General information",
-            "score": 312.88789237668163
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=334.168, mean=334.168, max=334.168, sum=668.336 (2)",
-            "tab": "General information",
-            "score": 334.1679389312977
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)",
-            "tab": "Efficiency",
-            "score": 0.39698751701796353
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=632.818, mean=632.818, max=632.818, sum=1265.636 (2)",
-            "tab": "General information",
-            "score": 632.8181818181819
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "details": {
-          "description": "min=0.865, mean=0.865, max=0.865, sum=1.73 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.37, mean=0.37, max=0.37, sum=0.74 (2)",
-            "tab": "Efficiency",
-            "score": 0.36976343722431204
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=442.564, mean=442.564, max=442.564, sum=885.129 (2)",
-            "tab": "General information",
-            "score": 442.5644171779141
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.714,
-        "details": {
-          "description": "min=0.714, mean=0.714, max=0.714, sum=1.429 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.7, mean=0.7, max=0.7, sum=1.401 (2)",
-            "tab": "Efficiency",
-            "score": 0.7002999080078942
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=661.054, mean=661.054, max=661.054, sum=1322.107 (2)",
-            "tab": "General information",
-            "score": 661.0535714285714
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.913,
-        "details": {
-          "description": "min=0.913, mean=0.913, max=0.913, sum=1.825 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.411, mean=0.411, max=0.411, sum=0.823 (2)",
-            "tab": "Efficiency",
-            "score": 0.41139175822433915
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=276.786, mean=276.786, max=276.786, sum=553.573 (2)",
-            "tab": "General information",
-            "score": 276.7864077669903
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.36, mean=0.36, max=0.36, sum=0.72 (2)",
-            "tab": "Efficiency",
-            "score": 0.35977526811453014
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=397.218, mean=397.218, max=397.218, sum=794.436 (2)",
-            "tab": "General information",
-            "score": 397.21794871794873
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "details": {
-          "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.398, mean=0.398, max=0.398, sum=0.796 (2)",
-            "tab": "Efficiency",
-            "score": 0.398222451210022
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=333.99, mean=333.99, max=333.99, sum=667.98 (2)",
-            "tab": "General information",
-            "score": 333.99
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.917,
-        "details": {
-          "description": "min=0.917, mean=0.917, max=0.917, sum=1.834 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.512, mean=0.512, max=0.512, sum=1.023 (2)",
-            "tab": "Efficiency",
-            "score": 0.5115468505089615
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=292.911, mean=292.911, max=292.911, sum=585.821 (2)",
-            "tab": "General information",
-            "score": 292.9106002554278
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.598,
-        "details": {
-          "description": "min=0.598, mean=0.598, max=0.598, sum=1.196 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.396, mean=0.396, max=0.396, sum=0.792 (2)",
-            "tab": "Efficiency",
-            "score": 0.3959053982199961
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.462, mean=0.462, max=0.462, sum=0.924 (2)",
-            "tab": "Efficiency",
-            "score": 0.46180219543712764
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=469.113, mean=469.113, max=469.113, sum=938.225 (2)",
-            "tab": "General information",
-            "score": 469.1127167630058
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=649.455, mean=649.455, max=649.455, sum=1298.909 (2)",
-            "tab": "General information",
-            "score": 649.454748603352
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.876,
-        "details": {
-          "description": "min=0.876, mean=0.876, max=0.876, sum=1.752 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.396, mean=0.396, max=0.396, sum=0.793 (2)",
-            "tab": "Efficiency",
-            "score": 0.3964238252515107
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=579.814, mean=579.814, max=579.814, sum=1159.627 (2)",
-            "tab": "General information",
-            "score": 579.8137254901961
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.509, mean=0.509, max=0.509, sum=1.017 (2)",
-            "tab": "Efficiency",
-            "score": 0.50853196338371
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=507.528, mean=507.528, max=507.528, sum=1015.056 (2)",
-            "tab": "General information",
-            "score": 507.52777777777777
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.727,
-        "details": {
-          "description": "min=0.727, mean=0.727, max=0.727, sum=1.455 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)",
-            "tab": "Efficiency",
-            "score": 0.4018417878584428
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=398.318, mean=398.318, max=398.318, sum=796.636 (2)",
-            "tab": "General information",
-            "score": 398.3181818181818
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.833,
-        "details": {
-          "description": "min=0.833, mean=0.833, max=0.833, sum=1.665 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.653, mean=0.653, max=0.653, sum=1.306 (2)",
-            "tab": "Efficiency",
-            "score": 0.652998145745725
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1157.473, mean=1157.473, max=1157.473, sum=2314.947 (2)",
-            "tab": "General information",
-            "score": 1157.4734693877551
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "details": {
-          "description": "min=0.93, mean=0.93, max=0.93, sum=1.861 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.36, mean=0.36, max=0.36, sum=0.721 (2)",
-            "tab": "Efficiency",
-            "score": 0.3602804935986723
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=438.517, mean=438.517, max=438.517, sum=877.035 (2)",
-            "tab": "General information",
-            "score": 438.51741293532336
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.59,
-        "details": {
-          "description": "min=0.59, mean=0.59, max=0.59, sum=1.181 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.371, mean=0.371, max=0.371, sum=0.743 (2)",
-            "tab": "Efficiency",
-            "score": 0.3714186226028994
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=336.018, mean=336.018, max=336.018, sum=672.036 (2)",
-            "tab": "General information",
-            "score": 336.01807228915663
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.906,
-        "details": {
-          "description": "min=0.906, mean=0.906, max=0.906, sum=1.813 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.665 (2)",
-            "tab": "Efficiency",
-            "score": 0.3325699170430501
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=267.52, mean=267.52, max=267.52, sum=535.041 (2)",
-            "tab": "General information",
-            "score": 267.5204678362573
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.524,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json b/data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json
deleted file mode 100644
index 83f907e80..000000000
--- a/data/helm_mmlu/meta/llama-3-8b/78f2484e-bc73-4026-929b-db345e92cf5a.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3-8b/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 3 8B",
-    "id": "meta/llama-3-8b",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.668,
-        "details": {
-          "description": "min=0.33, mean=0.668, max=0.885, sum=76.111 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.288, mean=0.35, max=0.586, sum=39.916 (114)",
-            "tab": "Efficiency",
-            "score": 0.350140152719457
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=267.52, mean=607.619, max=2790.885, sum=69268.61 (114)",
-            "tab": "General information",
-            "score": 607.6193817308517
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.33,
-        "details": {
-          "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.309, mean=0.309, max=0.309, sum=0.618 (2)",
-            "tab": "Efficiency",
-            "score": 0.30905162572860717
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=366.43, mean=366.43, max=366.43, sum=732.86 (2)",
-            "tab": "General information",
-            "score": 366.43
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.696,
-        "details": {
-          "description": "min=0.696, mean=0.696, max=0.696, sum=1.393 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.288, mean=0.288, max=0.288, sum=0.577 (2)",
-            "tab": "Efficiency",
-            "score": 0.28846773041619195
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=346.874, mean=346.874, max=346.874, sum=693.748 (2)",
-            "tab": "General information",
-            "score": 346.8740740740741
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.451,
-        "details": {
-          "description": "min=0.451, mean=0.451, max=0.451, sum=0.902 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.323, max=0.323, sum=0.646 (2)",
-            "tab": "Efficiency",
-            "score": 0.3228257203102112
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.687 (2)",
-            "tab": "Efficiency",
-            "score": 0.34339087539248997
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.366, mean=0.366, max=0.366, sum=0.733 (2)",
-            "tab": "Efficiency",
-            "score": 0.3662724041938782
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.32, mean=0.32, max=0.32, sum=0.64 (2)",
-            "tab": "Efficiency",
-            "score": 0.320071747303009
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.329, mean=0.329, max=0.329, sum=0.657 (2)",
-            "tab": "Efficiency",
-            "score": 0.32854826739757736
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.299, mean=0.299, max=0.299, sum=0.599 (2)",
-            "tab": "Efficiency",
-            "score": 0.2994629471909766
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=542.28, mean=542.28, max=542.28, sum=1084.56 (2)",
-            "tab": "General information",
-            "score": 542.28
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=466.875, mean=466.875, max=466.875, sum=933.75 (2)",
-            "tab": "General information",
-            "score": 466.875
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=821.29, mean=821.29, max=821.29, sum=1642.58 (2)",
-            "tab": "General information",
-            "score": 821.29
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=587.51, mean=587.51, max=587.51, sum=1175.02 (2)",
-            "tab": "General information",
-            "score": 587.51
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=495.705, mean=495.705, max=495.705, sum=991.41 (2)",
-            "tab": "General information",
-            "score": 495.70520231213874
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=496.569, mean=496.569, max=496.569, sum=993.137 (2)",
-            "tab": "General information",
-            "score": 496.5686274509804
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)",
-            "tab": "Efficiency",
-            "score": 0.3068851590156555
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=371.51, mean=371.51, max=371.51, sum=743.02 (2)",
-            "tab": "General information",
-            "score": 371.51
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.518,
-        "details": {
-          "description": "min=0.518, mean=0.518, max=0.518, sum=1.035 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.344, mean=0.344, max=0.344, sum=0.689 (2)",
-            "tab": "Efficiency",
-            "score": 0.3442605817527102
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=607.421, mean=607.421, max=607.421, sum=1214.842 (2)",
-            "tab": "General information",
-            "score": 607.421052631579
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.34,
-        "details": {
-          "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)",
-            "tab": "Efficiency",
-            "score": 0.3109010863304138
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=392.71, mean=392.71, max=392.71, sum=785.42 (2)",
-            "tab": "General information",
-            "score": 392.71
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.741,
-        "details": {
-          "description": "min=0.741, mean=0.741, max=0.741, sum=1.481 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.323, max=0.323, sum=0.645 (2)",
-            "tab": "Efficiency",
-            "score": 0.32258448998133343
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=387.63, mean=387.63, max=387.63, sum=775.259 (2)",
-            "tab": "General information",
-            "score": 387.6296296296296
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.743,
-        "details": {
-          "description": "min=0.743, mean=0.743, max=0.743, sum=1.486 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.309, mean=0.309, max=0.309, sum=0.617 (2)",
-            "tab": "Efficiency",
-            "score": 0.3085632078900598
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=322.084, mean=322.084, max=322.084, sum=644.167 (2)",
-            "tab": "General information",
-            "score": 322.08360128617363
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.711,
-        "details": {
-          "description": "min=0.711, mean=0.711, max=0.711, sum=1.422 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.341, mean=0.341, max=0.341, sum=0.682 (2)",
-            "tab": "Efficiency",
-            "score": 0.34079881275401397
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.45, mean=0.45, max=0.45, sum=0.901 (2)",
-            "tab": "Efficiency",
-            "score": 0.4504219800867933
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.429, mean=0.429, max=0.429, sum=0.857 (2)",
-            "tab": "Efficiency",
-            "score": 0.4285039446344587
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.376, mean=0.376, max=0.376, sum=0.752 (2)",
-            "tab": "Efficiency",
-            "score": 0.3759713149538227
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1087.489, mean=1087.489, max=1087.489, sum=2174.978 (2)",
-            "tab": "General information",
-            "score": 1087.4889705882354
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=651.585, mean=651.585, max=651.585, sum=1303.17 (2)",
-            "tab": "General information",
-            "score": 651.5851063829788
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1630.601, mean=1630.601, max=1630.601, sum=3261.202 (2)",
-            "tab": "General information",
-            "score": 1630.6010430247718
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=568.098, mean=568.098, max=568.098, sum=1136.196 (2)",
-            "tab": "General information",
-            "score": 568.0980392156863
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "details": {
-          "description": "min=0.88, mean=0.88, max=0.88, sum=1.76 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.599 (2)",
-            "tab": "Efficiency",
-            "score": 0.29950841665267947
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=415.79, mean=415.79, max=415.79, sum=831.58 (2)",
-            "tab": "General information",
-            "score": 415.79
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.711,
-        "details": {
-          "description": "min=0.711, mean=0.711, max=0.711, sum=1.421 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.539, mean=0.539, max=0.539, sum=1.077 (2)",
-            "tab": "Efficiency",
-            "score": 0.5385584250876778
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=572.684, mean=572.684, max=572.684, sum=1145.368 (2)",
-            "tab": "General information",
-            "score": 572.6842105263158
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.65,
-        "details": {
-          "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.312, mean=0.312, max=0.312, sum=0.623 (2)",
-            "tab": "Efficiency",
-            "score": 0.311549117565155
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=562.52, mean=562.52, max=562.52, sum=1125.04 (2)",
-            "tab": "General information",
-            "score": 562.52
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.751,
-        "details": {
-          "description": "min=0.751, mean=0.751, max=0.751, sum=1.502 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.609 (2)",
-            "tab": "Efficiency",
-            "score": 0.3043576915309114
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=390.928, mean=390.928, max=390.928, sum=781.857 (2)",
-            "tab": "General information",
-            "score": 390.92830188679244
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.557,
-        "details": {
-          "description": "min=0.557, mean=0.557, max=0.557, sum=1.115 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.315, mean=0.315, max=0.315, sum=0.631 (2)",
-            "tab": "Efficiency",
-            "score": 0.31532351615581106
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=297.834, mean=297.834, max=297.834, sum=595.668 (2)",
-            "tab": "General information",
-            "score": 297.83404255319147
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.669,
-        "details": {
-          "description": "min=0.669, mean=0.669, max=0.669, sum=1.338 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.317, mean=0.317, max=0.317, sum=0.635 (2)",
-            "tab": "Efficiency",
-            "score": 0.31737767910135206
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=428.607, mean=428.607, max=428.607, sum=857.214 (2)",
-            "tab": "General information",
-            "score": 428.60689655172416
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.426,
-        "details": {
-          "description": "min=0.426, mean=0.426, max=0.426, sum=0.852 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.308, mean=0.308, max=0.308, sum=0.616 (2)",
-            "tab": "Efficiency",
-            "score": 0.3080339734516447
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=524.854, mean=524.854, max=524.854, sum=1049.709 (2)",
-            "tab": "General information",
-            "score": 524.8544973544973
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.468,
-        "details": {
-          "description": "min=0.468, mean=0.468, max=0.468, sum=0.937 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)",
-            "tab": "Efficiency",
-            "score": 0.33724411328633624
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=594.778, mean=594.778, max=594.778, sum=1189.556 (2)",
-            "tab": "General information",
-            "score": 594.7777777777778
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.823,
-        "details": {
-          "description": "min=0.823, mean=0.823, max=0.823, sum=1.646 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.672 (2)",
-            "tab": "Efficiency",
-            "score": 0.3359520781424738
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.309, mean=0.309, max=0.309, sum=0.619 (2)",
-            "tab": "Efficiency",
-            "score": 0.3092998248602956
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.325, mean=0.325, max=0.325, sum=0.649 (2)",
-            "tab": "Efficiency",
-            "score": 0.324708514213562
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.544, mean=0.544, max=0.544, sum=1.087 (2)",
-            "tab": "Efficiency",
-            "score": 0.5437044996203798
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.609 (2)",
-            "tab": "Efficiency",
-            "score": 0.30433518236333673
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.319, mean=0.319, max=0.319, sum=0.638 (2)",
-            "tab": "Efficiency",
-            "score": 0.3192491321366068
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)",
-            "tab": "Efficiency",
-            "score": 0.31492268366691395
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.326, mean=0.326, max=0.326, sum=0.652 (2)",
-            "tab": "Efficiency",
-            "score": 0.3262451118893094
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.345, mean=0.345, max=0.345, sum=0.69 (2)",
-            "tab": "Efficiency",
-            "score": 0.3451059505719097
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.541, mean=0.541, max=0.541, sum=1.082 (2)",
-            "tab": "Efficiency",
-            "score": 0.5410290490712552
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.379, mean=0.379, max=0.379, sum=0.757 (2)",
-            "tab": "Efficiency",
-            "score": 0.3786245923523509
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.493, mean=0.493, max=0.493, sum=0.986 (2)",
-            "tab": "Efficiency",
-            "score": 0.4927717314826118
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.481, mean=0.481, max=0.481, sum=0.962 (2)",
-            "tab": "Efficiency",
-            "score": 0.48103941655626486
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.516, mean=0.516, max=0.516, sum=1.032 (2)",
-            "tab": "Efficiency",
-            "score": 0.5161508246313168
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=506.671, mean=506.671, max=506.671, sum=1013.342 (2)",
-            "tab": "General information",
-            "score": 506.6709677419355
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=489.704, mean=489.704, max=489.704, sum=979.409 (2)",
-            "tab": "General information",
-            "score": 489.70443349753697
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=860.78, mean=860.78, max=860.78, sum=1721.56 (2)",
-            "tab": "General information",
-            "score": 860.78
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2790.885, mean=2790.885, max=2790.885, sum=5581.77 (2)",
-            "tab": "General information",
-            "score": 2790.8848484848486
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=365.035, mean=365.035, max=365.035, sum=730.071 (2)",
-            "tab": "General information",
-            "score": 365.0353535353535
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=458.824, mean=458.824, max=458.824, sum=917.648 (2)",
-            "tab": "General information",
-            "score": 458.8238341968912
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=363.908, mean=363.908, max=363.908, sum=727.815 (2)",
-            "tab": "General information",
-            "score": 363.9076923076923
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=525.356, mean=525.356, max=525.356, sum=1050.711 (2)",
-            "tab": "General information",
-            "score": 525.3555555555556
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=392.013, mean=392.013, max=392.013, sum=784.025 (2)",
-            "tab": "General information",
-            "score": 392.0126050420168
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=553.457, mean=553.457, max=553.457, sum=1106.914 (2)",
-            "tab": "General information",
-            "score": 553.4569536423841
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=488.242, mean=488.242, max=488.242, sum=976.484 (2)",
-            "tab": "General information",
-            "score": 488.2422018348624
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=788.639, mean=788.639, max=788.639, sum=1577.278 (2)",
-            "tab": "General information",
-            "score": 788.6388888888889
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2210.809, mean=2210.809, max=2210.809, sum=4421.618 (2)",
-            "tab": "General information",
-            "score": 2210.8088235294117
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1421.173, mean=1421.173, max=1421.173, sum=2842.346 (2)",
-            "tab": "General information",
-            "score": 1421.1729957805908
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.748,
-        "details": {
-          "description": "min=0.748, mean=0.748, max=0.748, sum=1.496 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.303, max=0.303, sum=0.605 (2)",
-            "tab": "Efficiency",
-            "score": 0.30269593080597607
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.325, mean=0.325, max=0.325, sum=0.651 (2)",
-            "tab": "Efficiency",
-            "score": 0.32543583862654124
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=312.888, mean=312.888, max=312.888, sum=625.776 (2)",
-            "tab": "General information",
-            "score": 312.88789237668163
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=334.168, mean=334.168, max=334.168, sum=668.336 (2)",
-            "tab": "General information",
-            "score": 334.1679389312977
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.843,
-        "details": {
-          "description": "min=0.843, mean=0.843, max=0.843, sum=1.686 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.586, mean=0.586, max=0.586, sum=1.172 (2)",
-            "tab": "Efficiency",
-            "score": 0.5860170076701267
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=632.818, mean=632.818, max=632.818, sum=1265.636 (2)",
-            "tab": "General information",
-            "score": 632.8181818181819
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.755,
-        "details": {
-          "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.313, mean=0.313, max=0.313, sum=0.625 (2)",
-            "tab": "Efficiency",
-            "score": 0.31263120335303934
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=442.564, mean=442.564, max=442.564, sum=885.129 (2)",
-            "tab": "General information",
-            "score": 442.5644171779141
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.545,
-        "details": {
-          "description": "min=0.545, mean=0.545, max=0.545, sum=1.089 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.309, mean=0.309, max=0.309, sum=0.618 (2)",
-            "tab": "Efficiency",
-            "score": 0.30891925522259306
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=661.054, mean=661.054, max=661.054, sum=1322.107 (2)",
-            "tab": "General information",
-            "score": 661.0535714285714
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.874,
-        "details": {
-          "description": "min=0.874, mean=0.874, max=0.874, sum=1.748 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.298, mean=0.298, max=0.298, sum=0.596 (2)",
-            "tab": "Efficiency",
-            "score": 0.29801390703442027
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=276.786, mean=276.786, max=276.786, sum=553.573 (2)",
-            "tab": "General information",
-            "score": 276.7864077669903
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.885,
-        "details": {
-          "description": "min=0.885, mean=0.885, max=0.885, sum=1.769 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.595 (2)",
-            "tab": "Efficiency",
-            "score": 0.29727030717409575
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=397.218, mean=397.218, max=397.218, sum=794.436 (2)",
-            "tab": "General information",
-            "score": 397.21794871794873
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.301, mean=0.301, max=0.301, sum=0.602 (2)",
-            "tab": "Efficiency",
-            "score": 0.3011839747428894
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=333.99, mean=333.99, max=333.99, sum=667.98 (2)",
-            "tab": "General information",
-            "score": 333.99
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.831,
-        "details": {
-          "description": "min=0.831, mean=0.831, max=0.831, sum=1.663 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.352, mean=0.352, max=0.352, sum=0.703 (2)",
-            "tab": "Efficiency",
-            "score": 0.3515638007971519
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=292.911, mean=292.911, max=292.911, sum=585.821 (2)",
-            "tab": "General information",
-            "score": 292.9106002554278
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.416,
-        "details": {
-          "description": "min=0.416, mean=0.416, max=0.416, sum=0.831 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.585 (2)",
-            "tab": "Efficiency",
-            "score": 0.2926361808887107
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.329, mean=0.329, max=0.329, sum=0.658 (2)",
-            "tab": "Efficiency",
-            "score": 0.3287937753027378
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=469.113, mean=469.113, max=469.113, sum=938.225 (2)",
-            "tab": "General information",
-            "score": 469.1127167630058
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=649.455, mean=649.455, max=649.455, sum=1298.909 (2)",
-            "tab": "General information",
-            "score": 649.454748603352
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.761,
-        "details": {
-          "description": "min=0.761, mean=0.761, max=0.761, sum=1.523 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.323, max=0.323, sum=0.645 (2)",
-            "tab": "Efficiency",
-            "score": 0.3226836241927801
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=579.814, mean=579.814, max=579.814, sum=1159.627 (2)",
-            "tab": "General information",
-            "score": 579.8137254901961
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.738,
-        "details": {
-          "description": "min=0.738, mean=0.738, max=0.738, sum=1.475 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.594 (2)",
-            "tab": "Efficiency",
-            "score": 0.2970340943630831
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=507.528, mean=507.528, max=507.528, sum=1015.056 (2)",
-            "tab": "General information",
-            "score": 507.52777777777777
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.736,
-        "details": {
-          "description": "min=0.736, mean=0.736, max=0.736, sum=1.473 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.325, mean=0.325, max=0.325, sum=0.649 (2)",
-            "tab": "Efficiency",
-            "score": 0.3247281486337835
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=398.318, mean=398.318, max=398.318, sum=796.636 (2)",
-            "tab": "General information",
-            "score": 398.3181818181818
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.771,
-        "details": {
-          "description": "min=0.771, mean=0.771, max=0.771, sum=1.543 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)",
-            "tab": "Efficiency",
-            "score": 0.35109225779163594
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1157.473, mean=1157.473, max=1157.473, sum=2314.947 (2)",
-            "tab": "General information",
-            "score": 1157.4734693877551
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.866,
-        "details": {
-          "description": "min=0.866, mean=0.866, max=0.866, sum=1.731 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)",
-            "tab": "Efficiency",
-            "score": 0.31481776545889933
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=438.517, mean=438.517, max=438.517, sum=877.035 (2)",
-            "tab": "General information",
-            "score": 438.51741293532336
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.566,
-        "details": {
-          "description": "min=0.566, mean=0.566, max=0.566, sum=1.133 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.295, mean=0.295, max=0.295, sum=0.59 (2)",
-            "tab": "Efficiency",
-            "score": 0.2951422269085804
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=336.018, mean=336.018, max=336.018, sum=672.036 (2)",
-            "tab": "General information",
-            "score": 336.01807228915663
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.819,
-        "details": {
-          "description": "min=0.819, mean=0.819, max=0.819, sum=1.637 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.315, mean=0.315, max=0.315, sum=0.631 (2)",
-            "tab": "Efficiency",
-            "score": 0.3152559863196479
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=267.52, mean=267.52, max=267.52, sum=535.041 (2)",
-            "tab": "General information",
-            "score": 267.5204678362573
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.733,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json b/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json
deleted file mode 100644
index c4ce37e9d..000000000
--- a/data/helm_mmlu/meta/llama-3.1-405b-instruct-turbo/8ddf9de8-2ee3-4a30-9250-30fd027c63b4.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.1-405b-instruct-turbo/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 3.1 Instruct Turbo 405B",
-    "id": "meta/llama-3.1-405b-instruct-turbo",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.845,
-        "details": {
-          "description": "min=0.572, mean=0.845, max=0.984, sum=96.366 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.403, mean=0.685, max=1.366, sum=78.119 (114)",
-            "tab": "Efficiency",
-            "score": 0.6852569796494135
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)",
-            "tab": "General information",
-            "score": 614.6193817308517
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7,
-        "details": {
-          "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.464, mean=0.464, max=0.464, sum=0.928 (2)",
-            "tab": "Efficiency",
-            "score": 0.4640246653556824
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=373.43, mean=373.43, max=373.43, sum=746.86 (2)",
-            "tab": "General information",
-            "score": 373.43
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.822,
-        "details": {
-          "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.403, mean=0.403, max=0.403, sum=0.806 (2)",
-            "tab": "Efficiency",
-            "score": 0.4029027055810999
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=353.874, mean=353.874, max=353.874, sum=707.748 (2)",
-            "tab": "General information",
-            "score": 353.8740740740741
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.696,
-        "details": {
-          "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.56, mean=0.56, max=0.56, sum=1.119 (2)",
-            "tab": "Efficiency",
-            "score": 0.5597123241424561
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.48, mean=0.48, max=0.48, sum=0.959 (2)",
-            "tab": "Efficiency",
-            "score": 0.4795056896077262
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.566, mean=0.566, max=0.566, sum=1.132 (2)",
-            "tab": "Efficiency",
-            "score": 0.5661771416664123
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.541, mean=0.541, max=0.541, sum=1.082 (2)",
-            "tab": "Efficiency",
-            "score": 0.5411620163917541
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.635, mean=0.635, max=0.635, sum=1.271 (2)",
-            "tab": "Efficiency",
-            "score": 0.6352733904226667
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.495, mean=0.495, max=0.495, sum=0.991 (2)",
-            "tab": "Efficiency",
-            "score": 0.4953400083616668
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)",
-            "tab": "General information",
-            "score": 549.28
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=473.875, mean=473.875, max=473.875, sum=947.75 (2)",
-            "tab": "General information",
-            "score": 473.875
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)",
-            "tab": "General information",
-            "score": 828.29
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)",
-            "tab": "General information",
-            "score": 594.51
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)",
-            "tab": "General information",
-            "score": 502.70520231213874
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)",
-            "tab": "General information",
-            "score": 503.5686274509804
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.502, mean=0.502, max=0.502, sum=1.003 (2)",
-            "tab": "Efficiency",
-            "score": 0.5016749453544617
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=378.51, mean=378.51, max=378.51, sum=757.02 (2)",
-            "tab": "General information",
-            "score": 378.51
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.746,
-        "details": {
-          "description": "min=0.746, mean=0.746, max=0.746, sum=1.491 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.598, mean=0.598, max=0.598, sum=1.195 (2)",
-            "tab": "Efficiency",
-            "score": 0.5976439986312598
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)",
-            "tab": "General information",
-            "score": 614.421052631579
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.71,
-        "details": {
-          "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.471, mean=0.471, max=0.471, sum=0.941 (2)",
-            "tab": "Efficiency",
-            "score": 0.4706212830543518
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)",
-            "tab": "General information",
-            "score": 399.71
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.917, mean=0.917, max=0.917, sum=1.835 (2)",
-            "tab": "Efficiency",
-            "score": 0.9174331603226838
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=394.63, mean=394.63, max=394.63, sum=789.259 (2)",
-            "tab": "General information",
-            "score": 394.6296296296296
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.878,
-        "details": {
-          "description": "min=0.878, mean=0.878, max=0.878, sum=1.756 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.753, mean=0.753, max=0.753, sum=1.506 (2)",
-            "tab": "Efficiency",
-            "score": 0.7531090411342608
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)",
-            "tab": "General information",
-            "score": 329.08360128617363
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.861,
-        "details": {
-          "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=1.095, mean=1.095, max=1.095, sum=2.191 (2)",
-            "tab": "Efficiency",
-            "score": 1.0953595541855867
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.739, mean=0.739, max=0.739, sum=1.478 (2)",
-            "tab": "Efficiency",
-            "score": 0.7390724031637746
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=1.053, mean=1.053, max=1.053, sum=2.107 (2)",
-            "tab": "Efficiency",
-            "score": 1.0534205999337087
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.579, mean=0.579, max=0.579, sum=1.158 (2)",
-            "tab": "Efficiency",
-            "score": 0.5791019481771132
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)",
-            "tab": "General information",
-            "score": 1094.4889705882354
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)",
-            "tab": "General information",
-            "score": 658.5851063829788
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)",
-            "tab": "General information",
-            "score": 1637.6010430247718
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)",
-            "tab": "General information",
-            "score": 575.0980392156863
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)",
-            "tab": "Efficiency",
-            "score": 0.5199404859542847
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)",
-            "tab": "General information",
-            "score": 422.79
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.921,
-        "details": {
-          "description": "min=0.921, mean=0.921, max=0.921, sum=1.842 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.467, mean=0.467, max=0.467, sum=0.933 (2)",
-            "tab": "Efficiency",
-            "score": 0.46656754769777
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)",
-            "tab": "General information",
-            "score": 579.6842105263158
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.431, mean=0.431, max=0.431, sum=0.862 (2)",
-            "tab": "Efficiency",
-            "score": 0.4309411120414734
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)",
-            "tab": "General information",
-            "score": 569.52
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.879,
-        "details": {
-          "description": "min=0.879, mean=0.879, max=0.879, sum=1.758 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.533, mean=0.533, max=0.533, sum=1.067 (2)",
-            "tab": "Efficiency",
-            "score": 0.5334792272099909
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=397.928, mean=397.928, max=397.928, sum=795.857 (2)",
-            "tab": "General information",
-            "score": 397.92830188679244
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.877,
-        "details": {
-          "description": "min=0.877, mean=0.877, max=0.877, sum=1.753 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.508, mean=0.508, max=0.508, sum=1.016 (2)",
-            "tab": "Efficiency",
-            "score": 0.5081663547678197
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=304.834, mean=304.834, max=304.834, sum=609.668 (2)",
-            "tab": "General information",
-            "score": 304.83404255319147
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.821,
-        "details": {
-          "description": "min=0.821, mean=0.821, max=0.821, sum=1.641 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.48, mean=0.48, max=0.48, sum=0.959 (2)",
-            "tab": "Efficiency",
-            "score": 0.47960921155995334
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=435.607, mean=435.607, max=435.607, sum=871.214 (2)",
-            "tab": "General information",
-            "score": 435.60689655172416
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.828,
-        "details": {
-          "description": "min=0.828, mean=0.828, max=0.828, sum=1.656 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.559, mean=0.559, max=0.559, sum=1.117 (2)",
-            "tab": "Efficiency",
-            "score": 0.5586125358702645
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)",
-            "tab": "General information",
-            "score": 531.8544973544973
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.698,
-        "details": {
-          "description": "min=0.698, mean=0.698, max=0.698, sum=1.397 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.505, mean=0.505, max=0.505, sum=1.011 (2)",
-            "tab": "Efficiency",
-            "score": 0.5053695440292358
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)",
-            "tab": "General information",
-            "score": 601.7777777777778
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.941,
-        "details": {
-          "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.487, mean=0.487, max=0.487, sum=0.974 (2)",
-            "tab": "Efficiency",
-            "score": 0.48715837847801946
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.457, mean=0.457, max=0.457, sum=0.914 (2)",
-            "tab": "Efficiency",
-            "score": 0.45692210949113216
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.334 (2)",
-            "tab": "Efficiency",
-            "score": 0.6668596768379211
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=1.089, mean=1.089, max=1.089, sum=2.178 (2)",
-            "tab": "Efficiency",
-            "score": 1.0890785202835545
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.491, mean=0.491, max=0.491, sum=0.983 (2)",
-            "tab": "Efficiency",
-            "score": 0.49135766848169193
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.437, mean=0.437, max=0.437, sum=0.874 (2)",
-            "tab": "Efficiency",
-            "score": 0.4368582340102122
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.612, mean=0.612, max=0.612, sum=1.224 (2)",
-            "tab": "Efficiency",
-            "score": 0.6121874619752933
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.929, mean=0.929, max=0.929, sum=1.858 (2)",
-            "tab": "Efficiency",
-            "score": 0.9291445193467317
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.737, mean=0.737, max=0.737, sum=1.475 (2)",
-            "tab": "Efficiency",
-            "score": 0.7372911036515436
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.591, mean=0.591, max=0.591, sum=1.181 (2)",
-            "tab": "Efficiency",
-            "score": 0.5905803986732533
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.884, mean=0.884, max=0.884, sum=1.767 (2)",
-            "tab": "Efficiency",
-            "score": 0.8837221084384743
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.634, mean=0.634, max=0.634, sum=1.268 (2)",
-            "tab": "Efficiency",
-            "score": 0.6339434705398701
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.993, mean=0.993, max=0.993, sum=1.987 (2)",
-            "tab": "Efficiency",
-            "score": 0.9934839302418279
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=1.012, mean=1.012, max=1.012, sum=2.024 (2)",
-            "tab": "Efficiency",
-            "score": 1.0120529253271562
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)",
-            "tab": "General information",
-            "score": 513.6709677419354
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=496.704, mean=496.704, max=496.704, sum=993.409 (2)",
-            "tab": "General information",
-            "score": 496.70443349753697
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)",
-            "tab": "General information",
-            "score": 867.78
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)",
-            "tab": "General information",
-            "score": 2797.8848484848486
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=372.035, mean=372.035, max=372.035, sum=744.071 (2)",
-            "tab": "General information",
-            "score": 372.0353535353535
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)",
-            "tab": "General information",
-            "score": 465.8238341968912
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=370.908, mean=370.908, max=370.908, sum=741.815 (2)",
-            "tab": "General information",
-            "score": 370.9076923076923
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)",
-            "tab": "General information",
-            "score": 532.3555555555556
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=399.013, mean=399.013, max=399.013, sum=798.025 (2)",
-            "tab": "General information",
-            "score": 399.0126050420168
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)",
-            "tab": "General information",
-            "score": 560.4569536423841
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=495.242, mean=495.242, max=495.242, sum=990.484 (2)",
-            "tab": "General information",
-            "score": 495.2422018348624
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)",
-            "tab": "General information",
-            "score": 795.6388888888889
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)",
-            "tab": "General information",
-            "score": 2217.8088235294117
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)",
-            "tab": "General information",
-            "score": 1428.1729957805908
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.855,
-        "details": {
-          "description": "min=0.855, mean=0.855, max=0.855, sum=1.71 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.602, mean=0.602, max=0.602, sum=1.204 (2)",
-            "tab": "Efficiency",
-            "score": 0.6018790418257093
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.765, mean=0.765, max=0.765, sum=1.531 (2)",
-            "tab": "Efficiency",
-            "score": 0.7653163061797164
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=319.888, mean=319.888, max=319.888, sum=639.776 (2)",
-            "tab": "General information",
-            "score": 319.88789237668163
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=341.168, mean=341.168, max=341.168, sum=682.336 (2)",
-            "tab": "General information",
-            "score": 341.1679389312977
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.95,
-        "details": {
-          "description": "min=0.95, mean=0.95, max=0.95, sum=1.901 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)",
-            "tab": "Efficiency",
-            "score": 0.7894663180201507
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)",
-            "tab": "General information",
-            "score": 639.8181818181819
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.923, mean=0.923, max=0.923, sum=1.847 (2)",
-            "tab": "Efficiency",
-            "score": 0.9234680895425059
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=449.564, mean=449.564, max=449.564, sum=899.129 (2)",
-            "tab": "General information",
-            "score": 449.5644171779141
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.795,
-        "details": {
-          "description": "min=0.795, mean=0.795, max=0.795, sum=1.589 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=1.077, mean=1.077, max=1.077, sum=2.154 (2)",
-            "tab": "Efficiency",
-            "score": 1.0769924351147242
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)",
-            "tab": "General information",
-            "score": 668.0535714285714
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.893,
-        "details": {
-          "description": "min=0.893, mean=0.893, max=0.893, sum=1.786 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.505, mean=0.505, max=0.505, sum=1.009 (2)",
-            "tab": "Efficiency",
-            "score": 0.5047070956924586
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=283.786, mean=283.786, max=283.786, sum=567.573 (2)",
-            "tab": "General information",
-            "score": 283.7864077669903
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.962,
-        "details": {
-          "description": "min=0.962, mean=0.962, max=0.962, sum=1.923 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.617, mean=0.617, max=0.617, sum=1.234 (2)",
-            "tab": "Efficiency",
-            "score": 0.6168569010547084
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)",
-            "tab": "General information",
-            "score": 404.21794871794873
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "details": {
-          "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.725, mean=0.725, max=0.725, sum=1.45 (2)",
-            "tab": "Efficiency",
-            "score": 0.7251019191741943
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=340.99, mean=340.99, max=340.99, sum=681.98 (2)",
-            "tab": "General information",
-            "score": 340.99
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.939,
-        "details": {
-          "description": "min=0.939, mean=0.939, max=0.939, sum=1.877 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.844, mean=0.844, max=0.844, sum=1.689 (2)",
-            "tab": "Efficiency",
-            "score": 0.8444620089208181
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=299.911, mean=299.911, max=299.911, sum=599.821 (2)",
-            "tab": "General information",
-            "score": 299.9106002554278
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.876,
-        "details": {
-          "description": "min=0.876, mean=0.876, max=0.876, sum=1.752 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=1.366, mean=1.366, max=1.366, sum=2.732 (2)",
-            "tab": "Efficiency",
-            "score": 1.3659538754148979
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.749, mean=0.749, max=0.749, sum=1.498 (2)",
-            "tab": "Efficiency",
-            "score": 0.7492334496375569
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=476.113, mean=476.113, max=476.113, sum=952.225 (2)",
-            "tab": "General information",
-            "score": 476.1127167630058
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)",
-            "tab": "General information",
-            "score": 656.454748603352
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.928,
-        "details": {
-          "description": "min=0.928, mean=0.928, max=0.928, sum=1.856 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=1.217, mean=1.217, max=1.217, sum=2.433 (2)",
-            "tab": "Efficiency",
-            "score": 1.2165828491348067
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)",
-            "tab": "General information",
-            "score": 586.8137254901961
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.929,
-        "details": {
-          "description": "min=0.929, mean=0.929, max=0.929, sum=1.858 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.591, mean=0.591, max=0.591, sum=1.182 (2)",
-            "tab": "Efficiency",
-            "score": 0.5911465375511734
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)",
-            "tab": "General information",
-            "score": 514.5277777777778
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.818,
-        "details": {
-          "description": "min=0.818, mean=0.818, max=0.818, sum=1.636 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=1.129, mean=1.129, max=1.129, sum=2.258 (2)",
-            "tab": "Efficiency",
-            "score": 1.12924514467066
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)",
-            "tab": "General information",
-            "score": 405.3181818181818
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.857,
-        "details": {
-          "description": "min=0.857, mean=0.857, max=0.857, sum=1.714 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.734, mean=0.734, max=0.734, sum=1.468 (2)",
-            "tab": "Efficiency",
-            "score": 0.7342344303520358
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)",
-            "tab": "General information",
-            "score": 1164.4734693877551
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.881 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.583, mean=0.583, max=0.583, sum=1.166 (2)",
-            "tab": "Efficiency",
-            "score": 0.5830918010787585
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=445.517, mean=445.517, max=445.517, sum=891.035 (2)",
-            "tab": "General information",
-            "score": 445.51741293532336
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.572,
-        "details": {
-          "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.483, mean=0.483, max=0.483, sum=0.967 (2)",
-            "tab": "Efficiency",
-            "score": 0.4834072029734232
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=343.018, mean=343.018, max=343.018, sum=686.036 (2)",
-            "tab": "General information",
-            "score": 343.01807228915663
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.906,
-        "details": {
-          "description": "min=0.906, mean=0.906, max=0.906, sum=1.813 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.484, mean=0.484, max=0.484, sum=0.967 (2)",
-            "tab": "Efficiency",
-            "score": 0.48364103328414826
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=274.52, mean=274.52, max=274.52, sum=549.041 (2)",
-            "tab": "General information",
-            "score": 274.5204678362573
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.33,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json b/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json
deleted file mode 100644
index 0e4b849f9..000000000
--- a/data/helm_mmlu/meta/llama-3.1-70b-instruct-turbo/41af381a-3637-4578-a582-59d9b1327d95.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.1-70b-instruct-turbo/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 3.1 Instruct Turbo 70B",
-    "id": "meta/llama-3.1-70b-instruct-turbo",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.801,
-        "details": {
-          "description": "min=0.404, mean=0.801, max=0.984, sum=91.318 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=2.517, mean=5.993, max=45.251, sum=683.146 (114)",
-            "tab": "Efficiency",
-            "score": 5.992510112833335
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)",
-            "tab": "General information",
-            "score": 614.6193817308517
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.55,
-        "details": {
-          "description": "min=0.55, mean=0.55, max=0.55, sum=1.1 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=45.251, mean=45.251, max=45.251, sum=90.501 (2)",
-            "tab": "Efficiency",
-            "score": 45.250502264499666
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=373.43, mean=373.43, max=373.43, sum=746.86 (2)",
-            "tab": "General information",
-            "score": 373.43
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=36.973, mean=36.973, max=36.973, sum=73.946 (2)",
-            "tab": "Efficiency",
-            "score": 36.97310272499367
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=353.874, mean=353.874, max=353.874, sum=707.748 (2)",
-            "tab": "General information",
-            "score": 353.8740740740741
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.559,
-        "details": {
-          "description": "min=0.559, mean=0.559, max=0.559, sum=1.118 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=4.774, mean=4.774, max=4.774, sum=9.548 (2)",
-            "tab": "Efficiency",
-            "score": 4.774094069004059
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=4.993, mean=4.993, max=4.993, sum=9.986 (2)",
-            "tab": "Efficiency",
-            "score": 4.992929225166638
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=4.499, mean=4.499, max=4.499, sum=8.999 (2)",
-            "tab": "Efficiency",
-            "score": 4.499426193237305
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=4.479, mean=4.479, max=4.479, sum=8.957 (2)",
-            "tab": "Efficiency",
-            "score": 4.478512156009674
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=3.886, mean=3.886, max=3.886, sum=7.773 (2)",
-            "tab": "Efficiency",
-            "score": 3.886489330688653
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=3.274, mean=3.274, max=3.274, sum=6.548 (2)",
-            "tab": "Efficiency",
-            "score": 3.2739863746306477
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)",
-            "tab": "General information",
-            "score": 549.28
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=473.875, mean=473.875, max=473.875, sum=947.75 (2)",
-            "tab": "General information",
-            "score": 473.875
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)",
-            "tab": "General information",
-            "score": 828.29
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)",
-            "tab": "General information",
-            "score": 594.51
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)",
-            "tab": "General information",
-            "score": 502.70520231213874
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)",
-            "tab": "General information",
-            "score": 503.5686274509804
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=2.976, mean=2.976, max=2.976, sum=5.951 (2)",
-            "tab": "Efficiency",
-            "score": 2.9756615567207336
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=378.51, mean=378.51, max=378.51, sum=757.02 (2)",
-            "tab": "General information",
-            "score": 378.51
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.675,
-        "details": {
-          "description": "min=0.675, mean=0.675, max=0.675, sum=1.351 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=4.295, mean=4.295, max=4.295, sum=8.59 (2)",
-            "tab": "Efficiency",
-            "score": 4.29522921327959
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)",
-            "tab": "General information",
-            "score": 614.421052631579
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.61,
-        "details": {
-          "description": "min=0.61, mean=0.61, max=0.61, sum=1.22 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=3.637, mean=3.637, max=3.637, sum=7.275 (2)",
-            "tab": "Efficiency",
-            "score": 3.637417833805084
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)",
-            "tab": "General information",
-            "score": 399.71
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.889,
-        "details": {
-          "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=3.163, mean=3.163, max=3.163, sum=6.326 (2)",
-            "tab": "Efficiency",
-            "score": 3.1630651178183378
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=394.63, mean=394.63, max=394.63, sum=789.259 (2)",
-            "tab": "General information",
-            "score": 394.6296296296296
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.833,
-        "details": {
-          "description": "min=0.833, mean=0.833, max=0.833, sum=1.666 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=3.264, mean=3.264, max=3.264, sum=6.527 (2)",
-            "tab": "Efficiency",
-            "score": 3.2637280957875143
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)",
-            "tab": "General information",
-            "score": 329.08360128617363
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.846,
-        "details": {
-          "description": "min=0.846, mean=0.846, max=0.846, sum=1.693 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=3.871, mean=3.871, max=3.871, sum=7.742 (2)",
-            "tab": "Efficiency",
-            "score": 3.8712061214096405
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=2.943, mean=2.943, max=2.943, sum=5.886 (2)",
-            "tab": "Efficiency",
-            "score": 2.9428400173254894
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=3.318, mean=3.318, max=3.318, sum=6.637 (2)",
-            "tab": "Efficiency",
-            "score": 3.318323635681978
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=3.102, mean=3.102, max=3.102, sum=6.203 (2)",
-            "tab": "Efficiency",
-            "score": 3.1015563872125416
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)",
-            "tab": "General information",
-            "score": 1094.4889705882354
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)",
-            "tab": "General information",
-            "score": 658.5851063829788
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)",
-            "tab": "General information",
-            "score": 1637.6010430247718
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)",
-            "tab": "General information",
-            "score": 575.0980392156863
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "details": {
-          "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=2.836, mean=2.836, max=2.836, sum=5.672 (2)",
-            "tab": "Efficiency",
-            "score": 2.835986142158508
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)",
-            "tab": "General information",
-            "score": 422.79
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.908,
-        "details": {
-          "description": "min=0.908, mean=0.908, max=0.908, sum=1.816 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=33.307, mean=33.307, max=33.307, sum=66.613 (2)",
-            "tab": "Efficiency",
-            "score": 33.3065683904447
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)",
-            "tab": "General information",
-            "score": 579.6842105263158
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.72,
-        "details": {
-          "description": "min=0.72, mean=0.72, max=0.72, sum=1.44 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=34.272, mean=34.272, max=34.272, sum=68.544 (2)",
-            "tab": "Efficiency",
-            "score": 34.27190991640091
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)",
-            "tab": "General information",
-            "score": 569.52
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.845,
-        "details": {
-          "description": "min=0.845, mean=0.845, max=0.845, sum=1.691 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=6.181, mean=6.181, max=6.181, sum=12.362 (2)",
-            "tab": "Efficiency",
-            "score": 6.18122723057585
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=397.928, mean=397.928, max=397.928, sum=795.857 (2)",
-            "tab": "General information",
-            "score": 397.92830188679244
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.834,
-        "details": {
-          "description": "min=0.834, mean=0.834, max=0.834, sum=1.668 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=3.413, mean=3.413, max=3.413, sum=6.825 (2)",
-            "tab": "Efficiency",
-            "score": 3.412742525465945
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=304.834, mean=304.834, max=304.834, sum=609.668 (2)",
-            "tab": "General information",
-            "score": 304.83404255319147
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.745,
-        "details": {
-          "description": "min=0.745, mean=0.745, max=0.745, sum=1.49 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=4.146, mean=4.146, max=4.146, sum=8.292 (2)",
-            "tab": "Efficiency",
-            "score": 4.1461473415637835
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=435.607, mean=435.607, max=435.607, sum=871.214 (2)",
-            "tab": "General information",
-            "score": 435.60689655172416
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.701,
-        "details": {
-          "description": "min=0.701, mean=0.701, max=0.701, sum=1.402 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=4.13, mean=4.13, max=4.13, sum=8.261 (2)",
-            "tab": "Efficiency",
-            "score": 4.1303687221789485
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)",
-            "tab": "General information",
-            "score": 531.8544973544973
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.675,
-        "details": {
-          "description": "min=0.675, mean=0.675, max=0.675, sum=1.349 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=3.65, mean=3.65, max=3.65, sum=7.301 (2)",
-            "tab": "Efficiency",
-            "score": 3.6502806383465964
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)",
-            "tab": "General information",
-            "score": 601.7777777777778
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.937,
-        "details": {
-          "description": "min=0.937, mean=0.937, max=0.937, sum=1.873 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=4.179, mean=4.179, max=4.179, sum=8.357 (2)",
-            "tab": "Efficiency",
-            "score": 4.178504861554792
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=3.78, mean=3.78, max=3.78, sum=7.56 (2)",
-            "tab": "Efficiency",
-            "score": 3.779934604766921
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=4.276, mean=4.276, max=4.276, sum=8.553 (2)",
-            "tab": "Efficiency",
-            "score": 4.276434569358826
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=4.728, mean=4.728, max=4.728, sum=9.457 (2)",
-            "tab": "Efficiency",
-            "score": 4.7283261154637195
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=3.994, mean=3.994, max=3.994, sum=7.987 (2)",
-            "tab": "Efficiency",
-            "score": 3.993738304484974
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=4.056, mean=4.056, max=4.056, sum=8.111 (2)",
-            "tab": "Efficiency",
-            "score": 4.055596974229566
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=4.06, mean=4.06, max=4.06, sum=8.12 (2)",
-            "tab": "Efficiency",
-            "score": 4.059808598420559
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=4.211, mean=4.211, max=4.211, sum=8.422 (2)",
-            "tab": "Efficiency",
-            "score": 4.210984716592011
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=3.869, mean=3.869, max=3.869, sum=7.738 (2)",
-            "tab": "Efficiency",
-            "score": 3.8690204860783424
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=3.802, mean=3.802, max=3.802, sum=7.604 (2)",
-            "tab": "Efficiency",
-            "score": 3.801914532453019
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=3.897, mean=3.897, max=3.897, sum=7.793 (2)",
-            "tab": "Efficiency",
-            "score": 3.8966542169588423
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=3.5, mean=3.5, max=3.5, sum=6.999 (2)",
-            "tab": "Efficiency",
-            "score": 3.499593519502216
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=3.948, mean=3.948, max=3.948, sum=7.897 (2)",
-            "tab": "Efficiency",
-            "score": 3.948316371908375
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=3.316, mean=3.316, max=3.316, sum=6.632 (2)",
-            "tab": "Efficiency",
-            "score": 3.3161907819755974
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)",
-            "tab": "General information",
-            "score": 513.6709677419354
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=496.704, mean=496.704, max=496.704, sum=993.409 (2)",
-            "tab": "General information",
-            "score": 496.70443349753697
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)",
-            "tab": "General information",
-            "score": 867.78
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)",
-            "tab": "General information",
-            "score": 2797.8848484848486
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=372.035, mean=372.035, max=372.035, sum=744.071 (2)",
-            "tab": "General information",
-            "score": 372.0353535353535
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)",
-            "tab": "General information",
-            "score": 465.8238341968912
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=370.908, mean=370.908, max=370.908, sum=741.815 (2)",
-            "tab": "General information",
-            "score": 370.9076923076923
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)",
-            "tab": "General information",
-            "score": 532.3555555555556
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=399.013, mean=399.013, max=399.013, sum=798.025 (2)",
-            "tab": "General information",
-            "score": 399.0126050420168
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)",
-            "tab": "General information",
-            "score": 560.4569536423841
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=495.242, mean=495.242, max=495.242, sum=990.484 (2)",
-            "tab": "General information",
-            "score": 495.2422018348624
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)",
-            "tab": "General information",
-            "score": 795.6388888888889
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)",
-            "tab": "General information",
-            "score": 2217.8088235294117
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)",
-            "tab": "General information",
-            "score": 1428.1729957805908
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.855,
-        "details": {
-          "description": "min=0.855, mean=0.855, max=0.855, sum=1.71 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=3.222, mean=3.222, max=3.222, sum=6.444 (2)",
-            "tab": "Efficiency",
-            "score": 3.2222468500180095
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=3.132, mean=3.132, max=3.132, sum=6.264 (2)",
-            "tab": "Efficiency",
-            "score": 3.1318228208381713
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=319.888, mean=319.888, max=319.888, sum=639.776 (2)",
-            "tab": "General information",
-            "score": 319.88789237668163
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=341.168, mean=341.168, max=341.168, sum=682.336 (2)",
-            "tab": "General information",
-            "score": 341.1679389312977
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.926,
-        "details": {
-          "description": "min=0.926, mean=0.926, max=0.926, sum=1.851 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=3.686, mean=3.686, max=3.686, sum=7.372 (2)",
-            "tab": "Efficiency",
-            "score": 3.68597848750343
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)",
-            "tab": "General information",
-            "score": 639.8181818181819
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.681 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=2.835, mean=2.835, max=2.835, sum=5.67 (2)",
-            "tab": "Efficiency",
-            "score": 2.834790670067255
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=449.564, mean=449.564, max=449.564, sum=899.129 (2)",
-            "tab": "General information",
-            "score": 449.5644171779141
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.696,
-        "details": {
-          "description": "min=0.696, mean=0.696, max=0.696, sum=1.393 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=2.82, mean=2.82, max=2.82, sum=5.639 (2)",
-            "tab": "Efficiency",
-            "score": 2.81969299699579
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)",
-            "tab": "General information",
-            "score": 668.0535714285714
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.913,
-        "details": {
-          "description": "min=0.913, mean=0.913, max=0.913, sum=1.825 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=2.909, mean=2.909, max=2.909, sum=5.818 (2)",
-            "tab": "Efficiency",
-            "score": 2.9087865861874183
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=283.786, mean=283.786, max=283.786, sum=567.573 (2)",
-            "tab": "General information",
-            "score": 283.7864077669903
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.936,
-        "details": {
-          "description": "min=0.936, mean=0.936, max=0.936, sum=1.872 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=2.727, mean=2.727, max=2.727, sum=5.455 (2)",
-            "tab": "Efficiency",
-            "score": 2.7273036078510122
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)",
-            "tab": "General information",
-            "score": 404.21794871794873
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "details": {
-          "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=2.657, mean=2.657, max=2.657, sum=5.314 (2)",
-            "tab": "Efficiency",
-            "score": 2.656917359828949
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=340.99, mean=340.99, max=340.99, sum=681.98 (2)",
-            "tab": "General information",
-            "score": 340.99
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.913,
-        "details": {
-          "description": "min=0.913, mean=0.913, max=0.913, sum=1.826 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=3.308, mean=3.308, max=3.308, sum=6.616 (2)",
-            "tab": "Efficiency",
-            "score": 3.3082146720715713
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=299.911, mean=299.911, max=299.911, sum=599.821 (2)",
-            "tab": "General information",
-            "score": 299.9106002554278
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.834,
-        "details": {
-          "description": "min=0.834, mean=0.834, max=0.834, sum=1.667 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=2.926, mean=2.926, max=2.926, sum=5.852 (2)",
-            "tab": "Efficiency",
-            "score": 2.9259741898906024
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=3.608, mean=3.608, max=3.608, sum=7.216 (2)",
-            "tab": "Efficiency",
-            "score": 3.608134973248956
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=476.113, mean=476.113, max=476.113, sum=952.225 (2)",
-            "tab": "General information",
-            "score": 476.1127167630058
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)",
-            "tab": "General information",
-            "score": 656.454748603352
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.889,
-        "details": {
-          "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=3.56, mean=3.56, max=3.56, sum=7.12 (2)",
-            "tab": "Efficiency",
-            "score": 3.56020544089523
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)",
-            "tab": "General information",
-            "score": 586.8137254901961
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "details": {
-          "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=3.546, mean=3.546, max=3.546, sum=7.091 (2)",
-            "tab": "Efficiency",
-            "score": 3.54565680247766
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)",
-            "tab": "General information",
-            "score": 514.5277777777778
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.709,
-        "details": {
-          "description": "min=0.709, mean=0.709, max=0.709, sum=1.418 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=3.03, mean=3.03, max=3.03, sum=6.06 (2)",
-            "tab": "Efficiency",
-            "score": 3.0301454305648803
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)",
-            "tab": "General information",
-            "score": 405.3181818181818
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.849,
-        "details": {
-          "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=2.949, mean=2.949, max=2.949, sum=5.898 (2)",
-            "tab": "Efficiency",
-            "score": 2.948831728526524
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)",
-            "tab": "General information",
-            "score": 1164.4734693877551
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.841 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=2.843, mean=2.843, max=2.843, sum=5.686 (2)",
-            "tab": "Efficiency",
-            "score": 2.842961254404552
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=445.517, mean=445.517, max=445.517, sum=891.035 (2)",
-            "tab": "General information",
-            "score": 445.51741293532336
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.578,
-        "details": {
-          "description": "min=0.578, mean=0.578, max=0.578, sum=1.157 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=3.05, mean=3.05, max=3.05, sum=6.101 (2)",
-            "tab": "Efficiency",
-            "score": 3.050425999135856
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=343.018, mean=343.018, max=343.018, sum=686.036 (2)",
-            "tab": "General information",
-            "score": 343.01807228915663
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.895,
-        "details": {
-          "description": "min=0.895, mean=0.895, max=0.895, sum=1.789 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=2.517, mean=2.517, max=2.517, sum=5.033 (2)",
-            "tab": "Efficiency",
-            "score": 2.5166666828400905
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=274.52, mean=274.52, max=274.52, sum=549.041 (2)",
-            "tab": "General information",
-            "score": 274.5204678362573
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.021,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json b/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json
deleted file mode 100644
index 6c1d661d4..000000000
--- a/data/helm_mmlu/meta/llama-3.1-8b-instruct-turbo/96544ff3-225e-4f8f-82fb-2e3c42d5ba89.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.1-8b-instruct-turbo/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 3.1 Instruct Turbo 8B",
-    "id": "meta/llama-3.1-8b-instruct-turbo",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.561,
-        "details": {
-          "description": "min=0.26, mean=0.561, max=0.865, sum=63.912 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.202, mean=0.56, max=1.485, sum=63.854 (114)",
-            "tab": "Efficiency",
-            "score": 0.5601251981506405
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)",
-            "tab": "General information",
-            "score": 614.6193817308517
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.26,
-        "details": {
-          "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.284, mean=0.284, max=0.284, sum=0.568 (2)",
-            "tab": "Efficiency",
-            "score": 0.28381933450698854
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=373.43, mean=373.43, max=373.43, sum=746.86 (2)",
-            "tab": "General information",
-            "score": 373.43
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.459,
-        "details": {
-          "description": "min=0.459, mean=0.459, max=0.459, sum=0.919 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.323, max=0.323, sum=0.646 (2)",
-            "tab": "Efficiency",
-            "score": 0.3231998196354619
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=353.874, mean=353.874, max=353.874, sum=707.748 (2)",
-            "tab": "General information",
-            "score": 353.8740740740741
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.363,
-        "details": {
-          "description": "min=0.363, mean=0.363, max=0.363, sum=0.725 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.431, mean=0.431, max=0.431, sum=0.862 (2)",
-            "tab": "Efficiency",
-            "score": 0.43078258752822873
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.426, mean=0.426, max=0.426, sum=0.853 (2)",
-            "tab": "Efficiency",
-            "score": 0.42637243535783553
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.562, mean=0.562, max=0.562, sum=1.125 (2)",
-            "tab": "Efficiency",
-            "score": 0.5623248195648194
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.371, mean=0.371, max=0.371, sum=0.742 (2)",
-            "tab": "Efficiency",
-            "score": 0.3709776735305786
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.395, mean=0.395, max=0.395, sum=0.79 (2)",
-            "tab": "Efficiency",
-            "score": 0.3948341918129452
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.395, mean=0.395, max=0.395, sum=0.789 (2)",
-            "tab": "Efficiency",
-            "score": 0.39474552051693784
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)",
-            "tab": "General information",
-            "score": 549.28
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=473.875, mean=473.875, max=473.875, sum=947.75 (2)",
-            "tab": "General information",
-            "score": 473.875
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)",
-            "tab": "General information",
-            "score": 828.29
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)",
-            "tab": "General information",
-            "score": 594.51
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)",
-            "tab": "General information",
-            "score": 502.70520231213874
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)",
-            "tab": "General information",
-            "score": 503.5686274509804
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.71,
-        "details": {
-          "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.434, mean=0.434, max=0.434, sum=0.867 (2)",
-            "tab": "Efficiency",
-            "score": 0.43369229555130007
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=378.51, mean=378.51, max=378.51, sum=757.02 (2)",
-            "tab": "General information",
-            "score": 378.51
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.351,
-        "details": {
-          "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.371, mean=0.371, max=0.371, sum=0.742 (2)",
-            "tab": "Efficiency",
-            "score": 0.3707838414008157
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)",
-            "tab": "General information",
-            "score": 614.421052631579
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.26,
-        "details": {
-          "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.202, mean=0.202, max=0.202, sum=0.403 (2)",
-            "tab": "Efficiency",
-            "score": 0.2015515398979187
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)",
-            "tab": "General information",
-            "score": 399.71
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.731,
-        "details": {
-          "description": "min=0.731, mean=0.731, max=0.731, sum=1.463 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=1.035, mean=1.035, max=1.035, sum=2.07 (2)",
-            "tab": "Efficiency",
-            "score": 1.0347525963076838
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=394.63, mean=394.63, max=394.63, sum=789.259 (2)",
-            "tab": "General information",
-            "score": 394.6296296296296
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.64,
-        "details": {
-          "description": "min=0.64, mean=0.64, max=0.64, sum=1.28 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.681, mean=0.681, max=0.681, sum=1.363 (2)",
-            "tab": "Efficiency",
-            "score": 0.6814629341628391
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)",
-            "tab": "General information",
-            "score": 329.08360128617363
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.649,
-        "details": {
-          "description": "min=0.649, mean=0.649, max=0.649, sum=1.297 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.546, mean=0.546, max=0.546, sum=1.091 (2)",
-            "tab": "Efficiency",
-            "score": 0.5456299475010704
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.538, mean=0.538, max=0.538, sum=1.077 (2)",
-            "tab": "Efficiency",
-            "score": 0.5383730044601657
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.881, mean=0.881, max=0.881, sum=1.762 (2)",
-            "tab": "Efficiency",
-            "score": 0.8808572895368355
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.694, mean=0.694, max=0.694, sum=1.388 (2)",
-            "tab": "Efficiency",
-            "score": 0.6941978611977272
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)",
-            "tab": "General information",
-            "score": 1094.4889705882354
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)",
-            "tab": "General information",
-            "score": 658.5851063829788
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)",
-            "tab": "General information",
-            "score": 1637.6010430247718
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)",
-            "tab": "General information",
-            "score": 575.0980392156863
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.567, mean=0.567, max=0.567, sum=1.135 (2)",
-            "tab": "Efficiency",
-            "score": 0.5673955392837524
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)",
-            "tab": "General information",
-            "score": 422.79
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.645,
-        "details": {
-          "description": "min=0.645, mean=0.645, max=0.645, sum=1.289 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.317, mean=0.317, max=0.317, sum=0.634 (2)",
-            "tab": "Efficiency",
-            "score": 0.3168644199245854
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)",
-            "tab": "General information",
-            "score": 579.6842105263158
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.65,
-        "details": {
-          "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.444, mean=0.444, max=0.444, sum=0.888 (2)",
-            "tab": "Efficiency",
-            "score": 0.44396358251571655
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)",
-            "tab": "General information",
-            "score": 569.52
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.615,
-        "details": {
-          "description": "min=0.615, mean=0.615, max=0.615, sum=1.23 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.369, mean=0.369, max=0.369, sum=0.738 (2)",
-            "tab": "Efficiency",
-            "score": 0.3692442273193935
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=397.928, mean=397.928, max=397.928, sum=795.857 (2)",
-            "tab": "General information",
-            "score": 397.92830188679244
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.528,
-        "details": {
-          "description": "min=0.528, mean=0.528, max=0.528, sum=1.055 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.351, mean=0.351, max=0.351, sum=0.701 (2)",
-            "tab": "Efficiency",
-            "score": 0.35051030605397326
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=304.834, mean=304.834, max=304.834, sum=609.668 (2)",
-            "tab": "General information",
-            "score": 304.83404255319147
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.441,
-        "details": {
-          "description": "min=0.441, mean=0.441, max=0.441, sum=0.883 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)",
-            "tab": "Efficiency",
-            "score": 0.34982287637118636
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=435.607, mean=435.607, max=435.607, sum=871.214 (2)",
-            "tab": "General information",
-            "score": 435.60689655172416
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.429,
-        "details": {
-          "description": "min=0.429, mean=0.429, max=0.429, sum=0.857 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.4, mean=0.4, max=0.4, sum=0.801 (2)",
-            "tab": "Efficiency",
-            "score": 0.4003569991500289
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)",
-            "tab": "General information",
-            "score": 531.8544973544973
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.444,
-        "details": {
-          "description": "min=0.444, mean=0.444, max=0.444, sum=0.889 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.357, mean=0.357, max=0.357, sum=0.714 (2)",
-            "tab": "Efficiency",
-            "score": 0.35707327108534553
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)",
-            "tab": "General information",
-            "score": 601.7777777777778
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.515,
-        "details": {
-          "description": "min=0.515, mean=0.515, max=0.515, sum=1.03 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.211, mean=0.211, max=0.211, sum=0.423 (2)",
-            "tab": "Efficiency",
-            "score": 0.21137587870320967
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.211, mean=0.211, max=0.211, sum=0.423 (2)",
-            "tab": "Efficiency",
-            "score": 0.2113605567387172
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.214, mean=0.214, max=0.214, sum=0.428 (2)",
-            "tab": "Efficiency",
-            "score": 0.2138903546333313
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.664 (2)",
-            "tab": "Efficiency",
-            "score": 0.33188523668231384
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.218, mean=0.218, max=0.218, sum=0.435 (2)",
-            "tab": "Efficiency",
-            "score": 0.21753037818754561
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.558, mean=0.558, max=0.558, sum=1.117 (2)",
-            "tab": "Efficiency",
-            "score": 0.558492410985917
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.703, mean=0.703, max=0.703, sum=1.407 (2)",
-            "tab": "Efficiency",
-            "score": 0.7033225890917656
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.649, mean=0.649, max=0.649, sum=1.299 (2)",
-            "tab": "Efficiency",
-            "score": 0.6494572189119127
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.612, mean=0.612, max=0.612, sum=1.223 (2)",
-            "tab": "Efficiency",
-            "score": 0.6115654797113242
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.564, mean=0.564, max=0.564, sum=1.127 (2)",
-            "tab": "Efficiency",
-            "score": 0.5636763351642533
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.681, mean=0.681, max=0.681, sum=1.363 (2)",
-            "tab": "Efficiency",
-            "score": 0.6813242522948378
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.606, mean=0.606, max=0.606, sum=1.212 (2)",
-            "tab": "Efficiency",
-            "score": 0.6060926814874014
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=1.122, mean=1.122, max=1.122, sum=2.244 (2)",
-            "tab": "Efficiency",
-            "score": 1.1218917334780973
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.538, mean=0.538, max=0.538, sum=1.076 (2)",
-            "tab": "Efficiency",
-            "score": 0.5378943324592043
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)",
-            "tab": "General information",
-            "score": 513.6709677419354
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=496.704, mean=496.704, max=496.704, sum=993.409 (2)",
-            "tab": "General information",
-            "score": 496.70443349753697
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)",
-            "tab": "General information",
-            "score": 867.78
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)",
-            "tab": "General information",
-            "score": 2797.8848484848486
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=372.035, mean=372.035, max=372.035, sum=744.071 (2)",
-            "tab": "General information",
-            "score": 372.0353535353535
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)",
-            "tab": "General information",
-            "score": 465.8238341968912
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=370.908, mean=370.908, max=370.908, sum=741.815 (2)",
-            "tab": "General information",
-            "score": 370.9076923076923
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)",
-            "tab": "General information",
-            "score": 532.3555555555556
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=399.013, mean=399.013, max=399.013, sum=798.025 (2)",
-            "tab": "General information",
-            "score": 399.0126050420168
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)",
-            "tab": "General information",
-            "score": 560.4569536423841
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=495.242, mean=495.242, max=495.242, sum=990.484 (2)",
-            "tab": "General information",
-            "score": 495.2422018348624
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)",
-            "tab": "General information",
-            "score": 795.6388888888889
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)",
-            "tab": "General information",
-            "score": 2217.8088235294117
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)",
-            "tab": "General information",
-            "score": 1428.1729957805908
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.733,
-        "details": {
-          "description": "min=0.733, mean=0.733, max=0.733, sum=1.466 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.685, mean=0.685, max=0.685, sum=1.369 (2)",
-            "tab": "Efficiency",
-            "score": 0.6845707412257858
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=1.227, mean=1.227, max=1.227, sum=2.455 (2)",
-            "tab": "Efficiency",
-            "score": 1.2273387745136524
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=319.888, mean=319.888, max=319.888, sum=639.776 (2)",
-            "tab": "General information",
-            "score": 319.88789237668163
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=341.168, mean=341.168, max=341.168, sum=682.336 (2)",
-            "tab": "General information",
-            "score": 341.1679389312977
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.694,
-        "details": {
-          "description": "min=0.694, mean=0.694, max=0.694, sum=1.388 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.684, mean=0.684, max=0.684, sum=1.369 (2)",
-            "tab": "Efficiency",
-            "score": 0.6842782950598346
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)",
-            "tab": "General information",
-            "score": 639.8181818181819
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.742,
-        "details": {
-          "description": "min=0.742, mean=0.742, max=0.742, sum=1.485 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=1.35, mean=1.35, max=1.35, sum=2.7 (2)",
-            "tab": "Efficiency",
-            "score": 1.3501118970063566
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=449.564, mean=449.564, max=449.564, sum=899.129 (2)",
-            "tab": "General information",
-            "score": 449.5644171779141
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.384,
-        "details": {
-          "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.46, mean=0.46, max=0.46, sum=0.919 (2)",
-            "tab": "Efficiency",
-            "score": 0.45964209735393524
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)",
-            "tab": "General information",
-            "score": 668.0535714285714
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.709,
-        "details": {
-          "description": "min=0.709, mean=0.709, max=0.709, sum=1.417 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.481, mean=0.481, max=0.481, sum=0.963 (2)",
-            "tab": "Efficiency",
-            "score": 0.48132226536574874
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=283.786, mean=283.786, max=283.786, sum=567.573 (2)",
-            "tab": "General information",
-            "score": 283.7864077669903
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.833,
-        "details": {
-          "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.529, mean=0.529, max=0.529, sum=1.059 (2)",
-            "tab": "Efficiency",
-            "score": 0.5294545297948723
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)",
-            "tab": "General information",
-            "score": 404.21794871794873
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.66,
-        "details": {
-          "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.521, mean=0.521, max=0.521, sum=1.041 (2)",
-            "tab": "Efficiency",
-            "score": 0.520596706867218
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=340.99, mean=340.99, max=340.99, sum=681.98 (2)",
-            "tab": "General information",
-            "score": 340.99
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.653,
-        "details": {
-          "description": "min=0.653, mean=0.653, max=0.653, sum=1.305 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.803, mean=0.803, max=0.803, sum=1.606 (2)",
-            "tab": "Efficiency",
-            "score": 0.8030396217282857
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=299.911, mean=299.911, max=299.911, sum=599.821 (2)",
-            "tab": "General information",
-            "score": 299.9106002554278
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.368,
-        "details": {
-          "description": "min=0.368, mean=0.368, max=0.368, sum=0.735 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.657, mean=0.657, max=0.657, sum=1.314 (2)",
-            "tab": "Efficiency",
-            "score": 0.6570079657383737
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.65, mean=0.65, max=0.65, sum=1.299 (2)",
-            "tab": "Efficiency",
-            "score": 0.649639103266114
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=476.113, mean=476.113, max=476.113, sum=952.225 (2)",
-            "tab": "General information",
-            "score": 476.1127167630058
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)",
-            "tab": "General information",
-            "score": 656.454748603352
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.712,
-        "details": {
-          "description": "min=0.712, mean=0.712, max=0.712, sum=1.425 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=1.485, mean=1.485, max=1.485, sum=2.971 (2)",
-            "tab": "Efficiency",
-            "score": 1.4853957338270798
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)",
-            "tab": "General information",
-            "score": 586.8137254901961
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.728,
-        "details": {
-          "description": "min=0.728, mean=0.728, max=0.728, sum=1.457 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.792, mean=0.792, max=0.792, sum=1.584 (2)",
-            "tab": "Efficiency",
-            "score": 0.7917959955003526
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)",
-            "tab": "General information",
-            "score": 514.5277777777778
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.664,
-        "details": {
-          "description": "min=0.664, mean=0.664, max=0.664, sum=1.327 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.493, mean=0.493, max=0.493, sum=0.986 (2)",
-            "tab": "Efficiency",
-            "score": 0.49318039634011007
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)",
-            "tab": "General information",
-            "score": 405.3181818181818
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.576,
-        "details": {
-          "description": "min=0.576, mean=0.576, max=0.576, sum=1.151 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.656, mean=0.656, max=0.656, sum=1.312 (2)",
-            "tab": "Efficiency",
-            "score": 0.6561975401275012
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)",
-            "tab": "General information",
-            "score": 1164.4734693877551
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.701,
-        "details": {
-          "description": "min=0.701, mean=0.701, max=0.701, sum=1.403 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.517, mean=0.517, max=0.517, sum=1.034 (2)",
-            "tab": "Efficiency",
-            "score": 0.5170851643405744
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=445.517, mean=445.517, max=445.517, sum=891.035 (2)",
-            "tab": "General information",
-            "score": 445.51741293532336
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.446,
-        "details": {
-          "description": "min=0.446, mean=0.446, max=0.446, sum=0.892 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.406, mean=0.406, max=0.406, sum=0.813 (2)",
-            "tab": "Efficiency",
-            "score": 0.40646702553852493
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=343.018, mean=343.018, max=343.018, sum=686.036 (2)",
-            "tab": "General information",
-            "score": 343.01807228915663
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.789,
-        "details": {
-          "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.587, mean=0.587, max=0.587, sum=1.173 (2)",
-            "tab": "Efficiency",
-            "score": 0.5866640882882458
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=274.52, mean=274.52, max=274.52, sum=549.041 (2)",
-            "tab": "General information",
-            "score": 274.5204678362573
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.475,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json b/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json
deleted file mode 100644
index 599cd6855..000000000
--- a/data/helm_mmlu/meta/llama-3.2-11b-vision-instruct-turbo/bb6fd9af-5dd0-4590-b6ea-7687029ca18c.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.2-11b-vision-instruct-turbo/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 3.2 Vision Instruct Turbo 11B",
-    "id": "meta/llama-3.2-11b-vision-instruct-turbo",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.565,
-        "details": {
-          "description": "min=0.25, mean=0.565, max=0.865, sum=64.419 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.204, mean=0.255, max=0.726, sum=29.095 (114)",
-            "tab": "Efficiency",
-            "score": 0.2552187424358169
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)",
-            "tab": "General information",
-            "score": 614.6193817308517
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.28,
-        "details": {
-          "description": "min=0.28, mean=0.28, max=0.28, sum=0.56 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.227, mean=0.227, max=0.227, sum=0.454 (2)",
-            "tab": "Efficiency",
-            "score": 0.2272411847114563
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=373.43, mean=373.43, max=373.43, sum=746.86 (2)",
-            "tab": "General information",
-            "score": 373.43
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.533,
-        "details": {
-          "description": "min=0.533, mean=0.533, max=0.533, sum=1.067 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.222, mean=0.222, max=0.222, sum=0.443 (2)",
-            "tab": "Efficiency",
-            "score": 0.22151856069211606
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=353.874, mean=353.874, max=353.874, sum=707.748 (2)",
-            "tab": "General information",
-            "score": 353.8740740740741
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.333,
-        "details": {
-          "description": "min=0.333, mean=0.333, max=0.333, sum=0.667 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.726, mean=0.726, max=0.726, sum=1.453 (2)",
-            "tab": "Efficiency",
-            "score": 0.7264108276367187
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.244, mean=0.244, max=0.244, sum=0.488 (2)",
-            "tab": "Efficiency",
-            "score": 0.24387328988975948
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.216, mean=0.216, max=0.216, sum=0.433 (2)",
-            "tab": "Efficiency",
-            "score": 0.21631600618362426
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.219, mean=0.219, max=0.219, sum=0.437 (2)",
-            "tab": "Efficiency",
-            "score": 0.21859397411346435
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.22, mean=0.22, max=0.22, sum=0.439 (2)",
-            "tab": "Efficiency",
-            "score": 0.21971637665191826
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.226, mean=0.226, max=0.226, sum=0.452 (2)",
-            "tab": "Efficiency",
-            "score": 0.22610483683791816
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)",
-            "tab": "General information",
-            "score": 549.28
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=473.875, mean=473.875, max=473.875, sum=947.75 (2)",
-            "tab": "General information",
-            "score": 473.875
-          },
-          "College Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)",
-            "tab": "General information",
-            "score": 828.29
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)",
-            "tab": "General information",
-            "score": 594.51
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)",
-            "tab": "General information",
-            "score": 502.70520231213874
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)",
-            "tab": "General information",
-            "score": 503.5686274509804
-          },
-          "College Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.71,
-        "details": {
-          "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.469, mean=0.469, max=0.469, sum=0.938 (2)",
-            "tab": "Efficiency",
-            "score": 0.4692394161224365
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=378.51, mean=378.51, max=378.51, sum=757.02 (2)",
-            "tab": "General information",
-            "score": 378.51
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.395,
-        "details": {
-          "description": "min=0.395, mean=0.395, max=0.395, sum=0.789 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.226, mean=0.226, max=0.226, sum=0.451 (2)",
-            "tab": "Efficiency",
-            "score": 0.22570312023162842
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)",
-            "tab": "General information",
-            "score": 614.421052631579
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25,
-        "details": {
-          "description": "min=0.25, mean=0.25, max=0.25, sum=0.5 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.249, mean=0.249, max=0.249, sum=0.497 (2)",
-            "tab": "Efficiency",
-            "score": 0.24868298768997193
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)",
-            "tab": "General information",
-            "score": 399.71
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.722,
-        "details": {
-          "description": "min=0.722, mean=0.722, max=0.722, sum=1.444 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.204, mean=0.204, max=0.204, sum=0.409 (2)",
-            "tab": "Efficiency",
-            "score": 0.20448691756637008
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=394.63, mean=394.63, max=394.63, sum=789.259 (2)",
-            "tab": "General information",
-            "score": 394.6296296296296
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.646,
-        "details": {
-          "description": "min=0.646, mean=0.646, max=0.646, sum=1.293 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.216, mean=0.216, max=0.216, sum=0.433 (2)",
-            "tab": "Efficiency",
-            "score": 0.21639636628497452
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)",
-            "tab": "General information",
-            "score": 329.08360128617363
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.649,
-        "details": {
-          "description": "min=0.649, mean=0.649, max=0.649, sum=1.297 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.306, mean=0.306, max=0.306, sum=0.613 (2)",
-            "tab": "Efficiency",
-            "score": 0.30631748893681693
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.236, mean=0.236, max=0.236, sum=0.472 (2)",
-            "tab": "Efficiency",
-            "score": 0.23619349882112328
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.291, mean=0.291, max=0.291, sum=0.581 (2)",
-            "tab": "Efficiency",
-            "score": 0.2907135481940099
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.233, mean=0.233, max=0.233, sum=0.465 (2)",
-            "tab": "Efficiency",
-            "score": 0.23272827988356545
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)",
-            "tab": "General information",
-            "score": 1094.4889705882354
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)",
-            "tab": "General information",
-            "score": 658.5851063829788
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)",
-            "tab": "General information",
-            "score": 1637.6010430247718
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)",
-            "tab": "General information",
-            "score": 575.0980392156863
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78,
-        "details": {
-          "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.383, mean=0.383, max=0.383, sum=0.765 (2)",
-            "tab": "Efficiency",
-            "score": 0.3825261640548706
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)",
-            "tab": "General information",
-            "score": 422.79
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.671,
-        "details": {
-          "description": "min=0.671, mean=0.671, max=0.671, sum=1.342 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.249, mean=0.249, max=0.249, sum=0.497 (2)",
-            "tab": "Efficiency",
-            "score": 0.24860012060717532
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)",
-            "tab": "General information",
-            "score": 579.6842105263158
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.64,
-        "details": {
-          "description": "min=0.64, mean=0.64, max=0.64, sum=1.28 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.231, mean=0.231, max=0.231, sum=0.462 (2)",
-            "tab": "Efficiency",
-            "score": 0.23080476760864257
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)",
-            "tab": "General information",
-            "score": 569.52
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.638,
-        "details": {
-          "description": "min=0.638, mean=0.638, max=0.638, sum=1.275 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.23, mean=0.23, max=0.23, sum=0.46 (2)",
-            "tab": "Efficiency",
-            "score": 0.22993840721418274
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=397.928, mean=397.928, max=397.928, sum=795.857 (2)",
-            "tab": "General information",
-            "score": 397.92830188679244
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.536,
-        "details": {
-          "description": "min=0.536, mean=0.536, max=0.536, sum=1.072 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.221, mean=0.221, max=0.221, sum=0.441 (2)",
-            "tab": "Efficiency",
-            "score": 0.2206148127292065
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=304.834, mean=304.834, max=304.834, sum=609.668 (2)",
-            "tab": "General information",
-            "score": 304.83404255319147
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.51,
-        "details": {
-          "description": "min=0.51, mean=0.51, max=0.51, sum=1.021 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.231, mean=0.231, max=0.231, sum=0.461 (2)",
-            "tab": "Efficiency",
-            "score": 0.23056076312887258
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=435.607, mean=435.607, max=435.607, sum=871.214 (2)",
-            "tab": "General information",
-            "score": 435.60689655172416
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.458,
-        "details": {
-          "description": "min=0.458, mean=0.458, max=0.458, sum=0.915 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.224, mean=0.224, max=0.224, sum=0.447 (2)",
-            "tab": "Efficiency",
-            "score": 0.22350322569488848
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)",
-            "tab": "General information",
-            "score": 531.8544973544973
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46,
-        "details": {
-          "description": "min=0.46, mean=0.46, max=0.46, sum=0.921 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.229, mean=0.229, max=0.229, sum=0.458 (2)",
-            "tab": "Efficiency",
-            "score": 0.22878488661750915
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)",
-            "tab": "General information",
-            "score": 601.7777777777778
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.502,
-        "details": {
-          "description": "min=0.502, mean=0.502, max=0.502, sum=1.004 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.225, mean=0.225, max=0.225, sum=0.449 (2)",
-            "tab": "Efficiency",
-            "score": 0.22474505209153697
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.212, mean=0.212, max=0.212, sum=0.424 (2)",
-            "tab": "Efficiency",
-            "score": 0.21204462192328694
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.23, mean=0.23, max=0.23, sum=0.461 (2)",
-            "tab": "Efficiency",
-            "score": 0.2303963828086853
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.287, mean=0.287, max=0.287, sum=0.574 (2)",
-            "tab": "Efficiency",
-            "score": 0.28706942760583126
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.229, mean=0.229, max=0.229, sum=0.458 (2)",
-            "tab": "Efficiency",
-            "score": 0.22903898388448388
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.234, mean=0.234, max=0.234, sum=0.469 (2)",
-            "tab": "Efficiency",
-            "score": 0.23445281092984688
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.229, mean=0.229, max=0.229, sum=0.459 (2)",
-            "tab": "Efficiency",
-            "score": 0.22930157551398644
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.23, mean=0.23, max=0.23, sum=0.46 (2)",
-            "tab": "Efficiency",
-            "score": 0.23021557595994738
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.235, mean=0.235, max=0.235, sum=0.471 (2)",
-            "tab": "Efficiency",
-            "score": 0.2354360087579038
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.229, mean=0.229, max=0.229, sum=0.458 (2)",
-            "tab": "Efficiency",
-            "score": 0.22899133953827105
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.227, mean=0.227, max=0.227, sum=0.454 (2)",
-            "tab": "Efficiency",
-            "score": 0.22700285386601718
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.24, mean=0.24, max=0.24, sum=0.48 (2)",
-            "tab": "Efficiency",
-            "score": 0.2400491248678278
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.253, mean=0.253, max=0.253, sum=0.506 (2)",
-            "tab": "Efficiency",
-            "score": 0.2529456720632665
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.25, mean=0.25, max=0.25, sum=0.499 (2)",
-            "tab": "Efficiency",
-            "score": 0.249685173799217
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)",
-            "tab": "General information",
-            "score": 513.6709677419354
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=496.704, mean=496.704, max=496.704, sum=993.409 (2)",
-            "tab": "General information",
-            "score": 496.70443349753697
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)",
-            "tab": "General information",
-            "score": 867.78
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)",
-            "tab": "General information",
-            "score": 2797.8848484848486
-          },
-          "High School European History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=372.035, mean=372.035, max=372.035, sum=744.071 (2)",
-            "tab": "General information",
-            "score": 372.0353535353535
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)",
-            "tab": "General information",
-            "score": 465.8238341968912
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=370.908, mean=370.908, max=370.908, sum=741.815 (2)",
-            "tab": "General information",
-            "score": 370.9076923076923
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)",
-            "tab": "General information",
-            "score": 532.3555555555556
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=399.013, mean=399.013, max=399.013, sum=798.025 (2)",
-            "tab": "General information",
-            "score": 399.0126050420168
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)",
-            "tab": "General information",
-            "score": 560.4569536423841
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=495.242, mean=495.242, max=495.242, sum=990.484 (2)",
-            "tab": "General information",
-            "score": 495.2422018348624
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)",
-            "tab": "General information",
-            "score": 795.6388888888889
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)",
-            "tab": "General information",
-            "score": 2217.8088235294117
-          },
-          "High School US History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)",
-            "tab": "General information",
-            "score": 1428.1729957805908
-          },
-          "High School World History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.763,
-        "details": {
-          "description": "min=0.763, mean=0.763, max=0.763, sum=1.527 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.322, mean=0.322, max=0.322, sum=0.645 (2)",
-            "tab": "Efficiency",
-            "score": 0.32235514315789054
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.245, mean=0.245, max=0.245, sum=0.49 (2)",
-            "tab": "Efficiency",
-            "score": 0.24487258095777673
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=319.888, mean=319.888, max=319.888, sum=639.776 (2)",
-            "tab": "General information",
-            "score": 319.88789237668163
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=341.168, mean=341.168, max=341.168, sum=682.336 (2)",
-            "tab": "General information",
-            "score": 341.1679389312977
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.711,
-        "details": {
-          "description": "min=0.711, mean=0.711, max=0.711, sum=1.421 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.231, mean=0.231, max=0.231, sum=0.462 (2)",
-            "tab": "Efficiency",
-            "score": 0.23109814943360887
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)",
-            "tab": "General information",
-            "score": 639.8181818181819
-          },
-          "International Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.742,
-        "details": {
-          "description": "min=0.742, mean=0.742, max=0.742, sum=1.485 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.22, mean=0.22, max=0.22, sum=0.44 (2)",
-            "tab": "Efficiency",
-            "score": 0.21997687714231526
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=449.564, mean=449.564, max=449.564, sum=899.129 (2)",
-            "tab": "General information",
-            "score": 449.5644171779141
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375,
-        "details": {
-          "description": "min=0.375, mean=0.375, max=0.375, sum=0.75 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.234, mean=0.234, max=0.234, sum=0.467 (2)",
-            "tab": "Efficiency",
-            "score": 0.2336032326732363
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)",
-            "tab": "General information",
-            "score": 668.0535714285714
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.728,
-        "details": {
-          "description": "min=0.728, mean=0.728, max=0.728, sum=1.456 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.246, mean=0.246, max=0.246, sum=0.491 (2)",
-            "tab": "Efficiency",
-            "score": 0.24564221067335998
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=283.786, mean=283.786, max=283.786, sum=567.573 (2)",
-            "tab": "General information",
-            "score": 283.7864077669903
-          },
-          "Management - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.838,
-        "details": {
-          "description": "min=0.838, mean=0.838, max=0.838, sum=1.675 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.269, mean=0.269, max=0.269, sum=0.537 (2)",
-            "tab": "Efficiency",
-            "score": 0.26863190149649596
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)",
-            "tab": "General information",
-            "score": 404.21794871794873
-          },
-          "Marketing - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7,
-        "details": {
-          "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.273, mean=0.273, max=0.273, sum=0.546 (2)",
-            "tab": "Efficiency",
-            "score": 0.2728374266624451
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=340.99, mean=340.99, max=340.99, sum=681.98 (2)",
-            "tab": "General information",
-            "score": 340.99
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.644,
-        "details": {
-          "description": "min=0.644, mean=0.644, max=0.644, sum=1.287 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.673 (2)",
-            "tab": "Efficiency",
-            "score": 0.33641790095264734
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=299.911, mean=299.911, max=299.911, sum=599.821 (2)",
-            "tab": "General information",
-            "score": 299.9106002554278
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328,
-        "details": {
-          "description": "min=0.328, mean=0.328, max=0.328, sum=0.657 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.243, mean=0.243, max=0.243, sum=0.486 (2)",
-            "tab": "Efficiency",
-            "score": 0.24306911126726624
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.229, mean=0.229, max=0.229, sum=0.458 (2)",
-            "tab": "Efficiency",
-            "score": 0.2289134478435836
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=476.113, mean=476.113, max=476.113, sum=952.225 (2)",
-            "tab": "General information",
-            "score": 476.1127167630058
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)",
-            "tab": "General information",
-            "score": 656.454748603352
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.752,
-        "details": {
-          "description": "min=0.752, mean=0.752, max=0.752, sum=1.503 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.24, mean=0.24, max=0.24, sum=0.48 (2)",
-            "tab": "Efficiency",
-            "score": 0.2399757040871514
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)",
-            "tab": "General information",
-            "score": 586.8137254901961
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.744,
-        "details": {
-          "description": "min=0.744, mean=0.744, max=0.744, sum=1.488 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.229, mean=0.229, max=0.229, sum=0.457 (2)",
-            "tab": "Efficiency",
-            "score": 0.2287170680952661
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)",
-            "tab": "General information",
-            "score": 514.5277777777778
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.645,
-        "details": {
-          "description": "min=0.645, mean=0.645, max=0.645, sum=1.291 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.246, mean=0.246, max=0.246, sum=0.491 (2)",
-            "tab": "Efficiency",
-            "score": 0.24565653367476029
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)",
-            "tab": "General information",
-            "score": 405.3181818181818
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.567,
-        "details": {
-          "description": "min=0.567, mean=0.567, max=0.567, sum=1.135 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.253, mean=0.253, max=0.253, sum=0.506 (2)",
-            "tab": "Efficiency",
-            "score": 0.25285910878862655
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)",
-            "tab": "General information",
-            "score": 1164.4734693877551
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.627,
-        "details": {
-          "description": "min=0.627, mean=0.627, max=0.627, sum=1.254 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.234, mean=0.234, max=0.234, sum=0.468 (2)",
-            "tab": "Efficiency",
-            "score": 0.23380224503094876
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=445.517, mean=445.517, max=445.517, sum=891.035 (2)",
-            "tab": "General information",
-            "score": 445.51741293532336
-          },
-          "Sociology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.446,
-        "details": {
-          "description": "min=0.446, mean=0.446, max=0.446, sum=0.892 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.223, mean=0.223, max=0.223, sum=0.447 (2)",
-            "tab": "Efficiency",
-            "score": 0.22334270161318492
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=343.018, mean=343.018, max=343.018, sum=686.036 (2)",
-            "tab": "General information",
-            "score": 343.01807228915663
-          },
-          "Virology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.696,
-        "details": {
-          "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.239, mean=0.239, max=0.239, sum=0.478 (2)",
-            "tab": "Efficiency",
-            "score": 0.23875254357767384
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=274.52, mean=274.52, max=274.52, sum=549.041 (2)",
-            "tab": "General information",
-            "score": 274.5204678362573
-          },
-          "World Religions - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.897,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json b/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json
deleted file mode 100644
index f14700c78..000000000
--- a/data/helm_mmlu/meta/llama-3.2-90b-vision-instruct-turbo/e036de72-b425-4aa5-9448-dc52560e60db.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.2-90b-vision-instruct-turbo/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 3.2 Vision Instruct Turbo 90B",
-    "id": "meta/llama-3.2-90b-vision-instruct-turbo",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.803,
-        "details": {
-          "description": "min=0.407, mean=0.803, max=0.979, sum=91.503 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.256, mean=0.374, max=2.612, sum=42.58 (114)",
-            "tab": "Efficiency",
-            "score": 0.37350966276831277
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)",
-            "tab": "General information",
-            "score": 614.6193817308517
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52,
-        "details": {
-          "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=2.612, mean=2.612, max=2.612, sum=5.224 (2)",
-            "tab": "Efficiency",
-            "score": 2.611864836215973
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=373.43, mean=373.43, max=373.43, sum=746.86 (2)",
-            "tab": "General information",
-            "score": 373.43
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.672 (2)",
-            "tab": "Efficiency",
-            "score": 0.3359027315069128
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=353.874, mean=353.874, max=353.874, sum=707.748 (2)",
-            "tab": "General information",
-            "score": 353.8740740740741
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.539,
-        "details": {
-          "description": "min=0.539, mean=0.539, max=0.539, sum=1.078 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.31, mean=0.31, max=0.31, sum=0.621 (2)",
-            "tab": "Efficiency",
-            "score": 0.3104448890686035
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.272, mean=0.272, max=0.272, sum=0.544 (2)",
-            "tab": "Efficiency",
-            "score": 0.2720499005582597
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.642 (2)",
-            "tab": "Efficiency",
-            "score": 0.32119542360305786
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)",
-            "tab": "Efficiency",
-            "score": 0.31477957487106323
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.283, mean=0.283, max=0.283, sum=0.566 (2)",
-            "tab": "Efficiency",
-            "score": 0.28313319255850905
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.317, mean=0.317, max=0.317, sum=0.634 (2)",
-            "tab": "Efficiency",
-            "score": 0.31692570097306194
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)",
-            "tab": "General information",
-            "score": 549.28
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=473.875, mean=473.875, max=473.875, sum=947.75 (2)",
-            "tab": "General information",
-            "score": 473.875
-          },
-          "College Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)",
-            "tab": "General information",
-            "score": 828.29
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)",
-            "tab": "General information",
-            "score": 594.51
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)",
-            "tab": "General information",
-            "score": 502.70520231213874
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)",
-            "tab": "General information",
-            "score": 503.5686274509804
-          },
-          "College Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.266, mean=0.266, max=0.266, sum=0.532 (2)",
-            "tab": "Efficiency",
-            "score": 0.26576273441314696
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=378.51, mean=378.51, max=378.51, sum=757.02 (2)",
-            "tab": "General information",
-            "score": 378.51
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.684,
-        "details": {
-          "description": "min=0.684, mean=0.684, max=0.684, sum=1.368 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.595 (2)",
-            "tab": "Efficiency",
-            "score": 0.2972530210227297
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)",
-            "tab": "General information",
-            "score": 614.421052631579
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6,
-        "details": {
-          "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.267, mean=0.267, max=0.267, sum=0.533 (2)",
-            "tab": "Efficiency",
-            "score": 0.2666162133216858
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)",
-            "tab": "General information",
-            "score": 399.71
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "details": {
-          "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.279, mean=0.279, max=0.279, sum=0.558 (2)",
-            "tab": "Efficiency",
-            "score": 0.278864703796528
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=394.63, mean=394.63, max=394.63, sum=789.259 (2)",
-            "tab": "General information",
-            "score": 394.6296296296296
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.839,
-        "details": {
-          "description": "min=0.839, mean=0.839, max=0.839, sum=1.678 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.594 (2)",
-            "tab": "Efficiency",
-            "score": 0.29689135582117404
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)",
-            "tab": "General information",
-            "score": 329.08360128617363
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.843,
-        "details": {
-          "description": "min=0.843, mean=0.843, max=0.843, sum=1.686 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.553, mean=0.553, max=0.553, sum=1.106 (2)",
-            "tab": "Efficiency",
-            "score": 0.5529017465956071
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.323, max=0.323, sum=0.647 (2)",
-            "tab": "Efficiency",
-            "score": 0.32346555189038
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.372, mean=0.372, max=0.372, sum=0.743 (2)",
-            "tab": "Efficiency",
-            "score": 0.3715069820859131
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)",
-            "tab": "Efficiency",
-            "score": 0.3151663907992294
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)",
-            "tab": "General information",
-            "score": 1094.4889705882354
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)",
-            "tab": "General information",
-            "score": 658.5851063829788
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)",
-            "tab": "General information",
-            "score": 1637.6010430247718
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)",
-            "tab": "General information",
-            "score": 575.0980392156863
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "details": {
-          "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.507, mean=0.507, max=0.507, sum=1.014 (2)",
-            "tab": "Efficiency",
-            "score": 0.5069083476066589
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)",
-            "tab": "General information",
-            "score": 422.79
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.921,
-        "details": {
-          "description": "min=0.921, mean=0.921, max=0.921, sum=1.842 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.665 (2)",
-            "tab": "Efficiency",
-            "score": 0.3323579352152975
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)",
-            "tab": "General information",
-            "score": 579.6842105263158
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.291, mean=0.291, max=0.291, sum=0.581 (2)",
-            "tab": "Efficiency",
-            "score": 0.29072295665740966
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)",
-            "tab": "General information",
-            "score": 569.52
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.845,
-        "details": {
-          "description": "min=0.845, mean=0.845, max=0.845, sum=1.691 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.29, mean=0.29, max=0.29, sum=0.579 (2)",
-            "tab": "Efficiency",
-            "score": 0.2897273891376999
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=397.928, mean=397.928, max=397.928, sum=795.857 (2)",
-            "tab": "General information",
-            "score": 397.92830188679244
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.826,
-        "details": {
-          "description": "min=0.826, mean=0.826, max=0.826, sum=1.651 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.279, mean=0.279, max=0.279, sum=0.559 (2)",
-            "tab": "Efficiency",
-            "score": 0.2794749209221373
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=304.834, mean=304.834, max=304.834, sum=609.668 (2)",
-            "tab": "General information",
-            "score": 304.83404255319147
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.759,
-        "details": {
-          "description": "min=0.759, mean=0.759, max=0.759, sum=1.517 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.256, mean=0.256, max=0.256, sum=0.512 (2)",
-            "tab": "Efficiency",
-            "score": 0.2558267790695717
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=435.607, mean=435.607, max=435.607, sum=871.214 (2)",
-            "tab": "General information",
-            "score": 435.60689655172416
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.688,
-        "details": {
-          "description": "min=0.688, mean=0.688, max=0.688, sum=1.376 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.308, mean=0.308, max=0.308, sum=0.617 (2)",
-            "tab": "Efficiency",
-            "score": 0.30840403945357714
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)",
-            "tab": "General information",
-            "score": 531.8544973544973
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.683,
-        "details": {
-          "description": "min=0.683, mean=0.683, max=0.683, sum=1.365 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.609 (2)",
-            "tab": "Efficiency",
-            "score": 0.30448357074979754
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)",
-            "tab": "General information",
-            "score": 601.7777777777778
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.941,
-        "details": {
-          "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.309, mean=0.309, max=0.309, sum=0.619 (2)",
-            "tab": "Efficiency",
-            "score": 0.3094667688492806
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)",
-            "tab": "Efficiency",
-            "score": 0.29394797386207017
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.301, mean=0.301, max=0.301, sum=0.602 (2)",
-            "tab": "Efficiency",
-            "score": 0.30106969356536867
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.48, mean=0.48, max=0.48, sum=0.96 (2)",
-            "tab": "Efficiency",
-            "score": 0.4799844944115841
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.595 (2)",
-            "tab": "Efficiency",
-            "score": 0.29747620014229204
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.291, mean=0.291, max=0.291, sum=0.583 (2)",
-            "tab": "Efficiency",
-            "score": 0.2914604300662026
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.279, mean=0.279, max=0.279, sum=0.557 (2)",
-            "tab": "Efficiency",
-            "score": 0.27857950650728663
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.312, mean=0.312, max=0.312, sum=0.625 (2)",
-            "tab": "Efficiency",
-            "score": 0.3123831342767786
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.302, mean=0.302, max=0.302, sum=0.603 (2)",
-            "tab": "Efficiency",
-            "score": 0.30159517997453195
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.322, mean=0.322, max=0.322, sum=0.643 (2)",
-            "tab": "Efficiency",
-            "score": 0.32152655108874995
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.29, mean=0.29, max=0.29, sum=0.581 (2)",
-            "tab": "Efficiency",
-            "score": 0.2903494253071076
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.667 (2)",
-            "tab": "Efficiency",
-            "score": 0.33328031720938506
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.394, mean=0.394, max=0.394, sum=0.788 (2)",
-            "tab": "Efficiency",
-            "score": 0.39396579826579375
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.679, mean=0.679, max=0.679, sum=1.359 (2)",
-            "tab": "Efficiency",
-            "score": 0.6793377369265013
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)",
-            "tab": "General information",
-            "score": 513.6709677419354
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=496.704, mean=496.704, max=496.704, sum=993.409 (2)",
-            "tab": "General information",
-            "score": 496.70443349753697
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)",
-            "tab": "General information",
-            "score": 867.78
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)",
-            "tab": "General information",
-            "score": 2797.8848484848486
-          },
-          "High School European History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=372.035, mean=372.035, max=372.035, sum=744.071 (2)",
-            "tab": "General information",
-            "score": 372.0353535353535
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)",
-            "tab": "General information",
-            "score": 465.8238341968912
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=370.908, mean=370.908, max=370.908, sum=741.815 (2)",
-            "tab": "General information",
-            "score": 370.9076923076923
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)",
-            "tab": "General information",
-            "score": 532.3555555555556
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=399.013, mean=399.013, max=399.013, sum=798.025 (2)",
-            "tab": "General information",
-            "score": 399.0126050420168
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)",
-            "tab": "General information",
-            "score": 560.4569536423841
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=495.242, mean=495.242, max=495.242, sum=990.484 (2)",
-            "tab": "General information",
-            "score": 495.2422018348624
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)",
-            "tab": "General information",
-            "score": 795.6388888888889
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)",
-            "tab": "General information",
-            "score": 2217.8088235294117
-          },
-          "High School US History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)",
-            "tab": "General information",
-            "score": 1428.1729957805908
-          },
-          "High School World History - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.388, mean=0.388, max=0.388, sum=0.776 (2)",
-            "tab": "Efficiency",
-            "score": 0.38789880863754206
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.586 (2)",
-            "tab": "Efficiency",
-            "score": 0.2929920222013051
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=319.888, mean=319.888, max=319.888, sum=639.776 (2)",
-            "tab": "General information",
-            "score": 319.88789237668163
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=341.168, mean=341.168, max=341.168, sum=682.336 (2)",
-            "tab": "General information",
-            "score": 341.1679389312977
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.934,
-        "details": {
-          "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.342, mean=0.342, max=0.342, sum=0.685 (2)",
-            "tab": "Efficiency",
-            "score": 0.34241620962284813
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)",
-            "tab": "General information",
-            "score": 639.8181818181819
-          },
-          "International Law - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.834,
-        "details": {
-          "description": "min=0.834, mean=0.834, max=0.834, sum=1.669 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.282, mean=0.282, max=0.282, sum=0.565 (2)",
-            "tab": "Efficiency",
-            "score": 0.28232605325663745
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=449.564, mean=449.564, max=449.564, sum=899.129 (2)",
-            "tab": "General information",
-            "score": 449.5644171779141
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.688,
-        "details": {
-          "description": "min=0.688, mean=0.688, max=0.688, sum=1.375 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.338, mean=0.338, max=0.338, sum=0.676 (2)",
-            "tab": "Efficiency",
-            "score": 0.33782388057027546
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)",
-            "tab": "General information",
-            "score": 668.0535714285714
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.913,
-        "details": {
-          "description": "min=0.913, mean=0.913, max=0.913, sum=1.825 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.285, mean=0.285, max=0.285, sum=0.571 (2)",
-            "tab": "Efficiency",
-            "score": 0.2853238027072647
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=283.786, mean=283.786, max=283.786, sum=567.573 (2)",
-            "tab": "General information",
-            "score": 283.7864077669903
-          },
-          "Management - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.944,
-        "details": {
-          "description": "min=0.944, mean=0.944, max=0.944, sum=1.889 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.28, mean=0.28, max=0.28, sum=0.561 (2)",
-            "tab": "Efficiency",
-            "score": 0.28032574796269083
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)",
-            "tab": "General information",
-            "score": 404.21794871794873
-          },
-          "Marketing - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.296, mean=0.296, max=0.296, sum=0.592 (2)",
-            "tab": "Efficiency",
-            "score": 0.29611136198043825
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=340.99, mean=340.99, max=340.99, sum=681.98 (2)",
-            "tab": "General information",
-            "score": 340.99
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.913,
-        "details": {
-          "description": "min=0.913, mean=0.913, max=0.913, sum=1.826 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.324, mean=0.324, max=0.324, sum=0.647 (2)",
-            "tab": "Efficiency",
-            "score": 0.3237126984967735
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=299.911, mean=299.911, max=299.911, sum=599.821 (2)",
-            "tab": "General information",
-            "score": 299.9106002554278
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.841,
-        "details": {
-          "description": "min=0.841, mean=0.841, max=0.841, sum=1.683 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)",
-            "tab": "Efficiency",
-            "score": 0.2901734975032035
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.506, mean=0.506, max=0.506, sum=1.012 (2)",
-            "tab": "Efficiency",
-            "score": 0.5058047955262595
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=476.113, mean=476.113, max=476.113, sum=952.225 (2)",
-            "tab": "General information",
-            "score": 476.1127167630058
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)",
-            "tab": "General information",
-            "score": 656.454748603352
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.889,
-        "details": {
-          "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.641 (2)",
-            "tab": "Efficiency",
-            "score": 0.32064209264867444
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)",
-            "tab": "General information",
-            "score": 586.8137254901961
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.886,
-        "details": {
-          "description": "min=0.886, mean=0.886, max=0.886, sum=1.772 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.614, mean=0.614, max=0.614, sum=1.227 (2)",
-            "tab": "Efficiency",
-            "score": 0.6136744522754057
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)",
-            "tab": "General information",
-            "score": 514.5277777777778
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.718,
-        "details": {
-          "description": "min=0.718, mean=0.718, max=0.718, sum=1.436 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.599 (2)",
-            "tab": "Efficiency",
-            "score": 0.29952496832067316
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)",
-            "tab": "General information",
-            "score": 405.3181818181818
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.853,
-        "details": {
-          "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.348, mean=0.348, max=0.348, sum=0.697 (2)",
-            "tab": "Efficiency",
-            "score": 0.348436891789339
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)",
-            "tab": "General information",
-            "score": 1164.4734693877551
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.841 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.595 (2)",
-            "tab": "Efficiency",
-            "score": 0.29732529915387357
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=445.517, mean=445.517, max=445.517, sum=891.035 (2)",
-            "tab": "General information",
-            "score": 445.51741293532336
-          },
-          "Sociology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.584,
-        "details": {
-          "description": "min=0.584, mean=0.584, max=0.584, sum=1.169 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.642 (2)",
-            "tab": "Efficiency",
-            "score": 0.32124968609177923
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=343.018, mean=343.018, max=343.018, sum=686.036 (2)",
-            "tab": "General information",
-            "score": 343.01807228915663
-          },
-          "Virology - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.801 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.277, mean=0.277, max=0.277, sum=0.554 (2)",
-            "tab": "Efficiency",
-            "score": 0.27723441068191973
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=274.52, mean=274.52, max=274.52, sum=549.041 (2)",
-            "tab": "General information",
-            "score": 274.5204678362573
-          },
-          "World Religions - # output tokens": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.773,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json b/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json
deleted file mode 100644
index faf8ae128..000000000
--- a/data/helm_mmlu/meta/llama-3.3-70b-instruct-turbo/65423181-18f1-4296-98c2-171356106404.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/meta_llama-3.3-70b-instruct-turbo/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama 3.3 Instruct Turbo 70B",
-    "id": "meta/llama-3.3-70b-instruct-turbo",
-    "developer": "meta",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791,
-        "details": {
-          "description": "min=0.441, mean=0.791, max=0.984, sum=90.129 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.345, max=0.559, sum=39.355 (114)",
-            "tab": "Efficiency",
-            "score": 0.34521783642237874
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)",
-            "tab": "General information",
-            "score": 614.6193817308517
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.313, mean=0.313, max=0.313, sum=0.626 (2)",
-            "tab": "Efficiency",
-            "score": 0.3131356716156006
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=373.43, mean=373.43, max=373.43, sum=746.86 (2)",
-            "tab": "General information",
-            "score": 373.43
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.778,
-        "details": {
-          "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)",
-            "tab": "Efficiency",
-            "score": 0.3432198400850649
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=353.874, mean=353.874, max=353.874, sum=707.748 (2)",
-            "tab": "General information",
-            "score": 353.8740740740741
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52,
-        "details": {
-          "description": "min=0.52, mean=0.52, max=0.52, sum=1.039 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.359, mean=0.359, max=0.359, sum=0.717 (2)",
-            "tab": "Efficiency",
-            "score": 0.35871645450592043
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.366, mean=0.366, max=0.366, sum=0.732 (2)",
-            "tab": "Efficiency",
-            "score": 0.36611984339025283
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.35, mean=0.35, max=0.35, sum=0.701 (2)",
-            "tab": "Efficiency",
-            "score": 0.3503202319145203
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.337, mean=0.337, max=0.337, sum=0.675 (2)",
-            "tab": "Efficiency",
-            "score": 0.33748736619949343
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)",
-            "tab": "Efficiency",
-            "score": 0.3367649737121053
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.307, mean=0.307, max=0.307, sum=0.615 (2)",
-            "tab": "Efficiency",
-            "score": 0.30743202976152006
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)",
-            "tab": "General information",
-            "score": 549.28
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=473.875, mean=473.875, max=473.875, sum=947.75 (2)",
-            "tab": "General information",
-            "score": 473.875
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)",
-            "tab": "General information",
-            "score": 828.29
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)",
-            "tab": "General information",
-            "score": 594.51
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)",
-            "tab": "General information",
-            "score": 502.70520231213874
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)",
-            "tab": "General information",
-            "score": 503.5686274509804
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)",
-            "tab": "Efficiency",
-            "score": 0.33975651502609255
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=378.51, mean=378.51, max=378.51, sum=757.02 (2)",
-            "tab": "General information",
-            "score": 378.51
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.719,
-        "details": {
-          "description": "min=0.719, mean=0.719, max=0.719, sum=1.439 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.341, mean=0.341, max=0.341, sum=0.683 (2)",
-            "tab": "Efficiency",
-            "score": 0.34139270113225567
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)",
-            "tab": "General information",
-            "score": 614.421052631579
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.58,
-        "details": {
-          "description": "min=0.58, mean=0.58, max=0.58, sum=1.16 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.687 (2)",
-            "tab": "Efficiency",
-            "score": 0.34327178478240966
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)",
-            "tab": "General information",
-            "score": 399.71
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.33, mean=0.33, max=0.33, sum=0.659 (2)",
-            "tab": "Efficiency",
-            "score": 0.32968640327453613
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=394.63, mean=394.63, max=394.63, sum=789.259 (2)",
-            "tab": "General information",
-            "score": 394.6296296296296
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.659 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.642 (2)",
-            "tab": "Efficiency",
-            "score": 0.32124289515700755
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)",
-            "tab": "General information",
-            "score": 329.08360128617363
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.845,
-        "details": {
-          "description": "min=0.845, mean=0.845, max=0.845, sum=1.69 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.367, mean=0.367, max=0.367, sum=0.733 (2)",
-            "tab": "Efficiency",
-            "score": 0.36657266932375293
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)",
-            "tab": "Efficiency",
-            "score": 0.33986637440133605
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.386, mean=0.386, max=0.386, sum=0.772 (2)",
-            "tab": "Efficiency",
-            "score": 0.3858062526237856
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.334, mean=0.334, max=0.334, sum=0.668 (2)",
-            "tab": "Efficiency",
-            "score": 0.33390796184539795
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)",
-            "tab": "General information",
-            "score": 1094.4889705882354
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)",
-            "tab": "General information",
-            "score": 658.5851063829788
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)",
-            "tab": "General information",
-            "score": 1637.6010430247718
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)",
-            "tab": "General information",
-            "score": 575.0980392156863
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "details": {
-          "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.342, mean=0.342, max=0.342, sum=0.683 (2)",
-            "tab": "Efficiency",
-            "score": 0.34171419143676757
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)",
-            "tab": "General information",
-            "score": 422.79
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.888,
-        "details": {
-          "description": "min=0.888, mean=0.888, max=0.888, sum=1.776 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.329, mean=0.329, max=0.329, sum=0.657 (2)",
-            "tab": "Efficiency",
-            "score": 0.3287427550867984
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)",
-            "tab": "General information",
-            "score": 579.6842105263158
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.654 (2)",
-            "tab": "Efficiency",
-            "score": 0.327047655582428
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)",
-            "tab": "General information",
-            "score": 569.52
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.344, mean=0.344, max=0.344, sum=0.687 (2)",
-            "tab": "Efficiency",
-            "score": 0.3435286764828664
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=397.928, mean=397.928, max=397.928, sum=795.857 (2)",
-            "tab": "General information",
-            "score": 397.92830188679244
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.821,
-        "details": {
-          "description": "min=0.821, mean=0.821, max=0.821, sum=1.643 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.667 (2)",
-            "tab": "Efficiency",
-            "score": 0.33338003361478763
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=304.834, mean=304.834, max=304.834, sum=609.668 (2)",
-            "tab": "General information",
-            "score": 304.83404255319147
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.745,
-        "details": {
-          "description": "min=0.745, mean=0.745, max=0.745, sum=1.49 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.354, mean=0.354, max=0.354, sum=0.709 (2)",
-            "tab": "Efficiency",
-            "score": 0.35425889245395004
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=435.607, mean=435.607, max=435.607, sum=871.214 (2)",
-            "tab": "General information",
-            "score": 435.60689655172416
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.672,
-        "details": {
-          "description": "min=0.672, mean=0.672, max=0.672, sum=1.344 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.334, mean=0.334, max=0.334, sum=0.669 (2)",
-            "tab": "Efficiency",
-            "score": 0.33447367299801456
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)",
-            "tab": "General information",
-            "score": 531.8544973544973
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.675,
-        "details": {
-          "description": "min=0.675, mean=0.675, max=0.675, sum=1.349 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)",
-            "tab": "Efficiency",
-            "score": 0.349764451148018
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)",
-            "tab": "General information",
-            "score": 601.7777777777778
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.907,
-        "details": {
-          "description": "min=0.907, mean=0.907, max=0.907, sum=1.814 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.348, mean=0.348, max=0.348, sum=0.697 (2)",
-            "tab": "Efficiency",
-            "score": 0.34841231069257184
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.325, mean=0.325, max=0.325, sum=0.65 (2)",
-            "tab": "Efficiency",
-            "score": 0.3249026636771968
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.376, mean=0.376, max=0.376, sum=0.752 (2)",
-            "tab": "Efficiency",
-            "score": 0.3761155128479004
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.559, mean=0.559, max=0.559, sum=1.118 (2)",
-            "tab": "Efficiency",
-            "score": 0.558924115787853
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.303, max=0.303, sum=0.606 (2)",
-            "tab": "Efficiency",
-            "score": 0.30311920907762313
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.319, mean=0.319, max=0.319, sum=0.639 (2)",
-            "tab": "Efficiency",
-            "score": 0.3192925144353679
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.643 (2)",
-            "tab": "Efficiency",
-            "score": 0.3212899880531507
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.331, mean=0.331, max=0.331, sum=0.661 (2)",
-            "tab": "Efficiency",
-            "score": 0.3307388570573595
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.663 (2)",
-            "tab": "Efficiency",
-            "score": 0.3317271210566288
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)",
-            "tab": "Efficiency",
-            "score": 0.34023177229016033
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.655 (2)",
-            "tab": "Efficiency",
-            "score": 0.3273837903224
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.359, mean=0.359, max=0.359, sum=0.718 (2)",
-            "tab": "Efficiency",
-            "score": 0.359178250586545
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.444, mean=0.444, max=0.444, sum=0.887 (2)",
-            "tab": "Efficiency",
-            "score": 0.443670579031402
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.382, mean=0.382, max=0.382, sum=0.764 (2)",
-            "tab": "Efficiency",
-            "score": 0.3818797411294929
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)",
-            "tab": "General information",
-            "score": 513.6709677419354
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=496.704, mean=496.704, max=496.704, sum=993.409 (2)",
-            "tab": "General information",
-            "score": 496.70443349753697
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)",
-            "tab": "General information",
-            "score": 867.78
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)",
-            "tab": "General information",
-            "score": 2797.8848484848486
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=372.035, mean=372.035, max=372.035, sum=744.071 (2)",
-            "tab": "General information",
-            "score": 372.0353535353535
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)",
-            "tab": "General information",
-            "score": 465.8238341968912
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=370.908, mean=370.908, max=370.908, sum=741.815 (2)",
-            "tab": "General information",
-            "score": 370.9076923076923
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)",
-            "tab": "General information",
-            "score": 532.3555555555556
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=399.013, mean=399.013, max=399.013, sum=798.025 (2)",
-            "tab": "General information",
-            "score": 399.0126050420168
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)",
-            "tab": "General information",
-            "score": 560.4569536423841
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=495.242, mean=495.242, max=495.242, sum=990.484 (2)",
-            "tab": "General information",
-            "score": 495.2422018348624
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)",
-            "tab": "General information",
-            "score": 795.6388888888889
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)",
-            "tab": "General information",
-            "score": 2217.8088235294117
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)",
-            "tab": "General information",
-            "score": 1428.1729957805908
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.855,
-        "details": {
-          "description": "min=0.855, mean=0.855, max=0.855, sum=1.71 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.345, mean=0.345, max=0.345, sum=0.691 (2)",
-            "tab": "Efficiency",
-            "score": 0.3452627787140987
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.346, mean=0.346, max=0.346, sum=0.692 (2)",
-            "tab": "Efficiency",
-            "score": 0.34599654183132955
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=319.888, mean=319.888, max=319.888, sum=639.776 (2)",
-            "tab": "General information",
-            "score": 319.88789237668163
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=341.168, mean=341.168, max=341.168, sum=682.336 (2)",
-            "tab": "General information",
-            "score": 341.1679389312977
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.884,
-        "details": {
-          "description": "min=0.884, mean=0.884, max=0.884, sum=1.769 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.37, mean=0.37, max=0.37, sum=0.741 (2)",
-            "tab": "Efficiency",
-            "score": 0.3704575231252623
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)",
-            "tab": "General information",
-            "score": 639.8181818181819
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.816,
-        "details": {
-          "description": "min=0.816, mean=0.816, max=0.816, sum=1.632 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.307, mean=0.307, max=0.307, sum=0.613 (2)",
-            "tab": "Efficiency",
-            "score": 0.30655721506458117
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=449.564, mean=449.564, max=449.564, sum=899.129 (2)",
-            "tab": "General information",
-            "score": 449.5644171779141
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.714,
-        "details": {
-          "description": "min=0.714, mean=0.714, max=0.714, sum=1.429 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.375, mean=0.375, max=0.375, sum=0.75 (2)",
-            "tab": "Efficiency",
-            "score": 0.3751111796924046
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)",
-            "tab": "General information",
-            "score": 668.0535714285714
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.903,
-        "details": {
-          "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)",
-            "tab": "Efficiency",
-            "score": 0.3368335811837206
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=283.786, mean=283.786, max=283.786, sum=567.573 (2)",
-            "tab": "General information",
-            "score": 283.7864077669903
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.927,
-        "details": {
-          "description": "min=0.927, mean=0.927, max=0.927, sum=1.855 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.32, mean=0.32, max=0.32, sum=0.64 (2)",
-            "tab": "Efficiency",
-            "score": 0.320215484015962
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)",
-            "tab": "General information",
-            "score": 404.21794871794873
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.654 (2)",
-            "tab": "Efficiency",
-            "score": 0.3268785071372986
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=340.99, mean=340.99, max=340.99, sum=681.98 (2)",
-            "tab": "General information",
-            "score": 340.99
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.914,
-        "details": {
-          "description": "min=0.914, mean=0.914, max=0.914, sum=1.829 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.641 (2)",
-            "tab": "Efficiency",
-            "score": 0.32054392161801704
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=299.911, mean=299.911, max=299.911, sum=599.821 (2)",
-            "tab": "General information",
-            "score": 299.9106002554278
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.698,
-        "details": {
-          "description": "min=0.698, mean=0.698, max=0.698, sum=1.397 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.322, mean=0.322, max=0.322, sum=0.644 (2)",
-            "tab": "Efficiency",
-            "score": 0.321929149544997
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)",
-            "tab": "Efficiency",
-            "score": 0.3511003518237748
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=476.113, mean=476.113, max=476.113, sum=952.225 (2)",
-            "tab": "General information",
-            "score": 476.1127167630058
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)",
-            "tab": "General information",
-            "score": 656.454748603352
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.882,
-        "details": {
-          "description": "min=0.882, mean=0.882, max=0.882, sum=1.765 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.356, mean=0.356, max=0.356, sum=0.711 (2)",
-            "tab": "Efficiency",
-            "score": 0.35563821730270884
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)",
-            "tab": "General information",
-            "score": 586.8137254901961
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.895,
-        "details": {
-          "description": "min=0.895, mean=0.895, max=0.895, sum=1.79 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.685 (2)",
-            "tab": "Efficiency",
-            "score": 0.34269326410175843
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)",
-            "tab": "General information",
-            "score": 514.5277777777778
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.727,
-        "details": {
-          "description": "min=0.727, mean=0.727, max=0.727, sum=1.455 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.345, mean=0.345, max=0.345, sum=0.69 (2)",
-            "tab": "Efficiency",
-            "score": 0.34484653039412066
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)",
-            "tab": "General information",
-            "score": 405.3181818181818
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.845,
-        "details": {
-          "description": "min=0.845, mean=0.845, max=0.845, sum=1.69 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.369, mean=0.369, max=0.369, sum=0.737 (2)",
-            "tab": "Efficiency",
-            "score": 0.3686914687253991
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)",
-            "tab": "General information",
-            "score": 1164.4734693877551
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.841 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.324, mean=0.324, max=0.324, sum=0.647 (2)",
-            "tab": "Efficiency",
-            "score": 0.3236708546159279
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=445.517, mean=445.517, max=445.517, sum=891.035 (2)",
-            "tab": "General information",
-            "score": 445.51741293532336
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.566,
-        "details": {
-          "description": "min=0.566, mean=0.566, max=0.566, sum=1.133 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.324, mean=0.324, max=0.324, sum=0.647 (2)",
-            "tab": "Efficiency",
-            "score": 0.3235311522541276
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=343.018, mean=343.018, max=343.018, sum=686.036 (2)",
-            "tab": "General information",
-            "score": 343.01807228915663
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.883,
-        "details": {
-          "description": "min=0.883, mean=0.883, max=0.883, sum=1.766 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.303, max=0.303, sum=0.606 (2)",
-            "tab": "Efficiency",
-            "score": 0.30298223132975616
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=274.52, mean=274.52, max=274.52, sum=549.041 (2)",
-            "tab": "General information",
-            "score": 274.5204678362573
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.722,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json b/data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json
deleted file mode 100644
index 95bd9f1b8..000000000
--- a/data/helm_mmlu/microsoft/phi-2/41c3f46d-c798-422c-8b6a-b176ffa8e8ae.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/microsoft_phi-2/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-2",
-    "id": "microsoft/phi-2",
-    "developer": "microsoft",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.584,
-        "details": {
-          "description": "min=0.231, mean=0.584, max=0.833, sum=66.604 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.267, mean=0.309, max=0.409, sum=35.222 (114)",
-            "tab": "Efficiency",
-            "score": 0.3089648339000309
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=2.945, mean=4.946, max=5, sum=563.886 (114)",
-            "tab": "General information",
-            "score": 4.946365736553069
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=277.404, mean=600.9, max=1826.103, sum=68502.623 (114)",
-            "tab": "General information",
-            "score": 600.9002028338741
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.31,
-        "details": {
-          "description": "min=0.31, mean=0.31, max=0.31, sum=0.62 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.585 (2)",
-            "tab": "Efficiency",
-            "score": 0.2925554180145264
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=371.38, mean=371.38, max=371.38, sum=742.76 (2)",
-            "tab": "General information",
-            "score": 371.38
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.437,
-        "details": {
-          "description": "min=0.437, mean=0.437, max=0.437, sum=0.874 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.338, mean=0.338, max=0.338, sum=0.675 (2)",
-            "tab": "Efficiency",
-            "score": 0.3375302138151946
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=372.081, mean=372.081, max=372.081, sum=744.163 (2)",
-            "tab": "General information",
-            "score": 372.0814814814815
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.382,
-        "details": {
-          "description": "min=0.382, mean=0.382, max=0.382, sum=0.765 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.27, mean=0.27, max=0.27, sum=0.539 (2)",
-            "tab": "Efficiency",
-            "score": 0.2696530842781067
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.302, mean=0.302, max=0.302, sum=0.604 (2)",
-            "tab": "Efficiency",
-            "score": 0.3021910654173957
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.349, mean=0.349, max=0.349, sum=0.697 (2)",
-            "tab": "Efficiency",
-            "score": 0.34874132156372073
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.319, mean=0.319, max=0.319, sum=0.638 (2)",
-            "tab": "Efficiency",
-            "score": 0.3188008284568787
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.607 (2)",
-            "tab": "Efficiency",
-            "score": 0.30374339412402557
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.32, mean=0.32, max=0.32, sum=0.64 (2)",
-            "tab": "Efficiency",
-            "score": 0.31993647182688995
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=545.4, mean=545.4, max=545.4, sum=1090.8 (2)",
-            "tab": "General information",
-            "score": 545.4
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=482.278, mean=482.278, max=482.278, sum=964.556 (2)",
-            "tab": "General information",
-            "score": 482.27777777777777
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=852.03, mean=852.03, max=852.03, sum=1704.06 (2)",
-            "tab": "General information",
-            "score": 852.03
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=611.54, mean=611.54, max=611.54, sum=1223.08 (2)",
-            "tab": "General information",
-            "score": 611.54
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=530.301, mean=530.301, max=530.301, sum=1060.601 (2)",
-            "tab": "General information",
-            "score": 530.3005780346821
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=489.324, mean=489.324, max=489.324, sum=978.647 (2)",
-            "tab": "General information",
-            "score": 489.3235294117647
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.73,
-        "details": {
-          "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.277, mean=0.277, max=0.277, sum=0.554 (2)",
-            "tab": "Efficiency",
-            "score": 0.2771985101699829
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=387.4, mean=387.4, max=387.4, sum=774.8 (2)",
-            "tab": "General information",
-            "score": 387.4
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.342,
-        "details": {
-          "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.295, mean=0.295, max=0.295, sum=0.589 (2)",
-            "tab": "Efficiency",
-            "score": 0.294714699711716
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=624.07, mean=624.07, max=624.07, sum=1248.14 (2)",
-            "tab": "General information",
-            "score": 624.0701754385965
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35,
-        "details": {
-          "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.315, mean=0.315, max=0.315, sum=0.631 (2)",
-            "tab": "Efficiency",
-            "score": 0.3154014134407043
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=398.42, mean=398.42, max=398.42, sum=796.84 (2)",
-            "tab": "General information",
-            "score": 398.42
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.694,
-        "details": {
-          "description": "min=0.694, mean=0.694, max=0.694, sum=1.389 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.281, mean=0.281, max=0.281, sum=0.562 (2)",
-            "tab": "Efficiency",
-            "score": 0.28103237681918675
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=418.722, mean=418.722, max=418.722, sum=837.444 (2)",
-            "tab": "General information",
-            "score": 418.72222222222223
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.598,
-        "details": {
-          "description": "min=0.598, mean=0.598, max=0.598, sum=1.196 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.298, mean=0.298, max=0.298, sum=0.597 (2)",
-            "tab": "Efficiency",
-            "score": 0.29847138410979146
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=353.711, mean=353.711, max=353.711, sum=707.421 (2)",
-            "tab": "General information",
-            "score": 353.7106109324759
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.572,
-        "details": {
-          "description": "min=0.572, mean=0.572, max=0.572, sum=1.144 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.305, mean=0.305, max=0.305, sum=0.61 (2)",
-            "tab": "Efficiency",
-            "score": 0.3051472201066859
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.31, mean=0.31, max=0.31, sum=0.619 (2)",
-            "tab": "Efficiency",
-            "score": 0.3096669819338102
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.363, mean=0.363, max=0.363, sum=0.727 (2)",
-            "tab": "Efficiency",
-            "score": 0.36331592731401224
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)",
-            "tab": "Efficiency",
-            "score": 0.30723563518399505
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1118.287, mean=1118.287, max=1118.287, sum=2236.574 (2)",
-            "tab": "General information",
-            "score": 1118.2867647058824
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=660.72, mean=660.72, max=660.72, sum=1321.44 (2)",
-            "tab": "General information",
-            "score": 660.7198581560284
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=4.997, mean=4.997, max=4.997, sum=9.995 (2)",
-            "tab": "General information",
-            "score": 4.9973924380704045
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1657.596, mean=1657.596, max=1657.596, sum=3315.192 (2)",
-            "tab": "General information",
-            "score": 1657.5958279009126
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=597.574, mean=597.574, max=597.574, sum=1195.147 (2)",
-            "tab": "General information",
-            "score": 597.5735294117648
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78,
-        "details": {
-          "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.292, mean=0.292, max=0.292, sum=0.584 (2)",
-            "tab": "Efficiency",
-            "score": 0.2921306538581848
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=433.12, mean=433.12, max=433.12, sum=866.24 (2)",
-            "tab": "General information",
-            "score": 433.12
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.605,
-        "details": {
-          "description": "min=0.605, mean=0.605, max=0.605, sum=1.211 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.594 (2)",
-            "tab": "Efficiency",
-            "score": 0.2971143110802299
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=600.112, mean=600.112, max=600.112, sum=1200.224 (2)",
-            "tab": "General information",
-            "score": 600.1118421052631
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.59,
-        "details": {
-          "description": "min=0.59, mean=0.59, max=0.59, sum=1.18 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.666 (2)",
-            "tab": "Efficiency",
-            "score": 0.33283984184265136
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=589.43, mean=589.43, max=589.43, sum=1178.86 (2)",
-            "tab": "General information",
-            "score": 589.43
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.619,
-        "details": {
-          "description": "min=0.619, mean=0.619, max=0.619, sum=1.238 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)",
-            "tab": "Efficiency",
-            "score": 0.3039509620306627
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=423.925, mean=423.925, max=423.925, sum=847.849 (2)",
-            "tab": "General information",
-            "score": 423.92452830188677
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.519,
-        "details": {
-          "description": "min=0.519, mean=0.519, max=0.519, sum=1.038 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.309, mean=0.309, max=0.309, sum=0.618 (2)",
-            "tab": "Efficiency",
-            "score": 0.30905701251740153
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=313.723, mean=313.723, max=313.723, sum=627.447 (2)",
-            "tab": "General information",
-            "score": 313.72340425531917
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.545,
-        "details": {
-          "description": "min=0.545, mean=0.545, max=0.545, sum=1.09 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.319, mean=0.319, max=0.319, sum=0.639 (2)",
-            "tab": "Efficiency",
-            "score": 0.31939958375075767
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=430.345, mean=430.345, max=430.345, sum=860.69 (2)",
-            "tab": "General information",
-            "score": 430.3448275862069
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.463,
-        "details": {
-          "description": "min=0.463, mean=0.463, max=0.463, sum=0.926 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.607 (2)",
-            "tab": "Efficiency",
-            "score": 0.30370362284322266
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=506.09, mean=506.09, max=506.09, sum=1012.18 (2)",
-            "tab": "General information",
-            "score": 506.0899470899471
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.389,
-        "details": {
-          "description": "min=0.389, mean=0.389, max=0.389, sum=0.778 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.642 (2)",
-            "tab": "Efficiency",
-            "score": 0.3209871034773569
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=641, mean=641, max=641, sum=1282 (2)",
-            "tab": "General information",
-            "score": 641.0
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.73,
-        "details": {
-          "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.279, mean=0.279, max=0.279, sum=0.557 (2)",
-            "tab": "Efficiency",
-            "score": 0.2785434192226779
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.308, mean=0.308, max=0.308, sum=0.616 (2)",
-            "tab": "Efficiency",
-            "score": 0.3082333773814986
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.654 (2)",
-            "tab": "Efficiency",
-            "score": 0.3267984962463379
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.409, mean=0.409, max=0.409, sum=0.819 (2)",
-            "tab": "Efficiency",
-            "score": 0.40945722406560725
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.305, mean=0.305, max=0.305, sum=0.61 (2)",
-            "tab": "Efficiency",
-            "score": 0.30513872763123173
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.28, mean=0.28, max=0.28, sum=0.56 (2)",
-            "tab": "Efficiency",
-            "score": 0.2802187642902908
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.391, mean=0.391, max=0.391, sum=0.782 (2)",
-            "tab": "Efficiency",
-            "score": 0.3909576538281563
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)",
-            "tab": "Efficiency",
-            "score": 0.30405007821542246
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.274, mean=0.274, max=0.274, sum=0.548 (2)",
-            "tab": "Efficiency",
-            "score": 0.2737702652185905
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.303, max=0.303, sum=0.605 (2)",
-            "tab": "Efficiency",
-            "score": 0.30272982452089425
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.305, mean=0.305, max=0.305, sum=0.609 (2)",
-            "tab": "Efficiency",
-            "score": 0.30458581688207226
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.314, mean=0.314, max=0.314, sum=0.629 (2)",
-            "tab": "Efficiency",
-            "score": 0.3143394479045161
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.38, mean=0.38, max=0.38, sum=0.759 (2)",
-            "tab": "Efficiency",
-            "score": 0.37960049802181767
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.365, mean=0.365, max=0.365, sum=0.729 (2)",
-            "tab": "Efficiency",
-            "score": 0.36470460791125076
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=540.748, mean=540.748, max=540.748, sum=1081.497 (2)",
-            "tab": "General information",
-            "score": 540.7483870967742
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=495.645, mean=495.645, max=495.645, sum=991.291 (2)",
-            "tab": "General information",
-            "score": 495.6453201970443
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=894.78, mean=894.78, max=894.78, sum=1789.56 (2)",
-            "tab": "General information",
-            "score": 894.78
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=2.945, mean=2.945, max=2.945, sum=5.891 (2)",
-            "tab": "General information",
-            "score": 2.9454545454545453
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=1826.103, mean=1826.103, max=1826.103, sum=3652.206 (2)",
-            "tab": "General information",
-            "score": 1826.1030303030302
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=397.646, mean=397.646, max=397.646, sum=795.293 (2)",
-            "tab": "General information",
-            "score": 397.64646464646466
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=478.073, mean=478.073, max=478.073, sum=956.145 (2)",
-            "tab": "General information",
-            "score": 478.07253886010363
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=391.931, mean=391.931, max=391.931, sum=783.862 (2)",
-            "tab": "General information",
-            "score": 391.9307692307692
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=526.352, mean=526.352, max=526.352, sum=1052.704 (2)",
-            "tab": "General information",
-            "score": 526.3518518518518
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=410.937, mean=410.937, max=410.937, sum=821.874 (2)",
-            "tab": "General information",
-            "score": 410.93697478991595
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=553.669, mean=553.669, max=553.669, sum=1107.338 (2)",
-            "tab": "General information",
-            "score": 553.6688741721854
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=516.842, mean=516.842, max=516.842, sum=1033.684 (2)",
-            "tab": "General information",
-            "score": 516.8422018348624
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=805, mean=805, max=805, sum=1610 (2)",
-            "tab": "General information",
-            "score": 805.0
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=4, mean=4, max=4, sum=8 (2)",
-            "tab": "General information",
-            "score": 4.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=1756.25, mean=1756.25, max=1756.25, sum=3512.5 (2)",
-            "tab": "General information",
-            "score": 1756.25
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1438.561, mean=1438.561, max=1438.561, sum=2877.122 (2)",
-            "tab": "General information",
-            "score": 1438.5611814345991
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.733,
-        "details": {
-          "description": "min=0.733, mean=0.733, max=0.733, sum=1.466 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.298, mean=0.298, max=0.298, sum=0.596 (2)",
-            "tab": "Efficiency",
-            "score": 0.2979412987627791
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.303, max=0.303, sum=0.605 (2)",
-            "tab": "Efficiency",
-            "score": 0.30250649051811856
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=324.48, mean=324.48, max=324.48, sum=648.96 (2)",
-            "tab": "General information",
-            "score": 324.47982062780267
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=357.626, mean=357.626, max=357.626, sum=715.252 (2)",
-            "tab": "General information",
-            "score": 357.62595419847327
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.752,
-        "details": {
-          "description": "min=0.752, mean=0.752, max=0.752, sum=1.504 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)",
-            "tab": "Efficiency",
-            "score": 0.30694435647696505
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=639.843, mean=639.843, max=639.843, sum=1279.686 (2)",
-            "tab": "General information",
-            "score": 639.8429752066115
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.767,
-        "details": {
-          "description": "min=0.767, mean=0.767, max=0.767, sum=1.534 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.274, mean=0.274, max=0.274, sum=0.548 (2)",
-            "tab": "Efficiency",
-            "score": 0.273789843167264
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=454.233, mean=454.233, max=454.233, sum=908.466 (2)",
-            "tab": "General information",
-            "score": 454.23312883435585
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.313, mean=0.313, max=0.313, sum=0.627 (2)",
-            "tab": "Efficiency",
-            "score": 0.31332691439560484
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=671.598, mean=671.598, max=671.598, sum=1343.196 (2)",
-            "tab": "General information",
-            "score": 671.5982142857143
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.748,
-        "details": {
-          "description": "min=0.748, mean=0.748, max=0.748, sum=1.495 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.305, mean=0.305, max=0.305, sum=0.61 (2)",
-            "tab": "Efficiency",
-            "score": 0.3051937992132983
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=292.34, mean=292.34, max=292.34, sum=584.68 (2)",
-            "tab": "General information",
-            "score": 292.3398058252427
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.833,
-        "details": {
-          "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.276, mean=0.276, max=0.276, sum=0.552 (2)",
-            "tab": "Efficiency",
-            "score": 0.2761631949335082
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=437.667, mean=437.667, max=437.667, sum=875.333 (2)",
-            "tab": "General information",
-            "score": 437.6666666666667
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.62,
-        "details": {
-          "description": "min=0.62, mean=0.62, max=0.62, sum=1.24 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.305, mean=0.305, max=0.305, sum=0.609 (2)",
-            "tab": "Efficiency",
-            "score": 0.3045226716995239
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=352.71, mean=352.71, max=352.71, sum=705.42 (2)",
-            "tab": "General information",
-            "score": 352.71
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.688,
-        "details": {
-          "description": "min=0.688, mean=0.688, max=0.688, sum=1.377 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.334, mean=0.334, max=0.334, sum=0.668 (2)",
-            "tab": "Efficiency",
-            "score": 0.33387171049836645
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=314.847, mean=314.847, max=314.847, sum=629.693 (2)",
-            "tab": "General information",
-            "score": 314.84674329501917
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.231,
-        "details": {
-          "description": "min=0.231, mean=0.231, max=0.231, sum=0.463 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.303, max=0.303, sum=0.607 (2)",
-            "tab": "Efficiency",
-            "score": 0.3032567480395984
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.267, mean=0.267, max=0.267, sum=0.534 (2)",
-            "tab": "Efficiency",
-            "score": 0.26702385215119945
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=497.514, mean=497.514, max=497.514, sum=995.029 (2)",
-            "tab": "General information",
-            "score": 497.514450867052
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=664.479, mean=664.479, max=664.479, sum=1328.959 (2)",
-            "tab": "General information",
-            "score": 664.4793296089385
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.627,
-        "details": {
-          "description": "min=0.627, mean=0.627, max=0.627, sum=1.255 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)",
-            "tab": "Efficiency",
-            "score": 0.3112297058105469
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=584.69, mean=584.69, max=584.69, sum=1169.379 (2)",
-            "tab": "General information",
-            "score": 584.6895424836601
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.605,
-        "details": {
-          "description": "min=0.605, mean=0.605, max=0.605, sum=1.21 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.291, mean=0.291, max=0.291, sum=0.583 (2)",
-            "tab": "Efficiency",
-            "score": 0.29145334090715574
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=524.454, mean=524.454, max=524.454, sum=1048.907 (2)",
-            "tab": "General information",
-            "score": 524.4537037037037
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.673,
-        "details": {
-          "description": "min=0.673, mean=0.673, max=0.673, sum=1.345 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.282, mean=0.282, max=0.282, sum=0.564 (2)",
-            "tab": "Efficiency",
-            "score": 0.28212652423165063
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=420.609, mean=420.609, max=420.609, sum=841.218 (2)",
-            "tab": "General information",
-            "score": 420.6090909090909
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.702,
-        "details": {
-          "description": "min=0.702, mean=0.702, max=0.702, sum=1.404 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.322, mean=0.322, max=0.322, sum=0.645 (2)",
-            "tab": "Efficiency",
-            "score": 0.3223595599738919
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1196.433, mean=1196.433, max=1196.433, sum=2392.865 (2)",
-            "tab": "General information",
-            "score": 1196.4326530612245
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.816,
-        "details": {
-          "description": "min=0.816, mean=0.816, max=0.816, sum=1.632 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.288, mean=0.288, max=0.288, sum=0.575 (2)",
-            "tab": "Efficiency",
-            "score": 0.2876073993853669
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=446.512, mean=446.512, max=446.512, sum=893.025 (2)",
-            "tab": "General information",
-            "score": 446.5124378109453
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.47,
-        "details": {
-          "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.28, mean=0.28, max=0.28, sum=0.559 (2)",
-            "tab": "Efficiency",
-            "score": 0.27966123316661423
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=352.759, mean=352.759, max=352.759, sum=705.518 (2)",
-            "tab": "General information",
-            "score": 352.7590361445783
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.702,
-        "details": {
-          "description": "min=0.702, mean=0.702, max=0.702, sum=1.404 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.284, mean=0.284, max=0.284, sum=0.569 (2)",
-            "tab": "Efficiency",
-            "score": 0.2843696499428554
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=277.404, mean=277.404, max=277.404, sum=554.807 (2)",
-            "tab": "General information",
-            "score": 277.4035087719298
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.824,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json b/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json
deleted file mode 100644
index f1d62a268..000000000
--- a/data/helm_mmlu/microsoft/phi-3-medium-4k-instruct/f78d6e0a-a397-4a41-a37e-696bda5a1987.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/microsoft_phi-3-medium-4k-instruct/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3 14B",
-    "id": "microsoft/phi-3-medium-4k-instruct",
-    "developer": "microsoft",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.775,
-        "details": {
-          "description": "min=0.5, mean=0.775, max=0.969, sum=88.295 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=2.025, mean=4.948, max=22.342, sum=564.095 (114)",
-            "tab": "Efficiency",
-            "score": 4.948199983258553
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=313.474, mean=714.893, max=3168.636, sum=81497.749 (114)",
-            "tab": "General information",
-            "score": 714.8925389546507
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=2.63, mean=2.63, max=2.63, sum=5.26 (2)",
-            "tab": "Efficiency",
-            "score": 2.63020414352417
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=397.65, mean=397.65, max=397.65, sum=795.3 (2)",
-            "tab": "General information",
-            "score": 397.65
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.719,
-        "details": {
-          "description": "min=0.719, mean=0.719, max=0.719, sum=1.437 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=3.025, mean=3.025, max=3.025, sum=6.051 (2)",
-            "tab": "Efficiency",
-            "score": 3.0252625394750523
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=418.133, mean=418.133, max=418.133, sum=836.267 (2)",
-            "tab": "General information",
-            "score": 418.1333333333333
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.529,
-        "details": {
-          "description": "min=0.529, mean=0.529, max=0.529, sum=1.059 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=3.886, mean=3.886, max=3.886, sum=7.772 (2)",
-            "tab": "Efficiency",
-            "score": 3.886199688911438
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=4.073, mean=4.073, max=4.073, sum=8.146 (2)",
-            "tab": "Efficiency",
-            "score": 4.072841899262534
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=6.237, mean=6.237, max=6.237, sum=12.473 (2)",
-            "tab": "Efficiency",
-            "score": 6.236730601787567
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=4.541, mean=4.541, max=4.541, sum=9.083 (2)",
-            "tab": "Efficiency",
-            "score": 4.541367738246918
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=4.259, mean=4.259, max=4.259, sum=8.518 (2)",
-            "tab": "Efficiency",
-            "score": 4.259122938089977
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=3.966, mean=3.966, max=3.966, sum=7.933 (2)",
-            "tab": "Efficiency",
-            "score": 3.966460019934411
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=622.43, mean=622.43, max=622.43, sum=1244.86 (2)",
-            "tab": "General information",
-            "score": 622.43
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=562.632, mean=562.632, max=562.632, sum=1125.264 (2)",
-            "tab": "General information",
-            "score": 562.6319444444445
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=910.14, mean=910.14, max=910.14, sum=1820.28 (2)",
-            "tab": "General information",
-            "score": 910.14
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=655.96, mean=655.96, max=655.96, sum=1311.92 (2)",
-            "tab": "General information",
-            "score": 655.96
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=617.671, mean=617.671, max=617.671, sum=1235.341 (2)",
-            "tab": "General information",
-            "score": 617.6705202312139
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=560.873, mean=560.873, max=560.873, sum=1121.745 (2)",
-            "tab": "General information",
-            "score": 560.8725490196078
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=2.748, mean=2.748, max=2.748, sum=5.496 (2)",
-            "tab": "Efficiency",
-            "score": 2.7481748914718627
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=428.17, mean=428.17, max=428.17, sum=856.34 (2)",
-            "tab": "General information",
-            "score": 428.17
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.614,
-        "details": {
-          "description": "min=0.614, mean=0.614, max=0.614, sum=1.228 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=4.32, mean=4.32, max=4.32, sum=8.639 (2)",
-            "tab": "Efficiency",
-            "score": 4.319587314338015
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=684.675, mean=684.675, max=684.675, sum=1369.351 (2)",
-            "tab": "General information",
-            "score": 684.6754385964912
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=3.374, mean=3.374, max=3.374, sum=6.747 (2)",
-            "tab": "Efficiency",
-            "score": 3.373600058555603
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=493.54, mean=493.54, max=493.54, sum=987.08 (2)",
-            "tab": "General information",
-            "score": 493.54
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "details": {
-          "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=3.225, mean=3.225, max=3.225, sum=6.45 (2)",
-            "tab": "Efficiency",
-            "score": 3.2251307015065795
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=458.898, mean=458.898, max=458.898, sum=917.796 (2)",
-            "tab": "General information",
-            "score": 458.89814814814815
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.804,
-        "details": {
-          "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=2.591, mean=2.591, max=2.591, sum=5.182 (2)",
-            "tab": "Efficiency",
-            "score": 2.591215438781444
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=381.122, mean=381.122, max=381.122, sum=762.244 (2)",
-            "tab": "General information",
-            "score": 381.12218649517683
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.835,
-        "details": {
-          "description": "min=0.835, mean=0.835, max=0.835, sum=1.67 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=9.39, mean=9.39, max=9.39, sum=18.781 (2)",
-            "tab": "Efficiency",
-            "score": 9.390463957015205
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=5.784, mean=5.784, max=5.784, sum=11.567 (2)",
-            "tab": "Efficiency",
-            "score": 5.7837115450108305
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=13.198, mean=13.198, max=13.198, sum=26.396 (2)",
-            "tab": "Efficiency",
-            "score": 13.198108883849024
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=4.667, mean=4.667, max=4.667, sum=9.335 (2)",
-            "tab": "Efficiency",
-            "score": 4.667331269753524
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1339.647, mean=1339.647, max=1339.647, sum=2679.294 (2)",
-            "tab": "General information",
-            "score": 1339.6470588235295
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=832.277, mean=832.277, max=832.277, sum=1664.553 (2)",
-            "tab": "General information",
-            "score": 832.2765957446809
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1924.007, mean=1924.007, max=1924.007, sum=3848.014 (2)",
-            "tab": "General information",
-            "score": 1924.0071707953064
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=659.078, mean=659.078, max=659.078, sum=1318.157 (2)",
-            "tab": "General information",
-            "score": 659.0784313725491
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.95,
-        "details": {
-          "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=2.982, mean=2.982, max=2.982, sum=5.964 (2)",
-            "tab": "Efficiency",
-            "score": 2.98179637670517
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=479.81, mean=479.81, max=479.81, sum=959.62 (2)",
-            "tab": "General information",
-            "score": 479.81
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.849,
-        "details": {
-          "description": "min=0.849, mean=0.849, max=0.849, sum=1.697 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=4.875, mean=4.875, max=4.875, sum=9.749 (2)",
-            "tab": "Efficiency",
-            "score": 4.874531077711206
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=690.079, mean=690.079, max=690.079, sum=1380.158 (2)",
-            "tab": "General information",
-            "score": 690.078947368421
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=4.78, mean=4.78, max=4.78, sum=9.559 (2)",
-            "tab": "Efficiency",
-            "score": 4.779508647918701
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=683.44, mean=683.44, max=683.44, sum=1366.88 (2)",
-            "tab": "General information",
-            "score": 683.44
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.826,
-        "details": {
-          "description": "min=0.826, mean=0.826, max=0.826, sum=1.653 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=3.474, mean=3.474, max=3.474, sum=6.948 (2)",
-            "tab": "Efficiency",
-            "score": 3.474059367629717
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=496.374, mean=496.374, max=496.374, sum=992.747 (2)",
-            "tab": "General information",
-            "score": 496.3735849056604
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.809,
-        "details": {
-          "description": "min=0.809, mean=0.809, max=0.809, sum=1.617 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=2.448, mean=2.448, max=2.448, sum=4.896 (2)",
-            "tab": "Efficiency",
-            "score": 2.448020648956299
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=342.153, mean=342.153, max=342.153, sum=684.306 (2)",
-            "tab": "General information",
-            "score": 342.1531914893617
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.683,
-        "details": {
-          "description": "min=0.683, mean=0.683, max=0.683, sum=1.366 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=3.495, mean=3.495, max=3.495, sum=6.99 (2)",
-            "tab": "Efficiency",
-            "score": 3.4950728284901587
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=506.779, mean=506.779, max=506.779, sum=1013.559 (2)",
-            "tab": "General information",
-            "score": 506.7793103448276
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.709,
-        "details": {
-          "description": "min=0.709, mean=0.709, max=0.709, sum=1.418 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=4.344, mean=4.344, max=4.344, sum=8.688 (2)",
-            "tab": "Efficiency",
-            "score": 4.344110502137078
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=618.156, mean=618.156, max=618.156, sum=1236.312 (2)",
-            "tab": "General information",
-            "score": 618.1560846560847
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.587,
-        "details": {
-          "description": "min=0.587, mean=0.587, max=0.587, sum=1.175 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=4.988, mean=4.988, max=4.988, sum=9.977 (2)",
-            "tab": "Efficiency",
-            "score": 4.988478910355341
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=700.81, mean=700.81, max=700.81, sum=1401.619 (2)",
-            "tab": "General information",
-            "score": 700.8095238095239
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.903,
-        "details": {
-          "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=4.253, mean=4.253, max=4.253, sum=8.506 (2)",
-            "tab": "Efficiency",
-            "score": 4.253153976317375
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=4.116, mean=4.116, max=4.116, sum=8.232 (2)",
-            "tab": "Efficiency",
-            "score": 4.115784048446881
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=6.919, mean=6.919, max=6.919, sum=13.839 (2)",
-            "tab": "Efficiency",
-            "score": 6.919438579082489
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=22.342, mean=22.342, max=22.342, sum=44.684 (2)",
-            "tab": "Efficiency",
-            "score": 22.341962937152747
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=3.01, mean=3.01, max=3.01, sum=6.02 (2)",
-            "tab": "Efficiency",
-            "score": 3.010115607820376
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=3.784, mean=3.784, max=3.784, sum=7.567 (2)",
-            "tab": "Efficiency",
-            "score": 3.783631190117159
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=3.202, mean=3.202, max=3.202, sum=6.403 (2)",
-            "tab": "Efficiency",
-            "score": 3.2015056090477185
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=4.126, mean=4.126, max=4.126, sum=8.251 (2)",
-            "tab": "Efficiency",
-            "score": 4.125549591912163
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=3.125, mean=3.125, max=3.125, sum=6.249 (2)",
-            "tab": "Efficiency",
-            "score": 3.124516798668549
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=3.882, mean=3.882, max=3.882, sum=7.765 (2)",
-            "tab": "Efficiency",
-            "score": 3.88235890154807
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=4.036, mean=4.036, max=4.036, sum=8.072 (2)",
-            "tab": "Efficiency",
-            "score": 4.035925890108861
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=6.294, mean=6.294, max=6.294, sum=12.587 (2)",
-            "tab": "Efficiency",
-            "score": 6.293625408852542
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=17.94, mean=17.94, max=17.94, sum=35.88 (2)",
-            "tab": "Efficiency",
-            "score": 17.93984198219636
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=11.445, mean=11.445, max=11.445, sum=22.889 (2)",
-            "tab": "Efficiency",
-            "score": 11.444628432833193
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=605.894, mean=605.894, max=605.894, sum=1211.787 (2)",
-            "tab": "General information",
-            "score": 605.8935483870968
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=577.665, mean=577.665, max=577.665, sum=1155.33 (2)",
-            "tab": "General information",
-            "score": 577.6650246305419
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=997.57, mean=997.57, max=997.57, sum=1995.14 (2)",
-            "tab": "General information",
-            "score": 997.57
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=3168.636, mean=3168.636, max=3168.636, sum=6337.273 (2)",
-            "tab": "General information",
-            "score": 3168.6363636363635
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=445.657, mean=445.657, max=445.657, sum=891.313 (2)",
-            "tab": "General information",
-            "score": 445.65656565656565
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=536.927, mean=536.927, max=536.927, sum=1073.855 (2)",
-            "tab": "General information",
-            "score": 536.9274611398964
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=454.662, mean=454.662, max=454.662, sum=909.323 (2)",
-            "tab": "General information",
-            "score": 454.66153846153844
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=588.181, mean=588.181, max=588.181, sum=1176.363 (2)",
-            "tab": "General information",
-            "score": 588.1814814814815
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=458.492, mean=458.492, max=458.492, sum=916.983 (2)",
-            "tab": "General information",
-            "score": 458.49159663865544
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=630.788, mean=630.788, max=630.788, sum=1261.576 (2)",
-            "tab": "General information",
-            "score": 630.7880794701987
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=594.919, mean=594.919, max=594.919, sum=1189.839 (2)",
-            "tab": "General information",
-            "score": 594.9192660550459
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=917.208, mean=917.208, max=917.208, sum=1834.417 (2)",
-            "tab": "General information",
-            "score": 917.2083333333334
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2544.324, mean=2544.324, max=2544.324, sum=5088.647 (2)",
-            "tab": "General information",
-            "score": 2544.323529411765
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1647.219, mean=1647.219, max=1647.219, sum=3294.439 (2)",
-            "tab": "General information",
-            "score": 1647.2194092827003
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.863,
-        "details": {
-          "description": "min=0.863, mean=0.863, max=0.863, sum=1.725 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=2.578, mean=2.578, max=2.578, sum=5.157 (2)",
-            "tab": "Efficiency",
-            "score": 2.5783249647628033
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=2.963, mean=2.963, max=2.963, sum=5.925 (2)",
-            "tab": "Efficiency",
-            "score": 2.9625705234877024
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=370.26, mean=370.26, max=370.26, sum=740.52 (2)",
-            "tab": "General information",
-            "score": 370.26008968609864
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=412.382, mean=412.382, max=412.382, sum=824.763 (2)",
-            "tab": "General information",
-            "score": 412.381679389313
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.934,
-        "details": {
-          "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=5.179, mean=5.179, max=5.179, sum=10.357 (2)",
-            "tab": "Efficiency",
-            "score": 5.1785316802253405
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=738.463, mean=738.463, max=738.463, sum=1476.926 (2)",
-            "tab": "General information",
-            "score": 738.4628099173553
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.828,
-        "details": {
-          "description": "min=0.828, mean=0.828, max=0.828, sum=1.656 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=3.522, mean=3.522, max=3.522, sum=7.045 (2)",
-            "tab": "Efficiency",
-            "score": 3.5224247461447686
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=511.755, mean=511.755, max=511.755, sum=1023.509 (2)",
-            "tab": "General information",
-            "score": 511.7546012269939
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.696,
-        "details": {
-          "description": "min=0.696, mean=0.696, max=0.696, sum=1.393 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=5.118, mean=5.118, max=5.118, sum=10.237 (2)",
-            "tab": "Efficiency",
-            "score": 5.118442311882973
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=739.402, mean=739.402, max=739.402, sum=1478.804 (2)",
-            "tab": "General information",
-            "score": 739.4017857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.864,
-        "details": {
-          "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=2.27, mean=2.27, max=2.27, sum=4.539 (2)",
-            "tab": "Efficiency",
-            "score": 2.2697336812621183
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=324.777, mean=324.777, max=324.777, sum=649.553 (2)",
-            "tab": "General information",
-            "score": 324.77669902912623
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.919,
-        "details": {
-          "description": "min=0.919, mean=0.919, max=0.919, sum=1.838 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=3.394, mean=3.394, max=3.394, sum=6.788 (2)",
-            "tab": "Efficiency",
-            "score": 3.3940892515019474
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=481.628, mean=481.628, max=481.628, sum=963.256 (2)",
-            "tab": "General information",
-            "score": 481.62820512820514
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=2.894, mean=2.894, max=2.894, sum=5.787 (2)",
-            "tab": "Efficiency",
-            "score": 2.893650698661804
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=417.14, mean=417.14, max=417.14, sum=834.28 (2)",
-            "tab": "General information",
-            "score": 417.14
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.894,
-        "details": {
-          "description": "min=0.894, mean=0.894, max=0.894, sum=1.788 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=2.025, mean=2.025, max=2.025, sum=4.05 (2)",
-            "tab": "Efficiency",
-            "score": 2.0249771478075633
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=354.913, mean=354.913, max=354.913, sum=709.826 (2)",
-            "tab": "General information",
-            "score": 354.9131545338442
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.639,
-        "details": {
-          "description": "min=0.639, mean=0.639, max=0.639, sum=1.278 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=3.877, mean=3.877, max=3.877, sum=7.754 (2)",
-            "tab": "Efficiency",
-            "score": 3.877226921175257
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=5.17, mean=5.17, max=5.17, sum=10.34 (2)",
-            "tab": "Efficiency",
-            "score": 5.170224364509796
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=551.506, mean=551.506, max=551.506, sum=1103.012 (2)",
-            "tab": "General information",
-            "score": 551.5057803468208
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=765.479, mean=765.479, max=765.479, sum=1530.959 (2)",
-            "tab": "General information",
-            "score": 765.4793296089385
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.837,
-        "details": {
-          "description": "min=0.837, mean=0.837, max=0.837, sum=1.673 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=4.962, mean=4.962, max=4.962, sum=9.923 (2)",
-            "tab": "Efficiency",
-            "score": 4.961673566718507
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=704.922, mean=704.922, max=704.922, sum=1409.843 (2)",
-            "tab": "General information",
-            "score": 704.9215686274509
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.867,
-        "details": {
-          "description": "min=0.867, mean=0.867, max=0.867, sum=1.735 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=4.39, mean=4.39, max=4.39, sum=8.779 (2)",
-            "tab": "Efficiency",
-            "score": 4.389729757367829
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=628.185, mean=628.185, max=628.185, sum=1256.37 (2)",
-            "tab": "General information",
-            "score": 628.1851851851852
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.755,
-        "details": {
-          "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=3.474, mean=3.474, max=3.474, sum=6.948 (2)",
-            "tab": "Efficiency",
-            "score": 3.4741735740141437
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=483.827, mean=483.827, max=483.827, sum=967.655 (2)",
-            "tab": "General information",
-            "score": 483.8272727272727
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.829,
-        "details": {
-          "description": "min=0.829, mean=0.829, max=0.829, sum=1.657 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=9.808, mean=9.808, max=9.808, sum=19.616 (2)",
-            "tab": "Efficiency",
-            "score": 9.807938383063492
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1386.531, mean=1386.531, max=1386.531, sum=2773.061 (2)",
-            "tab": "General information",
-            "score": 1386.530612244898
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.891,
-        "details": {
-          "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=3.643, mean=3.643, max=3.643, sum=7.285 (2)",
-            "tab": "Efficiency",
-            "score": 3.642500052997722
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=517.478, mean=517.478, max=517.478, sum=1034.955 (2)",
-            "tab": "General information",
-            "score": 517.4776119402985
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.554,
-        "details": {
-          "description": "min=0.554, mean=0.554, max=0.554, sum=1.108 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=2.911, mean=2.911, max=2.911, sum=5.822 (2)",
-            "tab": "Efficiency",
-            "score": 2.910837286926178
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=414.108, mean=414.108, max=414.108, sum=828.217 (2)",
-            "tab": "General information",
-            "score": 414.10843373493975
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "details": {
-          "description": "min=0.865, mean=0.865, max=0.865, sum=1.731 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=2.097, mean=2.097, max=2.097, sum=4.194 (2)",
-            "tab": "Efficiency",
-            "score": 2.0972191897052075
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=313.474, mean=313.474, max=313.474, sum=626.947 (2)",
-            "tab": "General information",
-            "score": 313.4736842105263
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.015,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json b/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json
deleted file mode 100644
index bbe3afca0..000000000
--- a/data/helm_mmlu/microsoft/phi-3-small-8k-instruct/d2bf70ce-341f-49d7-bd03-87b523826953.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/microsoft_phi-3-small-8k-instruct/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3 7B",
-    "id": "microsoft/phi-3-small-8k-instruct",
-    "developer": "microsoft",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.757,
-        "details": {
-          "description": "min=0.44, mean=0.757, max=0.969, sum=86.273 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.226, mean=0.38, max=1.284, sum=43.298 (114)",
-            "tab": "Efficiency",
-            "score": 0.379805443442311
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=275.561, mean=614.852, max=2798.073, sum=70093.086 (114)",
-            "tab": "General information",
-            "score": 614.851634217556
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44,
-        "details": {
-          "description": "min=0.44, mean=0.44, max=0.44, sum=0.88 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.505, mean=0.505, max=0.505, sum=1.009 (2)",
-            "tab": "Efficiency",
-            "score": 0.5047230005264283
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=373.44, mean=373.44, max=373.44, sum=746.88 (2)",
-            "tab": "General information",
-            "score": 373.44
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.726,
-        "details": {
-          "description": "min=0.726, mean=0.726, max=0.726, sum=1.452 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.412, mean=0.412, max=0.412, sum=0.825 (2)",
-            "tab": "Efficiency",
-            "score": 0.4122970881285491
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=353.978, mean=353.978, max=353.978, sum=707.956 (2)",
-            "tab": "General information",
-            "score": 353.97777777777776
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.559,
-        "details": {
-          "description": "min=0.559, mean=0.559, max=0.559, sum=1.118 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.341, mean=0.341, max=0.341, sum=0.683 (2)",
-            "tab": "Efficiency",
-            "score": 0.3414782953262329
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.6 (2)",
-            "tab": "Efficiency",
-            "score": 0.3002290378014247
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.447, mean=0.447, max=0.447, sum=0.894 (2)",
-            "tab": "Efficiency",
-            "score": 0.4468130707740784
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.351, mean=0.351, max=0.351, sum=0.703 (2)",
-            "tab": "Efficiency",
-            "score": 0.35149253606796266
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.323, max=0.323, sum=0.646 (2)",
-            "tab": "Efficiency",
-            "score": 0.32299859399740405
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.322, mean=0.322, max=0.322, sum=0.644 (2)",
-            "tab": "Efficiency",
-            "score": 0.32188768246594596
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=549.4, mean=549.4, max=549.4, sum=1098.8 (2)",
-            "tab": "General information",
-            "score": 549.4
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=473.917, mean=473.917, max=473.917, sum=947.833 (2)",
-            "tab": "General information",
-            "score": 473.9166666666667
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=828.39, mean=828.39, max=828.39, sum=1656.78 (2)",
-            "tab": "General information",
-            "score": 828.39
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=594.52, mean=594.52, max=594.52, sum=1189.04 (2)",
-            "tab": "General information",
-            "score": 594.52
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=502.728, mean=502.728, max=502.728, sum=1005.457 (2)",
-            "tab": "General information",
-            "score": 502.728323699422
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=503.608, mean=503.608, max=503.608, sum=1007.216 (2)",
-            "tab": "General information",
-            "score": 503.6078431372549
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.275, mean=0.275, max=0.275, sum=0.55 (2)",
-            "tab": "Efficiency",
-            "score": 0.2747947096824646
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=378.54, mean=378.54, max=378.54, sum=757.08 (2)",
-            "tab": "General information",
-            "score": 378.54
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.596,
-        "details": {
-          "description": "min=0.596, mean=0.596, max=0.596, sum=1.193 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.362, mean=0.362, max=0.362, sum=0.724 (2)",
-            "tab": "Efficiency",
-            "score": 0.36201402178981845
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=614.43, mean=614.43, max=614.43, sum=1228.86 (2)",
-            "tab": "General information",
-            "score": 614.4298245614035
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52,
-        "details": {
-          "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.509, mean=0.509, max=0.509, sum=1.018 (2)",
-            "tab": "Efficiency",
-            "score": 0.5091006135940552
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)",
-            "tab": "General information",
-            "score": 399.71
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.843,
-        "details": {
-          "description": "min=0.843, mean=0.843, max=0.843, sum=1.685 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.269, mean=0.269, max=0.269, sum=0.538 (2)",
-            "tab": "Efficiency",
-            "score": 0.2687692134468644
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=394.639, mean=394.639, max=394.639, sum=789.278 (2)",
-            "tab": "General information",
-            "score": 394.6388888888889
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.263, mean=0.263, max=0.263, sum=0.527 (2)",
-            "tab": "Efficiency",
-            "score": 0.26347158346145483
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)",
-            "tab": "General information",
-            "score": 329.08360128617363
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.835,
-        "details": {
-          "description": "min=0.835, mean=0.835, max=0.835, sum=1.67 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.536, mean=0.536, max=0.536, sum=1.073 (2)",
-            "tab": "Efficiency",
-            "score": 0.5363782968591241
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.373, mean=0.373, max=0.373, sum=0.746 (2)",
-            "tab": "Efficiency",
-            "score": 0.37297873885919014
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.764, mean=0.764, max=0.764, sum=1.527 (2)",
-            "tab": "Efficiency",
-            "score": 0.7635687488620564
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.664 (2)",
-            "tab": "Efficiency",
-            "score": 0.3322232922697379
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1094.585, mean=1094.585, max=1094.585, sum=2189.169 (2)",
-            "tab": "General information",
-            "score": 1094.5845588235295
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=658.592, mean=658.592, max=658.592, sum=1317.184 (2)",
-            "tab": "General information",
-            "score": 658.5921985815603
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1637.787, mean=1637.787, max=1637.787, sum=3275.574 (2)",
-            "tab": "General information",
-            "score": 1637.7868318122555
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=575.114, mean=575.114, max=575.114, sum=1150.229 (2)",
-            "tab": "General information",
-            "score": 575.1143790849674
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.95,
-        "details": {
-          "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.549, mean=0.549, max=0.549, sum=1.098 (2)",
-            "tab": "Efficiency",
-            "score": 0.5491553211212158
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)",
-            "tab": "General information",
-            "score": 422.79
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.849,
-        "details": {
-          "description": "min=0.849, mean=0.849, max=0.849, sum=1.697 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.352, mean=0.352, max=0.352, sum=0.704 (2)",
-            "tab": "Efficiency",
-            "score": 0.35213252902030945
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=579.691, mean=579.691, max=579.691, sum=1159.382 (2)",
-            "tab": "General information",
-            "score": 579.6907894736842
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.347, mean=0.347, max=0.347, sum=0.693 (2)",
-            "tab": "Efficiency",
-            "score": 0.34657839775085447
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)",
-            "tab": "General information",
-            "score": 569.52
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.286, mean=0.286, max=0.286, sum=0.572 (2)",
-            "tab": "Efficiency",
-            "score": 0.2858500345697943
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=397.947, mean=397.947, max=397.947, sum=795.894 (2)",
-            "tab": "General information",
-            "score": 397.94716981132075
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.779,
-        "details": {
-          "description": "min=0.779, mean=0.779, max=0.779, sum=1.557 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.254, mean=0.254, max=0.254, sum=0.507 (2)",
-            "tab": "Efficiency",
-            "score": 0.2537446346688778
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=304.838, mean=304.838, max=304.838, sum=609.677 (2)",
-            "tab": "General information",
-            "score": 304.83829787234043
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69,
-        "details": {
-          "description": "min=0.69, mean=0.69, max=0.69, sum=1.379 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.301, mean=0.301, max=0.301, sum=0.602 (2)",
-            "tab": "Efficiency",
-            "score": 0.3010375532610663
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=440.641, mean=440.641, max=440.641, sum=881.283 (2)",
-            "tab": "General information",
-            "score": 440.6413793103448
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.619,
-        "details": {
-          "description": "min=0.619, mean=0.619, max=0.619, sum=1.238 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.338, mean=0.338, max=0.338, sum=0.676 (2)",
-            "tab": "Efficiency",
-            "score": 0.3380681862906804
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=531.862, mean=531.862, max=531.862, sum=1063.725 (2)",
-            "tab": "General information",
-            "score": 531.8624338624338
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.595,
-        "details": {
-          "description": "min=0.595, mean=0.595, max=0.595, sum=1.19 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.358, mean=0.358, max=0.358, sum=0.716 (2)",
-            "tab": "Efficiency",
-            "score": 0.35805845071399023
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=606.762, mean=606.762, max=606.762, sum=1213.524 (2)",
-            "tab": "General information",
-            "score": 606.7619047619048
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.848,
-        "details": {
-          "description": "min=0.848, mean=0.848, max=0.848, sum=1.696 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.655 (2)",
-            "tab": "Efficiency",
-            "score": 0.32748886615999284
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)",
-            "tab": "Efficiency",
-            "score": 0.31104220545350625
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.465, mean=0.465, max=0.465, sum=0.93 (2)",
-            "tab": "Efficiency",
-            "score": 0.4648329520225525
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=1.284, mean=1.284, max=1.284, sum=2.569 (2)",
-            "tab": "Efficiency",
-            "score": 1.2842581590016684
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.272, mean=0.272, max=0.272, sum=0.544 (2)",
-            "tab": "Efficiency",
-            "score": 0.27224273031408136
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)",
-            "tab": "Efficiency",
-            "score": 0.2989391489967781
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.273, mean=0.273, max=0.273, sum=0.546 (2)",
-            "tab": "Efficiency",
-            "score": 0.2728824230340811
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.339, mean=0.339, max=0.339, sum=0.679 (2)",
-            "tab": "Efficiency",
-            "score": 0.33938890828026663
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.285, mean=0.285, max=0.285, sum=0.57 (2)",
-            "tab": "Efficiency",
-            "score": 0.28512202290927663
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)",
-            "tab": "Efficiency",
-            "score": 0.34992847537362815
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.316, mean=0.316, max=0.316, sum=0.633 (2)",
-            "tab": "Efficiency",
-            "score": 0.31643713986108063
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.439, mean=0.439, max=0.439, sum=0.878 (2)",
-            "tab": "Efficiency",
-            "score": 0.43886349929703605
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=1.004, mean=1.004, max=1.004, sum=2.009 (2)",
-            "tab": "Efficiency",
-            "score": 1.0044469611317504
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.677, mean=0.677, max=0.677, sum=1.354 (2)",
-            "tab": "Efficiency",
-            "score": 0.6767715281072045
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=513.677, mean=513.677, max=513.677, sum=1027.355 (2)",
-            "tab": "General information",
-            "score": 513.6774193548387
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=496.714, mean=496.714, max=496.714, sum=993.429 (2)",
-            "tab": "General information",
-            "score": 496.7142857142857
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)",
-            "tab": "General information",
-            "score": 867.78
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2798.073, mean=2798.073, max=2798.073, sum=5596.145 (2)",
-            "tab": "General information",
-            "score": 2798.072727272727
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=372.045, mean=372.045, max=372.045, sum=744.091 (2)",
-            "tab": "General information",
-            "score": 372.04545454545456
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)",
-            "tab": "General information",
-            "score": 465.8238341968912
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=371.562, mean=371.562, max=371.562, sum=743.123 (2)",
-            "tab": "General information",
-            "score": 371.5615384615385
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=532.374, mean=532.374, max=532.374, sum=1064.748 (2)",
-            "tab": "General information",
-            "score": 532.3740740740741
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=399.025, mean=399.025, max=399.025, sum=798.05 (2)",
-            "tab": "General information",
-            "score": 399.02521008403363
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=560.464, mean=560.464, max=560.464, sum=1120.927 (2)",
-            "tab": "General information",
-            "score": 560.4635761589404
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=495.246, mean=495.246, max=495.246, sum=990.492 (2)",
-            "tab": "General information",
-            "score": 495.24587155963303
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=795.699, mean=795.699, max=795.699, sum=1591.398 (2)",
-            "tab": "General information",
-            "score": 795.699074074074
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)",
-            "tab": "General information",
-            "score": 2217.8088235294117
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1428.27, mean=1428.27, max=1428.27, sum=2856.54 (2)",
-            "tab": "General information",
-            "score": 1428.2700421940929
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.817,
-        "details": {
-          "description": "min=0.817, mean=0.817, max=0.817, sum=1.634 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.258, mean=0.258, max=0.258, sum=0.515 (2)",
-            "tab": "Efficiency",
-            "score": 0.2577151257895568
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.264, mean=0.264, max=0.264, sum=0.529 (2)",
-            "tab": "Efficiency",
-            "score": 0.26447626470609475
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=319.906, mean=319.906, max=319.906, sum=639.812 (2)",
-            "tab": "General information",
-            "score": 319.90582959641256
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=341.183, mean=341.183, max=341.183, sum=682.366 (2)",
-            "tab": "General information",
-            "score": 341.1832061068702
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.851,
-        "details": {
-          "description": "min=0.851, mean=0.851, max=0.851, sum=1.702 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.371, mean=0.371, max=0.371, sum=0.743 (2)",
-            "tab": "Efficiency",
-            "score": 0.3714516399320492
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=639.851, mean=639.851, max=639.851, sum=1279.702 (2)",
-            "tab": "General information",
-            "score": 639.8512396694215
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)",
-            "tab": "Efficiency",
-            "score": 0.30408222543681324
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=449.595, mean=449.595, max=449.595, sum=899.19 (2)",
-            "tab": "General information",
-            "score": 449.5950920245399
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.652,
-        "details": {
-          "description": "min=0.652, mean=0.652, max=0.652, sum=1.304 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.382, mean=0.382, max=0.382, sum=0.765 (2)",
-            "tab": "Efficiency",
-            "score": 0.3823078232152121
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)",
-            "tab": "General information",
-            "score": 668.0535714285714
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.903,
-        "details": {
-          "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.255, mean=0.255, max=0.255, sum=0.511 (2)",
-            "tab": "Efficiency",
-            "score": 0.2552997649294659
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=283.796, mean=283.796, max=283.796, sum=567.592 (2)",
-            "tab": "General information",
-            "score": 283.79611650485435
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.897,
-        "details": {
-          "description": "min=0.897, mean=0.897, max=0.897, sum=1.795 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.291, mean=0.291, max=0.291, sum=0.582 (2)",
-            "tab": "Efficiency",
-            "score": 0.29102008974450266
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)",
-            "tab": "General information",
-            "score": 404.21794871794873
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)",
-            "tab": "Efficiency",
-            "score": 0.27023372411727903
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=341, mean=341, max=341, sum=682 (2)",
-            "tab": "General information",
-            "score": 341.0
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.259, mean=0.259, max=0.259, sum=0.518 (2)",
-            "tab": "Efficiency",
-            "score": 0.25915825382198565
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=299.925, mean=299.925, max=299.925, sum=599.849 (2)",
-            "tab": "General information",
-            "score": 299.92464878671774
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.711,
-        "details": {
-          "description": "min=0.711, mean=0.711, max=0.711, sum=1.421 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.308, mean=0.308, max=0.308, sum=0.617 (2)",
-            "tab": "Efficiency",
-            "score": 0.3084571650951584
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.383, mean=0.383, max=0.383, sum=0.766 (2)",
-            "tab": "Efficiency",
-            "score": 0.3827664223463176
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=476.145, mean=476.145, max=476.145, sum=952.289 (2)",
-            "tab": "General information",
-            "score": 476.1445086705202
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)",
-            "tab": "General information",
-            "score": 656.454748603352
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.833,
-        "details": {
-          "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.349, mean=0.349, max=0.349, sum=0.699 (2)",
-            "tab": "Efficiency",
-            "score": 0.34937040011088055
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=586.817, mean=586.817, max=586.817, sum=1173.634 (2)",
-            "tab": "General information",
-            "score": 586.8169934640523
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.858,
-        "details": {
-          "description": "min=0.858, mean=0.858, max=0.858, sum=1.716 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.325, mean=0.325, max=0.325, sum=0.649 (2)",
-            "tab": "Efficiency",
-            "score": 0.32473731188126553
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=514.559, mean=514.559, max=514.559, sum=1029.117 (2)",
-            "tab": "General information",
-            "score": 514.5586419753087
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.727,
-        "details": {
-          "description": "min=0.727, mean=0.727, max=0.727, sum=1.455 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.259, mean=0.259, max=0.259, sum=0.517 (2)",
-            "tab": "Efficiency",
-            "score": 0.2587012074210427
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)",
-            "tab": "General information",
-            "score": 405.3181818181818
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.804,
-        "details": {
-          "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.528, mean=0.528, max=0.528, sum=1.057 (2)",
-            "tab": "Efficiency",
-            "score": 0.5282714629659847
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)",
-            "tab": "General information",
-            "score": 1164.4734693877551
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.886,
-        "details": {
-          "description": "min=0.886, mean=0.886, max=0.886, sum=1.771 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.267, mean=0.267, max=0.267, sum=0.534 (2)",
-            "tab": "Efficiency",
-            "score": 0.2668588197053368
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=445.522, mean=445.522, max=445.522, sum=891.045 (2)",
-            "tab": "General information",
-            "score": 445.5223880597015
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.548,
-        "details": {
-          "description": "min=0.548, mean=0.548, max=0.548, sum=1.096 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.235, mean=0.235, max=0.235, sum=0.47 (2)",
-            "tab": "Efficiency",
-            "score": 0.235107473580234
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=343.09, mean=343.09, max=343.09, sum=686.181 (2)",
-            "tab": "General information",
-            "score": 343.0903614457831
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.825,
-        "details": {
-          "description": "min=0.825, mean=0.825, max=0.825, sum=1.649 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.226, mean=0.226, max=0.226, sum=0.453 (2)",
-            "tab": "Efficiency",
-            "score": 0.22640645016006558
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=275.561, mean=275.561, max=275.561, sum=551.123 (2)",
-            "tab": "General information",
-            "score": 275.56140350877195
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.708,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json b/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json
deleted file mode 100644
index e788149e1..000000000
--- a/data/helm_mmlu/mistralai/mistral-7b-instruct-v0.3/b20860aa-fb88-46b8-a79b-fa71a79c7d4d.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-7b-instruct-v0.3/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral Instruct v0.3 7B",
-    "id": "mistralai/mistral-7b-instruct-v0.3",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.599,
-        "details": {
-          "description": "min=0.258, mean=0.599, max=0.881, sum=68.3 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.212, mean=0.526, max=1.438, sum=59.959 (114)",
-            "tab": "Efficiency",
-            "score": 0.525951832745908
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=317.924, mean=705.273, max=3098.109, sum=80401.178 (114)",
-            "tab": "General information",
-            "score": 705.2734899593811
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.27,
-        "details": {
-          "description": "min=0.27, mean=0.27, max=0.27, sum=0.54 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.642 (2)",
-            "tab": "Efficiency",
-            "score": 0.32117165088653565
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=411.44, mean=411.44, max=411.44, sum=822.88 (2)",
-            "tab": "General information",
-            "score": 411.44
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.585,
-        "details": {
-          "description": "min=0.585, mean=0.585, max=0.585, sum=1.17 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.246, mean=0.246, max=0.246, sum=0.493 (2)",
-            "tab": "Efficiency",
-            "score": 0.24627229902479383
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=416.089, mean=416.089, max=416.089, sum=832.178 (2)",
-            "tab": "General information",
-            "score": 416.0888888888889
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.343,
-        "details": {
-          "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.221, mean=0.221, max=0.221, sum=0.442 (2)",
-            "tab": "Efficiency",
-            "score": 0.22099271774291993
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.7, mean=0.7, max=0.7, sum=1.399 (2)",
-            "tab": "Efficiency",
-            "score": 0.6997380173868604
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.466, mean=0.466, max=0.466, sum=0.932 (2)",
-            "tab": "Efficiency",
-            "score": 0.4661028146743774
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.212, mean=0.212, max=0.212, sum=0.424 (2)",
-            "tab": "Efficiency",
-            "score": 0.21210591793060302
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.387, mean=0.387, max=0.387, sum=0.774 (2)",
-            "tab": "Efficiency",
-            "score": 0.3871537646806309
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.455, mean=0.455, max=0.455, sum=0.91 (2)",
-            "tab": "Efficiency",
-            "score": 0.45503536392660704
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=636.71, mean=636.71, max=636.71, sum=1273.42 (2)",
-            "tab": "General information",
-            "score": 636.71
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=559.799, mean=559.799, max=559.799, sum=1119.597 (2)",
-            "tab": "General information",
-            "score": 559.7986111111111
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=911.17, mean=911.17, max=911.17, sum=1822.34 (2)",
-            "tab": "General information",
-            "score": 911.17
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=667.31, mean=667.31, max=667.31, sum=1334.62 (2)",
-            "tab": "General information",
-            "score": 667.31
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=601.41, mean=601.41, max=601.41, sum=1202.821 (2)",
-            "tab": "General information",
-            "score": 601.4104046242775
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=560.029, mean=560.029, max=560.029, sum=1120.059 (2)",
-            "tab": "General information",
-            "score": 560.0294117647059
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7,
-        "details": {
-          "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.426, mean=0.426, max=0.426, sum=0.853 (2)",
-            "tab": "Efficiency",
-            "score": 0.4263953256607056
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=433.94, mean=433.94, max=433.94, sum=867.88 (2)",
-            "tab": "General information",
-            "score": 433.94
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.421,
-        "details": {
-          "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.406, mean=0.406, max=0.406, sum=0.813 (2)",
-            "tab": "Efficiency",
-            "score": 0.406455958098696
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=696.175, mean=696.175, max=696.175, sum=1392.351 (2)",
-            "tab": "General information",
-            "score": 696.1754385964912
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.33,
-        "details": {
-          "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)",
-            "tab": "Efficiency",
-            "score": 0.29881003856658933
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=492.47, mean=492.47, max=492.47, sum=984.94 (2)",
-            "tab": "General information",
-            "score": 492.47
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.713,
-        "details": {
-          "description": "min=0.713, mean=0.713, max=0.713, sum=1.426 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.232, mean=0.232, max=0.232, sum=0.465 (2)",
-            "tab": "Efficiency",
-            "score": 0.23237781833719323
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=460.093, mean=460.093, max=460.093, sum=920.185 (2)",
-            "tab": "General information",
-            "score": 460.0925925925926
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.659,
-        "details": {
-          "description": "min=0.659, mean=0.659, max=0.659, sum=1.318 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.899, mean=0.899, max=0.899, sum=1.798 (2)",
-            "tab": "Efficiency",
-            "score": 0.8987545852109167
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=382.82, mean=382.82, max=382.82, sum=765.64 (2)",
-            "tab": "General information",
-            "score": 382.81993569131834
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.641,
-        "details": {
-          "description": "min=0.641, mean=0.641, max=0.641, sum=1.281 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.615, mean=0.615, max=0.615, sum=1.23 (2)",
-            "tab": "Efficiency",
-            "score": 0.6148438769228318
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.825, mean=0.825, max=0.825, sum=1.651 (2)",
-            "tab": "Efficiency",
-            "score": 0.8254362666015084
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.682, mean=0.682, max=0.682, sum=1.364 (2)",
-            "tab": "Efficiency",
-            "score": 0.68212915414937
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.506, mean=0.506, max=0.506, sum=1.012 (2)",
-            "tab": "Efficiency",
-            "score": 0.505940170459498
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1288.143, mean=1288.143, max=1288.143, sum=2576.287 (2)",
-            "tab": "General information",
-            "score": 1288.1433823529412
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=805.496, mean=805.496, max=805.496, sum=1610.993 (2)",
-            "tab": "General information",
-            "score": 805.4964539007092
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1858.711, mean=1858.711, max=1858.711, sum=3717.421 (2)",
-            "tab": "General information",
-            "score": 1858.7105606258149
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=654.278, mean=654.278, max=654.278, sum=1308.556 (2)",
-            "tab": "General information",
-            "score": 654.2777777777778
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.487, mean=0.487, max=0.487, sum=0.973 (2)",
-            "tab": "Efficiency",
-            "score": 0.48650413513183594
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=482.19, mean=482.19, max=482.19, sum=964.38 (2)",
-            "tab": "General information",
-            "score": 482.19
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.638,
-        "details": {
-          "description": "min=0.638, mean=0.638, max=0.638, sum=1.276 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.678, mean=0.678, max=0.678, sum=1.355 (2)",
-            "tab": "Efficiency",
-            "score": 0.6775346147386652
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=674.987, mean=674.987, max=674.987, sum=1349.974 (2)",
-            "tab": "General information",
-            "score": 674.9868421052631
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.57,
-        "details": {
-          "description": "min=0.57, mean=0.57, max=0.57, sum=1.14 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.645, mean=0.645, max=0.645, sum=1.289 (2)",
-            "tab": "Efficiency",
-            "score": 0.6446590375900269
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=653.6, mean=653.6, max=653.6, sum=1307.2 (2)",
-            "tab": "General information",
-            "score": 653.6
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.687,
-        "details": {
-          "description": "min=0.687, mean=0.687, max=0.687, sum=1.374 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.844, mean=0.844, max=0.844, sum=1.687 (2)",
-            "tab": "Efficiency",
-            "score": 0.8436905698956184
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=496.174, mean=496.174, max=496.174, sum=992.347 (2)",
-            "tab": "General information",
-            "score": 496.1735849056604
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.549,
-        "details": {
-          "description": "min=0.549, mean=0.549, max=0.549, sum=1.098 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.666 (2)",
-            "tab": "Efficiency",
-            "score": 0.33306963900302317
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=343.285, mean=343.285, max=343.285, sum=686.57 (2)",
-            "tab": "General information",
-            "score": 343.2851063829787
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.572,
-        "details": {
-          "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.392, mean=0.392, max=0.392, sum=0.784 (2)",
-            "tab": "Efficiency",
-            "score": 0.3922290703345989
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=510.379, mean=510.379, max=510.379, sum=1020.759 (2)",
-            "tab": "General information",
-            "score": 510.37931034482756
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.402,
-        "details": {
-          "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.676, mean=0.676, max=0.676, sum=1.352 (2)",
-            "tab": "Efficiency",
-            "score": 0.6761655416438188
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=622.386, mean=622.386, max=622.386, sum=1244.772 (2)",
-            "tab": "General information",
-            "score": 622.3862433862433
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.397,
-        "details": {
-          "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.734, mean=0.734, max=0.734, sum=1.467 (2)",
-            "tab": "Efficiency",
-            "score": 0.7336057802987477
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=727.984, mean=727.984, max=727.984, sum=1455.968 (2)",
-            "tab": "General information",
-            "score": 727.984126984127
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.759,
-        "details": {
-          "description": "min=0.759, mean=0.759, max=0.759, sum=1.519 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.805, mean=0.805, max=0.805, sum=1.61 (2)",
-            "tab": "Efficiency",
-            "score": 0.8049156188964843
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.44, mean=0.44, max=0.44, sum=0.881 (2)",
-            "tab": "Efficiency",
-            "score": 0.44036899529067164
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.435, mean=0.435, max=0.435, sum=0.869 (2)",
-            "tab": "Efficiency",
-            "score": 0.4347002100944519
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.445, mean=0.445, max=0.445, sum=0.891 (2)",
-            "tab": "Efficiency",
-            "score": 0.4453156341205944
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.331, mean=0.331, max=0.331, sum=0.661 (2)",
-            "tab": "Efficiency",
-            "score": 0.3305177327358361
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.545, mean=0.545, max=0.545, sum=1.089 (2)",
-            "tab": "Efficiency",
-            "score": 0.5445178654527417
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.53, mean=0.53, max=0.53, sum=1.061 (2)",
-            "tab": "Efficiency",
-            "score": 0.5302642871172
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.585, mean=0.585, max=0.585, sum=1.169 (2)",
-            "tab": "Efficiency",
-            "score": 0.5845282289716932
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.234, mean=0.234, max=0.234, sum=0.468 (2)",
-            "tab": "Efficiency",
-            "score": 0.23408917118521297
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)",
-            "tab": "Efficiency",
-            "score": 0.3838195042894376
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.274, mean=0.274, max=0.274, sum=0.547 (2)",
-            "tab": "Efficiency",
-            "score": 0.2735835779697523
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.654, mean=0.654, max=0.654, sum=1.308 (2)",
-            "tab": "Efficiency",
-            "score": 0.6539056665367551
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.942, mean=0.942, max=0.942, sum=1.883 (2)",
-            "tab": "Efficiency",
-            "score": 0.9417344308366963
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.864, mean=0.864, max=0.864, sum=1.727 (2)",
-            "tab": "Efficiency",
-            "score": 0.8635432951561006
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=609.561, mean=609.561, max=609.561, sum=1219.123 (2)",
-            "tab": "General information",
-            "score": 609.5612903225806
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=581.798, mean=581.798, max=581.798, sum=1163.596 (2)",
-            "tab": "General information",
-            "score": 581.7980295566502
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=997.24, mean=997.24, max=997.24, sum=1994.48 (2)",
-            "tab": "General information",
-            "score": 997.24
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=3098.109, mean=3098.109, max=3098.109, sum=6196.218 (2)",
-            "tab": "General information",
-            "score": 3098.109090909091
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=438.207, mean=438.207, max=438.207, sum=876.414 (2)",
-            "tab": "General information",
-            "score": 438.2070707070707
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=523.808, mean=523.808, max=523.808, sum=1047.617 (2)",
-            "tab": "General information",
-            "score": 523.8082901554404
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=432.815, mean=432.815, max=432.815, sum=865.631 (2)",
-            "tab": "General information",
-            "score": 432.81538461538463
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=593.13, mean=593.13, max=593.13, sum=1186.259 (2)",
-            "tab": "General information",
-            "score": 593.1296296296297
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=452.345, mean=452.345, max=452.345, sum=904.689 (2)",
-            "tab": "General information",
-            "score": 452.34453781512605
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=631.775, mean=631.775, max=631.775, sum=1263.55 (2)",
-            "tab": "General information",
-            "score": 631.774834437086
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=567.873, mean=567.873, max=567.873, sum=1135.747 (2)",
-            "tab": "General information",
-            "score": 567.8733944954129
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=922.644, mean=922.644, max=922.644, sum=1845.287 (2)",
-            "tab": "General information",
-            "score": 922.6435185185185
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2486.446, mean=2486.446, max=2486.446, sum=4972.892 (2)",
-            "tab": "General information",
-            "score": 2486.4460784313724
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1594.553, mean=1594.553, max=1594.553, sum=3189.105 (2)",
-            "tab": "General information",
-            "score": 1594.5527426160338
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.702,
-        "details": {
-          "description": "min=0.702, mean=0.702, max=0.702, sum=1.405 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)",
-            "tab": "Efficiency",
-            "score": 0.8091403518557014
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=1.438, mean=1.438, max=1.438, sum=2.875 (2)",
-            "tab": "Efficiency",
-            "score": 1.437711750278036
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=362.152, mean=362.152, max=362.152, sum=724.305 (2)",
-            "tab": "General information",
-            "score": 362.15246636771303
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=403.748, mean=403.748, max=403.748, sum=807.496 (2)",
-            "tab": "General information",
-            "score": 403.7480916030534
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=1.521 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.393, mean=0.393, max=0.393, sum=0.787 (2)",
-            "tab": "Efficiency",
-            "score": 0.3933255593638775
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=729.182, mean=729.182, max=729.182, sum=1458.364 (2)",
-            "tab": "General information",
-            "score": 729.1818181818181
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.712,
-        "details": {
-          "description": "min=0.712, mean=0.712, max=0.712, sum=1.423 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.848, mean=0.848, max=0.848, sum=1.695 (2)",
-            "tab": "Efficiency",
-            "score": 0.8476987660296855
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=495.779, mean=495.779, max=495.779, sum=991.558 (2)",
-            "tab": "General information",
-            "score": 495.77914110429447
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.455,
-        "details": {
-          "description": "min=0.455, mean=0.455, max=0.455, sum=0.911 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.557, mean=0.557, max=0.557, sum=1.113 (2)",
-            "tab": "Efficiency",
-            "score": 0.5566470899752208
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=743.83, mean=743.83, max=743.83, sum=1487.661 (2)",
-            "tab": "General information",
-            "score": 743.8303571428571
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.767,
-        "details": {
-          "description": "min=0.767, mean=0.767, max=0.767, sum=1.534 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.365, mean=0.365, max=0.365, sum=0.73 (2)",
-            "tab": "Efficiency",
-            "score": 0.36507687059420985
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=324.359, mean=324.359, max=324.359, sum=648.718 (2)",
-            "tab": "General information",
-            "score": 324.3592233009709
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.842,
-        "details": {
-          "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.585, mean=0.585, max=0.585, sum=1.17 (2)",
-            "tab": "Efficiency",
-            "score": 0.58499161606161
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=472.423, mean=472.423, max=472.423, sum=944.846 (2)",
-            "tab": "General information",
-            "score": 472.4230769230769
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.75,
-        "details": {
-          "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.268, mean=0.268, max=0.268, sum=0.535 (2)",
-            "tab": "Efficiency",
-            "score": 0.2675498366355896
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=414.71, mean=414.71, max=414.71, sum=829.42 (2)",
-            "tab": "General information",
-            "score": 414.71
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.785,
-        "details": {
-          "description": "min=0.785, mean=0.785, max=0.785, sum=1.571 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.504, mean=0.504, max=0.504, sum=1.008 (2)",
-            "tab": "Efficiency",
-            "score": 0.5038632959850599
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=357.519, mean=357.519, max=357.519, sum=715.037 (2)",
-            "tab": "General information",
-            "score": 357.51851851851853
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.393,
-        "details": {
-          "description": "min=0.393, mean=0.393, max=0.393, sum=0.787 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.777, mean=0.777, max=0.777, sum=1.553 (2)",
-            "tab": "Efficiency",
-            "score": 0.7765735477381359
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.493, mean=0.493, max=0.493, sum=0.986 (2)",
-            "tab": "Efficiency",
-            "score": 0.4927780463042872
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=549.038, mean=549.038, max=549.038, sum=1098.075 (2)",
-            "tab": "General information",
-            "score": 549.0375722543353
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=754.516, mean=754.516, max=754.516, sum=1509.032 (2)",
-            "tab": "General information",
-            "score": 754.5162011173185
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.676,
-        "details": {
-          "description": "min=0.676, mean=0.676, max=0.676, sum=1.353 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.236, mean=0.236, max=0.236, sum=0.471 (2)",
-            "tab": "Efficiency",
-            "score": 0.23563866054310517
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=689.69, mean=689.69, max=689.69, sum=1379.379 (2)",
-            "tab": "General information",
-            "score": 689.6895424836601
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.673,
-        "details": {
-          "description": "min=0.673, mean=0.673, max=0.673, sum=1.346 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.345, mean=0.345, max=0.345, sum=0.69 (2)",
-            "tab": "Efficiency",
-            "score": 0.34476134880089465
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=611.145, mean=611.145, max=611.145, sum=1222.29 (2)",
-            "tab": "General information",
-            "score": 611.145061728395
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.636,
-        "details": {
-          "description": "min=0.636, mean=0.636, max=0.636, sum=1.273 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.654 (2)",
-            "tab": "Efficiency",
-            "score": 0.3271717678416859
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=471.036, mean=471.036, max=471.036, sum=942.073 (2)",
-            "tab": "General information",
-            "score": 471.03636363636366
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.682,
-        "details": {
-          "description": "min=0.682, mean=0.682, max=0.682, sum=1.363 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.561, mean=0.561, max=0.561, sum=1.121 (2)",
-            "tab": "Efficiency",
-            "score": 0.5606838294437954
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1324.865, mean=1324.865, max=1324.865, sum=2649.731 (2)",
-            "tab": "General information",
-            "score": 1324.865306122449
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.806,
-        "details": {
-          "description": "min=0.806, mean=0.806, max=0.806, sum=1.612 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.413, mean=0.413, max=0.413, sum=0.825 (2)",
-            "tab": "Efficiency",
-            "score": 0.41272182962787685
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=496.95, mean=496.95, max=496.95, sum=993.9 (2)",
-            "tab": "General information",
-            "score": 496.9502487562189
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.47,
-        "details": {
-          "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.644, mean=0.644, max=0.644, sum=1.288 (2)",
-            "tab": "Efficiency",
-            "score": 0.6437842285776713
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=404.349, mean=404.349, max=404.349, sum=808.699 (2)",
-            "tab": "General information",
-            "score": 404.34939759036143
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.825,
-        "details": {
-          "description": "min=0.825, mean=0.825, max=0.825, sum=1.649 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.266, mean=0.266, max=0.266, sum=0.532 (2)",
-            "tab": "Efficiency",
-            "score": 0.26615772330970094
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=317.924, mean=317.924, max=317.924, sum=635.848 (2)",
-            "tab": "General information",
-            "score": 317.92397660818716
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.509,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json b/data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json
deleted file mode 100644
index 5ca508d3b..000000000
--- a/data/helm_mmlu/mistralai/mistral-7b-v0.1/08590b6e-7050-413d-844b-1f3f1c5aa444.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-7b-v0.1/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral v0.1 7B",
-    "id": "mistralai/mistral-7b-v0.1",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.566,
-        "details": {
-          "description": "min=0.25, mean=0.566, max=0.845, sum=64.496 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.665, mean=0.864, max=1.234, sum=98.504 (114)",
-            "tab": "Efficiency",
-            "score": 0.8640714937745795
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=308.924, mean=696.273, max=3089.109, sum=79375.178 (114)",
-            "tab": "General information",
-            "score": 696.2734899593811
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25,
-        "details": {
-          "description": "min=0.25, mean=0.25, max=0.25, sum=0.5 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.834, mean=0.834, max=0.834, sum=1.667 (2)",
-            "tab": "Efficiency",
-            "score": 0.8337139582633972
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=402.44, mean=402.44, max=402.44, sum=804.88 (2)",
-            "tab": "General information",
-            "score": 402.44
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.467,
-        "details": {
-          "description": "min=0.467, mean=0.467, max=0.467, sum=0.933 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.717, mean=0.717, max=0.717, sum=1.435 (2)",
-            "tab": "Efficiency",
-            "score": 0.7173902529257316
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=407.089, mean=407.089, max=407.089, sum=814.178 (2)",
-            "tab": "General information",
-            "score": 407.0888888888889
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.314,
-        "details": {
-          "description": "min=0.314, mean=0.314, max=0.314, sum=0.627 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=1.018, mean=1.018, max=1.018, sum=2.036 (2)",
-            "tab": "Efficiency",
-            "score": 1.0181659984588622
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.699, mean=0.699, max=0.699, sum=1.398 (2)",
-            "tab": "Efficiency",
-            "score": 0.699198540714052
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.712, mean=0.712, max=0.712, sum=1.423 (2)",
-            "tab": "Efficiency",
-            "score": 0.7115359020233154
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.885, mean=0.885, max=0.885, sum=1.77 (2)",
-            "tab": "Efficiency",
-            "score": 0.8852152943611145
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.896, mean=0.896, max=0.896, sum=1.793 (2)",
-            "tab": "Efficiency",
-            "score": 0.8963309629804137
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.803, mean=0.803, max=0.803, sum=1.606 (2)",
-            "tab": "Efficiency",
-            "score": 0.8030702249676573
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=627.71, mean=627.71, max=627.71, sum=1255.42 (2)",
-            "tab": "General information",
-            "score": 627.71
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=550.799, mean=550.799, max=550.799, sum=1101.597 (2)",
-            "tab": "General information",
-            "score": 550.7986111111111
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=902.17, mean=902.17, max=902.17, sum=1804.34 (2)",
-            "tab": "General information",
-            "score": 902.17
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=658.31, mean=658.31, max=658.31, sum=1316.62 (2)",
-            "tab": "General information",
-            "score": 658.31
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=592.41, mean=592.41, max=592.41, sum=1184.821 (2)",
-            "tab": "General information",
-            "score": 592.4104046242775
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=551.029, mean=551.029, max=551.029, sum=1102.059 (2)",
-            "tab": "General information",
-            "score": 551.0294117647059
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69,
-        "details": {
-          "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=1.033, mean=1.033, max=1.033, sum=2.065 (2)",
-            "tab": "Efficiency",
-            "score": 1.032561357021332
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=424.94, mean=424.94, max=424.94, sum=849.88 (2)",
-            "tab": "General information",
-            "score": 424.94
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.351,
-        "details": {
-          "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.783, mean=0.783, max=0.783, sum=1.566 (2)",
-            "tab": "Efficiency",
-            "score": 0.7832156043303641
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=687.175, mean=687.175, max=687.175, sum=1374.351 (2)",
-            "tab": "General information",
-            "score": 687.1754385964912
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.29,
-        "details": {
-          "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.689, mean=0.689, max=0.689, sum=1.378 (2)",
-            "tab": "Efficiency",
-            "score": 0.6891914677619934
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=483.47, mean=483.47, max=483.47, sum=966.94 (2)",
-            "tab": "General information",
-            "score": 483.47
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.667,
-        "details": {
-          "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.787, mean=0.787, max=0.787, sum=1.574 (2)",
-            "tab": "Efficiency",
-            "score": 0.7868193630818967
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=451.093, mean=451.093, max=451.093, sum=902.185 (2)",
-            "tab": "General information",
-            "score": 451.0925925925926
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.63,
-        "details": {
-          "description": "min=0.63, mean=0.63, max=0.63, sum=1.26 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.743, mean=0.743, max=0.743, sum=1.487 (2)",
-            "tab": "Efficiency",
-            "score": 0.7434952857026716
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=373.82, mean=373.82, max=373.82, sum=747.64 (2)",
-            "tab": "General information",
-            "score": 373.81993569131834
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.578,
-        "details": {
-          "description": "min=0.578, mean=0.578, max=0.578, sum=1.157 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.817, mean=0.817, max=0.817, sum=1.633 (2)",
-            "tab": "Efficiency",
-            "score": 0.816552089417682
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.743, mean=0.743, max=0.743, sum=1.487 (2)",
-            "tab": "Efficiency",
-            "score": 0.7432903905286856
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)",
-            "tab": "Efficiency",
-            "score": 0.8197952300659836
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.936, mean=0.936, max=0.936, sum=1.873 (2)",
-            "tab": "Efficiency",
-            "score": 0.9364227648654015
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1279.143, mean=1279.143, max=1279.143, sum=2558.287 (2)",
-            "tab": "General information",
-            "score": 1279.1433823529412
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=796.496, mean=796.496, max=796.496, sum=1592.993 (2)",
-            "tab": "General information",
-            "score": 796.4964539007092
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1849.711, mean=1849.711, max=1849.711, sum=3699.421 (2)",
-            "tab": "General information",
-            "score": 1849.7105606258149
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=645.278, mean=645.278, max=645.278, sum=1290.556 (2)",
-            "tab": "General information",
-            "score": 645.2777777777778
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.863, mean=0.863, max=0.863, sum=1.727 (2)",
-            "tab": "Efficiency",
-            "score": 0.8633295917510986
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=473.19, mean=473.19, max=473.19, sum=946.38 (2)",
-            "tab": "General information",
-            "score": 473.19
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.599,
-        "details": {
-          "description": "min=0.599, mean=0.599, max=0.599, sum=1.197 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)",
-            "tab": "Efficiency",
-            "score": 0.8039205105681169
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=665.987, mean=665.987, max=665.987, sum=1331.974 (2)",
-            "tab": "General information",
-            "score": 665.9868421052631
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.56,
-        "details": {
-          "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=1.014, mean=1.014, max=1.014, sum=2.028 (2)",
-            "tab": "Efficiency",
-            "score": 1.013892731666565
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=644.6, mean=644.6, max=644.6, sum=1289.2 (2)",
-            "tab": "General information",
-            "score": 644.6
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.653,
-        "details": {
-          "description": "min=0.653, mean=0.653, max=0.653, sum=1.306 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.807, mean=0.807, max=0.807, sum=1.613 (2)",
-            "tab": "Efficiency",
-            "score": 0.8066773774488917
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=487.174, mean=487.174, max=487.174, sum=974.347 (2)",
-            "tab": "General information",
-            "score": 487.1735849056604
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.451,
-        "details": {
-          "description": "min=0.451, mean=0.451, max=0.451, sum=0.902 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.833, mean=0.833, max=0.833, sum=1.666 (2)",
-            "tab": "Efficiency",
-            "score": 0.833152520402949
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=334.285, mean=334.285, max=334.285, sum=668.57 (2)",
-            "tab": "General information",
-            "score": 334.2851063829787
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.538,
-        "details": {
-          "description": "min=0.538, mean=0.538, max=0.538, sum=1.076 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=1.234, mean=1.234, max=1.234, sum=2.468 (2)",
-            "tab": "Efficiency",
-            "score": 1.2342401932025777
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=501.379, mean=501.379, max=501.379, sum=1002.759 (2)",
-            "tab": "General information",
-            "score": 501.37931034482756
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.32,
-        "details": {
-          "description": "min=0.32, mean=0.32, max=0.32, sum=0.64 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.854, mean=0.854, max=0.854, sum=1.707 (2)",
-            "tab": "Efficiency",
-            "score": 0.8535163610700577
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=613.386, mean=613.386, max=613.386, sum=1226.772 (2)",
-            "tab": "General information",
-            "score": 613.3862433862433
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.365,
-        "details": {
-          "description": "min=0.365, mean=0.365, max=0.365, sum=0.73 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=1.022, mean=1.022, max=1.022, sum=2.044 (2)",
-            "tab": "Efficiency",
-            "score": 1.0218302371009949
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=718.984, mean=718.984, max=718.984, sum=1437.968 (2)",
-            "tab": "General information",
-            "score": 718.984126984127
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.726,
-        "details": {
-          "description": "min=0.726, mean=0.726, max=0.726, sum=1.451 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.904, mean=0.904, max=0.904, sum=1.808 (2)",
-            "tab": "Efficiency",
-            "score": 0.9039220233117381
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.891, mean=0.891, max=0.891, sum=1.782 (2)",
-            "tab": "Efficiency",
-            "score": 0.8910855988563575
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.759, mean=0.759, max=0.759, sum=1.519 (2)",
-            "tab": "Efficiency",
-            "score": 0.7594162678718567
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.709, mean=0.709, max=0.709, sum=1.418 (2)",
-            "tab": "Efficiency",
-            "score": 0.7088880394444321
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.909, mean=0.909, max=0.909, sum=1.818 (2)",
-            "tab": "Efficiency",
-            "score": 0.9091630006077314
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.665, mean=0.665, max=0.665, sum=1.329 (2)",
-            "tab": "Efficiency",
-            "score": 0.6645773976577996
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.841, mean=0.841, max=0.841, sum=1.682 (2)",
-            "tab": "Efficiency",
-            "score": 0.8412165372799605
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.767, mean=0.767, max=0.767, sum=1.534 (2)",
-            "tab": "Efficiency",
-            "score": 0.7671932847411544
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.995, mean=0.995, max=0.995, sum=1.99 (2)",
-            "tab": "Efficiency",
-            "score": 0.994775929370848
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.902, mean=0.902, max=0.902, sum=1.805 (2)",
-            "tab": "Efficiency",
-            "score": 0.9024771317740939
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.793, mean=0.793, max=0.793, sum=1.585 (2)",
-            "tab": "Efficiency",
-            "score": 0.7925117606416755
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.884, mean=0.884, max=0.884, sum=1.768 (2)",
-            "tab": "Efficiency",
-            "score": 0.8837873924661566
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=1.171, mean=1.171, max=1.171, sum=2.341 (2)",
-            "tab": "Efficiency",
-            "score": 1.170638754087336
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.801, mean=0.801, max=0.801, sum=1.603 (2)",
-            "tab": "Efficiency",
-            "score": 0.8013244822055479
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=600.561, mean=600.561, max=600.561, sum=1201.123 (2)",
-            "tab": "General information",
-            "score": 600.5612903225806
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=572.798, mean=572.798, max=572.798, sum=1145.596 (2)",
-            "tab": "General information",
-            "score": 572.7980295566502
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=988.24, mean=988.24, max=988.24, sum=1976.48 (2)",
-            "tab": "General information",
-            "score": 988.24
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=3089.109, mean=3089.109, max=3089.109, sum=6178.218 (2)",
-            "tab": "General information",
-            "score": 3089.109090909091
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=429.207, mean=429.207, max=429.207, sum=858.414 (2)",
-            "tab": "General information",
-            "score": 429.2070707070707
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=514.808, mean=514.808, max=514.808, sum=1029.617 (2)",
-            "tab": "General information",
-            "score": 514.8082901554404
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=423.815, mean=423.815, max=423.815, sum=847.631 (2)",
-            "tab": "General information",
-            "score": 423.81538461538463
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=584.13, mean=584.13, max=584.13, sum=1168.259 (2)",
-            "tab": "General information",
-            "score": 584.1296296296297
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=443.345, mean=443.345, max=443.345, sum=886.689 (2)",
-            "tab": "General information",
-            "score": 443.34453781512605
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=622.775, mean=622.775, max=622.775, sum=1245.55 (2)",
-            "tab": "General information",
-            "score": 622.774834437086
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=558.873, mean=558.873, max=558.873, sum=1117.747 (2)",
-            "tab": "General information",
-            "score": 558.8733944954129
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=913.644, mean=913.644, max=913.644, sum=1827.287 (2)",
-            "tab": "General information",
-            "score": 913.6435185185185
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2477.446, mean=2477.446, max=2477.446, sum=4954.892 (2)",
-            "tab": "General information",
-            "score": 2477.4460784313724
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1585.553, mean=1585.553, max=1585.553, sum=3171.105 (2)",
-            "tab": "General information",
-            "score": 1585.5527426160338
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.702,
-        "details": {
-          "description": "min=0.702, mean=0.702, max=0.702, sum=1.405 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.779, mean=0.779, max=0.779, sum=1.558 (2)",
-            "tab": "Efficiency",
-            "score": 0.778804096940387
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.85, mean=0.85, max=0.85, sum=1.701 (2)",
-            "tab": "Efficiency",
-            "score": 0.8504140213245653
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=353.152, mean=353.152, max=353.152, sum=706.305 (2)",
-            "tab": "General information",
-            "score": 353.15246636771303
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=394.748, mean=394.748, max=394.748, sum=789.496 (2)",
-            "tab": "General information",
-            "score": 394.7480916030534
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=1.521 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)",
-            "tab": "Efficiency",
-            "score": 0.9102441850772574
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=720.182, mean=720.182, max=720.182, sum=1440.364 (2)",
-            "tab": "General information",
-            "score": 720.1818181818181
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.693,
-        "details": {
-          "description": "min=0.693, mean=0.693, max=0.693, sum=1.387 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.806, mean=0.806, max=0.806, sum=1.613 (2)",
-            "tab": "Efficiency",
-            "score": 0.8063952381625498
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=486.779, mean=486.779, max=486.779, sum=973.558 (2)",
-            "tab": "General information",
-            "score": 486.77914110429447
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.438,
-        "details": {
-          "description": "min=0.438, mean=0.438, max=0.438, sum=0.875 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.751, mean=0.751, max=0.751, sum=1.503 (2)",
-            "tab": "Efficiency",
-            "score": 0.7514570632151195
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=734.83, mean=734.83, max=734.83, sum=1469.661 (2)",
-            "tab": "General information",
-            "score": 734.8303571428571
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.709,
-        "details": {
-          "description": "min=0.709, mean=0.709, max=0.709, sum=1.417 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)",
-            "tab": "Efficiency",
-            "score": 0.9339890294862025
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=315.359, mean=315.359, max=315.359, sum=630.718 (2)",
-            "tab": "General information",
-            "score": 315.3592233009709
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.833,
-        "details": {
-          "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=1.072, mean=1.072, max=1.072, sum=2.144 (2)",
-            "tab": "Efficiency",
-            "score": 1.0717963163669293
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=463.423, mean=463.423, max=463.423, sum=926.846 (2)",
-            "tab": "General information",
-            "score": 463.4230769230769
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.68,
-        "details": {
-          "description": "min=0.68, mean=0.68, max=0.68, sum=1.36 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.929, mean=0.929, max=0.929, sum=1.859 (2)",
-            "tab": "Efficiency",
-            "score": 0.9293915629386902
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=405.71, mean=405.71, max=405.71, sum=811.42 (2)",
-            "tab": "General information",
-            "score": 405.71
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.72,
-        "details": {
-          "description": "min=0.72, mean=0.72, max=0.72, sum=1.441 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.874, mean=0.874, max=0.874, sum=1.747 (2)",
-            "tab": "Efficiency",
-            "score": 0.8736470007500582
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=348.519, mean=348.519, max=348.519, sum=697.037 (2)",
-            "tab": "General information",
-            "score": 348.51851851851853
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.33,
-        "details": {
-          "description": "min=0.33, mean=0.33, max=0.33, sum=0.659 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.772, mean=0.772, max=0.772, sum=1.545 (2)",
-            "tab": "Efficiency",
-            "score": 0.7723477258847627
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.887, mean=0.887, max=0.887, sum=1.774 (2)",
-            "tab": "Efficiency",
-            "score": 0.8867556284259818
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=540.038, mean=540.038, max=540.038, sum=1080.075 (2)",
-            "tab": "General information",
-            "score": 540.0375722543353
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=745.516, mean=745.516, max=745.516, sum=1491.032 (2)",
-            "tab": "General information",
-            "score": 745.5162011173185
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.657,
-        "details": {
-          "description": "min=0.657, mean=0.657, max=0.657, sum=1.314 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.982, mean=0.982, max=0.982, sum=1.964 (2)",
-            "tab": "Efficiency",
-            "score": 0.9817679053038554
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=680.69, mean=680.69, max=680.69, sum=1361.379 (2)",
-            "tab": "General information",
-            "score": 680.6895424836601
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.642,
-        "details": {
-          "description": "min=0.642, mean=0.642, max=0.642, sum=1.284 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.752, mean=0.752, max=0.752, sum=1.505 (2)",
-            "tab": "Efficiency",
-            "score": 0.7522576863383069
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=602.145, mean=602.145, max=602.145, sum=1204.29 (2)",
-            "tab": "General information",
-            "score": 602.145061728395
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6,
-        "details": {
-          "description": "min=0.6, mean=0.6, max=0.6, sum=1.2 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=1.121, mean=1.121, max=1.121, sum=2.241 (2)",
-            "tab": "Efficiency",
-            "score": 1.120634336905046
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=462.036, mean=462.036, max=462.036, sum=924.073 (2)",
-            "tab": "General information",
-            "score": 462.03636363636366
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.731,
-        "details": {
-          "description": "min=0.731, mean=0.731, max=0.731, sum=1.461 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.837, mean=0.837, max=0.837, sum=1.674 (2)",
-            "tab": "Efficiency",
-            "score": 0.8369822920585165
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1315.865, mean=1315.865, max=1315.865, sum=2631.731 (2)",
-            "tab": "General information",
-            "score": 1315.865306122449
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.831,
-        "details": {
-          "description": "min=0.831, mean=0.831, max=0.831, sum=1.662 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.909, mean=0.909, max=0.909, sum=1.819 (2)",
-            "tab": "Efficiency",
-            "score": 0.9092605125844775
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=487.95, mean=487.95, max=487.95, sum=975.9 (2)",
-            "tab": "General information",
-            "score": 487.9502487562189
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44,
-        "details": {
-          "description": "min=0.44, mean=0.44, max=0.44, sum=0.88 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.885, mean=0.885, max=0.885, sum=1.771 (2)",
-            "tab": "Efficiency",
-            "score": 0.8854893704494798
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=395.349, mean=395.349, max=395.349, sum=790.699 (2)",
-            "tab": "General information",
-            "score": 395.34939759036143
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.789,
-        "details": {
-          "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.863, mean=0.863, max=0.863, sum=1.726 (2)",
-            "tab": "Efficiency",
-            "score": 0.8629393619403505
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=308.924, mean=308.924, max=308.924, sum=617.848 (2)",
-            "tab": "General information",
-            "score": 308.92397660818716
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.213,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json b/data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json
deleted file mode 100644
index 6b7873124..000000000
--- a/data/helm_mmlu/mistralai/mistral-large-2402/2d18fd88-73b5-4d4c-a1cc-e66a20316605.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-large-2402/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral Large 2402",
-    "id": "mistralai/mistral-large-2402",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.688,
-        "details": {
-          "description": "min=0.211, mean=0.688, max=0.964, sum=78.413 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.356, mean=0.546, max=1.633, sum=62.26 (114)",
-            "tab": "Efficiency",
-            "score": 0.5461372164599003
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=308.924, mean=696.273, max=3089.109, sum=79375.178 (114)",
-            "tab": "General information",
-            "score": 696.2734899593811
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.45,
-        "details": {
-          "description": "min=0.45, mean=0.45, max=0.45, sum=0.9 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=1.48, mean=1.48, max=1.48, sum=2.959 (2)",
-            "tab": "Efficiency",
-            "score": 1.4797466564178468
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=402.44, mean=402.44, max=402.44, sum=804.88 (2)",
-            "tab": "General information",
-            "score": 402.44
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.674,
-        "details": {
-          "description": "min=0.674, mean=0.674, max=0.674, sum=1.348 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.484, mean=0.484, max=0.484, sum=0.968 (2)",
-            "tab": "Efficiency",
-            "score": 0.4840934417865895
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=407.089, mean=407.089, max=407.089, sum=814.178 (2)",
-            "tab": "General information",
-            "score": 407.0888888888889
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.373,
-        "details": {
-          "description": "min=0.373, mean=0.373, max=0.373, sum=0.745 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=1.226, mean=1.226, max=1.226, sum=2.452 (2)",
-            "tab": "Efficiency",
-            "score": 1.2259348821640015
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.438, mean=0.438, max=0.438, sum=0.875 (2)",
-            "tab": "Efficiency",
-            "score": 0.43758388525909847
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.412, mean=0.412, max=0.412, sum=0.825 (2)",
-            "tab": "Efficiency",
-            "score": 0.41238118410110475
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.443, mean=0.443, max=0.443, sum=0.886 (2)",
-            "tab": "Efficiency",
-            "score": 0.44315950393676756
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.425, mean=0.425, max=0.425, sum=0.849 (2)",
-            "tab": "Efficiency",
-            "score": 0.4246950163317554
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.511, mean=0.511, max=0.511, sum=1.021 (2)",
-            "tab": "Efficiency",
-            "score": 0.510722931693582
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=627.71, mean=627.71, max=627.71, sum=1255.42 (2)",
-            "tab": "General information",
-            "score": 627.71
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=550.799, mean=550.799, max=550.799, sum=1101.597 (2)",
-            "tab": "General information",
-            "score": 550.7986111111111
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=902.17, mean=902.17, max=902.17, sum=1804.34 (2)",
-            "tab": "General information",
-            "score": 902.17
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=658.31, mean=658.31, max=658.31, sum=1316.62 (2)",
-            "tab": "General information",
-            "score": 658.31
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=592.41, mean=592.41, max=592.41, sum=1184.821 (2)",
-            "tab": "General information",
-            "score": 592.4104046242775
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=551.029, mean=551.029, max=551.029, sum=1102.059 (2)",
-            "tab": "General information",
-            "score": 551.0294117647059
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=1.535, mean=1.535, max=1.535, sum=3.071 (2)",
-            "tab": "Efficiency",
-            "score": 1.5353856110572814
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=424.94, mean=424.94, max=424.94, sum=849.88 (2)",
-            "tab": "General information",
-            "score": 424.94
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.64,
-        "details": {
-          "description": "min=0.64, mean=0.64, max=0.64, sum=1.281 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=1.383, mean=1.383, max=1.383, sum=2.766 (2)",
-            "tab": "Efficiency",
-            "score": 1.382804548531248
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=687.175, mean=687.175, max=687.175, sum=1374.351 (2)",
-            "tab": "General information",
-            "score": 687.1754385964912
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.34,
-        "details": {
-          "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.492, mean=0.492, max=0.492, sum=0.984 (2)",
-            "tab": "Efficiency",
-            "score": 0.49177081823349
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=483.47, mean=483.47, max=483.47, sum=966.94 (2)",
-            "tab": "General information",
-            "score": 483.47
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.815,
-        "details": {
-          "description": "min=0.815, mean=0.815, max=0.815, sum=1.63 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.505, mean=0.505, max=0.505, sum=1.01 (2)",
-            "tab": "Efficiency",
-            "score": 0.5051956353364168
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=451.093, mean=451.093, max=451.093, sum=902.185 (2)",
-            "tab": "General information",
-            "score": 451.0925925925926
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.794,
-        "details": {
-          "description": "min=0.794, mean=0.794, max=0.794, sum=1.588 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.506, mean=0.506, max=0.506, sum=1.011 (2)",
-            "tab": "Efficiency",
-            "score": 0.5055920081123279
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=373.82, mean=373.82, max=373.82, sum=747.64 (2)",
-            "tab": "General information",
-            "score": 373.81993569131834
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.809,
-        "details": {
-          "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.478, mean=0.478, max=0.478, sum=0.956 (2)",
-            "tab": "Efficiency",
-            "score": 0.4777693476747064
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.443, mean=0.443, max=0.443, sum=0.886 (2)",
-            "tab": "Efficiency",
-            "score": 0.4430855546437257
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.493, mean=0.493, max=0.493, sum=0.987 (2)",
-            "tab": "Efficiency",
-            "score": 0.4934647888372588
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.483, mean=0.483, max=0.483, sum=0.966 (2)",
-            "tab": "Efficiency",
-            "score": 0.4830952575004179
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1279.143, mean=1279.143, max=1279.143, sum=2558.287 (2)",
-            "tab": "General information",
-            "score": 1279.1433823529412
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=796.496, mean=796.496, max=796.496, sum=1592.993 (2)",
-            "tab": "General information",
-            "score": 796.4964539007092
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1849.711, mean=1849.711, max=1849.711, sum=3699.421 (2)",
-            "tab": "General information",
-            "score": 1849.7105606258149
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=645.278, mean=645.278, max=645.278, sum=1290.556 (2)",
-            "tab": "General information",
-            "score": 645.2777777777778
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=1.633, mean=1.633, max=1.633, sum=3.266 (2)",
-            "tab": "Efficiency",
-            "score": 1.6332264852523803
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=473.19, mean=473.19, max=473.19, sum=946.38 (2)",
-            "tab": "General information",
-            "score": 473.19
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.842,
-        "details": {
-          "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.45, mean=0.45, max=0.45, sum=0.901 (2)",
-            "tab": "Efficiency",
-            "score": 0.4503253243471447
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=665.987, mean=665.987, max=665.987, sum=1331.974 (2)",
-            "tab": "General information",
-            "score": 665.9868421052631
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.67,
-        "details": {
-          "description": "min=0.67, mean=0.67, max=0.67, sum=1.34 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.411, mean=0.411, max=0.411, sum=0.821 (2)",
-            "tab": "Efficiency",
-            "score": 0.4105031824111938
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=644.6, mean=644.6, max=644.6, sum=1289.2 (2)",
-            "tab": "General information",
-            "score": 644.6
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.751,
-        "details": {
-          "description": "min=0.751, mean=0.751, max=0.751, sum=1.502 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.521, mean=0.521, max=0.521, sum=1.042 (2)",
-            "tab": "Efficiency",
-            "score": 0.5210292402303444
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=487.174, mean=487.174, max=487.174, sum=974.347 (2)",
-            "tab": "General information",
-            "score": 487.1735849056604
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.574,
-        "details": {
-          "description": "min=0.574, mean=0.574, max=0.574, sum=1.149 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.418, mean=0.418, max=0.418, sum=0.835 (2)",
-            "tab": "Efficiency",
-            "score": 0.41761813873940323
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=334.285, mean=334.285, max=334.285, sum=668.57 (2)",
-            "tab": "General information",
-            "score": 334.2851063829787
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.545,
-        "details": {
-          "description": "min=0.545, mean=0.545, max=0.545, sum=1.09 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.54, mean=0.54, max=0.54, sum=1.08 (2)",
-            "tab": "Efficiency",
-            "score": 0.5400767852520121
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=501.379, mean=501.379, max=501.379, sum=1002.759 (2)",
-            "tab": "General information",
-            "score": 501.37931034482756
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.508,
-        "details": {
-          "description": "min=0.508, mean=0.508, max=0.508, sum=1.016 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.434, mean=0.434, max=0.434, sum=0.868 (2)",
-            "tab": "Efficiency",
-            "score": 0.4338057312385115
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=613.386, mean=613.386, max=613.386, sum=1226.772 (2)",
-            "tab": "General information",
-            "score": 613.3862433862433
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.532,
-        "details": {
-          "description": "min=0.532, mean=0.532, max=0.532, sum=1.063 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.512, mean=0.512, max=0.512, sum=1.024 (2)",
-            "tab": "Efficiency",
-            "score": 0.5122278436781869
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=718.984, mean=718.984, max=718.984, sum=1437.968 (2)",
-            "tab": "General information",
-            "score": 718.984126984127
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.886,
-        "details": {
-          "description": "min=0.886, mean=0.886, max=0.886, sum=1.772 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.526, mean=0.526, max=0.526, sum=1.052 (2)",
-            "tab": "Efficiency",
-            "score": 0.5259702259494412
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.402, mean=0.402, max=0.402, sum=0.803 (2)",
-            "tab": "Efficiency",
-            "score": 0.4016201167271055
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.398, mean=0.398, max=0.398, sum=0.797 (2)",
-            "tab": "Efficiency",
-            "score": 0.3984186482429504
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.649, mean=0.649, max=0.649, sum=1.298 (2)",
-            "tab": "Efficiency",
-            "score": 0.6488189350474964
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.449, mean=0.449, max=0.449, sum=0.897 (2)",
-            "tab": "Efficiency",
-            "score": 0.44867861752558236
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.441, mean=0.441, max=0.441, sum=0.883 (2)",
-            "tab": "Efficiency",
-            "score": 0.44147809675938104
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.456, mean=0.456, max=0.456, sum=0.912 (2)",
-            "tab": "Efficiency",
-            "score": 0.45610924195020625
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.427, mean=0.427, max=0.427, sum=0.854 (2)",
-            "tab": "Efficiency",
-            "score": 0.4269448068406847
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.402, mean=0.402, max=0.402, sum=0.805 (2)",
-            "tab": "Efficiency",
-            "score": 0.4023913435575341
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.43, mean=0.43, max=0.43, sum=0.861 (2)",
-            "tab": "Efficiency",
-            "score": 0.43034561738273164
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.428, mean=0.428, max=0.428, sum=0.856 (2)",
-            "tab": "Efficiency",
-            "score": 0.4278128755201987
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)",
-            "tab": "Efficiency",
-            "score": 0.42108922203381854
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.54, mean=0.54, max=0.54, sum=1.08 (2)",
-            "tab": "Efficiency",
-            "score": 0.5401732255430782
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.471, mean=0.471, max=0.471, sum=0.943 (2)",
-            "tab": "Efficiency",
-            "score": 0.47126107075043366
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=600.561, mean=600.561, max=600.561, sum=1201.123 (2)",
-            "tab": "General information",
-            "score": 600.5612903225806
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=572.798, mean=572.798, max=572.798, sum=1145.596 (2)",
-            "tab": "General information",
-            "score": 572.7980295566502
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=988.24, mean=988.24, max=988.24, sum=1976.48 (2)",
-            "tab": "General information",
-            "score": 988.24
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=3089.109, mean=3089.109, max=3089.109, sum=6178.218 (2)",
-            "tab": "General information",
-            "score": 3089.109090909091
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=429.207, mean=429.207, max=429.207, sum=858.414 (2)",
-            "tab": "General information",
-            "score": 429.2070707070707
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=514.808, mean=514.808, max=514.808, sum=1029.617 (2)",
-            "tab": "General information",
-            "score": 514.8082901554404
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=423.815, mean=423.815, max=423.815, sum=847.631 (2)",
-            "tab": "General information",
-            "score": 423.81538461538463
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=584.13, mean=584.13, max=584.13, sum=1168.259 (2)",
-            "tab": "General information",
-            "score": 584.1296296296297
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=443.345, mean=443.345, max=443.345, sum=886.689 (2)",
-            "tab": "General information",
-            "score": 443.34453781512605
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=622.775, mean=622.775, max=622.775, sum=1245.55 (2)",
-            "tab": "General information",
-            "score": 622.774834437086
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=558.873, mean=558.873, max=558.873, sum=1117.747 (2)",
-            "tab": "General information",
-            "score": 558.8733944954129
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=913.644, mean=913.644, max=913.644, sum=1827.287 (2)",
-            "tab": "General information",
-            "score": 913.6435185185185
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2477.446, mean=2477.446, max=2477.446, sum=4954.892 (2)",
-            "tab": "General information",
-            "score": 2477.4460784313724
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1585.553, mean=1585.553, max=1585.553, sum=3171.105 (2)",
-            "tab": "General information",
-            "score": 1585.5527426160338
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.847,
-        "details": {
-          "description": "min=0.847, mean=0.847, max=0.847, sum=1.695 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.401, mean=0.401, max=0.401, sum=0.803 (2)",
-            "tab": "Efficiency",
-            "score": 0.4013588674399885
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.356, mean=0.356, max=0.356, sum=0.711 (2)",
-            "tab": "Efficiency",
-            "score": 0.3556434161790455
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=353.152, mean=353.152, max=353.152, sum=706.305 (2)",
-            "tab": "General information",
-            "score": 353.15246636771303
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=394.748, mean=394.748, max=394.748, sum=789.496 (2)",
-            "tab": "General information",
-            "score": 394.7480916030534
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.868,
-        "details": {
-          "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.404, mean=0.404, max=0.404, sum=0.808 (2)",
-            "tab": "Efficiency",
-            "score": 0.40404871081517746
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=720.182, mean=720.182, max=720.182, sum=1440.364 (2)",
-            "tab": "General information",
-            "score": 720.1818181818181
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.409, mean=0.409, max=0.409, sum=0.818 (2)",
-            "tab": "Efficiency",
-            "score": 0.4088362228650988
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=486.779, mean=486.779, max=486.779, sum=973.558 (2)",
-            "tab": "General information",
-            "score": 486.77914110429447
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.562,
-        "details": {
-          "description": "min=0.562, mean=0.562, max=0.562, sum=1.125 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.401, mean=0.401, max=0.401, sum=0.802 (2)",
-            "tab": "Efficiency",
-            "score": 0.40122431090899874
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=734.83, mean=734.83, max=734.83, sum=1469.661 (2)",
-            "tab": "General information",
-            "score": 734.8303571428571
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.854,
-        "details": {
-          "description": "min=0.854, mean=0.854, max=0.854, sum=1.709 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.614, mean=0.614, max=0.614, sum=1.228 (2)",
-            "tab": "Efficiency",
-            "score": 0.6141544730917922
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=315.359, mean=315.359, max=315.359, sum=630.718 (2)",
-            "tab": "General information",
-            "score": 315.3592233009709
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.897,
-        "details": {
-          "description": "min=0.897, mean=0.897, max=0.897, sum=1.795 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.464, mean=0.464, max=0.464, sum=0.928 (2)",
-            "tab": "Efficiency",
-            "score": 0.46382204895345575
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=463.423, mean=463.423, max=463.423, sum=926.846 (2)",
-            "tab": "General information",
-            "score": 463.4230769230769
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.74,
-        "details": {
-          "description": "min=0.74, mean=0.74, max=0.74, sum=1.48 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.433, mean=0.433, max=0.433, sum=0.867 (2)",
-            "tab": "Efficiency",
-            "score": 0.4333249735832214
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=405.71, mean=405.71, max=405.71, sum=811.42 (2)",
-            "tab": "General information",
-            "score": 405.71
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.477, mean=0.477, max=0.477, sum=0.955 (2)",
-            "tab": "Efficiency",
-            "score": 0.477321812323988
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=348.519, mean=348.519, max=348.519, sum=697.037 (2)",
-            "tab": "General information",
-            "score": 348.51851851851853
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.579,
-        "details": {
-          "description": "min=0.579, mean=0.579, max=0.579, sum=1.158 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.497, mean=0.497, max=0.497, sum=0.995 (2)",
-            "tab": "Efficiency",
-            "score": 0.4974138419752176
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.451, mean=0.451, max=0.451, sum=0.902 (2)",
-            "tab": "Efficiency",
-            "score": 0.45121243466212096
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=540.038, mean=540.038, max=540.038, sum=1080.075 (2)",
-            "tab": "General information",
-            "score": 540.0375722543353
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=745.516, mean=745.516, max=745.516, sum=1491.032 (2)",
-            "tab": "General information",
-            "score": 745.5162011173185
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791,
-        "details": {
-          "description": "min=0.791, mean=0.791, max=0.791, sum=1.582 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.463, mean=0.463, max=0.463, sum=0.927 (2)",
-            "tab": "Efficiency",
-            "score": 0.46336324308432786
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=680.69, mean=680.69, max=680.69, sum=1361.379 (2)",
-            "tab": "General information",
-            "score": 680.6895424836601
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.904,
-        "details": {
-          "description": "min=0.904, mean=0.904, max=0.904, sum=1.809 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.393, mean=0.393, max=0.393, sum=0.786 (2)",
-            "tab": "Efficiency",
-            "score": 0.3928193273367705
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=602.145, mean=602.145, max=602.145, sum=1204.29 (2)",
-            "tab": "General information",
-            "score": 602.145061728395
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.709,
-        "details": {
-          "description": "min=0.709, mean=0.709, max=0.709, sum=1.418 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.535, mean=0.535, max=0.535, sum=1.069 (2)",
-            "tab": "Efficiency",
-            "score": 0.534747780453075
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=462.036, mean=462.036, max=462.036, sum=924.073 (2)",
-            "tab": "General information",
-            "score": 462.03636363636366
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.824,
-        "details": {
-          "description": "min=0.824, mean=0.824, max=0.824, sum=1.649 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.446, mean=0.446, max=0.446, sum=0.891 (2)",
-            "tab": "Efficiency",
-            "score": 0.44565339964263295
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1315.865, mean=1315.865, max=1315.865, sum=2631.731 (2)",
-            "tab": "General information",
-            "score": 1315.865306122449
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "details": {
-          "description": "min=0.93, mean=0.93, max=0.93, sum=1.861 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.442, mean=0.442, max=0.442, sum=0.884 (2)",
-            "tab": "Efficiency",
-            "score": 0.44217372296461416
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=487.95, mean=487.95, max=487.95, sum=975.9 (2)",
-            "tab": "General information",
-            "score": 487.9502487562189
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.554,
-        "details": {
-          "description": "min=0.554, mean=0.554, max=0.554, sum=1.108 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.871 (2)",
-            "tab": "Efficiency",
-            "score": 0.435666641557073
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=395.349, mean=395.349, max=395.349, sum=790.699 (2)",
-            "tab": "General information",
-            "score": 395.34939759036143
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.883,
-        "details": {
-          "description": "min=0.883, mean=0.883, max=0.883, sum=1.766 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.411, mean=0.411, max=0.411, sum=0.821 (2)",
-            "tab": "Efficiency",
-            "score": 0.4106302637802927
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=308.924, mean=308.924, max=308.924, sum=617.848 (2)",
-            "tab": "General information",
-            "score": 308.92397660818716
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.464,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json b/data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json
deleted file mode 100644
index 58aa6a379..000000000
--- a/data/helm_mmlu/mistralai/mistral-large-2407/567918be-be6f-4e41-b613-727828fe8a44.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-large-2407/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral Large 2 2407",
-    "id": "mistralai/mistral-large-2407",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.37, mean=0.8, max=0.969, sum=91.197 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.672, mean=0.798, max=1.025, sum=90.977 (114)",
-            "tab": "Efficiency",
-            "score": 0.798047748433812
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=318.924, mean=706.273, max=3099.109, sum=80515.178 (114)",
-            "tab": "General information",
-            "score": 706.2734899593811
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7,
-        "details": {
-          "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.732, mean=0.732, max=0.732, sum=1.464 (2)",
-            "tab": "Efficiency",
-            "score": 0.7317730689048767
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=412.44, mean=412.44, max=412.44, sum=824.88 (2)",
-            "tab": "General information",
-            "score": 412.44
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.785,
-        "details": {
-          "description": "min=0.785, mean=0.785, max=0.785, sum=1.57 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.808, mean=0.808, max=0.808, sum=1.616 (2)",
-            "tab": "Efficiency",
-            "score": 0.807829690862585
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=417.089, mean=417.089, max=417.089, sum=834.178 (2)",
-            "tab": "General information",
-            "score": 417.0888888888889
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.559,
-        "details": {
-          "description": "min=0.559, mean=0.559, max=0.559, sum=1.118 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.838, mean=0.838, max=0.838, sum=1.676 (2)",
-            "tab": "Efficiency",
-            "score": 0.8380094933509826
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.768, mean=0.768, max=0.768, sum=1.535 (2)",
-            "tab": "Efficiency",
-            "score": 0.76766570409139
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)",
-            "tab": "Efficiency",
-            "score": 0.8529829049110412
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.771, mean=0.771, max=0.771, sum=1.542 (2)",
-            "tab": "Efficiency",
-            "score": 0.7712302732467652
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.672, mean=0.672, max=0.672, sum=1.344 (2)",
-            "tab": "Efficiency",
-            "score": 0.6721915785287846
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.674, mean=0.674, max=0.674, sum=1.347 (2)",
-            "tab": "Efficiency",
-            "score": 0.6735490116418577
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=637.71, mean=637.71, max=637.71, sum=1275.42 (2)",
-            "tab": "General information",
-            "score": 637.71
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=560.799, mean=560.799, max=560.799, sum=1121.597 (2)",
-            "tab": "General information",
-            "score": 560.7986111111111
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=912.17, mean=912.17, max=912.17, sum=1824.34 (2)",
-            "tab": "General information",
-            "score": 912.17
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=668.31, mean=668.31, max=668.31, sum=1336.62 (2)",
-            "tab": "General information",
-            "score": 668.31
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=602.41, mean=602.41, max=602.41, sum=1204.821 (2)",
-            "tab": "General information",
-            "score": 602.4104046242775
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=561.029, mean=561.029, max=561.029, sum=1122.059 (2)",
-            "tab": "General information",
-            "score": 561.0294117647059
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.933, mean=0.933, max=0.933, sum=1.866 (2)",
-            "tab": "Efficiency",
-            "score": 0.9331179332733154
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=434.94, mean=434.94, max=434.94, sum=869.88 (2)",
-            "tab": "General information",
-            "score": 434.94
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.693,
-        "details": {
-          "description": "min=0.693, mean=0.693, max=0.693, sum=1.386 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.684, mean=0.684, max=0.684, sum=1.368 (2)",
-            "tab": "Efficiency",
-            "score": 0.6842389587770429
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=697.175, mean=697.175, max=697.175, sum=1394.351 (2)",
-            "tab": "General information",
-            "score": 697.1754385964912
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.56,
-        "details": {
-          "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.745, mean=0.745, max=0.745, sum=1.489 (2)",
-            "tab": "Efficiency",
-            "score": 0.744694242477417
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=493.47, mean=493.47, max=493.47, sum=986.94 (2)",
-            "tab": "General information",
-            "score": 493.47
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.861,
-        "details": {
-          "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.751, mean=0.751, max=0.751, sum=1.503 (2)",
-            "tab": "Efficiency",
-            "score": 0.751495877901713
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=461.093, mean=461.093, max=461.093, sum=922.185 (2)",
-            "tab": "General information",
-            "score": 461.0925925925926
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.826,
-        "details": {
-          "description": "min=0.826, mean=0.826, max=0.826, sum=1.653 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.804, mean=0.804, max=0.804, sum=1.609 (2)",
-            "tab": "Efficiency",
-            "score": 0.8043544453439988
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=383.82, mean=383.82, max=383.82, sum=767.64 (2)",
-            "tab": "General information",
-            "score": 383.81993569131834
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.861,
-        "details": {
-          "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.741, mean=0.741, max=0.741, sum=1.481 (2)",
-            "tab": "Efficiency",
-            "score": 0.7406316355747335
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.807, mean=0.807, max=0.807, sum=1.615 (2)",
-            "tab": "Efficiency",
-            "score": 0.8074929325293142
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.774, mean=0.774, max=0.774, sum=1.548 (2)",
-            "tab": "Efficiency",
-            "score": 0.7742255473851847
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.923, mean=0.923, max=0.923, sum=1.846 (2)",
-            "tab": "Efficiency",
-            "score": 0.9228381756084417
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1289.143, mean=1289.143, max=1289.143, sum=2578.287 (2)",
-            "tab": "General information",
-            "score": 1289.1433823529412
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=806.496, mean=806.496, max=806.496, sum=1612.993 (2)",
-            "tab": "General information",
-            "score": 806.4964539007092
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1859.711, mean=1859.711, max=1859.711, sum=3719.421 (2)",
-            "tab": "General information",
-            "score": 1859.7105606258149
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=655.278, mean=655.278, max=655.278, sum=1310.556 (2)",
-            "tab": "General information",
-            "score": 655.2777777777778
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.756, mean=0.756, max=0.756, sum=1.512 (2)",
-            "tab": "Efficiency",
-            "score": 0.7560967636108399
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=483.19, mean=483.19, max=483.19, sum=966.38 (2)",
-            "tab": "General information",
-            "score": 483.19
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.921,
-        "details": {
-          "description": "min=0.921, mean=0.921, max=0.921, sum=1.842 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=1.025, mean=1.025, max=1.025, sum=2.049 (2)",
-            "tab": "Efficiency",
-            "score": 1.0245175393004167
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=675.987, mean=675.987, max=675.987, sum=1351.974 (2)",
-            "tab": "General information",
-            "score": 675.9868421052631
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.923, mean=0.923, max=0.923, sum=1.846 (2)",
-            "tab": "Efficiency",
-            "score": 0.9228822708129882
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=654.6, mean=654.6, max=654.6, sum=1309.2 (2)",
-            "tab": "General information",
-            "score": 654.6
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.864,
-        "details": {
-          "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.789, mean=0.789, max=0.789, sum=1.578 (2)",
-            "tab": "Efficiency",
-            "score": 0.7888300931678628
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=497.174, mean=497.174, max=497.174, sum=994.347 (2)",
-            "tab": "General information",
-            "score": 497.1735849056604
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.864,
-        "details": {
-          "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.821, mean=0.821, max=0.821, sum=1.643 (2)",
-            "tab": "Efficiency",
-            "score": 0.8212997264050422
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=344.285, mean=344.285, max=344.285, sum=688.57 (2)",
-            "tab": "General information",
-            "score": 344.2851063829787
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.793,
-        "details": {
-          "description": "min=0.793, mean=0.793, max=0.793, sum=1.586 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.702, mean=0.702, max=0.702, sum=1.404 (2)",
-            "tab": "Efficiency",
-            "score": 0.701846879104088
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=511.379, mean=511.379, max=511.379, sum=1022.759 (2)",
-            "tab": "General information",
-            "score": 511.37931034482756
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.799,
-        "details": {
-          "description": "min=0.799, mean=0.799, max=0.799, sum=1.598 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.788, mean=0.788, max=0.788, sum=1.577 (2)",
-            "tab": "Efficiency",
-            "score": 0.7884082762652604
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=623.386, mean=623.386, max=623.386, sum=1246.772 (2)",
-            "tab": "General information",
-            "score": 623.3862433862433
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.579,
-        "details": {
-          "description": "min=0.579, mean=0.579, max=0.579, sum=1.159 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.763, mean=0.763, max=0.763, sum=1.526 (2)",
-            "tab": "Efficiency",
-            "score": 0.7629275567947872
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=728.984, mean=728.984, max=728.984, sum=1457.968 (2)",
-            "tab": "General information",
-            "score": 728.984126984127
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.685, mean=0.685, max=0.685, sum=1.371 (2)",
-            "tab": "Efficiency",
-            "score": 0.6854658296031336
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.756, mean=0.756, max=0.756, sum=1.513 (2)",
-            "tab": "Efficiency",
-            "score": 0.7563052259642502
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.717, mean=0.717, max=0.717, sum=1.435 (2)",
-            "tab": "Efficiency",
-            "score": 0.7174343037605285
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.767, mean=0.767, max=0.767, sum=1.535 (2)",
-            "tab": "Efficiency",
-            "score": 0.7674274748021906
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.7, mean=0.7, max=0.7, sum=1.4 (2)",
-            "tab": "Efficiency",
-            "score": 0.6998175286283397
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.824, mean=0.824, max=0.824, sum=1.648 (2)",
-            "tab": "Efficiency",
-            "score": 0.8241880792410262
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.863, mean=0.863, max=0.863, sum=1.726 (2)",
-            "tab": "Efficiency",
-            "score": 0.8630072312477307
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.815, mean=0.815, max=0.815, sum=1.631 (2)",
-            "tab": "Efficiency",
-            "score": 0.8153338502954554
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.818, mean=0.818, max=0.818, sum=1.637 (2)",
-            "tab": "Efficiency",
-            "score": 0.8183944405627852
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.83, mean=0.83, max=0.83, sum=1.659 (2)",
-            "tab": "Efficiency",
-            "score": 0.8296057877951111
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.818, mean=0.818, max=0.818, sum=1.636 (2)",
-            "tab": "Efficiency",
-            "score": 0.8179746304083308
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.775, mean=0.775, max=0.775, sum=1.55 (2)",
-            "tab": "Efficiency",
-            "score": 0.7749874878812719
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.743, mean=0.743, max=0.743, sum=1.486 (2)",
-            "tab": "Efficiency",
-            "score": 0.7428295682458317
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.832, mean=0.832, max=0.832, sum=1.663 (2)",
-            "tab": "Efficiency",
-            "score": 0.8316668367587061
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=610.561, mean=610.561, max=610.561, sum=1221.123 (2)",
-            "tab": "General information",
-            "score": 610.5612903225806
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=582.798, mean=582.798, max=582.798, sum=1165.596 (2)",
-            "tab": "General information",
-            "score": 582.7980295566502
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=998.24, mean=998.24, max=998.24, sum=1996.48 (2)",
-            "tab": "General information",
-            "score": 998.24
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=3099.109, mean=3099.109, max=3099.109, sum=6198.218 (2)",
-            "tab": "General information",
-            "score": 3099.109090909091
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=439.207, mean=439.207, max=439.207, sum=878.414 (2)",
-            "tab": "General information",
-            "score": 439.2070707070707
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=524.808, mean=524.808, max=524.808, sum=1049.617 (2)",
-            "tab": "General information",
-            "score": 524.8082901554404
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=433.815, mean=433.815, max=433.815, sum=867.631 (2)",
-            "tab": "General information",
-            "score": 433.81538461538463
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=594.13, mean=594.13, max=594.13, sum=1188.259 (2)",
-            "tab": "General information",
-            "score": 594.1296296296297
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=453.345, mean=453.345, max=453.345, sum=906.689 (2)",
-            "tab": "General information",
-            "score": 453.34453781512605
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=632.775, mean=632.775, max=632.775, sum=1265.55 (2)",
-            "tab": "General information",
-            "score": 632.774834437086
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=568.873, mean=568.873, max=568.873, sum=1137.747 (2)",
-            "tab": "General information",
-            "score": 568.8733944954129
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=923.644, mean=923.644, max=923.644, sum=1847.287 (2)",
-            "tab": "General information",
-            "score": 923.6435185185185
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2487.446, mean=2487.446, max=2487.446, sum=4974.892 (2)",
-            "tab": "General information",
-            "score": 2487.4460784313724
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1595.553, mean=1595.553, max=1595.553, sum=3191.105 (2)",
-            "tab": "General information",
-            "score": 1595.5527426160338
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.924,
-        "details": {
-          "description": "min=0.924, mean=0.924, max=0.924, sum=1.847 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.819, mean=0.819, max=0.819, sum=1.639 (2)",
-            "tab": "Efficiency",
-            "score": 0.8192698356816587
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.733, mean=0.733, max=0.733, sum=1.466 (2)",
-            "tab": "Efficiency",
-            "score": 0.732998116325786
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=363.152, mean=363.152, max=363.152, sum=726.305 (2)",
-            "tab": "General information",
-            "score": 363.15246636771303
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=404.748, mean=404.748, max=404.748, sum=809.496 (2)",
-            "tab": "General information",
-            "score": 404.7480916030534
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.926,
-        "details": {
-          "description": "min=0.926, mean=0.926, max=0.926, sum=1.851 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.852, mean=0.852, max=0.852, sum=1.705 (2)",
-            "tab": "Efficiency",
-            "score": 0.8524710600041161
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=730.182, mean=730.182, max=730.182, sum=1460.364 (2)",
-            "tab": "General information",
-            "score": 730.1818181818181
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.847,
-        "details": {
-          "description": "min=0.847, mean=0.847, max=0.847, sum=1.693 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.944, mean=0.944, max=0.944, sum=1.887 (2)",
-            "tab": "Efficiency",
-            "score": 0.9436116130805454
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=496.779, mean=496.779, max=496.779, sum=993.558 (2)",
-            "tab": "General information",
-            "score": 496.77914110429447
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.661,
-        "details": {
-          "description": "min=0.661, mean=0.661, max=0.661, sum=1.321 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.874, mean=0.874, max=0.874, sum=1.748 (2)",
-            "tab": "Efficiency",
-            "score": 0.8740715363195964
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=744.83, mean=744.83, max=744.83, sum=1489.661 (2)",
-            "tab": "General information",
-            "score": 744.8303571428571
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.883,
-        "details": {
-          "description": "min=0.883, mean=0.883, max=0.883, sum=1.767 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
-            "tab": "Efficiency",
-            "score": 0.7901336544925727
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=325.359, mean=325.359, max=325.359, sum=650.718 (2)",
-            "tab": "General information",
-            "score": 325.3592233009709
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.884, mean=0.884, max=0.884, sum=1.768 (2)",
-            "tab": "Efficiency",
-            "score": 0.88404920977405
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=473.423, mean=473.423, max=473.423, sum=946.846 (2)",
-            "tab": "General information",
-            "score": 473.4230769230769
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)",
-            "tab": "Efficiency",
-            "score": 0.7701838827133178
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=415.71, mean=415.71, max=415.71, sum=831.42 (2)",
-            "tab": "General information",
-            "score": 415.71
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.936,
-        "details": {
-          "description": "min=0.936, mean=0.936, max=0.936, sum=1.872 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.71, mean=0.71, max=0.71, sum=1.419 (2)",
-            "tab": "Efficiency",
-            "score": 0.7095236696045975
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=358.519, mean=358.519, max=358.519, sum=717.037 (2)",
-            "tab": "General information",
-            "score": 358.51851851851853
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.839,
-        "details": {
-          "description": "min=0.839, mean=0.839, max=0.839, sum=1.678 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.861, mean=0.861, max=0.861, sum=1.721 (2)",
-            "tab": "Efficiency",
-            "score": 0.8607459598883039
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.831, mean=0.831, max=0.831, sum=1.663 (2)",
-            "tab": "Efficiency",
-            "score": 0.8314023547998354
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=550.038, mean=550.038, max=550.038, sum=1100.075 (2)",
-            "tab": "General information",
-            "score": 550.0375722543353
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=755.516, mean=755.516, max=755.516, sum=1511.032 (2)",
-            "tab": "General information",
-            "score": 755.5162011173185
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.827,
-        "details": {
-          "description": "min=0.827, mean=0.827, max=0.827, sum=1.654 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.816, mean=0.816, max=0.816, sum=1.632 (2)",
-            "tab": "Efficiency",
-            "score": 0.8157819338094175
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=690.69, mean=690.69, max=690.69, sum=1381.379 (2)",
-            "tab": "General information",
-            "score": 690.6895424836601
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.927, mean=0.927, max=0.927, sum=1.854 (2)",
-            "tab": "Efficiency",
-            "score": 0.9269687445075424
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=612.145, mean=612.145, max=612.145, sum=1224.29 (2)",
-            "tab": "General information",
-            "score": 612.145061728395
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.764,
-        "details": {
-          "description": "min=0.764, mean=0.764, max=0.764, sum=1.527 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)",
-            "tab": "Efficiency",
-            "score": 0.7498581886291504
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=472.036, mean=472.036, max=472.036, sum=944.073 (2)",
-            "tab": "General information",
-            "score": 472.03636363636366
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "details": {
-          "description": "min=0.865, mean=0.865, max=0.865, sum=1.731 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.73, mean=0.73, max=0.73, sum=1.459 (2)",
-            "tab": "Efficiency",
-            "score": 0.7295293778789287
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1325.865, mean=1325.865, max=1325.865, sum=2651.731 (2)",
-            "tab": "General information",
-            "score": 1325.865306122449
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.751, mean=0.751, max=0.751, sum=1.501 (2)",
-            "tab": "Efficiency",
-            "score": 0.750605917688626
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=497.95, mean=497.95, max=497.95, sum=995.9 (2)",
-            "tab": "General information",
-            "score": 497.9502487562189
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.59,
-        "details": {
-          "description": "min=0.59, mean=0.59, max=0.59, sum=1.181 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.824, mean=0.824, max=0.824, sum=1.648 (2)",
-            "tab": "Efficiency",
-            "score": 0.8238025544637657
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=405.349, mean=405.349, max=405.349, sum=810.699 (2)",
-            "tab": "General information",
-            "score": 405.34939759036143
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "details": {
-          "description": "min=0.865, mean=0.865, max=0.865, sum=1.731 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.834, mean=0.834, max=0.834, sum=1.668 (2)",
-            "tab": "Efficiency",
-            "score": 0.8341451960000378
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=318.924, mean=318.924, max=318.924, sum=637.848 (2)",
-            "tab": "General information",
-            "score": 318.92397660818716
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.24,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json b/data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json
deleted file mode 100644
index 457d9ed2a..000000000
--- a/data/helm_mmlu/mistralai/mistral-small-2402/c2be131b-808c-4947-b24f-69ef6af499d7.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/mistralai_mistral-small-2402/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral Small 2402",
-    "id": "mistralai/mistral-small-2402",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.687,
-        "details": {
-          "description": "min=0.215, mean=0.687, max=0.948, sum=78.352 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.279, mean=0.486, max=1.477, sum=55.362 (114)",
-            "tab": "Efficiency",
-            "score": 0.4856315259373381
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=308.924, mean=696.273, max=3089.109, sum=79375.178 (114)",
-            "tab": "General information",
-            "score": 696.2734899593811
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.26,
-        "details": {
-          "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=1.31, mean=1.31, max=1.31, sum=2.621 (2)",
-            "tab": "Efficiency",
-            "score": 1.3102962040901185
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=402.44, mean=402.44, max=402.44, sum=804.88 (2)",
-            "tab": "General information",
-            "score": 402.44
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.674,
-        "details": {
-          "description": "min=0.674, mean=0.674, max=0.674, sum=1.348 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.359, mean=0.359, max=0.359, sum=0.719 (2)",
-            "tab": "Efficiency",
-            "score": 0.35931493441263834
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=407.089, mean=407.089, max=407.089, sum=814.178 (2)",
-            "tab": "General information",
-            "score": 407.0888888888889
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.402,
-        "details": {
-          "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=1.152, mean=1.152, max=1.152, sum=2.304 (2)",
-            "tab": "Efficiency",
-            "score": 1.151910934448242
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.358, mean=0.358, max=0.358, sum=0.716 (2)",
-            "tab": "Efficiency",
-            "score": 0.3582056214412053
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.295, mean=0.295, max=0.295, sum=0.59 (2)",
-            "tab": "Efficiency",
-            "score": 0.29487616300582886
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.448, mean=0.448, max=0.448, sum=0.896 (2)",
-            "tab": "Efficiency",
-            "score": 0.44812692165374757
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.367, mean=0.367, max=0.367, sum=0.734 (2)",
-            "tab": "Efficiency",
-            "score": 0.3668311620723305
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.375, mean=0.375, max=0.375, sum=0.75 (2)",
-            "tab": "Efficiency",
-            "score": 0.37511497852849024
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=627.71, mean=627.71, max=627.71, sum=1255.42 (2)",
-            "tab": "General information",
-            "score": 627.71
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=550.799, mean=550.799, max=550.799, sum=1101.597 (2)",
-            "tab": "General information",
-            "score": 550.7986111111111
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=902.17, mean=902.17, max=902.17, sum=1804.34 (2)",
-            "tab": "General information",
-            "score": 902.17
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=658.31, mean=658.31, max=658.31, sum=1316.62 (2)",
-            "tab": "General information",
-            "score": 658.31
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=592.41, mean=592.41, max=592.41, sum=1184.821 (2)",
-            "tab": "General information",
-            "score": 592.4104046242775
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=551.029, mean=551.029, max=551.029, sum=1102.059 (2)",
-            "tab": "General information",
-            "score": 551.0294117647059
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=1.022, mean=1.022, max=1.022, sum=2.044 (2)",
-            "tab": "Efficiency",
-            "score": 1.0222336649894714
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=424.94, mean=424.94, max=424.94, sum=849.88 (2)",
-            "tab": "General information",
-            "score": 424.94
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.614,
-        "details": {
-          "description": "min=0.614, mean=0.614, max=0.614, sum=1.228 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=1.477, mean=1.477, max=1.477, sum=2.954 (2)",
-            "tab": "Efficiency",
-            "score": 1.4771089867541665
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=687.175, mean=687.175, max=687.175, sum=1374.351 (2)",
-            "tab": "General information",
-            "score": 687.1754385964912
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.45,
-        "details": {
-          "description": "min=0.45, mean=0.45, max=0.45, sum=0.9 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.364, mean=0.364, max=0.364, sum=0.728 (2)",
-            "tab": "Efficiency",
-            "score": 0.36384799242019655
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=483.47, mean=483.47, max=483.47, sum=966.94 (2)",
-            "tab": "General information",
-            "score": 483.47
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.833,
-        "details": {
-          "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.435, mean=0.435, max=0.435, sum=0.871 (2)",
-            "tab": "Efficiency",
-            "score": 0.4353830130011947
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=451.093, mean=451.093, max=451.093, sum=902.185 (2)",
-            "tab": "General information",
-            "score": 451.0925925925926
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.765,
-        "details": {
-          "description": "min=0.765, mean=0.765, max=0.765, sum=1.531 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.438, mean=0.438, max=0.438, sum=0.877 (2)",
-            "tab": "Efficiency",
-            "score": 0.43847233306173344
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=373.82, mean=373.82, max=373.82, sum=747.64 (2)",
-            "tab": "General information",
-            "score": 373.81993569131834
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.768,
-        "details": {
-          "description": "min=0.768, mean=0.768, max=0.768, sum=1.536 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.459, mean=0.459, max=0.459, sum=0.919 (2)",
-            "tab": "Efficiency",
-            "score": 0.45927367666188407
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.447, mean=0.447, max=0.447, sum=0.895 (2)",
-            "tab": "Efficiency",
-            "score": 0.447448378759073
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)",
-            "tab": "Efficiency",
-            "score": 0.407953996390998
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.839 (2)",
-            "tab": "Efficiency",
-            "score": 0.41963181386586107
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1279.143, mean=1279.143, max=1279.143, sum=2558.287 (2)",
-            "tab": "General information",
-            "score": 1279.1433823529412
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=796.496, mean=796.496, max=796.496, sum=1592.993 (2)",
-            "tab": "General information",
-            "score": 796.4964539007092
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1849.711, mean=1849.711, max=1849.711, sum=3699.421 (2)",
-            "tab": "General information",
-            "score": 1849.7105606258149
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=645.278, mean=645.278, max=645.278, sum=1290.556 (2)",
-            "tab": "General information",
-            "score": 645.2777777777778
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "details": {
-          "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=1.347, mean=1.347, max=1.347, sum=2.693 (2)",
-            "tab": "Efficiency",
-            "score": 1.3467011404037477
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=473.19, mean=473.19, max=473.19, sum=946.38 (2)",
-            "tab": "General information",
-            "score": 473.19
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.539 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.345, mean=0.345, max=0.345, sum=0.689 (2)",
-            "tab": "Efficiency",
-            "score": 0.3447367345031939
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=665.987, mean=665.987, max=665.987, sum=1331.974 (2)",
-            "tab": "General information",
-            "score": 665.9868421052631
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.71,
-        "details": {
-          "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.45, mean=0.45, max=0.45, sum=0.9 (2)",
-            "tab": "Efficiency",
-            "score": 0.4499172067642212
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=644.6, mean=644.6, max=644.6, sum=1289.2 (2)",
-            "tab": "General information",
-            "score": 644.6
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.766,
-        "details": {
-          "description": "min=0.766, mean=0.766, max=0.766, sum=1.532 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.873 (2)",
-            "tab": "Efficiency",
-            "score": 0.4363225082181535
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=487.174, mean=487.174, max=487.174, sum=974.347 (2)",
-            "tab": "General information",
-            "score": 487.1735849056604
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.685,
-        "details": {
-          "description": "min=0.685, mean=0.685, max=0.685, sum=1.37 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.391, mean=0.391, max=0.391, sum=0.781 (2)",
-            "tab": "Efficiency",
-            "score": 0.3906106086487466
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=334.285, mean=334.285, max=334.285, sum=668.57 (2)",
-            "tab": "General information",
-            "score": 334.2851063829787
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.628,
-        "details": {
-          "description": "min=0.628, mean=0.628, max=0.628, sum=1.255 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.434, mean=0.434, max=0.434, sum=0.868 (2)",
-            "tab": "Efficiency",
-            "score": 0.4342194343435353
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=501.379, mean=501.379, max=501.379, sum=1002.759 (2)",
-            "tab": "General information",
-            "score": 501.37931034482756
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.415,
-        "details": {
-          "description": "min=0.415, mean=0.415, max=0.415, sum=0.831 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.434, mean=0.434, max=0.434, sum=0.869 (2)",
-            "tab": "Efficiency",
-            "score": 0.43446689244931336
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=613.386, mean=613.386, max=613.386, sum=1226.772 (2)",
-            "tab": "General information",
-            "score": 613.3862433862433
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.516,
-        "details": {
-          "description": "min=0.516, mean=0.516, max=0.516, sum=1.032 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.443, mean=0.443, max=0.443, sum=0.887 (2)",
-            "tab": "Efficiency",
-            "score": 0.4434795303950234
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=718.984, mean=718.984, max=718.984, sum=1437.968 (2)",
-            "tab": "General information",
-            "score": 718.984126984127
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.857,
-        "details": {
-          "description": "min=0.857, mean=0.857, max=0.857, sum=1.713 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.374, mean=0.374, max=0.374, sum=0.749 (2)",
-            "tab": "Efficiency",
-            "score": 0.3742693070442446
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)",
-            "tab": "Efficiency",
-            "score": 0.3839088602019061
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)",
-            "tab": "Efficiency",
-            "score": 0.4230046820640564
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.456, mean=0.456, max=0.456, sum=0.911 (2)",
-            "tab": "Efficiency",
-            "score": 0.4556852485194351
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.443, mean=0.443, max=0.443, sum=0.885 (2)",
-            "tab": "Efficiency",
-            "score": 0.44265695533367116
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.48, mean=0.48, max=0.48, sum=0.96 (2)",
-            "tab": "Efficiency",
-            "score": 0.47987033666106704
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.366, mean=0.366, max=0.366, sum=0.731 (2)",
-            "tab": "Efficiency",
-            "score": 0.3655165384977292
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.433, mean=0.433, max=0.433, sum=0.865 (2)",
-            "tab": "Efficiency",
-            "score": 0.4325918674468994
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)",
-            "tab": "Efficiency",
-            "score": 0.41513349929777515
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.417, mean=0.417, max=0.417, sum=0.834 (2)",
-            "tab": "Efficiency",
-            "score": 0.41723605496993915
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.448, mean=0.448, max=0.448, sum=0.896 (2)",
-            "tab": "Efficiency",
-            "score": 0.44808799017459977
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.402, mean=0.402, max=0.402, sum=0.805 (2)",
-            "tab": "Efficiency",
-            "score": 0.4024901666023113
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.383, mean=0.383, max=0.383, sum=0.767 (2)",
-            "tab": "Efficiency",
-            "score": 0.3834606175329171
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.399, mean=0.399, max=0.399, sum=0.798 (2)",
-            "tab": "Efficiency",
-            "score": 0.39886615648551327
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=600.561, mean=600.561, max=600.561, sum=1201.123 (2)",
-            "tab": "General information",
-            "score": 600.5612903225806
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=572.798, mean=572.798, max=572.798, sum=1145.596 (2)",
-            "tab": "General information",
-            "score": 572.7980295566502
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=988.24, mean=988.24, max=988.24, sum=1976.48 (2)",
-            "tab": "General information",
-            "score": 988.24
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=3089.109, mean=3089.109, max=3089.109, sum=6178.218 (2)",
-            "tab": "General information",
-            "score": 3089.109090909091
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=429.207, mean=429.207, max=429.207, sum=858.414 (2)",
-            "tab": "General information",
-            "score": 429.2070707070707
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=514.808, mean=514.808, max=514.808, sum=1029.617 (2)",
-            "tab": "General information",
-            "score": 514.8082901554404
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=423.815, mean=423.815, max=423.815, sum=847.631 (2)",
-            "tab": "General information",
-            "score": 423.81538461538463
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=584.13, mean=584.13, max=584.13, sum=1168.259 (2)",
-            "tab": "General information",
-            "score": 584.1296296296297
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=443.345, mean=443.345, max=443.345, sum=886.689 (2)",
-            "tab": "General information",
-            "score": 443.34453781512605
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=622.775, mean=622.775, max=622.775, sum=1245.55 (2)",
-            "tab": "General information",
-            "score": 622.774834437086
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=558.873, mean=558.873, max=558.873, sum=1117.747 (2)",
-            "tab": "General information",
-            "score": 558.8733944954129
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=913.644, mean=913.644, max=913.644, sum=1827.287 (2)",
-            "tab": "General information",
-            "score": 913.6435185185185
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2477.446, mean=2477.446, max=2477.446, sum=4954.892 (2)",
-            "tab": "General information",
-            "score": 2477.4460784313724
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1585.553, mean=1585.553, max=1585.553, sum=3171.105 (2)",
-            "tab": "General information",
-            "score": 1585.5527426160338
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.824,
-        "details": {
-          "description": "min=0.824, mean=0.824, max=0.824, sum=1.649 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.664 (2)",
-            "tab": "Efficiency",
-            "score": 0.33194801304907007
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.358, mean=0.358, max=0.358, sum=0.716 (2)",
-            "tab": "Efficiency",
-            "score": 0.3579711095067381
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=353.152, mean=353.152, max=353.152, sum=706.305 (2)",
-            "tab": "General information",
-            "score": 353.15246636771303
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=394.748, mean=394.748, max=394.748, sum=789.496 (2)",
-            "tab": "General information",
-            "score": 394.7480916030534
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.826,
-        "details": {
-          "description": "min=0.826, mean=0.826, max=0.826, sum=1.653 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.378, mean=0.378, max=0.378, sum=0.755 (2)",
-            "tab": "Efficiency",
-            "score": 0.37766425668700665
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=720.182, mean=720.182, max=720.182, sum=1440.364 (2)",
-            "tab": "General information",
-            "score": 720.1818181818181
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.804,
-        "details": {
-          "description": "min=0.804, mean=0.804, max=0.804, sum=1.607 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.39, mean=0.39, max=0.39, sum=0.781 (2)",
-            "tab": "Efficiency",
-            "score": 0.3902764905449803
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=486.779, mean=486.779, max=486.779, sum=973.558 (2)",
-            "tab": "General information",
-            "score": 486.77914110429447
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.562,
-        "details": {
-          "description": "min=0.562, mean=0.562, max=0.562, sum=1.125 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.393, mean=0.393, max=0.393, sum=0.785 (2)",
-            "tab": "Efficiency",
-            "score": 0.3927395024469921
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=734.83, mean=734.83, max=734.83, sum=1469.661 (2)",
-            "tab": "General information",
-            "score": 734.8303571428571
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.786,
-        "details": {
-          "description": "min=0.786, mean=0.786, max=0.786, sum=1.573 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.518, mean=0.518, max=0.518, sum=1.035 (2)",
-            "tab": "Efficiency",
-            "score": 0.5177000564278909
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=315.359, mean=315.359, max=315.359, sum=630.718 (2)",
-            "tab": "General information",
-            "score": 315.3592233009709
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.906,
-        "details": {
-          "description": "min=0.906, mean=0.906, max=0.906, sum=1.812 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.425, mean=0.425, max=0.425, sum=0.85 (2)",
-            "tab": "Efficiency",
-            "score": 0.42478426195617414
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=463.423, mean=463.423, max=463.423, sum=926.846 (2)",
-            "tab": "General information",
-            "score": 463.4230769230769
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.75,
-        "details": {
-          "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.279, mean=0.279, max=0.279, sum=0.557 (2)",
-            "tab": "Efficiency",
-            "score": 0.2786110520362854
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=405.71, mean=405.71, max=405.71, sum=811.42 (2)",
-            "tab": "General information",
-            "score": 405.71
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.844,
-        "details": {
-          "description": "min=0.844, mean=0.844, max=0.844, sum=1.688 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)",
-            "tab": "Efficiency",
-            "score": 0.3998657326436439
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=348.519, mean=348.519, max=348.519, sum=697.037 (2)",
-            "tab": "General information",
-            "score": 348.51851851851853
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.575,
-        "details": {
-          "description": "min=0.575, mean=0.575, max=0.575, sum=1.151 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.474, mean=0.474, max=0.474, sum=0.949 (2)",
-            "tab": "Efficiency",
-            "score": 0.4744071271378181
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.4, mean=0.4, max=0.4, sum=0.799 (2)",
-            "tab": "Efficiency",
-            "score": 0.39967524166213736
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=540.038, mean=540.038, max=540.038, sum=1080.075 (2)",
-            "tab": "General information",
-            "score": 540.0375722543353
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=745.516, mean=745.516, max=745.516, sum=1491.032 (2)",
-            "tab": "General information",
-            "score": 745.5162011173185
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.761,
-        "details": {
-          "description": "min=0.761, mean=0.761, max=0.761, sum=1.523 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.421, mean=0.421, max=0.421, sum=0.843 (2)",
-            "tab": "Efficiency",
-            "score": 0.42128828927582385
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=680.69, mean=680.69, max=680.69, sum=1361.379 (2)",
-            "tab": "General information",
-            "score": 680.6895424836601
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.802,
-        "details": {
-          "description": "min=0.802, mean=0.802, max=0.802, sum=1.605 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.438, mean=0.438, max=0.438, sum=0.875 (2)",
-            "tab": "Efficiency",
-            "score": 0.43764398863286147
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=602.145, mean=602.145, max=602.145, sum=1204.29 (2)",
-            "tab": "General information",
-            "score": 602.145061728395
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.773,
-        "details": {
-          "description": "min=0.773, mean=0.773, max=0.773, sum=1.545 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.464, mean=0.464, max=0.464, sum=0.929 (2)",
-            "tab": "Efficiency",
-            "score": 0.464488469470631
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=462.036, mean=462.036, max=462.036, sum=924.073 (2)",
-            "tab": "General information",
-            "score": 462.03636363636366
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.788,
-        "details": {
-          "description": "min=0.788, mean=0.788, max=0.788, sum=1.576 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.431, mean=0.431, max=0.431, sum=0.862 (2)",
-            "tab": "Efficiency",
-            "score": 0.43111481179996414
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1315.865, mean=1315.865, max=1315.865, sum=2631.731 (2)",
-            "tab": "General information",
-            "score": 1315.865306122449
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=1.741 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.399, mean=0.399, max=0.399, sum=0.799 (2)",
-            "tab": "Efficiency",
-            "score": 0.3994969099908326
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=487.95, mean=487.95, max=487.95, sum=975.9 (2)",
-            "tab": "General information",
-            "score": 487.9502487562189
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.542,
-        "details": {
-          "description": "min=0.542, mean=0.542, max=0.542, sum=1.084 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.538, mean=0.538, max=0.538, sum=1.076 (2)",
-            "tab": "Efficiency",
-            "score": 0.5377652975450079
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=395.349, mean=395.349, max=395.349, sum=790.699 (2)",
-            "tab": "General information",
-            "score": 395.34939759036143
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.848,
-        "details": {
-          "description": "min=0.848, mean=0.848, max=0.848, sum=1.696 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.404, mean=0.404, max=0.404, sum=0.809 (2)",
-            "tab": "Efficiency",
-            "score": 0.4042932554992319
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=308.924, mean=308.924, max=308.924, sum=617.848 (2)",
-            "tab": "General information",
-            "score": 308.92397660818716
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.54,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json b/data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json
deleted file mode 100644
index c7ab33c35..000000000
--- a/data/helm_mmlu/mistralai/mixtral-8x22b/24955250-a2e9-475f-a866-30a835579e03.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/mistralai_mixtral-8x22b/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mixtral 8x22B",
-    "id": "mistralai/mixtral-8x22b",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.778,
-        "details": {
-          "description": "min=0.463, mean=0.778, max=0.974, sum=88.715 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.284, mean=0.555, max=4.852, sum=63.286 (114)",
-            "tab": "Efficiency",
-            "score": 0.5551394123775506
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=308.924, mean=696.273, max=3089.109, sum=79375.178 (114)",
-            "tab": "General information",
-            "score": 696.2734899593811
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.48,
-        "details": {
-          "description": "min=0.48, mean=0.48, max=0.48, sum=0.96 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.313, mean=0.313, max=0.313, sum=0.626 (2)",
-            "tab": "Efficiency",
-            "score": 0.31304038524627686
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=402.44, mean=402.44, max=402.44, sum=804.88 (2)",
-            "tab": "General information",
-            "score": 402.44
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.741,
-        "details": {
-          "description": "min=0.741, mean=0.741, max=0.741, sum=1.481 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.331, mean=0.331, max=0.331, sum=0.662 (2)",
-            "tab": "Efficiency",
-            "score": 0.3308721118503147
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=407.089, mean=407.089, max=407.089, sum=814.178 (2)",
-            "tab": "General information",
-            "score": 407.0888888888889
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.569,
-        "details": {
-          "description": "min=0.569, mean=0.569, max=0.569, sum=1.137 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.358, mean=0.358, max=0.358, sum=0.716 (2)",
-            "tab": "Efficiency",
-            "score": 0.35782508373260496
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.671 (2)",
-            "tab": "Efficiency",
-            "score": 0.33555712799231213
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.409, mean=0.409, max=0.409, sum=0.819 (2)",
-            "tab": "Efficiency",
-            "score": 0.40926079750061034
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.382, mean=0.382, max=0.382, sum=0.765 (2)",
-            "tab": "Efficiency",
-            "score": 0.3824312686920166
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.671 (2)",
-            "tab": "Efficiency",
-            "score": 0.33573296993454066
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.347, mean=0.347, max=0.347, sum=0.694 (2)",
-            "tab": "Efficiency",
-            "score": 0.34694373841379206
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=627.71, mean=627.71, max=627.71, sum=1255.42 (2)",
-            "tab": "General information",
-            "score": 627.71
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=550.799, mean=550.799, max=550.799, sum=1101.597 (2)",
-            "tab": "General information",
-            "score": 550.7986111111111
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=902.17, mean=902.17, max=902.17, sum=1804.34 (2)",
-            "tab": "General information",
-            "score": 902.17
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=658.31, mean=658.31, max=658.31, sum=1316.62 (2)",
-            "tab": "General information",
-            "score": 658.31
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=592.41, mean=592.41, max=592.41, sum=1184.821 (2)",
-            "tab": "General information",
-            "score": 592.4104046242775
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=551.029, mean=551.029, max=551.029, sum=1102.059 (2)",
-            "tab": "General information",
-            "score": 551.0294117647059
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.344, mean=0.344, max=0.344, sum=0.689 (2)",
-            "tab": "Efficiency",
-            "score": 0.3443935012817383
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=424.94, mean=424.94, max=424.94, sum=849.88 (2)",
-            "tab": "General information",
-            "score": 424.94
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.667,
-        "details": {
-          "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.359, mean=0.359, max=0.359, sum=0.719 (2)",
-            "tab": "Efficiency",
-            "score": 0.359416033092298
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=687.175, mean=687.175, max=687.175, sum=1374.351 (2)",
-            "tab": "General information",
-            "score": 687.1754385964912
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.56,
-        "details": {
-          "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.349, mean=0.349, max=0.349, sum=0.699 (2)",
-            "tab": "Efficiency",
-            "score": 0.34949236392974853
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=483.47, mean=483.47, max=483.47, sum=966.94 (2)",
-            "tab": "General information",
-            "score": 483.47
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.852,
-        "details": {
-          "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.308, mean=0.308, max=0.308, sum=0.616 (2)",
-            "tab": "Efficiency",
-            "score": 0.30799298153983223
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=451.093, mean=451.093, max=451.093, sum=902.185 (2)",
-            "tab": "General information",
-            "score": 451.0925925925926
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.842,
-        "details": {
-          "description": "min=0.842, mean=0.842, max=0.842, sum=1.685 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)",
-            "tab": "Efficiency",
-            "score": 0.4229524595561135
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=373.82, mean=373.82, max=373.82, sum=747.64 (2)",
-            "tab": "General information",
-            "score": 373.81993569131834
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.845,
-        "details": {
-          "description": "min=0.845, mean=0.845, max=0.845, sum=1.69 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.461, mean=0.461, max=0.461, sum=0.921 (2)",
-            "tab": "Efficiency",
-            "score": 0.4606352711425108
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.36, mean=0.36, max=0.36, sum=0.72 (2)",
-            "tab": "Efficiency",
-            "score": 0.3601941665013631
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.578, mean=0.578, max=0.578, sum=1.156 (2)",
-            "tab": "Efficiency",
-            "score": 0.5780843218115815
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.359, mean=0.359, max=0.359, sum=0.718 (2)",
-            "tab": "Efficiency",
-            "score": 0.3589704905460083
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1279.143, mean=1279.143, max=1279.143, sum=2558.287 (2)",
-            "tab": "General information",
-            "score": 1279.1433823529412
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=796.496, mean=796.496, max=796.496, sum=1592.993 (2)",
-            "tab": "General information",
-            "score": 796.4964539007092
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1849.711, mean=1849.711, max=1849.711, sum=3699.421 (2)",
-            "tab": "General information",
-            "score": 1849.7105606258149
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=645.278, mean=645.278, max=645.278, sum=1290.556 (2)",
-            "tab": "General information",
-            "score": 645.2777777777778
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.95,
-        "details": {
-          "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.348, mean=0.348, max=0.348, sum=0.696 (2)",
-            "tab": "Efficiency",
-            "score": 0.3477613878250122
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=473.19, mean=473.19, max=473.19, sum=946.38 (2)",
-            "tab": "General information",
-            "score": 473.19
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.882,
-        "details": {
-          "description": "min=0.882, mean=0.882, max=0.882, sum=1.763 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.347, mean=0.347, max=0.347, sum=0.694 (2)",
-            "tab": "Efficiency",
-            "score": 0.34718117117881775
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=665.987, mean=665.987, max=665.987, sum=1331.974 (2)",
-            "tab": "General information",
-            "score": 665.9868421052631
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.74,
-        "details": {
-          "description": "min=0.74, mean=0.74, max=0.74, sum=1.48 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.672, mean=0.672, max=0.672, sum=1.345 (2)",
-            "tab": "Efficiency",
-            "score": 0.6724735307693481
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=644.6, mean=644.6, max=644.6, sum=1289.2 (2)",
-            "tab": "General information",
-            "score": 644.6
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.819,
-        "details": {
-          "description": "min=0.819, mean=0.819, max=0.819, sum=1.638 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.476, mean=0.476, max=0.476, sum=0.953 (2)",
-            "tab": "Efficiency",
-            "score": 0.4764475804454875
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=487.174, mean=487.174, max=487.174, sum=974.347 (2)",
-            "tab": "General information",
-            "score": 487.1735849056604
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "description": "min=0.796, mean=0.796, max=0.796, sum=1.591 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.654 (2)",
-            "tab": "Efficiency",
-            "score": 0.3271778279162468
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=334.285, mean=334.285, max=334.285, sum=668.57 (2)",
-            "tab": "General information",
-            "score": 334.2851063829787
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.766,
-        "details": {
-          "description": "min=0.766, mean=0.766, max=0.766, sum=1.531 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.579, mean=0.579, max=0.579, sum=1.158 (2)",
-            "tab": "Efficiency",
-            "score": 0.5787854655035611
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=501.379, mean=501.379, max=501.379, sum=1002.759 (2)",
-            "tab": "General information",
-            "score": 501.37931034482756
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.622,
-        "details": {
-          "description": "min=0.622, mean=0.622, max=0.622, sum=1.243 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=4.852, mean=4.852, max=4.852, sum=9.703 (2)",
-            "tab": "Efficiency",
-            "score": 4.851643589438584
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=613.386, mean=613.386, max=613.386, sum=1226.772 (2)",
-            "tab": "General information",
-            "score": 613.3862433862433
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.627,
-        "details": {
-          "description": "min=0.627, mean=0.627, max=0.627, sum=1.254 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=4.348, mean=4.348, max=4.348, sum=8.696 (2)",
-            "tab": "Efficiency",
-            "score": 4.34797261631678
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=718.984, mean=718.984, max=718.984, sum=1437.968 (2)",
-            "tab": "General information",
-            "score": 718.984126984127
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.895,
-        "details": {
-          "description": "min=0.895, mean=0.895, max=0.895, sum=1.789 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.306, mean=0.306, max=0.306, sum=0.613 (2)",
-            "tab": "Efficiency",
-            "score": 0.30645533454033635
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.362, mean=0.362, max=0.362, sum=0.724 (2)",
-            "tab": "Efficiency",
-            "score": 0.3618842803785954
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.432, mean=0.432, max=0.432, sum=0.864 (2)",
-            "tab": "Efficiency",
-            "score": 0.43201621770858767
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.874, mean=0.874, max=0.874, sum=1.747 (2)",
-            "tab": "Efficiency",
-            "score": 0.8736377629366788
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.373, mean=0.373, max=0.373, sum=0.746 (2)",
-            "tab": "Efficiency",
-            "score": 0.3727773331632518
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)",
-            "tab": "Efficiency",
-            "score": 0.380075985903567
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.313, mean=0.313, max=0.313, sum=0.626 (2)",
-            "tab": "Efficiency",
-            "score": 0.3130294726445125
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.454, mean=0.454, max=0.454, sum=0.909 (2)",
-            "tab": "Efficiency",
-            "score": 0.4543530375869186
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.475, mean=0.475, max=0.475, sum=0.95 (2)",
-            "tab": "Efficiency",
-            "score": 0.4752031294237666
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.341, mean=0.341, max=0.341, sum=0.683 (2)",
-            "tab": "Efficiency",
-            "score": 0.3413255830474247
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)",
-            "tab": "Efficiency",
-            "score": 0.32982436013877936
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.406, mean=0.406, max=0.406, sum=0.812 (2)",
-            "tab": "Efficiency",
-            "score": 0.4059625698460473
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.744, mean=0.744, max=0.744, sum=1.488 (2)",
-            "tab": "Efficiency",
-            "score": 0.7440984506233066
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.521, mean=0.521, max=0.521, sum=1.043 (2)",
-            "tab": "Efficiency",
-            "score": 0.5214709360388261
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=600.561, mean=600.561, max=600.561, sum=1201.123 (2)",
-            "tab": "General information",
-            "score": 600.5612903225806
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=572.798, mean=572.798, max=572.798, sum=1145.596 (2)",
-            "tab": "General information",
-            "score": 572.7980295566502
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=988.24, mean=988.24, max=988.24, sum=1976.48 (2)",
-            "tab": "General information",
-            "score": 988.24
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=3089.109, mean=3089.109, max=3089.109, sum=6178.218 (2)",
-            "tab": "General information",
-            "score": 3089.109090909091
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=429.207, mean=429.207, max=429.207, sum=858.414 (2)",
-            "tab": "General information",
-            "score": 429.2070707070707
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=514.808, mean=514.808, max=514.808, sum=1029.617 (2)",
-            "tab": "General information",
-            "score": 514.8082901554404
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=423.815, mean=423.815, max=423.815, sum=847.631 (2)",
-            "tab": "General information",
-            "score": 423.81538461538463
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=584.13, mean=584.13, max=584.13, sum=1168.259 (2)",
-            "tab": "General information",
-            "score": 584.1296296296297
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=443.345, mean=443.345, max=443.345, sum=886.689 (2)",
-            "tab": "General information",
-            "score": 443.34453781512605
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=622.775, mean=622.775, max=622.775, sum=1245.55 (2)",
-            "tab": "General information",
-            "score": 622.774834437086
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=558.873, mean=558.873, max=558.873, sum=1117.747 (2)",
-            "tab": "General information",
-            "score": 558.8733944954129
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=913.644, mean=913.644, max=913.644, sum=1827.287 (2)",
-            "tab": "General information",
-            "score": 913.6435185185185
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2477.446, mean=2477.446, max=2477.446, sum=4954.892 (2)",
-            "tab": "General information",
-            "score": 2477.4460784313724
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1585.553, mean=1585.553, max=1585.553, sum=3171.105 (2)",
-            "tab": "General information",
-            "score": 1585.5527426160338
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.885,
-        "details": {
-          "description": "min=0.885, mean=0.885, max=0.885, sum=1.771 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)",
-            "tab": "Efficiency",
-            "score": 0.3039867247166655
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.313, mean=0.313, max=0.313, sum=0.627 (2)",
-            "tab": "Efficiency",
-            "score": 0.3133269229918036
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=353.152, mean=353.152, max=353.152, sum=706.305 (2)",
-            "tab": "General information",
-            "score": 353.15246636771303
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=394.748, mean=394.748, max=394.748, sum=789.496 (2)",
-            "tab": "General information",
-            "score": 394.7480916030534
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.917,
-        "details": {
-          "description": "min=0.917, mean=0.917, max=0.917, sum=1.835 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.346, mean=0.346, max=0.346, sum=0.691 (2)",
-            "tab": "Efficiency",
-            "score": 0.34560450049471264
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=720.182, mean=720.182, max=720.182, sum=1440.364 (2)",
-            "tab": "General information",
-            "score": 720.1818181818181
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.877,
-        "details": {
-          "description": "min=0.877, mean=0.877, max=0.877, sum=1.755 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.357, mean=0.357, max=0.357, sum=0.713 (2)",
-            "tab": "Efficiency",
-            "score": 0.35657415360760836
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=486.779, mean=486.779, max=486.779, sum=973.558 (2)",
-            "tab": "General information",
-            "score": 486.77914110429447
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.661,
-        "details": {
-          "description": "min=0.661, mean=0.661, max=0.661, sum=1.321 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.375, mean=0.375, max=0.375, sum=0.751 (2)",
-            "tab": "Efficiency",
-            "score": 0.37532309123447966
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=734.83, mean=734.83, max=734.83, sum=1469.661 (2)",
-            "tab": "General information",
-            "score": 734.8303571428571
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.883,
-        "details": {
-          "description": "min=0.883, mean=0.883, max=0.883, sum=1.767 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.284, mean=0.284, max=0.284, sum=0.567 (2)",
-            "tab": "Efficiency",
-            "score": 0.2837195535307949
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=315.359, mean=315.359, max=315.359, sum=630.718 (2)",
-            "tab": "General information",
-            "score": 315.3592233009709
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.915,
-        "details": {
-          "description": "min=0.915, mean=0.915, max=0.915, sum=1.829 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.477, mean=0.477, max=0.477, sum=0.955 (2)",
-            "tab": "Efficiency",
-            "score": 0.47738775534507555
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=463.423, mean=463.423, max=463.423, sum=926.846 (2)",
-            "tab": "General information",
-            "score": 463.4230769230769
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.358, mean=0.358, max=0.358, sum=0.715 (2)",
-            "tab": "Efficiency",
-            "score": 0.35768274068832395
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=405.71, mean=405.71, max=405.71, sum=811.42 (2)",
-            "tab": "General information",
-            "score": 405.71
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.899,
-        "details": {
-          "description": "min=0.899, mean=0.899, max=0.899, sum=1.798 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.599 (2)",
-            "tab": "Efficiency",
-            "score": 0.29965735912931984
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=348.519, mean=348.519, max=348.519, sum=697.037 (2)",
-            "tab": "General information",
-            "score": 348.51851851851853
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.646,
-        "details": {
-          "description": "min=0.646, mean=0.646, max=0.646, sum=1.292 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.435, mean=0.435, max=0.435, sum=0.87 (2)",
-            "tab": "Efficiency",
-            "score": 0.43506465757513324
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.365, mean=0.365, max=0.365, sum=0.729 (2)",
-            "tab": "Efficiency",
-            "score": 0.36451081030861626
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=540.038, mean=540.038, max=540.038, sum=1080.075 (2)",
-            "tab": "General information",
-            "score": 540.0375722543353
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=745.516, mean=745.516, max=745.516, sum=1491.032 (2)",
-            "tab": "General information",
-            "score": 745.5162011173185
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.866,
-        "details": {
-          "description": "min=0.866, mean=0.866, max=0.866, sum=1.732 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.458, mean=0.458, max=0.458, sum=0.916 (2)",
-            "tab": "Efficiency",
-            "score": 0.4579993447447135
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=680.69, mean=680.69, max=680.69, sum=1361.379 (2)",
-            "tab": "General information",
-            "score": 680.6895424836601
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.45, mean=0.45, max=0.45, sum=0.901 (2)",
-            "tab": "Efficiency",
-            "score": 0.4504210890075307
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=602.145, mean=602.145, max=602.145, sum=1204.29 (2)",
-            "tab": "General information",
-            "score": 602.145061728395
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.755,
-        "details": {
-          "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.651, mean=0.651, max=0.651, sum=1.302 (2)",
-            "tab": "Efficiency",
-            "score": 0.6507512135939164
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=462.036, mean=462.036, max=462.036, sum=924.073 (2)",
-            "tab": "General information",
-            "score": 462.03636363636366
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "details": {
-          "description": "min=0.865, mean=0.865, max=0.865, sum=1.731 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.46, mean=0.46, max=0.46, sum=0.919 (2)",
-            "tab": "Efficiency",
-            "score": 0.4596467952339017
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1315.865, mean=1315.865, max=1315.865, sum=2631.731 (2)",
-            "tab": "General information",
-            "score": 1315.865306122449
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.841 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.495, mean=0.495, max=0.495, sum=0.989 (2)",
-            "tab": "Efficiency",
-            "score": 0.4945164248717958
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=487.95, mean=487.95, max=487.95, sum=975.9 (2)",
-            "tab": "General information",
-            "score": 487.9502487562189
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.596,
-        "details": {
-          "description": "min=0.596, mean=0.596, max=0.596, sum=1.193 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)",
-            "tab": "Efficiency",
-            "score": 0.3041278597820236
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=395.349, mean=395.349, max=395.349, sum=790.699 (2)",
-            "tab": "General information",
-            "score": 395.34939759036143
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.801 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.595 (2)",
-            "tab": "Efficiency",
-            "score": 0.29729281252587747
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=308.924, mean=308.924, max=308.924, sum=617.848 (2)",
-            "tab": "General information",
-            "score": 308.92397660818716
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.598,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json b/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json
deleted file mode 100644
index 3ed7c6104..000000000
--- a/data/helm_mmlu/mistralai/mixtral-8x7b-32kseqlen/de6f7e19-b54a-4bd3-b624-29f66afbee15.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/mistralai_mixtral-8x7b-32kseqlen/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mixtral 8x7B 32K seqlen",
-    "id": "mistralai/mixtral-8x7b-32kseqlen",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.717,
-        "details": {
-          "description": "min=0.38, mean=0.717, max=0.933, sum=81.767 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.364, max=0.667, sum=41.491 (114)",
-            "tab": "Efficiency",
-            "score": 0.36396022974729103
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=308.924, mean=696.273, max=3089.109, sum=79375.178 (114)",
-            "tab": "General information",
-            "score": 696.2734899593811
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38,
-        "details": {
-          "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.665 (2)",
-            "tab": "Efficiency",
-            "score": 0.3324201321601868
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=402.44, mean=402.44, max=402.44, sum=804.88 (2)",
-            "tab": "General information",
-            "score": 402.44
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.696,
-        "details": {
-          "description": "min=0.696, mean=0.696, max=0.696, sum=1.393 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.338, mean=0.338, max=0.338, sum=0.676 (2)",
-            "tab": "Efficiency",
-            "score": 0.33777406480577254
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=407.089, mean=407.089, max=407.089, sum=814.178 (2)",
-            "tab": "General information",
-            "score": 407.0888888888889
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.51,
-        "details": {
-          "description": "min=0.51, mean=0.51, max=0.51, sum=1.02 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.386, mean=0.386, max=0.386, sum=0.773 (2)",
-            "tab": "Efficiency",
-            "score": 0.386492395401001
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.366, mean=0.366, max=0.366, sum=0.733 (2)",
-            "tab": "Efficiency",
-            "score": 0.3663763701915741
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.367, mean=0.367, max=0.367, sum=0.735 (2)",
-            "tab": "Efficiency",
-            "score": 0.36740577936172486
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.356, mean=0.356, max=0.356, sum=0.712 (2)",
-            "tab": "Efficiency",
-            "score": 0.35591145277023317
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.347, mean=0.347, max=0.347, sum=0.695 (2)",
-            "tab": "Efficiency",
-            "score": 0.347429724787012
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.361, mean=0.361, max=0.361, sum=0.721 (2)",
-            "tab": "Efficiency",
-            "score": 0.3606654686086318
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=627.71, mean=627.71, max=627.71, sum=1255.42 (2)",
-            "tab": "General information",
-            "score": 627.71
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=550.799, mean=550.799, max=550.799, sum=1101.597 (2)",
-            "tab": "General information",
-            "score": 550.7986111111111
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=902.17, mean=902.17, max=902.17, sum=1804.34 (2)",
-            "tab": "General information",
-            "score": 902.17
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=658.31, mean=658.31, max=658.31, sum=1316.62 (2)",
-            "tab": "General information",
-            "score": 658.31
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=592.41, mean=592.41, max=592.41, sum=1184.821 (2)",
-            "tab": "General information",
-            "score": 592.4104046242775
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=551.029, mean=551.029, max=551.029, sum=1102.059 (2)",
-            "tab": "General information",
-            "score": 551.0294117647059
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)",
-            "tab": "Efficiency",
-            "score": 0.34211899518966676
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=424.94, mean=424.94, max=424.94, sum=849.88 (2)",
-            "tab": "General information",
-            "score": 424.94
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.605,
-        "details": {
-          "description": "min=0.605, mean=0.605, max=0.605, sum=1.211 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.354, mean=0.354, max=0.354, sum=0.708 (2)",
-            "tab": "Efficiency",
-            "score": 0.3541024630529839
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=687.175, mean=687.175, max=687.175, sum=1374.351 (2)",
-            "tab": "General information",
-            "score": 687.1754385964912
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46,
-        "details": {
-          "description": "min=0.46, mean=0.46, max=0.46, sum=0.92 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.335 (2)",
-            "tab": "Efficiency",
-            "score": 0.667280240058899
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=483.47, mean=483.47, max=483.47, sum=966.94 (2)",
-            "tab": "General information",
-            "score": 483.47
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.833,
-        "details": {
-          "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.338, mean=0.338, max=0.338, sum=0.677 (2)",
-            "tab": "Efficiency",
-            "score": 0.3384844925668504
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=451.093, mean=451.093, max=451.093, sum=902.185 (2)",
-            "tab": "General information",
-            "score": 451.0925925925926
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.797,
-        "details": {
-          "description": "min=0.797, mean=0.797, max=0.797, sum=1.595 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.323, max=0.323, sum=0.645 (2)",
-            "tab": "Efficiency",
-            "score": 0.322712682067773
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=373.82, mean=373.82, max=373.82, sum=747.64 (2)",
-            "tab": "General information",
-            "score": 373.81993569131834
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.779,
-        "details": {
-          "description": "min=0.779, mean=0.779, max=0.779, sum=1.559 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.416, mean=0.416, max=0.416, sum=0.832 (2)",
-            "tab": "Efficiency",
-            "score": 0.41612808669314666
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.346, mean=0.346, max=0.346, sum=0.691 (2)",
-            "tab": "Efficiency",
-            "score": 0.34556762884694636
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.44, mean=0.44, max=0.44, sum=0.879 (2)",
-            "tab": "Efficiency",
-            "score": 0.4395133182309286
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.328, mean=0.328, max=0.328, sum=0.655 (2)",
-            "tab": "Efficiency",
-            "score": 0.3276863078665889
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1279.143, mean=1279.143, max=1279.143, sum=2558.287 (2)",
-            "tab": "General information",
-            "score": 1279.1433823529412
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=796.496, mean=796.496, max=796.496, sum=1592.993 (2)",
-            "tab": "General information",
-            "score": 796.4964539007092
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1849.711, mean=1849.711, max=1849.711, sum=3699.421 (2)",
-            "tab": "General information",
-            "score": 1849.7105606258149
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=645.278, mean=645.278, max=645.278, sum=1290.556 (2)",
-            "tab": "General information",
-            "score": 645.2777777777778
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "details": {
-          "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.318, mean=0.318, max=0.318, sum=0.637 (2)",
-            "tab": "Efficiency",
-            "score": 0.3183705282211304
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=473.19, mean=473.19, max=473.19, sum=946.38 (2)",
-            "tab": "General information",
-            "score": 473.19
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.829,
-        "details": {
-          "description": "min=0.829, mean=0.829, max=0.829, sum=1.658 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.365, mean=0.365, max=0.365, sum=0.73 (2)",
-            "tab": "Efficiency",
-            "score": 0.36493434560926336
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=665.987, mean=665.987, max=665.987, sum=1331.974 (2)",
-            "tab": "General information",
-            "score": 665.9868421052631
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.72,
-        "details": {
-          "description": "min=0.72, mean=0.72, max=0.72, sum=1.44 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.365, mean=0.365, max=0.365, sum=0.73 (2)",
-            "tab": "Efficiency",
-            "score": 0.3650094985961914
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=644.6, mean=644.6, max=644.6, sum=1289.2 (2)",
-            "tab": "General information",
-            "score": 644.6
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.785,
-        "details": {
-          "description": "min=0.785, mean=0.785, max=0.785, sum=1.57 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.335, mean=0.335, max=0.335, sum=0.671 (2)",
-            "tab": "Efficiency",
-            "score": 0.33542148392155485
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=487.174, mean=487.174, max=487.174, sum=974.347 (2)",
-            "tab": "General information",
-            "score": 487.1735849056604
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.681,
-        "details": {
-          "description": "min=0.681, mean=0.681, max=0.681, sum=1.362 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.339, mean=0.339, max=0.339, sum=0.679 (2)",
-            "tab": "Efficiency",
-            "score": 0.3393338994776949
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=334.285, mean=334.285, max=334.285, sum=668.57 (2)",
-            "tab": "General information",
-            "score": 334.2851063829787
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.676,
-        "details": {
-          "description": "min=0.676, mean=0.676, max=0.676, sum=1.352 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)",
-            "tab": "Efficiency",
-            "score": 0.35104844159093396
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=501.379, mean=501.379, max=501.379, sum=1002.759 (2)",
-            "tab": "General information",
-            "score": 501.37931034482756
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.476,
-        "details": {
-          "description": "min=0.476, mean=0.476, max=0.476, sum=0.952 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.43, mean=0.43, max=0.43, sum=0.86 (2)",
-            "tab": "Efficiency",
-            "score": 0.4298846198137475
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=613.386, mean=613.386, max=613.386, sum=1226.772 (2)",
-            "tab": "General information",
-            "score": 613.3862433862433
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.532,
-        "details": {
-          "description": "min=0.532, mean=0.532, max=0.532, sum=1.063 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.37, mean=0.37, max=0.37, sum=0.741 (2)",
-            "tab": "Efficiency",
-            "score": 0.37032828255305217
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=718.984, mean=718.984, max=718.984, sum=1437.968 (2)",
-            "tab": "General information",
-            "score": 718.984126984127
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.886,
-        "details": {
-          "description": "min=0.886, mean=0.886, max=0.886, sum=1.772 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.328, mean=0.328, max=0.328, sum=0.657 (2)",
-            "tab": "Efficiency",
-            "score": 0.3284358686016452
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.326, mean=0.326, max=0.326, sum=0.652 (2)",
-            "tab": "Efficiency",
-            "score": 0.32620196624342446
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.485, mean=0.485, max=0.485, sum=0.969 (2)",
-            "tab": "Efficiency",
-            "score": 0.48452038288116456
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.641, mean=0.641, max=0.641, sum=1.283 (2)",
-            "tab": "Efficiency",
-            "score": 0.6413424491882325
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.653 (2)",
-            "tab": "Efficiency",
-            "score": 0.3266212759595929
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.337, mean=0.337, max=0.337, sum=0.675 (2)",
-            "tab": "Efficiency",
-            "score": 0.33742881191826857
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.654 (2)",
-            "tab": "Efficiency",
-            "score": 0.3271804552811843
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.328, mean=0.328, max=0.328, sum=0.655 (2)",
-            "tab": "Efficiency",
-            "score": 0.3277335458331638
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.329, mean=0.329, max=0.329, sum=0.658 (2)",
-            "tab": "Efficiency",
-            "score": 0.3291829443779312
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)",
-            "tab": "Efficiency",
-            "score": 0.33715188266425733
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.396, mean=0.396, max=0.396, sum=0.792 (2)",
-            "tab": "Efficiency",
-            "score": 0.39586829351722647
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.376, mean=0.376, max=0.376, sum=0.753 (2)",
-            "tab": "Efficiency",
-            "score": 0.37643481846208926
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.531, mean=0.531, max=0.531, sum=1.062 (2)",
-            "tab": "Efficiency",
-            "score": 0.531247288573022
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.44, mean=0.44, max=0.44, sum=0.88 (2)",
-            "tab": "Efficiency",
-            "score": 0.44013202341297003
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=600.561, mean=600.561, max=600.561, sum=1201.123 (2)",
-            "tab": "General information",
-            "score": 600.5612903225806
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=572.798, mean=572.798, max=572.798, sum=1145.596 (2)",
-            "tab": "General information",
-            "score": 572.7980295566502
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=988.24, mean=988.24, max=988.24, sum=1976.48 (2)",
-            "tab": "General information",
-            "score": 988.24
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=3089.109, mean=3089.109, max=3089.109, sum=6178.218 (2)",
-            "tab": "General information",
-            "score": 3089.109090909091
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=429.207, mean=429.207, max=429.207, sum=858.414 (2)",
-            "tab": "General information",
-            "score": 429.2070707070707
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=514.808, mean=514.808, max=514.808, sum=1029.617 (2)",
-            "tab": "General information",
-            "score": 514.8082901554404
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=423.815, mean=423.815, max=423.815, sum=847.631 (2)",
-            "tab": "General information",
-            "score": 423.81538461538463
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=584.13, mean=584.13, max=584.13, sum=1168.259 (2)",
-            "tab": "General information",
-            "score": 584.1296296296297
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=443.345, mean=443.345, max=443.345, sum=886.689 (2)",
-            "tab": "General information",
-            "score": 443.34453781512605
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=622.775, mean=622.775, max=622.775, sum=1245.55 (2)",
-            "tab": "General information",
-            "score": 622.774834437086
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=558.873, mean=558.873, max=558.873, sum=1117.747 (2)",
-            "tab": "General information",
-            "score": 558.8733944954129
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=913.644, mean=913.644, max=913.644, sum=1827.287 (2)",
-            "tab": "General information",
-            "score": 913.6435185185185
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2477.446, mean=2477.446, max=2477.446, sum=4954.892 (2)",
-            "tab": "General information",
-            "score": 2477.4460784313724
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1585.553, mean=1585.553, max=1585.553, sum=3171.105 (2)",
-            "tab": "General information",
-            "score": 1585.5527426160338
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.303, max=0.303, sum=0.607 (2)",
-            "tab": "Efficiency",
-            "score": 0.30348238068311206
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)",
-            "tab": "Efficiency",
-            "score": 0.30424233429304515
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=353.152, mean=353.152, max=353.152, sum=706.305 (2)",
-            "tab": "General information",
-            "score": 353.15246636771303
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=394.748, mean=394.748, max=394.748, sum=789.496 (2)",
-            "tab": "General information",
-            "score": 394.7480916030534
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.86,
-        "details": {
-          "description": "min=0.86, mean=0.86, max=0.86, sum=1.719 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.354, mean=0.354, max=0.354, sum=0.708 (2)",
-            "tab": "Efficiency",
-            "score": 0.354031091879222
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=720.182, mean=720.182, max=720.182, sum=1440.364 (2)",
-            "tab": "General information",
-            "score": 720.1818181818181
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.767,
-        "details": {
-          "description": "min=0.767, mean=0.767, max=0.767, sum=1.534 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.334, mean=0.334, max=0.334, sum=0.668 (2)",
-            "tab": "Efficiency",
-            "score": 0.3338228237409533
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=486.779, mean=486.779, max=486.779, sum=973.558 (2)",
-            "tab": "General information",
-            "score": 486.77914110429447
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.509,
-        "details": {
-          "description": "min=0.509, mean=0.509, max=0.509, sum=1.018 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.349, mean=0.349, max=0.349, sum=0.697 (2)",
-            "tab": "Efficiency",
-            "score": 0.34853318120752064
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=734.83, mean=734.83, max=734.83, sum=1469.661 (2)",
-            "tab": "General information",
-            "score": 734.8303571428571
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.845,
-        "details": {
-          "description": "min=0.845, mean=0.845, max=0.845, sum=1.689 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.325, mean=0.325, max=0.325, sum=0.651 (2)",
-            "tab": "Efficiency",
-            "score": 0.32549439124690677
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=315.359, mean=315.359, max=315.359, sum=630.718 (2)",
-            "tab": "General information",
-            "score": 315.3592233009709
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.923,
-        "details": {
-          "description": "min=0.923, mean=0.923, max=0.923, sum=1.846 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.316, mean=0.316, max=0.316, sum=0.631 (2)",
-            "tab": "Efficiency",
-            "score": 0.315602661198021
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=463.423, mean=463.423, max=463.423, sum=926.846 (2)",
-            "tab": "General information",
-            "score": 463.4230769230769
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.316, mean=0.316, max=0.316, sum=0.632 (2)",
-            "tab": "Efficiency",
-            "score": 0.3161799097061157
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=405.71, mean=405.71, max=405.71, sum=811.42 (2)",
-            "tab": "General information",
-            "score": 405.71
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.881,
-        "details": {
-          "description": "min=0.881, mean=0.881, max=0.881, sum=1.762 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.323, max=0.323, sum=0.645 (2)",
-            "tab": "Efficiency",
-            "score": 0.32256904598396857
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=348.519, mean=348.519, max=348.519, sum=697.037 (2)",
-            "tab": "General information",
-            "score": 348.51851851851853
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.444,
-        "details": {
-          "description": "min=0.444, mean=0.444, max=0.444, sum=0.887 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.607 (2)",
-            "tab": "Efficiency",
-            "score": 0.3035011126126857
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.345, mean=0.345, max=0.345, sum=0.69 (2)",
-            "tab": "Efficiency",
-            "score": 0.34521307439111465
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=540.038, mean=540.038, max=540.038, sum=1080.075 (2)",
-            "tab": "General information",
-            "score": 540.0375722543353
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=745.516, mean=745.516, max=745.516, sum=1491.032 (2)",
-            "tab": "General information",
-            "score": 745.5162011173185
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.353, mean=0.353, max=0.353, sum=0.706 (2)",
-            "tab": "Efficiency",
-            "score": 0.3528824195363163
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=680.69, mean=680.69, max=680.69, sum=1361.379 (2)",
-            "tab": "General information",
-            "score": 680.6895424836601
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.849,
-        "details": {
-          "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)",
-            "tab": "Efficiency",
-            "score": 0.32980361028953836
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=602.145, mean=602.145, max=602.145, sum=1204.29 (2)",
-            "tab": "General information",
-            "score": 602.145061728395
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.682,
-        "details": {
-          "description": "min=0.682, mean=0.682, max=0.682, sum=1.364 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.643 (2)",
-            "tab": "Efficiency",
-            "score": 0.32145483710549094
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=462.036, mean=462.036, max=462.036, sum=924.073 (2)",
-            "tab": "General information",
-            "score": 462.03636363636366
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.792,
-        "details": {
-          "description": "min=0.792, mean=0.792, max=0.792, sum=1.584 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.391, mean=0.391, max=0.391, sum=0.783 (2)",
-            "tab": "Efficiency",
-            "score": 0.3913051323014863
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1315.865, mean=1315.865, max=1315.865, sum=2631.731 (2)",
-            "tab": "General information",
-            "score": 1315.865306122449
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=1.741 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.326, mean=0.326, max=0.326, sum=0.652 (2)",
-            "tab": "Efficiency",
-            "score": 0.326159788008353
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=487.95, mean=487.95, max=487.95, sum=975.9 (2)",
-            "tab": "General information",
-            "score": 487.9502487562189
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.506,
-        "details": {
-          "description": "min=0.506, mean=0.506, max=0.506, sum=1.012 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)",
-            "tab": "Efficiency",
-            "score": 0.34297854210956985
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=395.349, mean=395.349, max=395.349, sum=790.699 (2)",
-            "tab": "General information",
-            "score": 395.34939759036143
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=1.743 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.317, mean=0.317, max=0.317, sum=0.633 (2)",
-            "tab": "Efficiency",
-            "score": 0.3165940499445151
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=308.924, mean=308.924, max=308.924, sum=617.848 (2)",
-            "tab": "General information",
-            "score": 308.92397660818716
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.689,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json b/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json
deleted file mode 100644
index e5aec6b67..000000000
--- a/data/helm_mmlu/mistralai/open-mistral-nemo-2407/e4c3032d-04e0-414b-a7e9-e30756d82000.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/mistralai_open-mistral-nemo-2407/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral NeMo 2402",
-    "id": "mistralai/open-mistral-nemo-2407",
-    "developer": "mistralai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.653,
-        "details": {
-          "description": "min=0.29, mean=0.653, max=0.912, sum=74.476 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.57, mean=0.852, max=1.185, sum=97.097 (114)",
-            "tab": "Efficiency",
-            "score": 0.8517321572873682
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=275.181, mean=627.375, max=2825.394, sum=71520.789 (114)",
-            "tab": "General information",
-            "score": 627.3753397392697
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.29,
-        "details": {
-          "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)",
-            "tab": "Efficiency",
-            "score": 0.6429726719856262
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=377.89, mean=377.89, max=377.89, sum=755.78 (2)",
-            "tab": "General information",
-            "score": 377.89
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.607,
-        "details": {
-          "description": "min=0.607, mean=0.607, max=0.607, sum=1.215 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.784, mean=0.784, max=0.784, sum=1.569 (2)",
-            "tab": "Efficiency",
-            "score": 0.7843294850102177
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=334.711, mean=334.711, max=334.711, sum=669.422 (2)",
-            "tab": "General information",
-            "score": 334.7111111111111
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.373,
-        "details": {
-          "description": "min=0.373, mean=0.373, max=0.373, sum=0.745 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.635, mean=0.635, max=0.635, sum=1.27 (2)",
-            "tab": "Efficiency",
-            "score": 0.6347627878189087
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.743, mean=0.743, max=0.743, sum=1.487 (2)",
-            "tab": "Efficiency",
-            "score": 0.7433112810055414
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.764, mean=0.764, max=0.764, sum=1.529 (2)",
-            "tab": "Efficiency",
-            "score": 0.7643197441101074
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.807, mean=0.807, max=0.807, sum=1.614 (2)",
-            "tab": "Efficiency",
-            "score": 0.8069064331054687
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.913, mean=0.913, max=0.913, sum=1.825 (2)",
-            "tab": "Efficiency",
-            "score": 0.9125060442555157
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.792, mean=0.792, max=0.792, sum=1.584 (2)",
-            "tab": "Efficiency",
-            "score": 0.7920899648292392
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=563.78, mean=563.78, max=563.78, sum=1127.56 (2)",
-            "tab": "General information",
-            "score": 563.78
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=471.931, mean=471.931, max=471.931, sum=943.861 (2)",
-            "tab": "General information",
-            "score": 471.93055555555554
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=844.21, mean=844.21, max=844.21, sum=1688.42 (2)",
-            "tab": "General information",
-            "score": 844.21
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=609.39, mean=609.39, max=609.39, sum=1218.78 (2)",
-            "tab": "General information",
-            "score": 609.39
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=497.538, mean=497.538, max=497.538, sum=995.075 (2)",
-            "tab": "General information",
-            "score": 497.53757225433526
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=512.941, mean=512.941, max=512.941, sum=1025.882 (2)",
-            "tab": "General information",
-            "score": 512.9411764705883
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=1.011, mean=1.011, max=1.011, sum=2.023 (2)",
-            "tab": "Efficiency",
-            "score": 1.0114419960975647
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=395.27, mean=395.27, max=395.27, sum=790.54 (2)",
-            "tab": "General information",
-            "score": 395.27
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.561,
-        "details": {
-          "description": "min=0.561, mean=0.561, max=0.561, sum=1.123 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.766, mean=0.766, max=0.766, sum=1.531 (2)",
-            "tab": "Efficiency",
-            "score": 0.7657254641516167
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=631.851, mean=631.851, max=631.851, sum=1263.702 (2)",
-            "tab": "General information",
-            "score": 631.8508771929825
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4,
-        "details": {
-          "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.842, mean=0.842, max=0.842, sum=1.683 (2)",
-            "tab": "Efficiency",
-            "score": 0.8416926956176758
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=433.39, mean=433.39, max=433.39, sum=866.78 (2)",
-            "tab": "General information",
-            "score": 433.39
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "description": "min=0.796, mean=0.796, max=0.796, sum=1.593 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.837, mean=0.837, max=0.837, sum=1.674 (2)",
-            "tab": "Efficiency",
-            "score": 0.8370662177050555
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=399.019, mean=399.019, max=399.019, sum=798.037 (2)",
-            "tab": "General information",
-            "score": 399.01851851851853
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.733,
-        "details": {
-          "description": "min=0.733, mean=0.733, max=0.733, sum=1.466 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.877, mean=0.877, max=0.877, sum=1.755 (2)",
-            "tab": "Efficiency",
-            "score": 0.8774675686643054
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=331.354, mean=331.354, max=331.354, sum=662.707 (2)",
-            "tab": "General information",
-            "score": 331.35369774919616
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.588,
-        "details": {
-          "description": "min=0.588, mean=0.588, max=0.588, sum=1.176 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.731, mean=0.731, max=0.731, sum=1.462 (2)",
-            "tab": "Efficiency",
-            "score": 0.7308363747947356
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.825, mean=0.825, max=0.825, sum=1.649 (2)",
-            "tab": "Efficiency",
-            "score": 0.824517419152226
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.812, mean=0.812, max=0.812, sum=1.625 (2)",
-            "tab": "Efficiency",
-            "score": 0.8123439646761917
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.757, mean=0.757, max=0.757, sum=1.515 (2)",
-            "tab": "Efficiency",
-            "score": 0.757308129391639
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1091.357, mean=1091.357, max=1091.357, sum=2182.713 (2)",
-            "tab": "General information",
-            "score": 1091.3566176470588
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=749.039, mean=749.039, max=749.039, sum=1498.078 (2)",
-            "tab": "General information",
-            "score": 749.0390070921986
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1710.472, mean=1710.472, max=1710.472, sum=3420.944 (2)",
-            "tab": "General information",
-            "score": 1710.4719687092568
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=584.748, mean=584.748, max=584.748, sum=1169.497 (2)",
-            "tab": "General information",
-            "score": 584.7483660130719
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "details": {
-          "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)",
-            "tab": "Efficiency",
-            "score": 0.8529575586318969
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=430.83, mean=430.83, max=430.83, sum=861.66 (2)",
-            "tab": "General information",
-            "score": 430.83
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.691,
-        "details": {
-          "description": "min=0.691, mean=0.691, max=0.691, sum=1.382 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.947, mean=0.947, max=0.947, sum=1.895 (2)",
-            "tab": "Efficiency",
-            "score": 0.9474252227105593
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=587.053, mean=587.053, max=587.053, sum=1174.105 (2)",
-            "tab": "General information",
-            "score": 587.0526315789474
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.49,
-        "details": {
-          "description": "min=0.49, mean=0.49, max=0.49, sum=0.98 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.719, mean=0.719, max=0.719, sum=1.438 (2)",
-            "tab": "Efficiency",
-            "score": 0.7189487242698669
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=597.68, mean=597.68, max=597.68, sum=1195.36 (2)",
-            "tab": "General information",
-            "score": 597.68
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.736,
-        "details": {
-          "description": "min=0.736, mean=0.736, max=0.736, sum=1.472 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.822, mean=0.822, max=0.822, sum=1.643 (2)",
-            "tab": "Efficiency",
-            "score": 0.8215559176678927
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=382.989, mean=382.989, max=382.989, sum=765.977 (2)",
-            "tab": "General information",
-            "score": 382.98867924528304
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.647,
-        "details": {
-          "description": "min=0.647, mean=0.647, max=0.647, sum=1.294 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.788, mean=0.788, max=0.788, sum=1.576 (2)",
-            "tab": "Efficiency",
-            "score": 0.7878646302730479
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=301.336, mean=301.336, max=301.336, sum=602.672 (2)",
-            "tab": "General information",
-            "score": 301.336170212766
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.531,
-        "details": {
-          "description": "min=0.531, mean=0.531, max=0.531, sum=1.062 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.658, mean=0.658, max=0.658, sum=1.316 (2)",
-            "tab": "Efficiency",
-            "score": 0.6578493726664576
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=464.697, mean=464.697, max=464.697, sum=929.393 (2)",
-            "tab": "General information",
-            "score": 464.6965517241379
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.439,
-        "details": {
-          "description": "min=0.439, mean=0.439, max=0.439, sum=0.878 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.712, mean=0.712, max=0.712, sum=1.423 (2)",
-            "tab": "Efficiency",
-            "score": 0.7115525694751235
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=580.741, mean=580.741, max=580.741, sum=1161.481 (2)",
-            "tab": "General information",
-            "score": 580.7407407407408
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.405,
-        "details": {
-          "description": "min=0.405, mean=0.405, max=0.405, sum=0.81 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=1.185, mean=1.185, max=1.185, sum=2.37 (2)",
-            "tab": "Efficiency",
-            "score": 1.1852161146345592
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=619.929, mean=619.929, max=619.929, sum=1239.857 (2)",
-            "tab": "General information",
-            "score": 619.9285714285714
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.848,
-        "details": {
-          "description": "min=0.848, mean=0.848, max=0.848, sum=1.696 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.97, mean=0.97, max=0.97, sum=1.94 (2)",
-            "tab": "Efficiency",
-            "score": 0.9699527340550577
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.751, mean=0.751, max=0.751, sum=1.503 (2)",
-            "tab": "Efficiency",
-            "score": 0.751325937327493
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.764, mean=0.764, max=0.764, sum=1.528 (2)",
-            "tab": "Efficiency",
-            "score": 0.7637556600570679
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.796, mean=0.796, max=0.796, sum=1.592 (2)",
-            "tab": "Efficiency",
-            "score": 0.7959829893979159
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.686, mean=0.686, max=0.686, sum=1.373 (2)",
-            "tab": "Efficiency",
-            "score": 0.686434592863526
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.837, mean=0.837, max=0.837, sum=1.674 (2)",
-            "tab": "Efficiency",
-            "score": 0.8370978684005342
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=1.045, mean=1.045, max=1.045, sum=2.09 (2)",
-            "tab": "Efficiency",
-            "score": 1.045194720610594
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.839, mean=0.839, max=0.839, sum=1.677 (2)",
-            "tab": "Efficiency",
-            "score": 0.8386335717307196
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)",
-            "tab": "Efficiency",
-            "score": 0.9010114108814913
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)",
-            "tab": "Efficiency",
-            "score": 0.9301499767808725
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.975, mean=0.975, max=0.975, sum=1.95 (2)",
-            "tab": "Efficiency",
-            "score": 0.9747656953444175
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.798, mean=0.798, max=0.798, sum=1.595 (2)",
-            "tab": "Efficiency",
-            "score": 0.7976611223485734
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=1.112, mean=1.112, max=1.112, sum=2.225 (2)",
-            "tab": "Efficiency",
-            "score": 1.1124158618496913
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.942, mean=0.942, max=0.942, sum=1.883 (2)",
-            "tab": "Efficiency",
-            "score": 0.9417288112237987
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=500.994, mean=500.994, max=500.994, sum=1001.987 (2)",
-            "tab": "General information",
-            "score": 500.9935483870968
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=507.995, mean=507.995, max=507.995, sum=1015.99 (2)",
-            "tab": "General information",
-            "score": 507.9950738916256
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=888.78, mean=888.78, max=888.78, sum=1777.56 (2)",
-            "tab": "General information",
-            "score": 888.78
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2825.394, mean=2825.394, max=2825.394, sum=5650.788 (2)",
-            "tab": "General information",
-            "score": 2825.3939393939395
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=372.207, mean=372.207, max=372.207, sum=744.414 (2)",
-            "tab": "General information",
-            "score": 372.2070707070707
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=471.202, mean=471.202, max=471.202, sum=942.404 (2)",
-            "tab": "General information",
-            "score": 471.2020725388601
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=379.21, mean=379.21, max=379.21, sum=758.421 (2)",
-            "tab": "General information",
-            "score": 379.2102564102564
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=555.807, mean=555.807, max=555.807, sum=1111.615 (2)",
-            "tab": "General information",
-            "score": 555.8074074074074
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=405.95, mean=405.95, max=405.95, sum=811.899 (2)",
-            "tab": "General information",
-            "score": 405.9495798319328
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=584.272, mean=584.272, max=584.272, sum=1168.543 (2)",
-            "tab": "General information",
-            "score": 584.2715231788079
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=487.532, mean=487.532, max=487.532, sum=975.064 (2)",
-            "tab": "General information",
-            "score": 487.5321100917431
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=831.926, mean=831.926, max=831.926, sum=1663.852 (2)",
-            "tab": "General information",
-            "score": 831.925925925926
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2271.559, mean=2271.559, max=2271.559, sum=4543.118 (2)",
-            "tab": "General information",
-            "score": 2271.5588235294117
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1458.937, mean=1458.937, max=1458.937, sum=2917.873 (2)",
-            "tab": "General information",
-            "score": 1458.9367088607594
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.702,
-        "details": {
-          "description": "min=0.702, mean=0.702, max=0.702, sum=1.405 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.851, mean=0.851, max=0.851, sum=1.703 (2)",
-            "tab": "Efficiency",
-            "score": 0.8512581602874892
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.57, mean=0.57, max=0.57, sum=1.139 (2)",
-            "tab": "Efficiency",
-            "score": 0.569578381895109
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=320.296, mean=320.296, max=320.296, sum=640.592 (2)",
-            "tab": "General information",
-            "score": 320.29596412556054
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=345.45, mean=345.45, max=345.45, sum=690.901 (2)",
-            "tab": "General information",
-            "score": 345.4503816793893
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.769,
-        "details": {
-          "description": "min=0.769, mean=0.769, max=0.769, sum=1.537 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.779, mean=0.779, max=0.779, sum=1.558 (2)",
-            "tab": "Efficiency",
-            "score": 0.7790698473118554
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=649.017, mean=649.017, max=649.017, sum=1298.033 (2)",
-            "tab": "General information",
-            "score": 649.0165289256198
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791,
-        "details": {
-          "description": "min=0.791, mean=0.791, max=0.791, sum=1.583 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=1.077, mean=1.077, max=1.077, sum=2.154 (2)",
-            "tab": "Efficiency",
-            "score": 1.0772201810146402
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=449.595, mean=449.595, max=449.595, sum=899.19 (2)",
-            "tab": "General information",
-            "score": 449.5950920245399
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.402,
-        "details": {
-          "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=1.123, mean=1.123, max=1.123, sum=2.246 (2)",
-            "tab": "Efficiency",
-            "score": 1.1229032427072525
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=684.696, mean=684.696, max=684.696, sum=1369.393 (2)",
-            "tab": "General information",
-            "score": 684.6964285714286
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "description": "min=0.796, mean=0.796, max=0.796, sum=1.592 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.786, mean=0.786, max=0.786, sum=1.571 (2)",
-            "tab": "Efficiency",
-            "score": 0.7855723436596325
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=286.272, mean=286.272, max=286.272, sum=572.544 (2)",
-            "tab": "General information",
-            "score": 286.2718446601942
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.889,
-        "details": {
-          "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.859, mean=0.859, max=0.859, sum=1.719 (2)",
-            "tab": "Efficiency",
-            "score": 0.8593697160737127
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=408.308, mean=408.308, max=408.308, sum=816.615 (2)",
-            "tab": "General information",
-            "score": 408.3076923076923
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78,
-        "details": {
-          "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.829, mean=0.829, max=0.829, sum=1.658 (2)",
-            "tab": "Efficiency",
-            "score": 0.8288634467124939
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=332.56, mean=332.56, max=332.56, sum=665.12 (2)",
-            "tab": "General information",
-            "score": 332.56
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.861,
-        "details": {
-          "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)",
-            "tab": "Efficiency",
-            "score": 0.8490832494440967
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=307.041, mean=307.041, max=307.041, sum=614.082 (2)",
-            "tab": "General information",
-            "score": 307.04086845466156
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.381,
-        "details": {
-          "description": "min=0.381, mean=0.381, max=0.381, sum=0.762 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.808, mean=0.808, max=0.808, sum=1.615 (2)",
-            "tab": "Efficiency",
-            "score": 0.8076560903835848
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.868, mean=0.868, max=0.868, sum=1.735 (2)",
-            "tab": "Efficiency",
-            "score": 0.8676496551023515
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=480.821, mean=480.821, max=480.821, sum=961.642 (2)",
-            "tab": "General information",
-            "score": 480.8208092485549
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=672.035, mean=672.035, max=672.035, sum=1344.069 (2)",
-            "tab": "General information",
-            "score": 672.0346368715084
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.709,
-        "details": {
-          "description": "min=0.709, mean=0.709, max=0.709, sum=1.418 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.903, mean=0.903, max=0.903, sum=1.807 (2)",
-            "tab": "Efficiency",
-            "score": 0.9033067834143546
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=590.154, mean=590.154, max=590.154, sum=1180.307 (2)",
-            "tab": "General information",
-            "score": 590.1535947712418
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.765,
-        "details": {
-          "description": "min=0.765, mean=0.765, max=0.765, sum=1.531 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)",
-            "tab": "Efficiency",
-            "score": 0.8491357167561849
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=526.04, mean=526.04, max=526.04, sum=1052.08 (2)",
-            "tab": "General information",
-            "score": 526.0401234567901
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.718,
-        "details": {
-          "description": "min=0.718, mean=0.718, max=0.718, sum=1.436 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=1.141, mean=1.141, max=1.141, sum=2.281 (2)",
-            "tab": "Efficiency",
-            "score": 1.1407060449773616
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=409.045, mean=409.045, max=409.045, sum=818.091 (2)",
-            "tab": "General information",
-            "score": 409.04545454545456
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.771,
-        "details": {
-          "description": "min=0.771, mean=0.771, max=0.771, sum=1.543 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.958, mean=0.958, max=0.958, sum=1.915 (2)",
-            "tab": "Efficiency",
-            "score": 0.9576426525505222
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1186.502, mean=1186.502, max=1186.502, sum=2373.004 (2)",
-            "tab": "General information",
-            "score": 1186.5020408163266
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.726,
-        "details": {
-          "description": "min=0.726, mean=0.726, max=0.726, sum=1.453 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.781, mean=0.781, max=0.781, sum=1.562 (2)",
-            "tab": "Efficiency",
-            "score": 0.781044238835425
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=455.348, mean=455.348, max=455.348, sum=910.697 (2)",
-            "tab": "General information",
-            "score": 455.3482587064677
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.56,
-        "details": {
-          "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=1.059, mean=1.059, max=1.059, sum=2.118 (2)",
-            "tab": "Efficiency",
-            "score": 1.0589684750660355
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=336.127, mean=336.127, max=336.127, sum=672.253 (2)",
-            "tab": "General information",
-            "score": 336.1265060240964
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.789,
-        "details": {
-          "description": "min=0.789, mean=0.789, max=0.789, sum=1.579 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)",
-            "tab": "Efficiency",
-            "score": 0.8906254336150766
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=275.181, mean=275.181, max=275.181, sum=550.363 (2)",
-            "tab": "General information",
-            "score": 275.1812865497076
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.215,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json b/data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json
deleted file mode 100644
index e429d6dbc..000000000
--- a/data/helm_mmlu/openai/gpt-3.5-turbo-0125/e9a41d4b-56c7-47f0-b439-72ad1e463000.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0125/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-3.5 Turbo 0125",
-    "id": "openai/gpt-3.5-turbo-0125",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.673,
-        "details": {
-          "description": "min=0.307, mean=0.673, max=0.922, sum=76.686 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.396, mean=0.476, max=1.242, sum=54.283 (114)",
-            "tab": "Efficiency",
-            "score": 0.4761648045252673
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=275.561, mean=614.852, max=2798.073, sum=70093.086 (114)",
-            "tab": "General information",
-            "score": 614.851634217556
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.31,
-        "details": {
-          "description": "min=0.31, mean=0.31, max=0.31, sum=0.62 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)",
-            "tab": "Efficiency",
-            "score": 0.4701289844512939
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=373.44, mean=373.44, max=373.44, sum=746.88 (2)",
-            "tab": "General information",
-            "score": 373.44
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.696,
-        "details": {
-          "description": "min=0.696, mean=0.696, max=0.696, sum=1.393 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.422, mean=0.422, max=0.422, sum=0.844 (2)",
-            "tab": "Efficiency",
-            "score": 0.42177006050392435
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=353.978, mean=353.978, max=353.978, sum=707.956 (2)",
-            "tab": "General information",
-            "score": 353.97777777777776
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.471,
-        "details": {
-          "description": "min=0.471, mean=0.471, max=0.471, sum=0.941 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.428, mean=0.428, max=0.428, sum=0.856 (2)",
-            "tab": "Efficiency",
-            "score": 0.42796642541885377
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.474, mean=0.474, max=0.474, sum=0.949 (2)",
-            "tab": "Efficiency",
-            "score": 0.47431788014041054
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)",
-            "tab": "Efficiency",
-            "score": 0.5200183248519897
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.448, mean=0.448, max=0.448, sum=0.897 (2)",
-            "tab": "Efficiency",
-            "score": 0.4484861779212952
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)",
-            "tab": "Efficiency",
-            "score": 0.4230213785447137
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)",
-            "tab": "Efficiency",
-            "score": 0.4148852918662277
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=549.4, mean=549.4, max=549.4, sum=1098.8 (2)",
-            "tab": "General information",
-            "score": 549.4
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=473.917, mean=473.917, max=473.917, sum=947.833 (2)",
-            "tab": "General information",
-            "score": 473.9166666666667
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=828.39, mean=828.39, max=828.39, sum=1656.78 (2)",
-            "tab": "General information",
-            "score": 828.39
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=594.52, mean=594.52, max=594.52, sum=1189.04 (2)",
-            "tab": "General information",
-            "score": 594.52
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=502.728, mean=502.728, max=502.728, sum=1005.457 (2)",
-            "tab": "General information",
-            "score": 502.728323699422
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=503.608, mean=503.608, max=503.608, sum=1007.216 (2)",
-            "tab": "General information",
-            "score": 503.6078431372549
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78,
-        "details": {
-          "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.444, mean=0.444, max=0.444, sum=0.887 (2)",
-            "tab": "Efficiency",
-            "score": 0.44357073068618774
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=378.54, mean=378.54, max=378.54, sum=757.08 (2)",
-            "tab": "General information",
-            "score": 378.54
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.474,
-        "details": {
-          "description": "min=0.474, mean=0.474, max=0.474, sum=0.947 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.418, mean=0.418, max=0.418, sum=0.836 (2)",
-            "tab": "Efficiency",
-            "score": 0.4179882564042744
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=614.43, mean=614.43, max=614.43, sum=1228.86 (2)",
-            "tab": "General information",
-            "score": 614.4298245614035
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39,
-        "details": {
-          "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.432, mean=0.432, max=0.432, sum=0.863 (2)",
-            "tab": "Efficiency",
-            "score": 0.4315228652954102
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)",
-            "tab": "General information",
-            "score": 399.71
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.806,
-        "details": {
-          "description": "min=0.806, mean=0.806, max=0.806, sum=1.611 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.509, mean=0.509, max=0.509, sum=1.017 (2)",
-            "tab": "Efficiency",
-            "score": 0.5086877279811435
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=394.639, mean=394.639, max=394.639, sum=789.278 (2)",
-            "tab": "General information",
-            "score": 394.6388888888889
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.746,
-        "details": {
-          "description": "min=0.746, mean=0.746, max=0.746, sum=1.492 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.472, mean=0.472, max=0.472, sum=0.944 (2)",
-            "tab": "Efficiency",
-            "score": 0.4717828660149283
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)",
-            "tab": "General information",
-            "score": 329.08360128617363
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.722,
-        "details": {
-          "description": "min=0.722, mean=0.722, max=0.722, sum=1.444 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.485, mean=0.485, max=0.485, sum=0.971 (2)",
-            "tab": "Efficiency",
-            "score": 0.4853776947540395
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)",
-            "tab": "Efficiency",
-            "score": 0.42316425692105125
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.442, mean=0.442, max=0.442, sum=0.883 (2)",
-            "tab": "Efficiency",
-            "score": 0.4417385995932011
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.422, mean=0.422, max=0.422, sum=0.843 (2)",
-            "tab": "Efficiency",
-            "score": 0.42156751132478903
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1094.585, mean=1094.585, max=1094.585, sum=2189.169 (2)",
-            "tab": "General information",
-            "score": 1094.5845588235295
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=658.592, mean=658.592, max=658.592, sum=1317.184 (2)",
-            "tab": "General information",
-            "score": 658.5921985815603
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1637.787, mean=1637.787, max=1637.787, sum=3275.574 (2)",
-            "tab": "General information",
-            "score": 1637.7868318122555
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=575.114, mean=575.114, max=575.114, sum=1150.229 (2)",
-            "tab": "General information",
-            "score": 575.1143790849674
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "details": {
-          "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.456, mean=0.456, max=0.456, sum=0.911 (2)",
-            "tab": "Efficiency",
-            "score": 0.4557087206840515
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)",
-            "tab": "General information",
-            "score": 422.79
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.75,
-        "details": {
-          "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)",
-            "tab": "Efficiency",
-            "score": 0.42091869994213704
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=579.691, mean=579.691, max=579.691, sum=1159.382 (2)",
-            "tab": "General information",
-            "score": 579.6907894736842
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.75,
-        "details": {
-          "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.453, mean=0.453, max=0.453, sum=0.906 (2)",
-            "tab": "Efficiency",
-            "score": 0.4530529642105103
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)",
-            "tab": "General information",
-            "score": 569.52
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.755,
-        "details": {
-          "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.418, mean=0.418, max=0.418, sum=0.837 (2)",
-            "tab": "Efficiency",
-            "score": 0.41833644812961795
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=397.947, mean=397.947, max=397.947, sum=795.894 (2)",
-            "tab": "General information",
-            "score": 397.94716981132075
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.634,
-        "details": {
-          "description": "min=0.634, mean=0.634, max=0.634, sum=1.268 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.808, mean=0.808, max=0.808, sum=1.616 (2)",
-            "tab": "Efficiency",
-            "score": 0.8081990150695152
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=304.838, mean=304.838, max=304.838, sum=609.677 (2)",
-            "tab": "General information",
-            "score": 304.83829787234043
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.669,
-        "details": {
-          "description": "min=0.669, mean=0.669, max=0.669, sum=1.338 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=1.242, mean=1.242, max=1.242, sum=2.485 (2)",
-            "tab": "Efficiency",
-            "score": 1.2423763686213
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=440.641, mean=440.641, max=440.641, sum=881.283 (2)",
-            "tab": "General information",
-            "score": 440.6413793103448
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.534,
-        "details": {
-          "description": "min=0.534, mean=0.534, max=0.534, sum=1.069 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.872 (2)",
-            "tab": "Efficiency",
-            "score": 0.4359189442225865
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=531.862, mean=531.862, max=531.862, sum=1063.725 (2)",
-            "tab": "General information",
-            "score": 531.8624338624338
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.444,
-        "details": {
-          "description": "min=0.444, mean=0.444, max=0.444, sum=0.889 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.431, mean=0.431, max=0.431, sum=0.861 (2)",
-            "tab": "Efficiency",
-            "score": 0.43056895051683697
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=606.762, mean=606.762, max=606.762, sum=1213.524 (2)",
-            "tab": "General information",
-            "score": 606.7619047619048
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.819,
-        "details": {
-          "description": "min=0.819, mean=0.819, max=0.819, sum=1.637 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.572, mean=0.572, max=0.572, sum=1.143 (2)",
-            "tab": "Efficiency",
-            "score": 0.5715394450772193
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.491, mean=0.491, max=0.491, sum=0.981 (2)",
-            "tab": "Efficiency",
-            "score": 0.49073645046779085
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.433, mean=0.433, max=0.433, sum=0.865 (2)",
-            "tab": "Efficiency",
-            "score": 0.43273836851119996
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.489, mean=0.489, max=0.489, sum=0.977 (2)",
-            "tab": "Efficiency",
-            "score": 0.48863930413217255
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.872 (2)",
-            "tab": "Efficiency",
-            "score": 0.4360258868246367
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.484, mean=0.484, max=0.484, sum=0.967 (2)",
-            "tab": "Efficiency",
-            "score": 0.4836950153884492
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.422, mean=0.422, max=0.422, sum=0.843 (2)",
-            "tab": "Efficiency",
-            "score": 0.4215013412328867
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.49, mean=0.49, max=0.49, sum=0.979 (2)",
-            "tab": "Efficiency",
-            "score": 0.48968876291204383
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.432, mean=0.432, max=0.432, sum=0.864 (2)",
-            "tab": "Efficiency",
-            "score": 0.4320918882594389
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.466, mean=0.466, max=0.466, sum=0.932 (2)",
-            "tab": "Efficiency",
-            "score": 0.4659363955061957
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.443, mean=0.443, max=0.443, sum=0.887 (2)",
-            "tab": "Efficiency",
-            "score": 0.4434620769745713
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.431, mean=0.431, max=0.431, sum=0.862 (2)",
-            "tab": "Efficiency",
-            "score": 0.43081507749027675
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.486, mean=0.486, max=0.486, sum=0.971 (2)",
-            "tab": "Efficiency",
-            "score": 0.4857361819229874
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.441, mean=0.441, max=0.441, sum=0.882 (2)",
-            "tab": "Efficiency",
-            "score": 0.44100493620216596
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=513.677, mean=513.677, max=513.677, sum=1027.355 (2)",
-            "tab": "General information",
-            "score": 513.6774193548387
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=496.714, mean=496.714, max=496.714, sum=993.429 (2)",
-            "tab": "General information",
-            "score": 496.7142857142857
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)",
-            "tab": "General information",
-            "score": 867.78
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2798.073, mean=2798.073, max=2798.073, sum=5596.145 (2)",
-            "tab": "General information",
-            "score": 2798.072727272727
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=372.045, mean=372.045, max=372.045, sum=744.091 (2)",
-            "tab": "General information",
-            "score": 372.04545454545456
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)",
-            "tab": "General information",
-            "score": 465.8238341968912
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=371.562, mean=371.562, max=371.562, sum=743.123 (2)",
-            "tab": "General information",
-            "score": 371.5615384615385
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=532.374, mean=532.374, max=532.374, sum=1064.748 (2)",
-            "tab": "General information",
-            "score": 532.3740740740741
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=399.025, mean=399.025, max=399.025, sum=798.05 (2)",
-            "tab": "General information",
-            "score": 399.02521008403363
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=560.464, mean=560.464, max=560.464, sum=1120.927 (2)",
-            "tab": "General information",
-            "score": 560.4635761589404
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=495.246, mean=495.246, max=495.246, sum=990.492 (2)",
-            "tab": "General information",
-            "score": 495.24587155963303
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=795.699, mean=795.699, max=795.699, sum=1591.398 (2)",
-            "tab": "General information",
-            "score": 795.699074074074
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)",
-            "tab": "General information",
-            "score": 2217.8088235294117
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1428.27, mean=1428.27, max=1428.27, sum=2856.54 (2)",
-            "tab": "General information",
-            "score": 1428.2700421940929
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.779,
-        "details": {
-          "description": "min=0.779, mean=0.779, max=0.779, sum=1.557 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)",
-            "tab": "Efficiency",
-            "score": 0.42309954027423946
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.417, mean=0.417, max=0.417, sum=0.833 (2)",
-            "tab": "Efficiency",
-            "score": 0.4166541681944869
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=319.906, mean=319.906, max=319.906, sum=639.812 (2)",
-            "tab": "General information",
-            "score": 319.90582959641256
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=341.183, mean=341.183, max=341.183, sum=682.366 (2)",
-            "tab": "General information",
-            "score": 341.1832061068702
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.525, mean=0.525, max=0.525, sum=1.05 (2)",
-            "tab": "Efficiency",
-            "score": 0.5249163257189033
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=639.851, mean=639.851, max=639.851, sum=1279.702 (2)",
-            "tab": "General information",
-            "score": 639.8512396694215
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.779,
-        "details": {
-          "description": "min=0.779, mean=0.779, max=0.779, sum=1.558 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.504, mean=0.504, max=0.504, sum=1.008 (2)",
-            "tab": "Efficiency",
-            "score": 0.5038382904661214
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=449.595, mean=449.595, max=449.595, sum=899.19 (2)",
-            "tab": "General information",
-            "score": 449.5950920245399
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.455,
-        "details": {
-          "description": "min=0.455, mean=0.455, max=0.455, sum=0.911 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.437, mean=0.437, max=0.437, sum=0.875 (2)",
-            "tab": "Efficiency",
-            "score": 0.4374160830463682
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)",
-            "tab": "General information",
-            "score": 668.0535714285714
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.835,
-        "details": {
-          "description": "min=0.835, mean=0.835, max=0.835, sum=1.67 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.458, mean=0.458, max=0.458, sum=0.917 (2)",
-            "tab": "Efficiency",
-            "score": 0.4584047493425388
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=283.796, mean=283.796, max=283.796, sum=567.592 (2)",
-            "tab": "General information",
-            "score": 283.79611650485435
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)",
-            "tab": "Efficiency",
-            "score": 0.4209032700611995
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)",
-            "tab": "General information",
-            "score": 404.21794871794873
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.73,
-        "details": {
-          "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.489, mean=0.489, max=0.489, sum=0.979 (2)",
-            "tab": "Efficiency",
-            "score": 0.48938191413879395
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=341, mean=341, max=341, sum=682 (2)",
-            "tab": "General information",
-            "score": 341.0
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "details": {
-          "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.421, mean=0.421, max=0.421, sum=0.841 (2)",
-            "tab": "Efficiency",
-            "score": 0.4205615121590528
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=299.925, mean=299.925, max=299.925, sum=599.849 (2)",
-            "tab": "General information",
-            "score": 299.92464878671774
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.355,
-        "details": {
-          "description": "min=0.355, mean=0.355, max=0.355, sum=0.711 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.439, mean=0.439, max=0.439, sum=0.878 (2)",
-            "tab": "Efficiency",
-            "score": 0.43890244423309505
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.422, mean=0.422, max=0.422, sum=0.843 (2)",
-            "tab": "Efficiency",
-            "score": 0.4216500338229387
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=476.145, mean=476.145, max=476.145, sum=952.289 (2)",
-            "tab": "General information",
-            "score": 476.1445086705202
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)",
-            "tab": "General information",
-            "score": 656.454748603352
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.748,
-        "details": {
-          "description": "min=0.748, mean=0.748, max=0.748, sum=1.497 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.438, mean=0.438, max=0.438, sum=0.876 (2)",
-            "tab": "Efficiency",
-            "score": 0.4378981278612723
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=586.817, mean=586.817, max=586.817, sum=1173.634 (2)",
-            "tab": "General information",
-            "score": 586.8169934640523
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.735,
-        "details": {
-          "description": "min=0.735, mean=0.735, max=0.735, sum=1.469 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.462, mean=0.462, max=0.462, sum=0.924 (2)",
-            "tab": "Efficiency",
-            "score": 0.4620003163078685
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=514.559, mean=514.559, max=514.559, sum=1029.117 (2)",
-            "tab": "General information",
-            "score": 514.5586419753087
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.727,
-        "details": {
-          "description": "min=0.727, mean=0.727, max=0.727, sum=1.455 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.685, mean=0.685, max=0.685, sum=1.371 (2)",
-            "tab": "Efficiency",
-            "score": 0.6854934020475908
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)",
-            "tab": "General information",
-            "score": 405.3181818181818
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.751,
-        "details": {
-          "description": "min=0.751, mean=0.751, max=0.751, sum=1.502 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.539, mean=0.539, max=0.539, sum=1.077 (2)",
-            "tab": "Efficiency",
-            "score": 0.5387308393205915
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)",
-            "tab": "General information",
-            "score": 1164.4734693877551
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.861,
-        "details": {
-          "description": "min=0.861, mean=0.861, max=0.861, sum=1.721 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.428, mean=0.428, max=0.428, sum=0.856 (2)",
-            "tab": "Efficiency",
-            "score": 0.42779283025371495
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=445.522, mean=445.522, max=445.522, sum=891.045 (2)",
-            "tab": "General information",
-            "score": 445.5223880597015
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.536,
-        "details": {
-          "description": "min=0.536, mean=0.536, max=0.536, sum=1.072 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.396, mean=0.396, max=0.396, sum=0.791 (2)",
-            "tab": "Efficiency",
-            "score": 0.39562296723744955
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=343.09, mean=343.09, max=343.09, sum=686.181 (2)",
-            "tab": "General information",
-            "score": 343.0903614457831
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.842,
-        "details": {
-          "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.413, mean=0.413, max=0.413, sum=0.827 (2)",
-            "tab": "Efficiency",
-            "score": 0.41344076848169514
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=275.561, mean=275.561, max=275.561, sum=551.123 (2)",
-            "tab": "General information",
-            "score": 275.56140350877195
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.493,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json b/data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json
deleted file mode 100644
index 92faf2169..000000000
--- a/data/helm_mmlu/openai/gpt-3.5-turbo-0613/a2b7c0ec-fb74-4698-80ad-f054039ecb3f.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-3.5-turbo-0613/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-3.5 Turbo 0613",
-    "id": "openai/gpt-3.5-turbo-0613",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.689,
-        "details": {
-          "description": "min=0.33, mean=0.689, max=0.922, sum=78.524 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.171, mean=0.411, max=0.659, sum=46.797 (114)",
-            "tab": "Efficiency",
-            "score": 0.41050392458578394
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=268.561, mean=607.852, max=2791.073, sum=69295.086 (114)",
-            "tab": "General information",
-            "score": 607.851634217556
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38,
-        "details": {
-          "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.177, mean=0.177, max=0.177, sum=0.353 (2)",
-            "tab": "Efficiency",
-            "score": 0.17670444011688233
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=366.44, mean=366.44, max=366.44, sum=732.88 (2)",
-            "tab": "General information",
-            "score": 366.44
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.659,
-        "details": {
-          "description": "min=0.659, mean=0.659, max=0.659, sum=1.319 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.448, mean=0.448, max=0.448, sum=0.896 (2)",
-            "tab": "Efficiency",
-            "score": 0.448052688881203
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=346.978, mean=346.978, max=346.978, sum=693.956 (2)",
-            "tab": "General information",
-            "score": 346.97777777777776
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.461,
-        "details": {
-          "description": "min=0.461, mean=0.461, max=0.461, sum=0.922 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.174, mean=0.174, max=0.174, sum=0.349 (2)",
-            "tab": "Efficiency",
-            "score": 0.17441444158554076
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.435, mean=0.435, max=0.435, sum=0.871 (2)",
-            "tab": "Efficiency",
-            "score": 0.43541959755950504
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.508, mean=0.508, max=0.508, sum=1.015 (2)",
-            "tab": "Efficiency",
-            "score": 0.5075832653045654
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.419, mean=0.419, max=0.419, sum=0.839 (2)",
-            "tab": "Efficiency",
-            "score": 0.41928773641586303
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.389, mean=0.389, max=0.389, sum=0.777 (2)",
-            "tab": "Efficiency",
-            "score": 0.3885422951913293
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.37, mean=0.37, max=0.37, sum=0.74 (2)",
-            "tab": "Efficiency",
-            "score": 0.3700263453464882
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=542.4, mean=542.4, max=542.4, sum=1084.8 (2)",
-            "tab": "General information",
-            "score": 542.4
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=466.917, mean=466.917, max=466.917, sum=933.833 (2)",
-            "tab": "General information",
-            "score": 466.9166666666667
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=821.39, mean=821.39, max=821.39, sum=1642.78 (2)",
-            "tab": "General information",
-            "score": 821.39
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=587.52, mean=587.52, max=587.52, sum=1175.04 (2)",
-            "tab": "General information",
-            "score": 587.52
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=495.728, mean=495.728, max=495.728, sum=991.457 (2)",
-            "tab": "General information",
-            "score": 495.728323699422
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=496.608, mean=496.608, max=496.608, sum=993.216 (2)",
-            "tab": "General information",
-            "score": 496.6078431372549
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.171, mean=0.171, max=0.171, sum=0.342 (2)",
-            "tab": "Efficiency",
-            "score": 0.17102816104888915
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=371.54, mean=371.54, max=371.54, sum=743.08 (2)",
-            "tab": "General information",
-            "score": 371.54
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.176, mean=0.176, max=0.176, sum=0.353 (2)",
-            "tab": "Efficiency",
-            "score": 0.1764866866563496
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=607.43, mean=607.43, max=607.43, sum=1214.86 (2)",
-            "tab": "General information",
-            "score": 607.4298245614035
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37,
-        "details": {
-          "description": "min=0.37, mean=0.37, max=0.37, sum=0.74 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.439, mean=0.439, max=0.439, sum=0.879 (2)",
-            "tab": "Efficiency",
-            "score": 0.4393133974075317
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=392.71, mean=392.71, max=392.71, sum=785.42 (2)",
-            "tab": "General information",
-            "score": 392.71
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.806,
-        "details": {
-          "description": "min=0.806, mean=0.806, max=0.806, sum=1.611 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.373, mean=0.373, max=0.373, sum=0.747 (2)",
-            "tab": "Efficiency",
-            "score": 0.37349939346313477
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=387.639, mean=387.639, max=387.639, sum=775.278 (2)",
-            "tab": "General information",
-            "score": 387.6388888888889
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.759,
-        "details": {
-          "description": "min=0.759, mean=0.759, max=0.759, sum=1.518 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.382, mean=0.382, max=0.382, sum=0.763 (2)",
-            "tab": "Efficiency",
-            "score": 0.3817227730030415
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=322.084, mean=322.084, max=322.084, sum=644.167 (2)",
-            "tab": "General information",
-            "score": 322.08360128617363
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.732,
-        "details": {
-          "description": "min=0.732, mean=0.732, max=0.732, sum=1.464 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.485, mean=0.485, max=0.485, sum=0.969 (2)",
-            "tab": "Efficiency",
-            "score": 0.48464199637665467
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.439, mean=0.439, max=0.439, sum=0.878 (2)",
-            "tab": "Efficiency",
-            "score": 0.4387922709715282
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.506, mean=0.506, max=0.506, sum=1.012 (2)",
-            "tab": "Efficiency",
-            "score": 0.5061173195012079
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.468, mean=0.468, max=0.468, sum=0.935 (2)",
-            "tab": "Efficiency",
-            "score": 0.4675601058536106
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1087.585, mean=1087.585, max=1087.585, sum=2175.169 (2)",
-            "tab": "General information",
-            "score": 1087.5845588235295
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=651.592, mean=651.592, max=651.592, sum=1303.184 (2)",
-            "tab": "General information",
-            "score": 651.5921985815603
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1630.787, mean=1630.787, max=1630.787, sum=3261.574 (2)",
-            "tab": "General information",
-            "score": 1630.7868318122555
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=568.114, mean=568.114, max=568.114, sum=1136.229 (2)",
-            "tab": "General information",
-            "score": 568.1143790849674
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "details": {
-          "description": "min=0.88, mean=0.88, max=0.88, sum=1.76 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.177, mean=0.177, max=0.177, sum=0.353 (2)",
-            "tab": "Efficiency",
-            "score": 0.17667593240737914
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=415.79, mean=415.79, max=415.79, sum=831.58 (2)",
-            "tab": "General information",
-            "score": 415.79
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.763,
-        "details": {
-          "description": "min=0.763, mean=0.763, max=0.763, sum=1.526 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.442, mean=0.442, max=0.442, sum=0.885 (2)",
-            "tab": "Efficiency",
-            "score": 0.44235374111878245
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=572.691, mean=572.691, max=572.691, sum=1145.382 (2)",
-            "tab": "General information",
-            "score": 572.6907894736842
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.75,
-        "details": {
-          "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.573, mean=0.573, max=0.573, sum=1.147 (2)",
-            "tab": "Efficiency",
-            "score": 0.5733751010894775
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=562.52, mean=562.52, max=562.52, sum=1125.04 (2)",
-            "tab": "General information",
-            "score": 562.52
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.777,
-        "details": {
-          "description": "min=0.777, mean=0.777, max=0.777, sum=1.555 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.418, mean=0.418, max=0.418, sum=0.837 (2)",
-            "tab": "Efficiency",
-            "score": 0.4183455800110439
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=390.947, mean=390.947, max=390.947, sum=781.894 (2)",
-            "tab": "General information",
-            "score": 390.94716981132075
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.613,
-        "details": {
-          "description": "min=0.613, mean=0.613, max=0.613, sum=1.226 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.386, mean=0.386, max=0.386, sum=0.771 (2)",
-            "tab": "Efficiency",
-            "score": 0.3856722780998717
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=297.838, mean=297.838, max=297.838, sum=595.677 (2)",
-            "tab": "General information",
-            "score": 297.83829787234043
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.648,
-        "details": {
-          "description": "min=0.648, mean=0.648, max=0.648, sum=1.297 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.434, mean=0.434, max=0.434, sum=0.867 (2)",
-            "tab": "Efficiency",
-            "score": 0.43367810249328614
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=433.641, mean=433.641, max=433.641, sum=867.283 (2)",
-            "tab": "General information",
-            "score": 433.6413793103448
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.386, mean=0.386, max=0.386, sum=0.771 (2)",
-            "tab": "Efficiency",
-            "score": 0.3857186824556381
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=524.862, mean=524.862, max=524.862, sum=1049.725 (2)",
-            "tab": "General information",
-            "score": 524.8624338624338
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.397,
-        "details": {
-          "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.411, mean=0.411, max=0.411, sum=0.822 (2)",
-            "tab": "Efficiency",
-            "score": 0.4109457277116321
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=599.762, mean=599.762, max=599.762, sum=1199.524 (2)",
-            "tab": "General information",
-            "score": 599.7619047619048
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.857,
-        "details": {
-          "description": "min=0.857, mean=0.857, max=0.857, sum=1.713 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.389, mean=0.389, max=0.389, sum=0.777 (2)",
-            "tab": "Efficiency",
-            "score": 0.38858610660799087
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.396, mean=0.396, max=0.396, sum=0.792 (2)",
-            "tab": "Efficiency",
-            "score": 0.39599566624082366
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.495, mean=0.495, max=0.495, sum=0.99 (2)",
-            "tab": "Efficiency",
-            "score": 0.495233371257782
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.539, mean=0.539, max=0.539, sum=1.077 (2)",
-            "tab": "Efficiency",
-            "score": 0.5386766448165431
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.367, mean=0.367, max=0.367, sum=0.735 (2)",
-            "tab": "Efficiency",
-            "score": 0.36738430129157174
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)",
-            "tab": "Efficiency",
-            "score": 0.38988350463037047
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.36, mean=0.36, max=0.36, sum=0.721 (2)",
-            "tab": "Efficiency",
-            "score": 0.3604950317969689
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.388, mean=0.388, max=0.388, sum=0.777 (2)",
-            "tab": "Efficiency",
-            "score": 0.38829568756951227
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.372, mean=0.372, max=0.372, sum=0.743 (2)",
-            "tab": "Efficiency",
-            "score": 0.37170837205999036
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)",
-            "tab": "Efficiency",
-            "score": 0.3798852077383079
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.395, mean=0.395, max=0.395, sum=0.79 (2)",
-            "tab": "Efficiency",
-            "score": 0.3950107355730249
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.477, mean=0.477, max=0.477, sum=0.954 (2)",
-            "tab": "Efficiency",
-            "score": 0.4768963897669757
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.528, mean=0.528, max=0.528, sum=1.056 (2)",
-            "tab": "Efficiency",
-            "score": 0.5277850253909242
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.517, mean=0.517, max=0.517, sum=1.034 (2)",
-            "tab": "Efficiency",
-            "score": 0.5169116002094897
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=506.677, mean=506.677, max=506.677, sum=1013.355 (2)",
-            "tab": "General information",
-            "score": 506.6774193548387
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=489.714, mean=489.714, max=489.714, sum=979.429 (2)",
-            "tab": "General information",
-            "score": 489.7142857142857
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=860.78, mean=860.78, max=860.78, sum=1721.56 (2)",
-            "tab": "General information",
-            "score": 860.78
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2791.073, mean=2791.073, max=2791.073, sum=5582.145 (2)",
-            "tab": "General information",
-            "score": 2791.072727272727
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=365.045, mean=365.045, max=365.045, sum=730.091 (2)",
-            "tab": "General information",
-            "score": 365.04545454545456
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=458.824, mean=458.824, max=458.824, sum=917.648 (2)",
-            "tab": "General information",
-            "score": 458.8238341968912
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=364.562, mean=364.562, max=364.562, sum=729.123 (2)",
-            "tab": "General information",
-            "score": 364.5615384615385
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=525.374, mean=525.374, max=525.374, sum=1050.748 (2)",
-            "tab": "General information",
-            "score": 525.3740740740741
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=392.025, mean=392.025, max=392.025, sum=784.05 (2)",
-            "tab": "General information",
-            "score": 392.02521008403363
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=553.464, mean=553.464, max=553.464, sum=1106.927 (2)",
-            "tab": "General information",
-            "score": 553.4635761589404
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=488.246, mean=488.246, max=488.246, sum=976.492 (2)",
-            "tab": "General information",
-            "score": 488.24587155963303
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=788.699, mean=788.699, max=788.699, sum=1577.398 (2)",
-            "tab": "General information",
-            "score": 788.699074074074
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2210.809, mean=2210.809, max=2210.809, sum=4421.618 (2)",
-            "tab": "General information",
-            "score": 2210.8088235294117
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1421.27, mean=1421.27, max=1421.27, sum=2842.54 (2)",
-            "tab": "General information",
-            "score": 1421.2700421940929
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.786,
-        "details": {
-          "description": "min=0.786, mean=0.786, max=0.786, sum=1.573 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.38, mean=0.38, max=0.38, sum=0.76 (2)",
-            "tab": "Efficiency",
-            "score": 0.3799830274197018
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.391, mean=0.391, max=0.391, sum=0.783 (2)",
-            "tab": "Efficiency",
-            "score": 0.3914412269155488
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=312.906, mean=312.906, max=312.906, sum=625.812 (2)",
-            "tab": "General information",
-            "score": 312.90582959641256
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=334.183, mean=334.183, max=334.183, sum=668.366 (2)",
-            "tab": "General information",
-            "score": 334.1832061068702
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.843,
-        "details": {
-          "description": "min=0.843, mean=0.843, max=0.843, sum=1.686 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.556, mean=0.556, max=0.556, sum=1.113 (2)",
-            "tab": "Efficiency",
-            "score": 0.5563427140890074
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=632.851, mean=632.851, max=632.851, sum=1265.702 (2)",
-            "tab": "General information",
-            "score": 632.8512396694215
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791,
-        "details": {
-          "description": "min=0.791, mean=0.791, max=0.791, sum=1.583 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.405, mean=0.405, max=0.405, sum=0.811 (2)",
-            "tab": "Efficiency",
-            "score": 0.4053135386273905
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=442.595, mean=442.595, max=442.595, sum=885.19 (2)",
-            "tab": "General information",
-            "score": 442.5950920245399
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.455,
-        "details": {
-          "description": "min=0.455, mean=0.455, max=0.455, sum=0.911 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.46, mean=0.46, max=0.46, sum=0.92 (2)",
-            "tab": "Efficiency",
-            "score": 0.45983841376645224
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=661.054, mean=661.054, max=661.054, sum=1322.107 (2)",
-            "tab": "General information",
-            "score": 661.0535714285714
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.845,
-        "details": {
-          "description": "min=0.845, mean=0.845, max=0.845, sum=1.689 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.386, mean=0.386, max=0.386, sum=0.773 (2)",
-            "tab": "Efficiency",
-            "score": 0.38629551535671197
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=276.796, mean=276.796, max=276.796, sum=553.592 (2)",
-            "tab": "General information",
-            "score": 276.79611650485435
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.391, mean=0.391, max=0.391, sum=0.781 (2)",
-            "tab": "Efficiency",
-            "score": 0.3906826453331189
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=397.218, mean=397.218, max=397.218, sum=794.436 (2)",
-            "tab": "General information",
-            "score": 397.21794871794873
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.378, mean=0.378, max=0.378, sum=0.756 (2)",
-            "tab": "Efficiency",
-            "score": 0.3778671717643738
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=334, mean=334, max=334, sum=668 (2)",
-            "tab": "General information",
-            "score": 334.0
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.893,
-        "details": {
-          "description": "min=0.893, mean=0.893, max=0.893, sum=1.785 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.367, mean=0.367, max=0.367, sum=0.735 (2)",
-            "tab": "Efficiency",
-            "score": 0.36739401007368494
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=292.925, mean=292.925, max=292.925, sum=585.849 (2)",
-            "tab": "General information",
-            "score": 292.92464878671774
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.404,
-        "details": {
-          "description": "min=0.404, mean=0.404, max=0.404, sum=0.809 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.387, mean=0.387, max=0.387, sum=0.773 (2)",
-            "tab": "Efficiency",
-            "score": 0.38658536858641346
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.609, mean=0.609, max=0.609, sum=1.217 (2)",
-            "tab": "Efficiency",
-            "score": 0.6085127204490107
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=469.145, mean=469.145, max=469.145, sum=938.289 (2)",
-            "tab": "General information",
-            "score": 469.1445086705202
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=649.455, mean=649.455, max=649.455, sum=1298.909 (2)",
-            "tab": "General information",
-            "score": 649.454748603352
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.758,
-        "details": {
-          "description": "min=0.758, mean=0.758, max=0.758, sum=1.516 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.659, mean=0.659, max=0.659, sum=1.319 (2)",
-            "tab": "Efficiency",
-            "score": 0.6593383916842392
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=579.817, mean=579.817, max=579.817, sum=1159.634 (2)",
-            "tab": "General information",
-            "score": 579.8169934640523
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.787,
-        "details": {
-          "description": "min=0.787, mean=0.787, max=0.787, sum=1.574 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.414, mean=0.414, max=0.414, sum=0.828 (2)",
-            "tab": "Efficiency",
-            "score": 0.4140352636207769
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=507.559, mean=507.559, max=507.559, sum=1015.117 (2)",
-            "tab": "General information",
-            "score": 507.55864197530866
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.745,
-        "details": {
-          "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.373, mean=0.373, max=0.373, sum=0.746 (2)",
-            "tab": "Efficiency",
-            "score": 0.3731096332723444
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=398.318, mean=398.318, max=398.318, sum=796.636 (2)",
-            "tab": "General information",
-            "score": 398.3181818181818
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.496, mean=0.496, max=0.496, sum=0.993 (2)",
-            "tab": "Efficiency",
-            "score": 0.4963450723764848
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1157.473, mean=1157.473, max=1157.473, sum=2314.947 (2)",
-            "tab": "General information",
-            "score": 1157.4734693877551
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=1.741 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.376, mean=0.376, max=0.376, sum=0.753 (2)",
-            "tab": "Efficiency",
-            "score": 0.3763423120204489
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=438.522, mean=438.522, max=438.522, sum=877.045 (2)",
-            "tab": "General information",
-            "score": 438.5223880597015
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.542,
-        "details": {
-          "description": "min=0.542, mean=0.542, max=0.542, sum=1.084 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.387, mean=0.387, max=0.387, sum=0.775 (2)",
-            "tab": "Efficiency",
-            "score": 0.3873033107045185
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=336.09, mean=336.09, max=336.09, sum=672.181 (2)",
-            "tab": "General information",
-            "score": 336.0903614457831
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.836,
-        "details": {
-          "description": "min=0.836, mean=0.836, max=0.836, sum=1.673 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.403, mean=0.403, max=0.403, sum=0.807 (2)",
-            "tab": "Efficiency",
-            "score": 0.4032876603087487
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=268.561, mean=268.561, max=268.561, sum=537.123 (2)",
-            "tab": "General information",
-            "score": 268.56140350877195
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.589,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json b/data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json
deleted file mode 100644
index 6ccc418f3..000000000
--- a/data/helm_mmlu/openai/gpt-4-0613/fd6aea24-dc18-41ce-bc19-23f461a39032.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4-0613/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-4 0613",
-    "id": "openai/gpt-4-0613",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.824,
-        "details": {
-          "description": "min=0.54, mean=0.824, max=0.99, sum=93.978 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.364, mean=0.447, max=0.579, sum=51.005 (114)",
-            "tab": "Efficiency",
-            "score": 0.4474144183932911
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=268.561, mean=607.852, max=2791.073, sum=69295.086 (114)",
-            "tab": "General information",
-            "score": 607.851634217556
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.63,
-        "details": {
-          "description": "min=0.63, mean=0.63, max=0.63, sum=1.26 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.393, mean=0.393, max=0.393, sum=0.787 (2)",
-            "tab": "Efficiency",
-            "score": 0.39332568168640136
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=366.44, mean=366.44, max=366.44, sum=732.88 (2)",
-            "tab": "General information",
-            "score": 366.44
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.545, mean=0.545, max=0.545, sum=1.09 (2)",
-            "tab": "Efficiency",
-            "score": 0.5451150911825674
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=346.978, mean=346.978, max=346.978, sum=693.956 (2)",
-            "tab": "General information",
-            "score": 346.97777777777776
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.627,
-        "details": {
-          "description": "min=0.627, mean=0.627, max=0.627, sum=1.255 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.389, mean=0.389, max=0.389, sum=0.778 (2)",
-            "tab": "Efficiency",
-            "score": 0.3888898015022278
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.433, mean=0.433, max=0.433, sum=0.866 (2)",
-            "tab": "Efficiency",
-            "score": 0.43280420700709027
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.492, mean=0.492, max=0.492, sum=0.984 (2)",
-            "tab": "Efficiency",
-            "score": 0.49212974786758423
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.435, mean=0.435, max=0.435, sum=0.871 (2)",
-            "tab": "Efficiency",
-            "score": 0.4354128074645996
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.431, mean=0.431, max=0.431, sum=0.861 (2)",
-            "tab": "Efficiency",
-            "score": 0.4306242893196944
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)",
-            "tab": "Efficiency",
-            "score": 0.41519686287524654
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=542.4, mean=542.4, max=542.4, sum=1084.8 (2)",
-            "tab": "General information",
-            "score": 542.4
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=466.917, mean=466.917, max=466.917, sum=933.833 (2)",
-            "tab": "General information",
-            "score": 466.9166666666667
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=821.39, mean=821.39, max=821.39, sum=1642.78 (2)",
-            "tab": "General information",
-            "score": 821.39
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=587.52, mean=587.52, max=587.52, sum=1175.04 (2)",
-            "tab": "General information",
-            "score": 587.52
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=495.728, mean=495.728, max=495.728, sum=991.457 (2)",
-            "tab": "General information",
-            "score": 495.728323699422
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=496.608, mean=496.608, max=496.608, sum=993.216 (2)",
-            "tab": "General information",
-            "score": 496.6078431372549
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.86,
-        "details": {
-          "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.373, mean=0.373, max=0.373, sum=0.746 (2)",
-            "tab": "Efficiency",
-            "score": 0.3729291558265686
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=371.54, mean=371.54, max=371.54, sum=743.08 (2)",
-            "tab": "General information",
-            "score": 371.54
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.684,
-        "details": {
-          "description": "min=0.684, mean=0.684, max=0.684, sum=1.368 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.364, mean=0.364, max=0.364, sum=0.729 (2)",
-            "tab": "Efficiency",
-            "score": 0.36447873241023016
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=607.43, mean=607.43, max=607.43, sum=1214.86 (2)",
-            "tab": "General information",
-            "score": 607.4298245614035
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.62,
-        "details": {
-          "description": "min=0.62, mean=0.62, max=0.62, sum=1.24 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.476, mean=0.476, max=0.476, sum=0.952 (2)",
-            "tab": "Efficiency",
-            "score": 0.4758000469207764
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=392.71, mean=392.71, max=392.71, sum=785.42 (2)",
-            "tab": "General information",
-            "score": 392.71
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.889,
-        "details": {
-          "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.439, mean=0.439, max=0.439, sum=0.878 (2)",
-            "tab": "Efficiency",
-            "score": 0.43886900389636
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=387.639, mean=387.639, max=387.639, sum=775.278 (2)",
-            "tab": "General information",
-            "score": 387.6388888888889
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.859,
-        "details": {
-          "description": "min=0.859, mean=0.859, max=0.859, sum=1.717 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.403, mean=0.403, max=0.403, sum=0.807 (2)",
-            "tab": "Efficiency",
-            "score": 0.40341131480177117
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=322.084, mean=322.084, max=322.084, sum=644.167 (2)",
-            "tab": "General information",
-            "score": 322.08360128617363
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.891,
-        "details": {
-          "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.483, mean=0.483, max=0.483, sum=0.966 (2)",
-            "tab": "Efficiency",
-            "score": 0.48306868356816907
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.444, mean=0.444, max=0.444, sum=0.888 (2)",
-            "tab": "Efficiency",
-            "score": 0.44407470006469296
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.578, mean=0.578, max=0.578, sum=1.157 (2)",
-            "tab": "Efficiency",
-            "score": 0.578451920053017
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.469, mean=0.469, max=0.469, sum=0.938 (2)",
-            "tab": "Efficiency",
-            "score": 0.4690242421393301
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1087.585, mean=1087.585, max=1087.585, sum=2175.169 (2)",
-            "tab": "General information",
-            "score": 1087.5845588235295
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=651.592, mean=651.592, max=651.592, sum=1303.184 (2)",
-            "tab": "General information",
-            "score": 651.5921985815603
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1630.787, mean=1630.787, max=1630.787, sum=3261.574 (2)",
-            "tab": "General information",
-            "score": 1630.7868318122555
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=568.114, mean=568.114, max=568.114, sum=1136.229 (2)",
-            "tab": "General information",
-            "score": 568.1143790849674
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.95,
-        "details": {
-          "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.434, mean=0.434, max=0.434, sum=0.869 (2)",
-            "tab": "Efficiency",
-            "score": 0.43441893100738527
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=415.79, mean=415.79, max=415.79, sum=831.58 (2)",
-            "tab": "General information",
-            "score": 415.79
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.934,
-        "details": {
-          "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.472, mean=0.472, max=0.472, sum=0.944 (2)",
-            "tab": "Efficiency",
-            "score": 0.4718977307018481
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=572.691, mean=572.691, max=572.691, sum=1145.382 (2)",
-            "tab": "General information",
-            "score": 572.6907894736842
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.477, mean=0.477, max=0.477, sum=0.953 (2)",
-            "tab": "Efficiency",
-            "score": 0.4765148901939392
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=562.52, mean=562.52, max=562.52, sum=1125.04 (2)",
-            "tab": "General information",
-            "score": 562.52
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.845,
-        "details": {
-          "description": "min=0.845, mean=0.845, max=0.845, sum=1.691 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.415, mean=0.415, max=0.415, sum=0.829 (2)",
-            "tab": "Efficiency",
-            "score": 0.414557883424579
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=390.947, mean=390.947, max=390.947, sum=781.894 (2)",
-            "tab": "General information",
-            "score": 390.94716981132075
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.868,
-        "details": {
-          "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.384, mean=0.384, max=0.384, sum=0.767 (2)",
-            "tab": "Efficiency",
-            "score": 0.3836827186827964
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=297.838, mean=297.838, max=297.838, sum=595.677 (2)",
-            "tab": "General information",
-            "score": 297.83829787234043
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.786,
-        "details": {
-          "description": "min=0.786, mean=0.786, max=0.786, sum=1.572 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.399, mean=0.399, max=0.399, sum=0.798 (2)",
-            "tab": "Efficiency",
-            "score": 0.39915286919166304
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=433.641, mean=433.641, max=433.641, sum=867.283 (2)",
-            "tab": "General information",
-            "score": 433.6413793103448
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.807,
-        "details": {
-          "description": "min=0.807, mean=0.807, max=0.807, sum=1.614 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.423, mean=0.423, max=0.423, sum=0.845 (2)",
-            "tab": "Efficiency",
-            "score": 0.4225258120784053
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=524.862, mean=524.862, max=524.862, sum=1049.725 (2)",
-            "tab": "General information",
-            "score": 524.8624338624338
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.643,
-        "details": {
-          "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.486, mean=0.486, max=0.486, sum=0.973 (2)",
-            "tab": "Efficiency",
-            "score": 0.48647683007376535
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=599.762, mean=599.762, max=599.762, sum=1199.524 (2)",
-            "tab": "General information",
-            "score": 599.7619047619048
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.945,
-        "details": {
-          "description": "min=0.945, mean=0.945, max=0.945, sum=1.89 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.872 (2)",
-            "tab": "Efficiency",
-            "score": 0.4360047817230225
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.413, mean=0.413, max=0.413, sum=0.827 (2)",
-            "tab": "Efficiency",
-            "score": 0.41338158710836775
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.001 (2)",
-            "tab": "Efficiency",
-            "score": 0.5002665758132935
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.579, mean=0.579, max=0.579, sum=1.158 (2)",
-            "tab": "Efficiency",
-            "score": 0.578774525902488
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.414, mean=0.414, max=0.414, sum=0.829 (2)",
-            "tab": "Efficiency",
-            "score": 0.4142996747084338
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.43, mean=0.43, max=0.43, sum=0.86 (2)",
-            "tab": "Efficiency",
-            "score": 0.43005221001224814
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.416, mean=0.416, max=0.416, sum=0.832 (2)",
-            "tab": "Efficiency",
-            "score": 0.4160928750649477
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)",
-            "tab": "Efficiency",
-            "score": 0.4231933620240953
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.474, mean=0.474, max=0.474, sum=0.948 (2)",
-            "tab": "Efficiency",
-            "score": 0.4740273321376127
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.462, mean=0.462, max=0.462, sum=0.924 (2)",
-            "tab": "Efficiency",
-            "score": 0.4620048778736039
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.407, mean=0.407, max=0.407, sum=0.813 (2)",
-            "tab": "Efficiency",
-            "score": 0.40661886022725235
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.463, mean=0.463, max=0.463, sum=0.926 (2)",
-            "tab": "Efficiency",
-            "score": 0.46296725780875597
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.546, mean=0.546, max=0.546, sum=1.091 (2)",
-            "tab": "Efficiency",
-            "score": 0.5456923538563299
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.517, mean=0.517, max=0.517, sum=1.033 (2)",
-            "tab": "Efficiency",
-            "score": 0.5166646488608188
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=506.677, mean=506.677, max=506.677, sum=1013.355 (2)",
-            "tab": "General information",
-            "score": 506.6774193548387
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=489.714, mean=489.714, max=489.714, sum=979.429 (2)",
-            "tab": "General information",
-            "score": 489.7142857142857
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=860.78, mean=860.78, max=860.78, sum=1721.56 (2)",
-            "tab": "General information",
-            "score": 860.78
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2791.073, mean=2791.073, max=2791.073, sum=5582.145 (2)",
-            "tab": "General information",
-            "score": 2791.072727272727
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=365.045, mean=365.045, max=365.045, sum=730.091 (2)",
-            "tab": "General information",
-            "score": 365.04545454545456
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=458.824, mean=458.824, max=458.824, sum=917.648 (2)",
-            "tab": "General information",
-            "score": 458.8238341968912
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=364.562, mean=364.562, max=364.562, sum=729.123 (2)",
-            "tab": "General information",
-            "score": 364.5615384615385
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=525.374, mean=525.374, max=525.374, sum=1050.748 (2)",
-            "tab": "General information",
-            "score": 525.3740740740741
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=392.025, mean=392.025, max=392.025, sum=784.05 (2)",
-            "tab": "General information",
-            "score": 392.02521008403363
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=553.464, mean=553.464, max=553.464, sum=1106.927 (2)",
-            "tab": "General information",
-            "score": 553.4635761589404
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=488.246, mean=488.246, max=488.246, sum=976.492 (2)",
-            "tab": "General information",
-            "score": 488.24587155963303
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=788.699, mean=788.699, max=788.699, sum=1577.398 (2)",
-            "tab": "General information",
-            "score": 788.699074074074
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2210.809, mean=2210.809, max=2210.809, sum=4421.618 (2)",
-            "tab": "General information",
-            "score": 2210.8088235294117
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1421.27, mean=1421.27, max=1421.27, sum=2842.54 (2)",
-            "tab": "General information",
-            "score": 1421.2700421940929
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.908,
-        "details": {
-          "description": "min=0.908, mean=0.908, max=0.908, sum=1.817 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.406, mean=0.406, max=0.406, sum=0.812 (2)",
-            "tab": "Efficiency",
-            "score": 0.4058152218036053
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.466, mean=0.466, max=0.466, sum=0.932 (2)",
-            "tab": "Efficiency",
-            "score": 0.46620041541470825
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=312.906, mean=312.906, max=312.906, sum=625.812 (2)",
-            "tab": "General information",
-            "score": 312.90582959641256
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=334.183, mean=334.183, max=334.183, sum=668.366 (2)",
-            "tab": "General information",
-            "score": 334.1832061068702
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.917,
-        "details": {
-          "description": "min=0.917, mean=0.917, max=0.917, sum=1.835 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.461, mean=0.461, max=0.461, sum=0.922 (2)",
-            "tab": "Efficiency",
-            "score": 0.4608367139642889
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=632.851, mean=632.851, max=632.851, sum=1265.702 (2)",
-            "tab": "General information",
-            "score": 632.8512396694215
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.432, mean=0.432, max=0.432, sum=0.864 (2)",
-            "tab": "Efficiency",
-            "score": 0.4321035870745138
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=442.595, mean=442.595, max=442.595, sum=885.19 (2)",
-            "tab": "General information",
-            "score": 442.5950920245399
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.759,
-        "details": {
-          "description": "min=0.759, mean=0.759, max=0.759, sum=1.518 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.463, mean=0.463, max=0.463, sum=0.926 (2)",
-            "tab": "Efficiency",
-            "score": 0.46302694933755056
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=661.054, mean=661.054, max=661.054, sum=1322.107 (2)",
-            "tab": "General information",
-            "score": 661.0535714285714
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.932,
-        "details": {
-          "description": "min=0.932, mean=0.932, max=0.932, sum=1.864 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.446, mean=0.446, max=0.446, sum=0.891 (2)",
-            "tab": "Efficiency",
-            "score": 0.4455798760201167
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=276.796, mean=276.796, max=276.796, sum=553.592 (2)",
-            "tab": "General information",
-            "score": 276.79611650485435
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.962,
-        "details": {
-          "description": "min=0.962, mean=0.962, max=0.962, sum=1.923 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.421, mean=0.421, max=0.421, sum=0.843 (2)",
-            "tab": "Efficiency",
-            "score": 0.4213859372668796
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=397.218, mean=397.218, max=397.218, sum=794.436 (2)",
-            "tab": "General information",
-            "score": 397.21794871794873
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.411, mean=0.411, max=0.411, sum=0.823 (2)",
-            "tab": "Efficiency",
-            "score": 0.41135803937911986
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=334, mean=334, max=334, sum=668 (2)",
-            "tab": "General information",
-            "score": 334.0
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.949,
-        "details": {
-          "description": "min=0.949, mean=0.949, max=0.949, sum=1.898 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.451, mean=0.451, max=0.451, sum=0.901 (2)",
-            "tab": "Efficiency",
-            "score": 0.4505587230088001
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=292.925, mean=292.925, max=292.925, sum=585.849 (2)",
-            "tab": "General information",
-            "score": 292.92464878671774
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.902,
-        "details": {
-          "description": "min=0.902, mean=0.902, max=0.902, sum=1.803 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.428, mean=0.428, max=0.428, sum=0.856 (2)",
-            "tab": "Efficiency",
-            "score": 0.4281756044123214
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.445, mean=0.445, max=0.445, sum=0.89 (2)",
-            "tab": "Efficiency",
-            "score": 0.44513606945229645
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=469.145, mean=469.145, max=469.145, sum=938.289 (2)",
-            "tab": "General information",
-            "score": 469.1445086705202
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=649.455, mean=649.455, max=649.455, sum=1298.909 (2)",
-            "tab": "General information",
-            "score": 649.454748603352
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.892,
-        "details": {
-          "description": "min=0.892, mean=0.892, max=0.892, sum=1.784 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.446, mean=0.446, max=0.446, sum=0.892 (2)",
-            "tab": "Efficiency",
-            "score": 0.4460979816960354
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=579.817, mean=579.817, max=579.817, sum=1159.634 (2)",
-            "tab": "General information",
-            "score": 579.8169934640523
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.926,
-        "details": {
-          "description": "min=0.926, mean=0.926, max=0.926, sum=1.852 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.426, mean=0.426, max=0.426, sum=0.852 (2)",
-            "tab": "Efficiency",
-            "score": 0.42610209665180726
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=507.559, mean=507.559, max=507.559, sum=1015.117 (2)",
-            "tab": "General information",
-            "score": 507.55864197530866
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.745,
-        "details": {
-          "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.496, mean=0.496, max=0.496, sum=0.992 (2)",
-            "tab": "Efficiency",
-            "score": 0.49601870450106533
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=398.318, mean=398.318, max=398.318, sum=796.636 (2)",
-            "tab": "General information",
-            "score": 398.3181818181818
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.861,
-        "details": {
-          "description": "min=0.861, mean=0.861, max=0.861, sum=1.722 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.471, mean=0.471, max=0.471, sum=0.941 (2)",
-            "tab": "Efficiency",
-            "score": 0.47064581306613223
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1157.473, mean=1157.473, max=1157.473, sum=2314.947 (2)",
-            "tab": "General information",
-            "score": 1157.4734693877551
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "details": {
-          "description": "min=0.93, mean=0.93, max=0.93, sum=1.861 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.43, mean=0.43, max=0.43, sum=0.86 (2)",
-            "tab": "Efficiency",
-            "score": 0.42976075143956427
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=438.522, mean=438.522, max=438.522, sum=877.045 (2)",
-            "tab": "General information",
-            "score": 438.5223880597015
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.596,
-        "details": {
-          "description": "min=0.596, mean=0.596, max=0.596, sum=1.193 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)",
-            "tab": "Efficiency",
-            "score": 0.42023470890091125
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=336.09, mean=336.09, max=336.09, sum=672.181 (2)",
-            "tab": "General information",
-            "score": 336.0903614457831
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.877,
-        "details": {
-          "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.451, mean=0.451, max=0.451, sum=0.901 (2)",
-            "tab": "Efficiency",
-            "score": 0.4507097779658803
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=268.561, mean=268.561, max=268.561, sum=537.123 (2)",
-            "tab": "General information",
-            "score": 268.56140350877195
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.517,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json b/data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json
deleted file mode 100644
index 610be9719..000000000
--- a/data/helm_mmlu/openai/gpt-4-1106-preview/625d33ce-a320-4bfd-a962-451b8c22d392.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4-1106-preview/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-4 Turbo 1106 preview",
-    "id": "openai/gpt-4-1106-preview",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "description": "min=0.093, mean=0.796, max=0.979, sum=90.688 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.397, mean=0.537, max=0.852, sum=61.247 (114)",
-            "tab": "Efficiency",
-            "score": 0.5372507053364665
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=268.561, mean=607.852, max=2791.073, sum=69295.086 (114)",
-            "tab": "General information",
-            "score": 607.851634217556
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53,
-        "details": {
-          "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.425, mean=0.425, max=0.425, sum=0.85 (2)",
-            "tab": "Efficiency",
-            "score": 0.42504594564437864
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=366.44, mean=366.44, max=366.44, sum=732.88 (2)",
-            "tab": "General information",
-            "score": 366.44
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.807,
-        "details": {
-          "description": "min=0.807, mean=0.807, max=0.807, sum=1.615 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.569, mean=0.569, max=0.569, sum=1.138 (2)",
-            "tab": "Efficiency",
-            "score": 0.5691532982720269
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=346.978, mean=346.978, max=346.978, sum=693.956 (2)",
-            "tab": "General information",
-            "score": 346.97777777777776
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.402,
-        "details": {
-          "description": "min=0.402, mean=0.402, max=0.402, sum=0.804 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.457, mean=0.457, max=0.457, sum=0.913 (2)",
-            "tab": "Efficiency",
-            "score": 0.456736900806427
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.444, mean=0.444, max=0.444, sum=0.888 (2)",
-            "tab": "Efficiency",
-            "score": 0.44404302537441254
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.516, mean=0.516, max=0.516, sum=1.033 (2)",
-            "tab": "Efficiency",
-            "score": 0.516348373889923
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.534, mean=0.534, max=0.534, sum=1.067 (2)",
-            "tab": "Efficiency",
-            "score": 0.5335026264190674
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.491, mean=0.491, max=0.491, sum=0.982 (2)",
-            "tab": "Efficiency",
-            "score": 0.4908691348368033
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.75, mean=0.75, max=0.75, sum=1.499 (2)",
-            "tab": "Efficiency",
-            "score": 0.7497045245825076
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=542.4, mean=542.4, max=542.4, sum=1084.8 (2)",
-            "tab": "General information",
-            "score": 542.4
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=466.917, mean=466.917, max=466.917, sum=933.833 (2)",
-            "tab": "General information",
-            "score": 466.9166666666667
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=821.39, mean=821.39, max=821.39, sum=1642.78 (2)",
-            "tab": "General information",
-            "score": 821.39
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=587.52, mean=587.52, max=587.52, sum=1175.04 (2)",
-            "tab": "General information",
-            "score": 587.52
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=495.728, mean=495.728, max=495.728, sum=991.457 (2)",
-            "tab": "General information",
-            "score": 495.728323699422
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=496.608, mean=496.608, max=496.608, sum=993.216 (2)",
-            "tab": "General information",
-            "score": 496.6078431372549
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.86,
-        "details": {
-          "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.442, mean=0.442, max=0.442, sum=0.884 (2)",
-            "tab": "Efficiency",
-            "score": 0.4418716287612915
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=371.54, mean=371.54, max=371.54, sum=743.08 (2)",
-            "tab": "General information",
-            "score": 371.54
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.675,
-        "details": {
-          "description": "min=0.675, mean=0.675, max=0.675, sum=1.351 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.515, mean=0.515, max=0.515, sum=1.03 (2)",
-            "tab": "Efficiency",
-            "score": 0.5149402095560442
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=607.43, mean=607.43, max=607.43, sum=1214.86 (2)",
-            "tab": "General information",
-            "score": 607.4298245614035
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.58,
-        "details": {
-          "description": "min=0.58, mean=0.58, max=0.58, sum=1.16 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.486, mean=0.486, max=0.486, sum=0.973 (2)",
-            "tab": "Efficiency",
-            "score": 0.4863955807685852
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=392.71, mean=392.71, max=392.71, sum=785.42 (2)",
-            "tab": "General information",
-            "score": 392.71
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.889,
-        "details": {
-          "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.731, mean=0.731, max=0.731, sum=1.462 (2)",
-            "tab": "Efficiency",
-            "score": 0.7311423023541769
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=387.639, mean=387.639, max=387.639, sum=775.278 (2)",
-            "tab": "General information",
-            "score": 387.6388888888889
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.852,
-        "details": {
-          "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.486, mean=0.486, max=0.486, sum=0.973 (2)",
-            "tab": "Efficiency",
-            "score": 0.4863421380328212
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=322.084, mean=322.084, max=322.084, sum=644.167 (2)",
-            "tab": "General information",
-            "score": 322.08360128617363
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.887,
-        "details": {
-          "description": "min=0.887, mean=0.887, max=0.887, sum=1.775 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.551, mean=0.551, max=0.551, sum=1.103 (2)",
-            "tab": "Efficiency",
-            "score": 0.5514215528964996
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.54, mean=0.54, max=0.54, sum=1.079 (2)",
-            "tab": "Efficiency",
-            "score": 0.5395518828791084
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.616, mean=0.616, max=0.616, sum=1.232 (2)",
-            "tab": "Efficiency",
-            "score": 0.6162493903447317
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.563, mean=0.563, max=0.563, sum=1.126 (2)",
-            "tab": "Efficiency",
-            "score": 0.5629562961509804
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1087.585, mean=1087.585, max=1087.585, sum=2175.169 (2)",
-            "tab": "General information",
-            "score": 1087.5845588235295
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=651.592, mean=651.592, max=651.592, sum=1303.184 (2)",
-            "tab": "General information",
-            "score": 651.5921985815603
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1630.787, mean=1630.787, max=1630.787, sum=3261.574 (2)",
-            "tab": "General information",
-            "score": 1630.7868318122555
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=568.114, mean=568.114, max=568.114, sum=1136.229 (2)",
-            "tab": "General information",
-            "score": 568.1143790849674
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.96,
-        "details": {
-          "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)",
-            "tab": "Efficiency",
-            "score": 0.39724321842193605
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=415.79, mean=415.79, max=415.79, sum=831.58 (2)",
-            "tab": "General information",
-            "score": 415.79
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.941,
-        "details": {
-          "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.519, mean=0.519, max=0.519, sum=1.038 (2)",
-            "tab": "Efficiency",
-            "score": 0.5192367622726842
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=572.691, mean=572.691, max=572.691, sum=1145.382 (2)",
-            "tab": "General information",
-            "score": 572.6907894736842
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78,
-        "details": {
-          "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.495, mean=0.495, max=0.495, sum=0.99 (2)",
-            "tab": "Efficiency",
-            "score": 0.49495640993118284
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=562.52, mean=562.52, max=562.52, sum=1125.04 (2)",
-            "tab": "General information",
-            "score": 562.52
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.864,
-        "details": {
-          "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.642, mean=0.642, max=0.642, sum=1.284 (2)",
-            "tab": "Efficiency",
-            "score": 0.6421918509141454
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=390.947, mean=390.947, max=390.947, sum=781.894 (2)",
-            "tab": "General information",
-            "score": 390.94716981132075
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.894,
-        "details": {
-          "description": "min=0.894, mean=0.894, max=0.894, sum=1.787 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.586, mean=0.586, max=0.586, sum=1.172 (2)",
-            "tab": "Efficiency",
-            "score": 0.5859095319788507
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=297.838, mean=297.838, max=297.838, sum=595.677 (2)",
-            "tab": "General information",
-            "score": 297.83829787234043
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.772,
-        "details": {
-          "description": "min=0.772, mean=0.772, max=0.772, sum=1.545 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.507, mean=0.507, max=0.507, sum=1.014 (2)",
-            "tab": "Efficiency",
-            "score": 0.5071375830420133
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=433.641, mean=433.641, max=433.641, sum=867.283 (2)",
-            "tab": "General information",
-            "score": 433.6413793103448
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.638,
-        "details": {
-          "description": "min=0.638, mean=0.638, max=0.638, sum=1.275 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.486, mean=0.486, max=0.486, sum=0.972 (2)",
-            "tab": "Efficiency",
-            "score": 0.48600239034682985
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=524.862, mean=524.862, max=524.862, sum=1049.725 (2)",
-            "tab": "General information",
-            "score": 524.8624338624338
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.651,
-        "details": {
-          "description": "min=0.651, mean=0.651, max=0.651, sum=1.302 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.491, mean=0.491, max=0.491, sum=0.983 (2)",
-            "tab": "Efficiency",
-            "score": 0.4912937557886517
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=599.762, mean=599.762, max=599.762, sum=1199.524 (2)",
-            "tab": "General information",
-            "score": 599.7619047619048
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.958,
-        "details": {
-          "description": "min=0.958, mean=0.958, max=0.958, sum=1.916 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.572, mean=0.572, max=0.572, sum=1.144 (2)",
-            "tab": "Efficiency",
-            "score": 0.5719813362244637
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.656, mean=0.656, max=0.656, sum=1.312 (2)",
-            "tab": "Efficiency",
-            "score": 0.6560086276143643
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.568, mean=0.568, max=0.568, sum=1.137 (2)",
-            "tab": "Efficiency",
-            "score": 0.5683712005615235
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.64, mean=0.64, max=0.64, sum=1.28 (2)",
-            "tab": "Efficiency",
-            "score": 0.6399081995992949
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.471, mean=0.471, max=0.471, sum=0.943 (2)",
-            "tab": "Efficiency",
-            "score": 0.47148694173254146
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)",
-            "tab": "Efficiency",
-            "score": 0.420210268831006
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.445, mean=0.445, max=0.445, sum=0.89 (2)",
-            "tab": "Efficiency",
-            "score": 0.4451567802673731
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.434, mean=0.434, max=0.434, sum=0.868 (2)",
-            "tab": "Efficiency",
-            "score": 0.43410645679191306
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.656, mean=0.656, max=0.656, sum=1.312 (2)",
-            "tab": "Efficiency",
-            "score": 0.6560712812327537
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.574, mean=0.574, max=0.574, sum=1.148 (2)",
-            "tab": "Efficiency",
-            "score": 0.5739512143545593
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.446, mean=0.446, max=0.446, sum=0.892 (2)",
-            "tab": "Efficiency",
-            "score": 0.4460442779261038
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.586, mean=0.586, max=0.586, sum=1.171 (2)",
-            "tab": "Efficiency",
-            "score": 0.5855172486216934
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.579, mean=0.579, max=0.579, sum=1.158 (2)",
-            "tab": "Efficiency",
-            "score": 0.5790434245969734
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.643, mean=0.643, max=0.643, sum=1.285 (2)",
-            "tab": "Efficiency",
-            "score": 0.6425194448559596
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=506.677, mean=506.677, max=506.677, sum=1013.355 (2)",
-            "tab": "General information",
-            "score": 506.6774193548387
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=489.714, mean=489.714, max=489.714, sum=979.429 (2)",
-            "tab": "General information",
-            "score": 489.7142857142857
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=860.78, mean=860.78, max=860.78, sum=1721.56 (2)",
-            "tab": "General information",
-            "score": 860.78
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2791.073, mean=2791.073, max=2791.073, sum=5582.145 (2)",
-            "tab": "General information",
-            "score": 2791.072727272727
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=365.045, mean=365.045, max=365.045, sum=730.091 (2)",
-            "tab": "General information",
-            "score": 365.04545454545456
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=458.824, mean=458.824, max=458.824, sum=917.648 (2)",
-            "tab": "General information",
-            "score": 458.8238341968912
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=364.562, mean=364.562, max=364.562, sum=729.123 (2)",
-            "tab": "General information",
-            "score": 364.5615384615385
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=525.374, mean=525.374, max=525.374, sum=1050.748 (2)",
-            "tab": "General information",
-            "score": 525.3740740740741
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=392.025, mean=392.025, max=392.025, sum=784.05 (2)",
-            "tab": "General information",
-            "score": 392.02521008403363
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=553.464, mean=553.464, max=553.464, sum=1106.927 (2)",
-            "tab": "General information",
-            "score": 553.4635761589404
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=488.246, mean=488.246, max=488.246, sum=976.492 (2)",
-            "tab": "General information",
-            "score": 488.24587155963303
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=788.699, mean=788.699, max=788.699, sum=1577.398 (2)",
-            "tab": "General information",
-            "score": 788.699074074074
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2210.809, mean=2210.809, max=2210.809, sum=4421.618 (2)",
-            "tab": "General information",
-            "score": 2210.8088235294117
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1421.27, mean=1421.27, max=1421.27, sum=2842.54 (2)",
-            "tab": "General information",
-            "score": 1421.2700421940929
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.908,
-        "details": {
-          "description": "min=0.908, mean=0.908, max=0.908, sum=1.817 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.472, mean=0.472, max=0.472, sum=0.944 (2)",
-            "tab": "Efficiency",
-            "score": 0.47213134316585526
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.515, mean=0.515, max=0.515, sum=1.03 (2)",
-            "tab": "Efficiency",
-            "score": 0.5152236923916649
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=312.906, mean=312.906, max=312.906, sum=625.812 (2)",
-            "tab": "General information",
-            "score": 312.90582959641256
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=334.183, mean=334.183, max=334.183, sum=668.366 (2)",
-            "tab": "General information",
-            "score": 334.1832061068702
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.926,
-        "details": {
-          "description": "min=0.926, mean=0.926, max=0.926, sum=1.851 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.523, mean=0.523, max=0.523, sum=1.046 (2)",
-            "tab": "Efficiency",
-            "score": 0.5229926621618349
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=632.851, mean=632.851, max=632.851, sum=1265.702 (2)",
-            "tab": "General information",
-            "score": 632.8512396694215
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "details": {
-          "description": "min=0.865, mean=0.865, max=0.865, sum=1.73 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.699, mean=0.699, max=0.699, sum=1.398 (2)",
-            "tab": "Efficiency",
-            "score": 0.6990647155083031
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=442.595, mean=442.595, max=442.595, sum=885.19 (2)",
-            "tab": "General information",
-            "score": 442.5950920245399
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.723,
-        "details": {
-          "description": "min=0.723, mean=0.723, max=0.723, sum=1.446 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.606, mean=0.606, max=0.606, sum=1.211 (2)",
-            "tab": "Efficiency",
-            "score": 0.6055374975715365
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=661.054, mean=661.054, max=661.054, sum=1322.107 (2)",
-            "tab": "General information",
-            "score": 661.0535714285714
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.913,
-        "details": {
-          "description": "min=0.913, mean=0.913, max=0.913, sum=1.825 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.576, mean=0.576, max=0.576, sum=1.152 (2)",
-            "tab": "Efficiency",
-            "score": 0.5760108475546235
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=276.796, mean=276.796, max=276.796, sum=553.592 (2)",
-            "tab": "General information",
-            "score": 276.79611650485435
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.932,
-        "details": {
-          "description": "min=0.932, mean=0.932, max=0.932, sum=1.863 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.495, mean=0.495, max=0.495, sum=0.991 (2)",
-            "tab": "Efficiency",
-            "score": 0.49540983204148775
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=397.218, mean=397.218, max=397.218, sum=794.436 (2)",
-            "tab": "General information",
-            "score": 397.21794871794873
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.93,
-        "details": {
-          "description": "min=0.93, mean=0.93, max=0.93, sum=1.86 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.541, mean=0.541, max=0.541, sum=1.082 (2)",
-            "tab": "Efficiency",
-            "score": 0.5407642388343811
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=334, mean=334, max=334, sum=668 (2)",
-            "tab": "General information",
-            "score": 334.0
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.946,
-        "details": {
-          "description": "min=0.946, mean=0.946, max=0.946, sum=1.893 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.474, mean=0.474, max=0.474, sum=0.947 (2)",
-            "tab": "Efficiency",
-            "score": 0.4736132238103055
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=292.925, mean=292.925, max=292.925, sum=585.849 (2)",
-            "tab": "General information",
-            "score": 292.92464878671774
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.816,
-        "details": {
-          "description": "min=0.816, mean=0.816, max=0.816, sum=1.631 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.451, mean=0.451, max=0.451, sum=0.901 (2)",
-            "tab": "Efficiency",
-            "score": 0.45068276686475456
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.545, mean=0.545, max=0.545, sum=1.09 (2)",
-            "tab": "Efficiency",
-            "score": 0.5448215519249773
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=469.145, mean=469.145, max=469.145, sum=938.289 (2)",
-            "tab": "General information",
-            "score": 469.1445086705202
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=649.455, mean=649.455, max=649.455, sum=1298.909 (2)",
-            "tab": "General information",
-            "score": 649.454748603352
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.879,
-        "details": {
-          "description": "min=0.879, mean=0.879, max=0.879, sum=1.758 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.441, mean=0.441, max=0.441, sum=0.882 (2)",
-            "tab": "Efficiency",
-            "score": 0.4411514296251185
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=579.817, mean=579.817, max=579.817, sum=1159.634 (2)",
-            "tab": "General information",
-            "score": 579.8169934640523
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.917,
-        "details": {
-          "description": "min=0.917, mean=0.917, max=0.917, sum=1.833 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.489, mean=0.489, max=0.489, sum=0.978 (2)",
-            "tab": "Efficiency",
-            "score": 0.4891524300163175
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=507.559, mean=507.559, max=507.559, sum=1015.117 (2)",
-            "tab": "General information",
-            "score": 507.55864197530866
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.782,
-        "details": {
-          "description": "min=0.782, mean=0.782, max=0.782, sum=1.564 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.46, mean=0.46, max=0.46, sum=0.92 (2)",
-            "tab": "Efficiency",
-            "score": 0.46012504534287885
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=398.318, mean=398.318, max=398.318, sum=796.636 (2)",
-            "tab": "General information",
-            "score": 398.3181818181818
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.841,
-        "details": {
-          "description": "min=0.841, mean=0.841, max=0.841, sum=1.682 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.546, mean=0.546, max=0.546, sum=1.093 (2)",
-            "tab": "Efficiency",
-            "score": 0.546490309189777
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1157.473, mean=1157.473, max=1157.473, sum=2314.947 (2)",
-            "tab": "General information",
-            "score": 1157.4734693877551
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.925,
-        "details": {
-          "description": "min=0.925, mean=0.925, max=0.925, sum=1.851 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.441, mean=0.441, max=0.441, sum=0.882 (2)",
-            "tab": "Efficiency",
-            "score": 0.4410626805243801
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=438.522, mean=438.522, max=438.522, sum=877.045 (2)",
-            "tab": "General information",
-            "score": 438.5223880597015
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.59,
-        "details": {
-          "description": "min=0.59, mean=0.59, max=0.59, sum=1.181 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)",
-            "tab": "Efficiency",
-            "score": 0.851962562066963
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=336.09, mean=336.09, max=336.09, sum=672.181 (2)",
-            "tab": "General information",
-            "score": 336.0903614457831
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.854,
-        "details": {
-          "description": "min=0.854, mean=0.854, max=0.854, sum=1.708 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.566, mean=0.566, max=0.566, sum=1.133 (2)",
-            "tab": "Efficiency",
-            "score": 0.5664703581068251
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=268.561, mean=268.561, max=268.561, sum=537.123 (2)",
-            "tab": "General information",
-            "score": 268.56140350877195
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.416,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json b/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json
deleted file mode 100644
index a348a9fb9..000000000
--- a/data/helm_mmlu/openai/gpt-4-turbo-2024-04-09/e51be257-610e-4d38-b58a-a3b29fc06a83.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4-turbo-2024-04-09/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-4 Turbo 2024-04-09",
-    "id": "openai/gpt-4-turbo-2024-04-09",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.813,
-        "details": {
-          "description": "min=0.515, mean=0.813, max=0.974, sum=92.65 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.479, mean=0.617, max=0.934, sum=70.3 (114)",
-            "tab": "Efficiency",
-            "score": 0.6166649052297876
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=275.561, mean=614.852, max=2798.073, sum=70093.086 (114)",
-            "tab": "General information",
-            "score": 614.851634217556
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.56,
-        "details": {
-          "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.54, mean=0.54, max=0.54, sum=1.08 (2)",
-            "tab": "Efficiency",
-            "score": 0.539907853603363
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=373.44, mean=373.44, max=373.44, sum=746.88 (2)",
-            "tab": "General information",
-            "score": 373.44
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.822,
-        "details": {
-          "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)",
-            "tab": "Efficiency",
-            "score": 0.5299274744810881
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=353.978, mean=353.978, max=353.978, sum=707.956 (2)",
-            "tab": "General information",
-            "score": 353.97777777777776
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.539,
-        "details": {
-          "description": "min=0.539, mean=0.539, max=0.539, sum=1.078 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.549, mean=0.549, max=0.549, sum=1.099 (2)",
-            "tab": "Efficiency",
-            "score": 0.5493535542488098
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.6, mean=0.6, max=0.6, sum=1.199 (2)",
-            "tab": "Efficiency",
-            "score": 0.5995734184980392
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.691, mean=0.691, max=0.691, sum=1.382 (2)",
-            "tab": "Efficiency",
-            "score": 0.6911867094039917
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.609, mean=0.609, max=0.609, sum=1.219 (2)",
-            "tab": "Efficiency",
-            "score": 0.6092576813697815
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.67, mean=0.67, max=0.67, sum=1.34 (2)",
-            "tab": "Efficiency",
-            "score": 0.6697626251705809
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.706, mean=0.706, max=0.706, sum=1.412 (2)",
-            "tab": "Efficiency",
-            "score": 0.7058592660754335
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=549.4, mean=549.4, max=549.4, sum=1098.8 (2)",
-            "tab": "General information",
-            "score": 549.4
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=473.917, mean=473.917, max=473.917, sum=947.833 (2)",
-            "tab": "General information",
-            "score": 473.9166666666667
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=828.39, mean=828.39, max=828.39, sum=1656.78 (2)",
-            "tab": "General information",
-            "score": 828.39
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=594.52, mean=594.52, max=594.52, sum=1189.04 (2)",
-            "tab": "General information",
-            "score": 594.52
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=502.728, mean=502.728, max=502.728, sum=1005.457 (2)",
-            "tab": "General information",
-            "score": 502.728323699422
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=503.608, mean=503.608, max=503.608, sum=1007.216 (2)",
-            "tab": "General information",
-            "score": 503.6078431372549
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.53, mean=0.53, max=0.53, sum=1.061 (2)",
-            "tab": "Efficiency",
-            "score": 0.5303381824493408
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=378.54, mean=378.54, max=378.54, sum=757.08 (2)",
-            "tab": "General information",
-            "score": 378.54
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.675,
-        "details": {
-          "description": "min=0.675, mean=0.675, max=0.675, sum=1.351 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.572, mean=0.572, max=0.572, sum=1.144 (2)",
-            "tab": "Efficiency",
-            "score": 0.5721135453173989
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=614.43, mean=614.43, max=614.43, sum=1228.86 (2)",
-            "tab": "General information",
-            "score": 614.4298245614035
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.58,
-        "details": {
-          "description": "min=0.58, mean=0.58, max=0.58, sum=1.16 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.479, mean=0.479, max=0.479, sum=0.958 (2)",
-            "tab": "Efficiency",
-            "score": 0.47900029182434084
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)",
-            "tab": "General information",
-            "score": 399.71
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "details": {
-          "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.539, mean=0.539, max=0.539, sum=1.079 (2)",
-            "tab": "Efficiency",
-            "score": 0.5393155504156042
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=394.639, mean=394.639, max=394.639, sum=789.278 (2)",
-            "tab": "General information",
-            "score": 394.6388888888889
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.868,
-        "details": {
-          "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.543, mean=0.543, max=0.543, sum=1.087 (2)",
-            "tab": "Efficiency",
-            "score": 0.5434573969273705
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)",
-            "tab": "General information",
-            "score": 329.08360128617363
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.873,
-        "details": {
-          "description": "min=0.873, mean=0.873, max=0.873, sum=1.745 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.579, mean=0.579, max=0.579, sum=1.159 (2)",
-            "tab": "Efficiency",
-            "score": 0.5794552100055358
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.59, mean=0.59, max=0.59, sum=1.18 (2)",
-            "tab": "Efficiency",
-            "score": 0.5898241354218612
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.639, mean=0.639, max=0.639, sum=1.278 (2)",
-            "tab": "Efficiency",
-            "score": 0.6388053317424371
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.671, mean=0.671, max=0.671, sum=1.342 (2)",
-            "tab": "Efficiency",
-            "score": 0.6712259284031936
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1094.585, mean=1094.585, max=1094.585, sum=2189.169 (2)",
-            "tab": "General information",
-            "score": 1094.5845588235295
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=658.592, mean=658.592, max=658.592, sum=1317.184 (2)",
-            "tab": "General information",
-            "score": 658.5921985815603
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1637.787, mean=1637.787, max=1637.787, sum=3275.574 (2)",
-            "tab": "General information",
-            "score": 1637.7868318122555
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=575.114, mean=575.114, max=575.114, sum=1150.229 (2)",
-            "tab": "General information",
-            "score": 575.1143790849674
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.96,
-        "details": {
-          "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.558, mean=0.558, max=0.558, sum=1.115 (2)",
-            "tab": "Efficiency",
-            "score": 0.557673556804657
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)",
-            "tab": "General information",
-            "score": 422.79
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.941,
-        "details": {
-          "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.666, mean=0.666, max=0.666, sum=1.332 (2)",
-            "tab": "Efficiency",
-            "score": 0.6662032525790366
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=579.691, mean=579.691, max=579.691, sum=1159.382 (2)",
-            "tab": "General information",
-            "score": 579.6907894736842
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.598, mean=0.598, max=0.598, sum=1.196 (2)",
-            "tab": "Efficiency",
-            "score": 0.5981367039680481
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)",
-            "tab": "General information",
-            "score": 569.52
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.591, mean=0.591, max=0.591, sum=1.183 (2)",
-            "tab": "Efficiency",
-            "score": 0.5912713131814633
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=397.947, mean=397.947, max=397.947, sum=795.894 (2)",
-            "tab": "General information",
-            "score": 397.94716981132075
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.894,
-        "details": {
-          "description": "min=0.894, mean=0.894, max=0.894, sum=1.787 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.685, mean=0.685, max=0.685, sum=1.369 (2)",
-            "tab": "Efficiency",
-            "score": 0.684603402969685
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=304.838, mean=304.838, max=304.838, sum=609.677 (2)",
-            "tab": "General information",
-            "score": 304.83829787234043
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.752,
-        "details": {
-          "description": "min=0.752, mean=0.752, max=0.752, sum=1.503 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.649, mean=0.649, max=0.649, sum=1.297 (2)",
-            "tab": "Efficiency",
-            "score": 0.6487039006989578
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=440.641, mean=440.641, max=440.641, sum=881.283 (2)",
-            "tab": "General information",
-            "score": 440.6413793103448
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.72,
-        "details": {
-          "description": "min=0.72, mean=0.72, max=0.72, sum=1.439 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.708, mean=0.708, max=0.708, sum=1.417 (2)",
-            "tab": "Efficiency",
-            "score": 0.708430844009238
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=531.862, mean=531.862, max=531.862, sum=1063.725 (2)",
-            "tab": "General information",
-            "score": 531.8624338624338
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.706,
-        "details": {
-          "description": "min=0.706, mean=0.706, max=0.706, sum=1.413 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.635, mean=0.635, max=0.635, sum=1.27 (2)",
-            "tab": "Efficiency",
-            "score": 0.6347800322941372
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=606.762, mean=606.762, max=606.762, sum=1213.524 (2)",
-            "tab": "General information",
-            "score": 606.7619047619048
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.941,
-        "details": {
-          "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.674, mean=0.674, max=0.674, sum=1.348 (2)",
-            "tab": "Efficiency",
-            "score": 0.6741217144073979
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.673, mean=0.673, max=0.673, sum=1.346 (2)",
-            "tab": "Efficiency",
-            "score": 0.6728476491467706
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.626, mean=0.626, max=0.626, sum=1.252 (2)",
-            "tab": "Efficiency",
-            "score": 0.6261640882492066
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.747, mean=0.747, max=0.747, sum=1.495 (2)",
-            "tab": "Efficiency",
-            "score": 0.7474224538514108
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.335 (2)",
-            "tab": "Efficiency",
-            "score": 0.6672574221485793
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.683, mean=0.683, max=0.683, sum=1.366 (2)",
-            "tab": "Efficiency",
-            "score": 0.6831059715290762
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.613, mean=0.613, max=0.613, sum=1.226 (2)",
-            "tab": "Efficiency",
-            "score": 0.6132381714307344
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.594, mean=0.594, max=0.594, sum=1.188 (2)",
-            "tab": "Efficiency",
-            "score": 0.5939316025486698
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.585, mean=0.585, max=0.585, sum=1.169 (2)",
-            "tab": "Efficiency",
-            "score": 0.5845635728675778
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)",
-            "tab": "Efficiency",
-            "score": 0.9341671135251886
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.741, mean=0.741, max=0.741, sum=1.482 (2)",
-            "tab": "Efficiency",
-            "score": 0.7410666920723171
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.72, mean=0.72, max=0.72, sum=1.439 (2)",
-            "tab": "Efficiency",
-            "score": 0.7196061655327126
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)",
-            "tab": "Efficiency",
-            "score": 0.7454434785188413
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-            "tab": "Efficiency",
-            "score": 0.6665283818788166
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=513.677, mean=513.677, max=513.677, sum=1027.355 (2)",
-            "tab": "General information",
-            "score": 513.6774193548387
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=496.714, mean=496.714, max=496.714, sum=993.429 (2)",
-            "tab": "General information",
-            "score": 496.7142857142857
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)",
-            "tab": "General information",
-            "score": 867.78
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2798.073, mean=2798.073, max=2798.073, sum=5596.145 (2)",
-            "tab": "General information",
-            "score": 2798.072727272727
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=372.045, mean=372.045, max=372.045, sum=744.091 (2)",
-            "tab": "General information",
-            "score": 372.04545454545456
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)",
-            "tab": "General information",
-            "score": 465.8238341968912
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=371.562, mean=371.562, max=371.562, sum=743.123 (2)",
-            "tab": "General information",
-            "score": 371.5615384615385
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=532.374, mean=532.374, max=532.374, sum=1064.748 (2)",
-            "tab": "General information",
-            "score": 532.3740740740741
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=399.025, mean=399.025, max=399.025, sum=798.05 (2)",
-            "tab": "General information",
-            "score": 399.02521008403363
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=560.464, mean=560.464, max=560.464, sum=1120.927 (2)",
-            "tab": "General information",
-            "score": 560.4635761589404
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=495.246, mean=495.246, max=495.246, sum=990.492 (2)",
-            "tab": "General information",
-            "score": 495.24587155963303
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=795.699, mean=795.699, max=795.699, sum=1591.398 (2)",
-            "tab": "General information",
-            "score": 795.699074074074
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)",
-            "tab": "General information",
-            "score": 2217.8088235294117
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1428.27, mean=1428.27, max=1428.27, sum=2856.54 (2)",
-            "tab": "General information",
-            "score": 1428.2700421940929
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.656, mean=0.656, max=0.656, sum=1.313 (2)",
-            "tab": "Efficiency",
-            "score": 0.6564141239286003
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.613, mean=0.613, max=0.613, sum=1.226 (2)",
-            "tab": "Efficiency",
-            "score": 0.6131143715545422
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=319.906, mean=319.906, max=319.906, sum=639.812 (2)",
-            "tab": "General information",
-            "score": 319.90582959641256
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=341.183, mean=341.183, max=341.183, sum=682.366 (2)",
-            "tab": "General information",
-            "score": 341.1832061068702
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.942,
-        "details": {
-          "description": "min=0.942, mean=0.942, max=0.942, sum=1.884 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.63, mean=0.63, max=0.63, sum=1.26 (2)",
-            "tab": "Efficiency",
-            "score": 0.6297830116650289
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=639.851, mean=639.851, max=639.851, sum=1279.702 (2)",
-            "tab": "General information",
-            "score": 639.8512396694215
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.585, mean=0.585, max=0.585, sum=1.171 (2)",
-            "tab": "Efficiency",
-            "score": 0.585445927695994
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=449.595, mean=449.595, max=449.595, sum=899.19 (2)",
-            "tab": "General information",
-            "score": 449.5950920245399
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.741,
-        "details": {
-          "description": "min=0.741, mean=0.741, max=0.741, sum=1.482 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.718, mean=0.718, max=0.718, sum=1.436 (2)",
-            "tab": "Efficiency",
-            "score": 0.718035706451961
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)",
-            "tab": "General information",
-            "score": 668.0535714285714
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.883,
-        "details": {
-          "description": "min=0.883, mean=0.883, max=0.883, sum=1.767 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.592, mean=0.592, max=0.592, sum=1.184 (2)",
-            "tab": "Efficiency",
-            "score": 0.5921963488013999
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=283.796, mean=283.796, max=283.796, sum=567.592 (2)",
-            "tab": "General information",
-            "score": 283.79611650485435
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.949,
-        "details": {
-          "description": "min=0.949, mean=0.949, max=0.949, sum=1.897 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.588, mean=0.588, max=0.588, sum=1.176 (2)",
-            "tab": "Efficiency",
-            "score": 0.5880082672477788
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)",
-            "tab": "General information",
-            "score": 404.21794871794873
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)",
-            "tab": "Efficiency",
-            "score": 0.5201336288452149
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=341, mean=341, max=341, sum=682 (2)",
-            "tab": "General information",
-            "score": 341.0
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.945,
-        "details": {
-          "description": "min=0.945, mean=0.945, max=0.945, sum=1.89 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.565, mean=0.565, max=0.565, sum=1.13 (2)",
-            "tab": "Efficiency",
-            "score": 0.5650817577561809
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=299.925, mean=299.925, max=299.925, sum=599.849 (2)",
-            "tab": "General information",
-            "score": 299.92464878671774
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.803,
-        "details": {
-          "description": "min=0.803, mean=0.803, max=0.803, sum=1.607 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.564, mean=0.564, max=0.564, sum=1.129 (2)",
-            "tab": "Efficiency",
-            "score": 0.5643301023913256
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.599, mean=0.599, max=0.599, sum=1.197 (2)",
-            "tab": "Efficiency",
-            "score": 0.5985688052363902
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=476.145, mean=476.145, max=476.145, sum=952.289 (2)",
-            "tab": "General information",
-            "score": 476.1445086705202
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)",
-            "tab": "General information",
-            "score": 656.454748603352
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.892,
-        "details": {
-          "description": "min=0.892, mean=0.892, max=0.892, sum=1.784 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.532, mean=0.532, max=0.532, sum=1.063 (2)",
-            "tab": "Efficiency",
-            "score": 0.5316595968857311
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=586.817, mean=586.817, max=586.817, sum=1173.634 (2)",
-            "tab": "General information",
-            "score": 586.8169934640523
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.54, mean=0.54, max=0.54, sum=1.079 (2)",
-            "tab": "Efficiency",
-            "score": 0.5397091279795141
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=514.559, mean=514.559, max=514.559, sum=1029.117 (2)",
-            "tab": "General information",
-            "score": 514.5586419753087
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.755,
-        "details": {
-          "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.584, mean=0.584, max=0.584, sum=1.168 (2)",
-            "tab": "Efficiency",
-            "score": 0.5840315688740123
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)",
-            "tab": "General information",
-            "score": 405.3181818181818
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.529, mean=0.529, max=0.529, sum=1.058 (2)",
-            "tab": "Efficiency",
-            "score": 0.529095221538933
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)",
-            "tab": "General information",
-            "score": 1164.4734693877551
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.915,
-        "details": {
-          "description": "min=0.915, mean=0.915, max=0.915, sum=1.831 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)",
-            "tab": "Efficiency",
-            "score": 0.5199050891458692
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=445.522, mean=445.522, max=445.522, sum=891.045 (2)",
-            "tab": "General information",
-            "score": 445.5223880597015
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.602,
-        "details": {
-          "description": "min=0.602, mean=0.602, max=0.602, sum=1.205 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.523, mean=0.523, max=0.523, sum=1.045 (2)",
-            "tab": "Efficiency",
-            "score": 0.5226844951330897
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=343.09, mean=343.09, max=343.09, sum=686.181 (2)",
-            "tab": "General information",
-            "score": 343.0903614457831
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.848,
-        "details": {
-          "description": "min=0.848, mean=0.848, max=0.848, sum=1.696 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.494, mean=0.494, max=0.494, sum=0.988 (2)",
-            "tab": "Efficiency",
-            "score": 0.49407080739562276
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=275.561, mean=275.561, max=275.561, sum=551.123 (2)",
-            "tab": "General information",
-            "score": 275.56140350877195
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.351,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json b/data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json
deleted file mode 100644
index 76ba53d53..000000000
--- a/data/helm_mmlu/openai/gpt-4o-2024-05-13/9e0b9f48-f913-4bbe-a135-59e596c9e479.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-05-13/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-4o 2024-05-13",
-    "id": "openai/gpt-4o-2024-05-13",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.842,
-        "details": {
-          "description": "min=0.47, mean=0.842, max=0.979, sum=95.957 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.314, mean=0.37, max=0.515, sum=42.144 (114)",
-            "tab": "Efficiency",
-            "score": 0.3696883367683005
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=267.936, mean=612.332, max=2793.83, sum=69805.818 (114)",
-            "tab": "General information",
-            "score": 612.3317391408493
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.66,
-        "details": {
-          "description": "min=0.66, mean=0.66, max=0.66, sum=1.32 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.381, mean=0.381, max=0.381, sum=0.761 (2)",
-            "tab": "Efficiency",
-            "score": 0.38067533016204835
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=374.53, mean=374.53, max=374.53, sum=749.06 (2)",
-            "tab": "General information",
-            "score": 374.53
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.911,
-        "details": {
-          "description": "min=0.911, mean=0.911, max=0.911, sum=1.822 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.666 (2)",
-            "tab": "Efficiency",
-            "score": 0.3328125264909532
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=350.6, mean=350.6, max=350.6, sum=701.2 (2)",
-            "tab": "General information",
-            "score": 350.6
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.686,
-        "details": {
-          "description": "min=0.686, mean=0.686, max=0.686, sum=1.373 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.473, mean=0.473, max=0.473, sum=0.947 (2)",
-            "tab": "Efficiency",
-            "score": 0.4733888053894043
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.428, mean=0.428, max=0.428, sum=0.855 (2)",
-            "tab": "Efficiency",
-            "score": 0.4276181277301576
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.367, mean=0.367, max=0.367, sum=0.734 (2)",
-            "tab": "Efficiency",
-            "score": 0.36701245784759523
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.665 (2)",
-            "tab": "Efficiency",
-            "score": 0.3324534225463867
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.365, mean=0.365, max=0.365, sum=0.73 (2)",
-            "tab": "Efficiency",
-            "score": 0.3647800649521668
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.349, mean=0.349, max=0.349, sum=0.699 (2)",
-            "tab": "Efficiency",
-            "score": 0.3492975866093355
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=552.07, mean=552.07, max=552.07, sum=1104.14 (2)",
-            "tab": "General information",
-            "score": 552.07
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=468.056, mean=468.056, max=468.056, sum=936.111 (2)",
-            "tab": "General information",
-            "score": 468.05555555555554
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=828.39, mean=828.39, max=828.39, sum=1656.78 (2)",
-            "tab": "General information",
-            "score": 828.39
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=594.44, mean=594.44, max=594.44, sum=1188.88 (2)",
-            "tab": "General information",
-            "score": 594.44
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=499.566, mean=499.566, max=499.566, sum=999.133 (2)",
-            "tab": "General information",
-            "score": 499.5664739884393
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=502.412, mean=502.412, max=502.412, sum=1004.824 (2)",
-            "tab": "General information",
-            "score": 502.4117647058824
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.36, mean=0.36, max=0.36, sum=0.72 (2)",
-            "tab": "Efficiency",
-            "score": 0.35994538068771365
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=373.42, mean=373.42, max=373.42, sum=746.84 (2)",
-            "tab": "General information",
-            "score": 373.42
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.693,
-        "details": {
-          "description": "min=0.693, mean=0.693, max=0.693, sum=1.386 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.354, mean=0.354, max=0.354, sum=0.709 (2)",
-            "tab": "Efficiency",
-            "score": 0.3544190766518576
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=613.228, mean=613.228, max=613.228, sum=1226.456 (2)",
-            "tab": "General information",
-            "score": 613.2280701754386
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.64,
-        "details": {
-          "description": "min=0.64, mean=0.64, max=0.64, sum=1.28 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.326, mean=0.326, max=0.326, sum=0.653 (2)",
-            "tab": "Efficiency",
-            "score": 0.3264468240737915
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=399.69, mean=399.69, max=399.69, sum=799.38 (2)",
-            "tab": "General information",
-            "score": 399.69
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.898,
-        "details": {
-          "description": "min=0.898, mean=0.898, max=0.898, sum=1.796 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.407, mean=0.407, max=0.407, sum=0.815 (2)",
-            "tab": "Efficiency",
-            "score": 0.40749982330534196
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=391.231, mean=391.231, max=391.231, sum=782.463 (2)",
-            "tab": "General information",
-            "score": 391.23148148148147
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.482, mean=0.482, max=0.482, sum=0.963 (2)",
-            "tab": "Efficiency",
-            "score": 0.48153685373508665
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=327.92, mean=327.92, max=327.92, sum=655.839 (2)",
-            "tab": "General information",
-            "score": 327.91961414790995
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.905,
-        "details": {
-          "description": "min=0.905, mean=0.905, max=0.905, sum=1.81 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.386, mean=0.386, max=0.386, sum=0.772 (2)",
-            "tab": "Efficiency",
-            "score": 0.3862454724662444
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.347, mean=0.347, max=0.347, sum=0.694 (2)",
-            "tab": "Efficiency",
-            "score": 0.3472177982330322
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.474, mean=0.474, max=0.474, sum=0.947 (2)",
-            "tab": "Efficiency",
-            "score": 0.47372100343915596
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.33, mean=0.33, max=0.33, sum=0.661 (2)",
-            "tab": "Efficiency",
-            "score": 0.330327843528947
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1071.18, mean=1071.18, max=1071.18, sum=2142.36 (2)",
-            "tab": "General information",
-            "score": 1071.1801470588234
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=657.206, mean=657.206, max=657.206, sum=1314.411 (2)",
-            "tab": "General information",
-            "score": 657.2056737588653
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1629.344, mean=1629.344, max=1629.344, sum=3258.687 (2)",
-            "tab": "General information",
-            "score": 1629.3435462842242
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=574.518, mean=574.518, max=574.518, sum=1149.036 (2)",
-            "tab": "General information",
-            "score": 574.5179738562091
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.96,
-        "details": {
-          "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.672 (2)",
-            "tab": "Efficiency",
-            "score": 0.335811505317688
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=421.71, mean=421.71, max=421.71, sum=843.42 (2)",
-            "tab": "General information",
-            "score": 421.71
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.941,
-        "details": {
-          "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.349, mean=0.349, max=0.349, sum=0.697 (2)",
-            "tab": "Efficiency",
-            "score": 0.34870150528456034
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=577.349, mean=577.349, max=577.349, sum=1154.697 (2)",
-            "tab": "General information",
-            "score": 577.3486842105264
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.345, mean=0.345, max=0.345, sum=0.69 (2)",
-            "tab": "Efficiency",
-            "score": 0.3450936794281006
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=565.7, mean=565.7, max=565.7, sum=1131.4 (2)",
-            "tab": "General information",
-            "score": 565.7
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.894,
-        "details": {
-          "description": "min=0.894, mean=0.894, max=0.894, sum=1.789 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.331, mean=0.331, max=0.331, sum=0.662 (2)",
-            "tab": "Efficiency",
-            "score": 0.33114023748433813
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=400.985, mean=400.985, max=400.985, sum=801.97 (2)",
-            "tab": "General information",
-            "score": 400.98490566037736
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.911,
-        "details": {
-          "description": "min=0.911, mean=0.911, max=0.911, sum=1.821 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.346, mean=0.346, max=0.346, sum=0.693 (2)",
-            "tab": "Efficiency",
-            "score": 0.34625059391589874
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=304.677, mean=304.677, max=304.677, sum=609.353 (2)",
-            "tab": "General information",
-            "score": 304.67659574468087
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.807,
-        "details": {
-          "description": "min=0.807, mean=0.807, max=0.807, sum=1.614 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.359, mean=0.359, max=0.359, sum=0.717 (2)",
-            "tab": "Efficiency",
-            "score": 0.35874251661629514
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=439.228, mean=439.228, max=439.228, sum=878.455 (2)",
-            "tab": "General information",
-            "score": 439.22758620689655
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.741,
-        "details": {
-          "description": "min=0.741, mean=0.741, max=0.741, sum=1.481 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.36, mean=0.36, max=0.36, sum=0.721 (2)",
-            "tab": "Efficiency",
-            "score": 0.360492156926917
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=532.683, mean=532.683, max=532.683, sum=1065.365 (2)",
-            "tab": "General information",
-            "score": 532.6825396825396
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.683,
-        "details": {
-          "description": "min=0.683, mean=0.683, max=0.683, sum=1.365 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.458, mean=0.458, max=0.458, sum=0.915 (2)",
-            "tab": "Efficiency",
-            "score": 0.4577372566102043
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=604.492, mean=604.492, max=604.492, sum=1208.984 (2)",
-            "tab": "General information",
-            "score": 604.4920634920635
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.945,
-        "details": {
-          "description": "min=0.945, mean=0.945, max=0.945, sum=1.89 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.422, mean=0.422, max=0.422, sum=0.844 (2)",
-            "tab": "Efficiency",
-            "score": 0.42223084818932316
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.352, mean=0.352, max=0.352, sum=0.703 (2)",
-            "tab": "Efficiency",
-            "score": 0.3515606560730582
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)",
-            "tab": "Efficiency",
-            "score": 0.39000784397125243
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.515, mean=0.515, max=0.515, sum=1.029 (2)",
-            "tab": "Efficiency",
-            "score": 0.5147185542366721
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.349, mean=0.349, max=0.349, sum=0.697 (2)",
-            "tab": "Efficiency",
-            "score": 0.34874117615247013
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.425, mean=0.425, max=0.425, sum=0.85 (2)",
-            "tab": "Efficiency",
-            "score": 0.4252293505199215
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)",
-            "tab": "Efficiency",
-            "score": 0.3419678932581192
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.348, mean=0.348, max=0.348, sum=0.697 (2)",
-            "tab": "Efficiency",
-            "score": 0.3482617440047088
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.41, mean=0.41, max=0.41, sum=0.819 (2)",
-            "tab": "Efficiency",
-            "score": 0.4096046676154898
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.365, mean=0.365, max=0.365, sum=0.731 (2)",
-            "tab": "Efficiency",
-            "score": 0.36535484427647874
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.344, mean=0.344, max=0.344, sum=0.687 (2)",
-            "tab": "Efficiency",
-            "score": 0.3435875463923183
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.687 (2)",
-            "tab": "Efficiency",
-            "score": 0.3434795880759204
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.508, mean=0.508, max=0.508, sum=1.016 (2)",
-            "tab": "Efficiency",
-            "score": 0.5077870616725847
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.445, mean=0.445, max=0.445, sum=0.891 (2)",
-            "tab": "Efficiency",
-            "score": 0.44530287473010616
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=504.874, mean=504.874, max=504.874, sum=1009.748 (2)",
-            "tab": "General information",
-            "score": 504.8741935483871
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=495.34, mean=495.34, max=495.34, sum=990.68 (2)",
-            "tab": "General information",
-            "score": 495.3399014778325
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=865.8, mean=865.8, max=865.8, sum=1731.6 (2)",
-            "tab": "General information",
-            "score": 865.8
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2793.83, mean=2793.83, max=2793.83, sum=5587.661 (2)",
-            "tab": "General information",
-            "score": 2793.830303030303
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=372.783, mean=372.783, max=372.783, sum=745.566 (2)",
-            "tab": "General information",
-            "score": 372.7828282828283
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=463.01, mean=463.01, max=463.01, sum=926.021 (2)",
-            "tab": "General information",
-            "score": 463.0103626943005
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=371.451, mean=371.451, max=371.451, sum=742.903 (2)",
-            "tab": "General information",
-            "score": 371.4512820512821
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=532.456, mean=532.456, max=532.456, sum=1064.911 (2)",
-            "tab": "General information",
-            "score": 532.4555555555555
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=398.739, mean=398.739, max=398.739, sum=797.479 (2)",
-            "tab": "General information",
-            "score": 398.73949579831935
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=560.238, mean=560.238, max=560.238, sum=1120.477 (2)",
-            "tab": "General information",
-            "score": 560.2384105960265
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=492.917, mean=492.917, max=492.917, sum=985.835 (2)",
-            "tab": "General information",
-            "score": 492.91743119266056
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=787.574, mean=787.574, max=787.574, sum=1575.148 (2)",
-            "tab": "General information",
-            "score": 787.574074074074
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2220.005, mean=2220.005, max=2220.005, sum=4440.01 (2)",
-            "tab": "General information",
-            "score": 2220.0049019607845
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1424.439, mean=1424.439, max=1424.439, sum=2848.878 (2)",
-            "tab": "General information",
-            "score": 1424.4388185654009
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.908,
-        "details": {
-          "description": "min=0.908, mean=0.908, max=0.908, sum=1.817 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.397, mean=0.397, max=0.397, sum=0.793 (2)",
-            "tab": "Efficiency",
-            "score": 0.39673851637562296
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.372, mean=0.372, max=0.372, sum=0.744 (2)",
-            "tab": "Efficiency",
-            "score": 0.37223931305281077
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=316.453, mean=316.453, max=316.453, sum=632.906 (2)",
-            "tab": "General information",
-            "score": 316.4529147982063
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=335.695, mean=335.695, max=335.695, sum=671.389 (2)",
-            "tab": "General information",
-            "score": 335.69465648854964
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.934,
-        "details": {
-          "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)",
-            "tab": "Efficiency",
-            "score": 0.336965306731295
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=639.504, mean=639.504, max=639.504, sum=1279.008 (2)",
-            "tab": "General information",
-            "score": 639.5041322314049
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.883,
-        "details": {
-          "description": "min=0.883, mean=0.883, max=0.883, sum=1.767 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.643 (2)",
-            "tab": "Efficiency",
-            "score": 0.3214270746781051
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=445.84, mean=445.84, max=445.84, sum=891.681 (2)",
-            "tab": "General information",
-            "score": 445.840490797546
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.768,
-        "details": {
-          "description": "min=0.768, mean=0.768, max=0.768, sum=1.536 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.328, mean=0.328, max=0.328, sum=0.657 (2)",
-            "tab": "Efficiency",
-            "score": 0.3284116280930383
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=666.205, mean=666.205, max=666.205, sum=1332.411 (2)",
-            "tab": "General information",
-            "score": 666.2053571428571
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.942,
-        "details": {
-          "description": "min=0.942, mean=0.942, max=0.942, sum=1.883 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.32, mean=0.32, max=0.32, sum=0.64 (2)",
-            "tab": "Efficiency",
-            "score": 0.32008614354920617
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=279.485, mean=279.485, max=279.485, sum=558.971 (2)",
-            "tab": "General information",
-            "score": 279.4854368932039
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.936,
-        "details": {
-          "description": "min=0.936, mean=0.936, max=0.936, sum=1.872 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.337, mean=0.337, max=0.337, sum=0.675 (2)",
-            "tab": "Efficiency",
-            "score": 0.3374974228378035
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=399.85, mean=399.85, max=399.85, sum=799.701 (2)",
-            "tab": "General information",
-            "score": 399.85042735042737
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.96,
-        "details": {
-          "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)",
-            "tab": "Efficiency",
-            "score": 0.33016372203826905
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=343.23, mean=343.23, max=343.23, sum=686.46 (2)",
-            "tab": "General information",
-            "score": 343.23
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.954,
-        "details": {
-          "description": "min=0.954, mean=0.954, max=0.954, sum=1.908 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.672 (2)",
-            "tab": "Efficiency",
-            "score": 0.335910246898997
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=296.479, mean=296.479, max=296.479, sum=592.958 (2)",
-            "tab": "General information",
-            "score": 296.47892720306515
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.841,
-        "details": {
-          "description": "min=0.841, mean=0.841, max=0.841, sum=1.683 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.667 (2)",
-            "tab": "Efficiency",
-            "score": 0.3332573719796418
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.344, mean=0.344, max=0.344, sum=0.687 (2)",
-            "tab": "Efficiency",
-            "score": 0.3436078146183291
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=474.835, mean=474.835, max=474.835, sum=949.671 (2)",
-            "tab": "General information",
-            "score": 474.83526011560696
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=655.068, mean=655.068, max=655.068, sum=1310.136 (2)",
-            "tab": "General information",
-            "score": 655.068156424581
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.899,
-        "details": {
-          "description": "min=0.899, mean=0.899, max=0.899, sum=1.797 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.368, mean=0.368, max=0.368, sum=0.737 (2)",
-            "tab": "Efficiency",
-            "score": 0.36828617722380397
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=581.997, mean=581.997, max=581.997, sum=1163.993 (2)",
-            "tab": "General information",
-            "score": 581.9967320261438
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.938,
-        "details": {
-          "description": "min=0.938, mean=0.938, max=0.938, sum=1.877 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.318, mean=0.318, max=0.318, sum=0.635 (2)",
-            "tab": "Efficiency",
-            "score": 0.31765871430620735
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=513.944, mean=513.944, max=513.944, sum=1027.889 (2)",
-            "tab": "General information",
-            "score": 513.9444444444445
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.809,
-        "details": {
-          "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.35, mean=0.35, max=0.35, sum=0.699 (2)",
-            "tab": "Efficiency",
-            "score": 0.3496434450149536
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=402.918, mean=402.918, max=402.918, sum=805.836 (2)",
-            "tab": "General information",
-            "score": 402.91818181818184
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.837,
-        "details": {
-          "description": "min=0.837, mean=0.837, max=0.837, sum=1.673 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)",
-            "tab": "Efficiency",
-            "score": 0.3501845612817881
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1166.686, mean=1166.686, max=1166.686, sum=2333.371 (2)",
-            "tab": "General information",
-            "score": 1166.6857142857143
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.881 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.347, mean=0.347, max=0.347, sum=0.693 (2)",
-            "tab": "Efficiency",
-            "score": 0.346723644294549
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=444.269, mean=444.269, max=444.269, sum=888.537 (2)",
-            "tab": "General information",
-            "score": 444.2686567164179
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.596,
-        "details": {
-          "description": "min=0.596, mean=0.596, max=0.596, sum=1.193 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.314, mean=0.314, max=0.314, sum=0.628 (2)",
-            "tab": "Efficiency",
-            "score": 0.3142197634800371
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=334.434, mean=334.434, max=334.434, sum=668.867 (2)",
-            "tab": "General information",
-            "score": 334.43373493975906
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.889,
-        "details": {
-          "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.664 (2)",
-            "tab": "Efficiency",
-            "score": 0.3320118307370191
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=267.936, mean=267.936, max=267.936, sum=535.871 (2)",
-            "tab": "General information",
-            "score": 267.9356725146199
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.671,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json b/data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json
deleted file mode 100644
index 2d538eb02..000000000
--- a/data/helm_mmlu/openai/gpt-4o-2024-08-06/189e6cc5-1c8f-4712-8dda-c108f18f836d.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4o-2024-08-06/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-4o 2024-08-06",
-    "id": "openai/gpt-4o-2024-08-06",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.843,
-        "details": {
-          "description": "min=0.481, mean=0.843, max=0.984, sum=96.141 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.301, mean=0.459, max=0.88, sum=52.346 (114)",
-            "tab": "Efficiency",
-            "score": 0.45917774780314197
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=267.936, mean=612.332, max=2793.83, sum=69805.818 (114)",
-            "tab": "General information",
-            "score": 612.3317391408493
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.58,
-        "details": {
-          "description": "min=0.58, mean=0.58, max=0.58, sum=1.16 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.335, mean=0.335, max=0.335, sum=0.67 (2)",
-            "tab": "Efficiency",
-            "score": 0.3350093102455139
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=374.53, mean=374.53, max=374.53, sum=749.06 (2)",
-            "tab": "General information",
-            "score": 374.53
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.911,
-        "details": {
-          "description": "min=0.911, mean=0.911, max=0.911, sum=1.822 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.508, mean=0.508, max=0.508, sum=1.015 (2)",
-            "tab": "Efficiency",
-            "score": 0.5075124228442157
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=350.6, mean=350.6, max=350.6, sum=701.2 (2)",
-            "tab": "General information",
-            "score": 350.6
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.686,
-        "details": {
-          "description": "min=0.686, mean=0.686, max=0.686, sum=1.373 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.409, mean=0.409, max=0.409, sum=0.818 (2)",
-            "tab": "Efficiency",
-            "score": 0.4090025806427002
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.41, mean=0.41, max=0.41, sum=0.82 (2)",
-            "tab": "Efficiency",
-            "score": 0.40991874204741585
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.869, mean=0.869, max=0.869, sum=1.739 (2)",
-            "tab": "Efficiency",
-            "score": 0.8693285202980041
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.482, mean=0.482, max=0.482, sum=0.964 (2)",
-            "tab": "Efficiency",
-            "score": 0.4821875333786011
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.396, mean=0.396, max=0.396, sum=0.791 (2)",
-            "tab": "Efficiency",
-            "score": 0.3955839837906678
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.531, mean=0.531, max=0.531, sum=1.062 (2)",
-            "tab": "Efficiency",
-            "score": 0.5307925659067491
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=552.07, mean=552.07, max=552.07, sum=1104.14 (2)",
-            "tab": "General information",
-            "score": 552.07
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=468.056, mean=468.056, max=468.056, sum=936.111 (2)",
-            "tab": "General information",
-            "score": 468.05555555555554
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=828.39, mean=828.39, max=828.39, sum=1656.78 (2)",
-            "tab": "General information",
-            "score": 828.39
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=594.44, mean=594.44, max=594.44, sum=1188.88 (2)",
-            "tab": "General information",
-            "score": 594.44
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=499.566, mean=499.566, max=499.566, sum=999.133 (2)",
-            "tab": "General information",
-            "score": 499.5664739884393
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=502.412, mean=502.412, max=502.412, sum=1004.824 (2)",
-            "tab": "General information",
-            "score": 502.4117647058824
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.502, mean=0.502, max=0.502, sum=1.004 (2)",
-            "tab": "Efficiency",
-            "score": 0.5020688962936402
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=373.42, mean=373.42, max=373.42, sum=746.84 (2)",
-            "tab": "General information",
-            "score": 373.42
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.711,
-        "details": {
-          "description": "min=0.711, mean=0.711, max=0.711, sum=1.421 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.445, mean=0.445, max=0.445, sum=0.89 (2)",
-            "tab": "Efficiency",
-            "score": 0.44516249497731525
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=613.228, mean=613.228, max=613.228, sum=1226.456 (2)",
-            "tab": "General information",
-            "score": 613.2280701754386
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69,
-        "details": {
-          "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.301, mean=0.301, max=0.301, sum=0.602 (2)",
-            "tab": "Efficiency",
-            "score": 0.3012181663513184
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=399.69, mean=399.69, max=399.69, sum=799.38 (2)",
-            "tab": "General information",
-            "score": 399.69
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.907,
-        "details": {
-          "description": "min=0.907, mean=0.907, max=0.907, sum=1.815 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.388, mean=0.388, max=0.388, sum=0.776 (2)",
-            "tab": "Efficiency",
-            "score": 0.3880515495936076
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=391.231, mean=391.231, max=391.231, sum=782.463 (2)",
-            "tab": "General information",
-            "score": 391.23148148148147
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.894,
-        "details": {
-          "description": "min=0.894, mean=0.894, max=0.894, sum=1.788 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.483, mean=0.483, max=0.483, sum=0.965 (2)",
-            "tab": "Efficiency",
-            "score": 0.48272855795464714
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=327.92, mean=327.92, max=327.92, sum=655.839 (2)",
-            "tab": "General information",
-            "score": 327.91961414790995
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.899,
-        "details": {
-          "description": "min=0.899, mean=0.899, max=0.899, sum=1.797 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.448, mean=0.448, max=0.448, sum=0.897 (2)",
-            "tab": "Efficiency",
-            "score": 0.4483548367724699
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.419, mean=0.419, max=0.419, sum=0.839 (2)",
-            "tab": "Efficiency",
-            "score": 0.4192587585313946
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.462, mean=0.462, max=0.462, sum=0.924 (2)",
-            "tab": "Efficiency",
-            "score": 0.462134175381418
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.518, mean=0.518, max=0.518, sum=1.036 (2)",
-            "tab": "Efficiency",
-            "score": 0.5180651210491953
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1071.18, mean=1071.18, max=1071.18, sum=2142.36 (2)",
-            "tab": "General information",
-            "score": 1071.1801470588234
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=657.206, mean=657.206, max=657.206, sum=1314.411 (2)",
-            "tab": "General information",
-            "score": 657.2056737588653
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1629.344, mean=1629.344, max=1629.344, sum=3258.687 (2)",
-            "tab": "General information",
-            "score": 1629.3435462842242
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=574.518, mean=574.518, max=574.518, sum=1149.036 (2)",
-            "tab": "General information",
-            "score": 574.5179738562091
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.95,
-        "details": {
-          "description": "min=0.95, mean=0.95, max=0.95, sum=1.9 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.512, mean=0.512, max=0.512, sum=1.025 (2)",
-            "tab": "Efficiency",
-            "score": 0.5122887134552002
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=421.71, mean=421.71, max=421.71, sum=843.42 (2)",
-            "tab": "General information",
-            "score": 421.71
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.947,
-        "details": {
-          "description": "min=0.947, mean=0.947, max=0.947, sum=1.895 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.435, mean=0.435, max=0.435, sum=0.869 (2)",
-            "tab": "Efficiency",
-            "score": 0.4347311226945174
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=577.349, mean=577.349, max=577.349, sum=1154.697 (2)",
-            "tab": "General information",
-            "score": 577.3486842105264
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "details": {
-          "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.52, mean=0.52, max=0.52, sum=1.04 (2)",
-            "tab": "Efficiency",
-            "score": 0.5199928903579711
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=565.7, mean=565.7, max=565.7, sum=1131.4 (2)",
-            "tab": "General information",
-            "score": 565.7
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.894,
-        "details": {
-          "description": "min=0.894, mean=0.894, max=0.894, sum=1.789 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.307, mean=0.307, max=0.307, sum=0.613 (2)",
-            "tab": "Efficiency",
-            "score": 0.3066561905842907
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=400.985, mean=400.985, max=400.985, sum=801.97 (2)",
-            "tab": "General information",
-            "score": 400.98490566037736
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.923,
-        "details": {
-          "description": "min=0.923, mean=0.923, max=0.923, sum=1.847 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.381, mean=0.381, max=0.381, sum=0.763 (2)",
-            "tab": "Efficiency",
-            "score": 0.3812521427235705
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=304.677, mean=304.677, max=304.677, sum=609.353 (2)",
-            "tab": "General information",
-            "score": 304.67659574468087
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.793,
-        "details": {
-          "description": "min=0.793, mean=0.793, max=0.793, sum=1.586 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.437, mean=0.437, max=0.437, sum=0.874 (2)",
-            "tab": "Efficiency",
-            "score": 0.4368692447399271
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=439.228, mean=439.228, max=439.228, sum=878.455 (2)",
-            "tab": "General information",
-            "score": 439.22758620689655
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.775,
-        "details": {
-          "description": "min=0.775, mean=0.775, max=0.775, sum=1.55 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.374, mean=0.374, max=0.374, sum=0.747 (2)",
-            "tab": "Efficiency",
-            "score": 0.37356801449306426
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=532.683, mean=532.683, max=532.683, sum=1065.365 (2)",
-            "tab": "General information",
-            "score": 532.6825396825396
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.675,
-        "details": {
-          "description": "min=0.675, mean=0.675, max=0.675, sum=1.349 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.341, mean=0.341, max=0.341, sum=0.683 (2)",
-            "tab": "Efficiency",
-            "score": 0.3414205180274116
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=604.492, mean=604.492, max=604.492, sum=1208.984 (2)",
-            "tab": "General information",
-            "score": 604.4920634920635
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.941,
-        "details": {
-          "description": "min=0.941, mean=0.941, max=0.941, sum=1.882 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.511, mean=0.511, max=0.511, sum=1.021 (2)",
-            "tab": "Efficiency",
-            "score": 0.5105965960410334
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.338, mean=0.338, max=0.338, sum=0.676 (2)",
-            "tab": "Efficiency",
-            "score": 0.3379564614131533
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)",
-            "tab": "Efficiency",
-            "score": 0.3969814705848694
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.594, mean=0.594, max=0.594, sum=1.189 (2)",
-            "tab": "Efficiency",
-            "score": 0.5944608587207216
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.353, mean=0.353, max=0.353, sum=0.706 (2)",
-            "tab": "Efficiency",
-            "score": 0.3532402262543187
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.88, mean=0.88, max=0.88, sum=1.76 (2)",
-            "tab": "Efficiency",
-            "score": 0.8798744147305662
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.501, mean=0.501, max=0.501, sum=1.003 (2)",
-            "tab": "Efficiency",
-            "score": 0.501340057911017
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.472, mean=0.472, max=0.472, sum=0.944 (2)",
-            "tab": "Efficiency",
-            "score": 0.4721549925980745
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.406, mean=0.406, max=0.406, sum=0.812 (2)",
-            "tab": "Efficiency",
-            "score": 0.4058714473948759
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.484, mean=0.484, max=0.484, sum=0.968 (2)",
-            "tab": "Efficiency",
-            "score": 0.48384577075377205
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.532, mean=0.532, max=0.532, sum=1.063 (2)",
-            "tab": "Efficiency",
-            "score": 0.5316181160988064
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.518, mean=0.518, max=0.518, sum=1.036 (2)",
-            "tab": "Efficiency",
-            "score": 0.5179998201352579
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.573, mean=0.573, max=0.573, sum=1.147 (2)",
-            "tab": "Efficiency",
-            "score": 0.5734734535217285
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.461, mean=0.461, max=0.461, sum=0.923 (2)",
-            "tab": "Efficiency",
-            "score": 0.4614185592796229
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=504.874, mean=504.874, max=504.874, sum=1009.748 (2)",
-            "tab": "General information",
-            "score": 504.8741935483871
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=495.34, mean=495.34, max=495.34, sum=990.68 (2)",
-            "tab": "General information",
-            "score": 495.3399014778325
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=865.8, mean=865.8, max=865.8, sum=1731.6 (2)",
-            "tab": "General information",
-            "score": 865.8
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2793.83, mean=2793.83, max=2793.83, sum=5587.661 (2)",
-            "tab": "General information",
-            "score": 2793.830303030303
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=372.783, mean=372.783, max=372.783, sum=745.566 (2)",
-            "tab": "General information",
-            "score": 372.7828282828283
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=463.01, mean=463.01, max=463.01, sum=926.021 (2)",
-            "tab": "General information",
-            "score": 463.0103626943005
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=371.451, mean=371.451, max=371.451, sum=742.903 (2)",
-            "tab": "General information",
-            "score": 371.4512820512821
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=532.456, mean=532.456, max=532.456, sum=1064.911 (2)",
-            "tab": "General information",
-            "score": 532.4555555555555
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=398.739, mean=398.739, max=398.739, sum=797.479 (2)",
-            "tab": "General information",
-            "score": 398.73949579831935
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=560.238, mean=560.238, max=560.238, sum=1120.477 (2)",
-            "tab": "General information",
-            "score": 560.2384105960265
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=492.917, mean=492.917, max=492.917, sum=985.835 (2)",
-            "tab": "General information",
-            "score": 492.91743119266056
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=787.574, mean=787.574, max=787.574, sum=1575.148 (2)",
-            "tab": "General information",
-            "score": 787.574074074074
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2220.005, mean=2220.005, max=2220.005, sum=4440.01 (2)",
-            "tab": "General information",
-            "score": 2220.0049019607845
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1424.439, mean=1424.439, max=1424.439, sum=2848.878 (2)",
-            "tab": "General information",
-            "score": 1424.4388185654009
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.403, mean=0.403, max=0.403, sum=0.807 (2)",
-            "tab": "Efficiency",
-            "score": 0.4033327327180871
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)",
-            "tab": "Efficiency",
-            "score": 0.3971163625935562
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=316.453, mean=316.453, max=316.453, sum=632.906 (2)",
-            "tab": "General information",
-            "score": 316.4529147982063
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=335.695, mean=335.695, max=335.695, sum=671.389 (2)",
-            "tab": "General information",
-            "score": 335.69465648854964
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.942,
-        "details": {
-          "description": "min=0.942, mean=0.942, max=0.942, sum=1.884 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.437, mean=0.437, max=0.437, sum=0.875 (2)",
-            "tab": "Efficiency",
-            "score": 0.4373398063596615
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=639.504, mean=639.504, max=639.504, sum=1279.008 (2)",
-            "tab": "General information",
-            "score": 639.5041322314049
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.902,
-        "details": {
-          "description": "min=0.902, mean=0.902, max=0.902, sum=1.804 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.445, mean=0.445, max=0.445, sum=0.89 (2)",
-            "tab": "Efficiency",
-            "score": 0.44485992888000114
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=445.84, mean=445.84, max=445.84, sum=891.681 (2)",
-            "tab": "General information",
-            "score": 445.840490797546
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.777,
-        "details": {
-          "description": "min=0.777, mean=0.777, max=0.777, sum=1.554 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.414, mean=0.414, max=0.414, sum=0.829 (2)",
-            "tab": "Efficiency",
-            "score": 0.41432228897299084
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=666.205, mean=666.205, max=666.205, sum=1332.411 (2)",
-            "tab": "General information",
-            "score": 666.2053571428571
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.913,
-        "details": {
-          "description": "min=0.913, mean=0.913, max=0.913, sum=1.825 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.46, mean=0.46, max=0.46, sum=0.92 (2)",
-            "tab": "Efficiency",
-            "score": 0.4598746878429524
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=279.485, mean=279.485, max=279.485, sum=558.971 (2)",
-            "tab": "General information",
-            "score": 279.4854368932039
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.481, mean=0.481, max=0.481, sum=0.962 (2)",
-            "tab": "Efficiency",
-            "score": 0.4812224573559231
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=399.85, mean=399.85, max=399.85, sum=799.701 (2)",
-            "tab": "General information",
-            "score": 399.85042735042737
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.98,
-        "details": {
-          "description": "min=0.98, mean=0.98, max=0.98, sum=1.96 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.425, mean=0.425, max=0.425, sum=0.85 (2)",
-            "tab": "Efficiency",
-            "score": 0.42490904808044433
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=343.23, mean=343.23, max=343.23, sum=686.46 (2)",
-            "tab": "General information",
-            "score": 343.23
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.958,
-        "details": {
-          "description": "min=0.958, mean=0.958, max=0.958, sum=1.916 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.457, mean=0.457, max=0.457, sum=0.915 (2)",
-            "tab": "Efficiency",
-            "score": 0.457414278734385
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=296.479, mean=296.479, max=296.479, sum=592.958 (2)",
-            "tab": "General information",
-            "score": 296.47892720306515
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.802,
-        "details": {
-          "description": "min=0.802, mean=0.802, max=0.802, sum=1.604 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.364, mean=0.364, max=0.364, sum=0.727 (2)",
-            "tab": "Efficiency",
-            "score": 0.3637407087866282
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.462, mean=0.462, max=0.462, sum=0.924 (2)",
-            "tab": "Efficiency",
-            "score": 0.46217673823820143
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=474.835, mean=474.835, max=474.835, sum=949.671 (2)",
-            "tab": "General information",
-            "score": 474.83526011560696
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=655.068, mean=655.068, max=655.068, sum=1310.136 (2)",
-            "tab": "General information",
-            "score": 655.068156424581
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.905,
-        "details": {
-          "description": "min=0.905, mean=0.905, max=0.905, sum=1.81 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.423, mean=0.423, max=0.423, sum=0.847 (2)",
-            "tab": "Efficiency",
-            "score": 0.42327408541261763
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=581.997, mean=581.997, max=581.997, sum=1163.993 (2)",
-            "tab": "General information",
-            "score": 581.9967320261438
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.935,
-        "details": {
-          "description": "min=0.935, mean=0.935, max=0.935, sum=1.87 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.486, mean=0.486, max=0.486, sum=0.972 (2)",
-            "tab": "Efficiency",
-            "score": 0.48604018452726766
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=513.944, mean=513.944, max=513.944, sum=1027.889 (2)",
-            "tab": "General information",
-            "score": 513.9444444444445
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.782,
-        "details": {
-          "description": "min=0.782, mean=0.782, max=0.782, sum=1.564 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.472, mean=0.472, max=0.472, sum=0.944 (2)",
-            "tab": "Efficiency",
-            "score": 0.47211467786268757
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=402.918, mean=402.918, max=402.918, sum=805.836 (2)",
-            "tab": "General information",
-            "score": 402.91818181818184
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.833,
-        "details": {
-          "description": "min=0.833, mean=0.833, max=0.833, sum=1.665 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.452, mean=0.452, max=0.452, sum=0.905 (2)",
-            "tab": "Efficiency",
-            "score": 0.45247335336646255
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1166.686, mean=1166.686, max=1166.686, sum=2333.371 (2)",
-            "tab": "General information",
-            "score": 1166.6857142857143
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.945,
-        "details": {
-          "description": "min=0.945, mean=0.945, max=0.945, sum=1.891 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.479, mean=0.479, max=0.479, sum=0.958 (2)",
-            "tab": "Efficiency",
-            "score": 0.4788183940583794
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=444.269, mean=444.269, max=444.269, sum=888.537 (2)",
-            "tab": "General information",
-            "score": 444.2686567164179
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.578,
-        "details": {
-          "description": "min=0.578, mean=0.578, max=0.578, sum=1.157 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.473, mean=0.473, max=0.473, sum=0.945 (2)",
-            "tab": "Efficiency",
-            "score": 0.47254319794206734
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=334.434, mean=334.434, max=334.434, sum=668.867 (2)",
-            "tab": "General information",
-            "score": 334.43373493975906
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.883,
-        "details": {
-          "description": "min=0.883, mean=0.883, max=0.883, sum=1.766 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.408, mean=0.408, max=0.408, sum=0.815 (2)",
-            "tab": "Efficiency",
-            "score": 0.4075693944741411
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=267.936, mean=267.936, max=267.936, sum=535.871 (2)",
-            "tab": "General information",
-            "score": 267.9356725146199
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json b/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json
deleted file mode 100644
index 7753003a8..000000000
--- a/data/helm_mmlu/openai/gpt-4o-mini-2024-07-18/4f043e7b-dfb5-4de5-a034-c4b0a335a8b3.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/openai_gpt-4o-mini-2024-07-18/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-4o mini 2024-07-18",
-    "id": "openai/gpt-4o-mini-2024-07-18",
-    "developer": "openai",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.767,
-        "details": {
-          "description": "min=0.419, mean=0.767, max=0.959, sum=87.464 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.26, mean=0.334, max=0.733, sum=38.043 (114)",
-            "tab": "Efficiency",
-            "score": 0.3337143530055209
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=267.936, mean=612.332, max=2793.83, sum=69805.818 (114)",
-            "tab": "General information",
-            "score": 612.3317391408493
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.42,
-        "details": {
-          "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.292, mean=0.292, max=0.292, sum=0.584 (2)",
-            "tab": "Efficiency",
-            "score": 0.29186195611953736
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=374.53, mean=374.53, max=374.53, sum=749.06 (2)",
-            "tab": "General information",
-            "score": 374.53
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.541 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.282, mean=0.282, max=0.282, sum=0.564 (2)",
-            "tab": "Efficiency",
-            "score": 0.282137664159139
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=350.6, mean=350.6, max=350.6, sum=701.2 (2)",
-            "tab": "General information",
-            "score": 350.6
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.559,
-        "details": {
-          "description": "min=0.559, mean=0.559, max=0.559, sum=1.118 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.309, mean=0.309, max=0.309, sum=0.618 (2)",
-            "tab": "Efficiency",
-            "score": 0.30902551651000976
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)",
-            "tab": "Efficiency",
-            "score": 0.31521839068995583
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.641 (2)",
-            "tab": "Efficiency",
-            "score": 0.3206118988990784
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.31, mean=0.31, max=0.31, sum=0.621 (2)",
-            "tab": "Efficiency",
-            "score": 0.31047542572021486
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.313, mean=0.313, max=0.313, sum=0.625 (2)",
-            "tab": "Efficiency",
-            "score": 0.31259707081524624
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.287, mean=0.287, max=0.287, sum=0.573 (2)",
-            "tab": "Efficiency",
-            "score": 0.2866650983399036
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=552.07, mean=552.07, max=552.07, sum=1104.14 (2)",
-            "tab": "General information",
-            "score": 552.07
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=468.056, mean=468.056, max=468.056, sum=936.111 (2)",
-            "tab": "General information",
-            "score": 468.05555555555554
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=828.39, mean=828.39, max=828.39, sum=1656.78 (2)",
-            "tab": "General information",
-            "score": 828.39
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=594.44, mean=594.44, max=594.44, sum=1188.88 (2)",
-            "tab": "General information",
-            "score": 594.44
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=499.566, mean=499.566, max=499.566, sum=999.133 (2)",
-            "tab": "General information",
-            "score": 499.5664739884393
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=502.412, mean=502.412, max=502.412, sum=1004.824 (2)",
-            "tab": "General information",
-            "score": 502.4117647058824
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.594 (2)",
-            "tab": "Efficiency",
-            "score": 0.29681269884109496
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=373.42, mean=373.42, max=373.42, sum=746.84 (2)",
-            "tab": "General information",
-            "score": 373.42
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.649,
-        "details": {
-          "description": "min=0.649, mean=0.649, max=0.649, sum=1.298 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.299, mean=0.299, max=0.299, sum=0.599 (2)",
-            "tab": "Efficiency",
-            "score": 0.29936775199153964
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=613.228, mean=613.228, max=613.228, sum=1226.456 (2)",
-            "tab": "General information",
-            "score": 613.2280701754386
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.45,
-        "details": {
-          "description": "min=0.45, mean=0.45, max=0.45, sum=0.9 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.27, mean=0.27, max=0.27, sum=0.539 (2)",
-            "tab": "Efficiency",
-            "score": 0.269585702419281
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=399.69, mean=399.69, max=399.69, sum=799.38 (2)",
-            "tab": "General information",
-            "score": 399.69
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.305, mean=0.305, max=0.305, sum=0.61 (2)",
-            "tab": "Efficiency",
-            "score": 0.3047747744454278
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=391.231, mean=391.231, max=391.231, sum=782.463 (2)",
-            "tab": "General information",
-            "score": 391.23148148148147
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.772,
-        "details": {
-          "description": "min=0.772, mean=0.772, max=0.772, sum=1.543 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.289, mean=0.289, max=0.289, sum=0.578 (2)",
-            "tab": "Efficiency",
-            "score": 0.28879288308490125
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=327.92, mean=327.92, max=327.92, sum=655.839 (2)",
-            "tab": "General information",
-            "score": 327.91961414790995
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.833,
-        "details": {
-          "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.306, mean=0.306, max=0.306, sum=0.612 (2)",
-            "tab": "Efficiency",
-            "score": 0.30609772924114675
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.312, mean=0.312, max=0.312, sum=0.624 (2)",
-            "tab": "Efficiency",
-            "score": 0.31189272336080565
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.654 (2)",
-            "tab": "Efficiency",
-            "score": 0.32692549234885127
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.429, mean=0.429, max=0.429, sum=0.858 (2)",
-            "tab": "Efficiency",
-            "score": 0.42903122792836107
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1071.18, mean=1071.18, max=1071.18, sum=2142.36 (2)",
-            "tab": "General information",
-            "score": 1071.1801470588234
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=657.206, mean=657.206, max=657.206, sum=1314.411 (2)",
-            "tab": "General information",
-            "score": 657.2056737588653
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1629.344, mean=1629.344, max=1629.344, sum=3258.687 (2)",
-            "tab": "General information",
-            "score": 1629.3435462842242
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=574.518, mean=574.518, max=574.518, sum=1149.036 (2)",
-            "tab": "General information",
-            "score": 574.5179738562091
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.299, mean=0.299, max=0.299, sum=0.599 (2)",
-            "tab": "Efficiency",
-            "score": 0.29943873405456545
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=421.71, mean=421.71, max=421.71, sum=843.42 (2)",
-            "tab": "General information",
-            "score": 421.71
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.849,
-        "details": {
-          "description": "min=0.849, mean=0.849, max=0.849, sum=1.697 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.306, mean=0.306, max=0.306, sum=0.612 (2)",
-            "tab": "Efficiency",
-            "score": 0.30577954336216573
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=577.349, mean=577.349, max=577.349, sum=1154.697 (2)",
-            "tab": "General information",
-            "score": 577.3486842105264
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.301, mean=0.301, max=0.301, sum=0.602 (2)",
-            "tab": "Efficiency",
-            "score": 0.3009026026725769
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=565.7, mean=565.7, max=565.7, sum=1131.4 (2)",
-            "tab": "General information",
-            "score": 565.7
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.845,
-        "details": {
-          "description": "min=0.845, mean=0.845, max=0.845, sum=1.691 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.292, mean=0.292, max=0.292, sum=0.585 (2)",
-            "tab": "Efficiency",
-            "score": 0.29226316685946485
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=400.985, mean=400.985, max=400.985, sum=801.97 (2)",
-            "tab": "General information",
-            "score": 400.98490566037736
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791,
-        "details": {
-          "description": "min=0.791, mean=0.791, max=0.791, sum=1.583 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)",
-            "tab": "Efficiency",
-            "score": 0.26024563261803163
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=304.677, mean=304.677, max=304.677, sum=609.353 (2)",
-            "tab": "General information",
-            "score": 304.67659574468087
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.731,
-        "details": {
-          "description": "min=0.731, mean=0.731, max=0.731, sum=1.462 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.287, mean=0.287, max=0.287, sum=0.575 (2)",
-            "tab": "Efficiency",
-            "score": 0.287484780673323
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=439.228, mean=439.228, max=439.228, sum=878.455 (2)",
-            "tab": "General information",
-            "score": 439.22758620689655
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.651,
-        "details": {
-          "description": "min=0.651, mean=0.651, max=0.651, sum=1.302 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.306, mean=0.306, max=0.306, sum=0.612 (2)",
-            "tab": "Efficiency",
-            "score": 0.305813713679238
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=532.683, mean=532.683, max=532.683, sum=1065.365 (2)",
-            "tab": "General information",
-            "score": 532.6825396825396
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.556,
-        "details": {
-          "description": "min=0.556, mean=0.556, max=0.556, sum=1.111 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.355, mean=0.355, max=0.355, sum=0.711 (2)",
-            "tab": "Efficiency",
-            "score": 0.3554064962599013
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=604.492, mean=604.492, max=604.492, sum=1208.984 (2)",
-            "tab": "General information",
-            "score": 604.4920634920635
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.903,
-        "details": {
-          "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.383, mean=0.383, max=0.383, sum=0.765 (2)",
-            "tab": "Efficiency",
-            "score": 0.3826789717520437
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.672 (2)",
-            "tab": "Efficiency",
-            "score": 0.3358421137767472
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.357, mean=0.357, max=0.357, sum=0.714 (2)",
-            "tab": "Efficiency",
-            "score": 0.3572020483016968
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.442, mean=0.442, max=0.442, sum=0.883 (2)",
-            "tab": "Efficiency",
-            "score": 0.44169029033545293
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.331, mean=0.331, max=0.331, sum=0.663 (2)",
-            "tab": "Efficiency",
-            "score": 0.33136808029328935
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.31, mean=0.31, max=0.31, sum=0.62 (2)",
-            "tab": "Efficiency",
-            "score": 0.31024189563612864
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.302, mean=0.302, max=0.302, sum=0.605 (2)",
-            "tab": "Efficiency",
-            "score": 0.30249478022257487
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.273, mean=0.273, max=0.273, sum=0.546 (2)",
-            "tab": "Efficiency",
-            "score": 0.2731299541614674
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.687 (2)",
-            "tab": "Efficiency",
-            "score": 0.34336654078058837
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.277, mean=0.277, max=0.277, sum=0.554 (2)",
-            "tab": "Efficiency",
-            "score": 0.27723274167799794
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.342, mean=0.342, max=0.342, sum=0.684 (2)",
-            "tab": "Efficiency",
-            "score": 0.3419263616614385
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)",
-            "tab": "Efficiency",
-            "score": 0.41491677584471526
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.367, mean=0.367, max=0.367, sum=0.735 (2)",
-            "tab": "Efficiency",
-            "score": 0.3674813041500017
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.339, mean=0.339, max=0.339, sum=0.678 (2)",
-            "tab": "Efficiency",
-            "score": 0.33923840120371884
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=504.874, mean=504.874, max=504.874, sum=1009.748 (2)",
-            "tab": "General information",
-            "score": 504.8741935483871
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=495.34, mean=495.34, max=495.34, sum=990.68 (2)",
-            "tab": "General information",
-            "score": 495.3399014778325
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=865.8, mean=865.8, max=865.8, sum=1731.6 (2)",
-            "tab": "General information",
-            "score": 865.8
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2793.83, mean=2793.83, max=2793.83, sum=5587.661 (2)",
-            "tab": "General information",
-            "score": 2793.830303030303
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=372.783, mean=372.783, max=372.783, sum=745.566 (2)",
-            "tab": "General information",
-            "score": 372.7828282828283
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=463.01, mean=463.01, max=463.01, sum=926.021 (2)",
-            "tab": "General information",
-            "score": 463.0103626943005
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=371.451, mean=371.451, max=371.451, sum=742.903 (2)",
-            "tab": "General information",
-            "score": 371.4512820512821
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=532.456, mean=532.456, max=532.456, sum=1064.911 (2)",
-            "tab": "General information",
-            "score": 532.4555555555555
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=398.739, mean=398.739, max=398.739, sum=797.479 (2)",
-            "tab": "General information",
-            "score": 398.73949579831935
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=560.238, mean=560.238, max=560.238, sum=1120.477 (2)",
-            "tab": "General information",
-            "score": 560.2384105960265
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=492.917, mean=492.917, max=492.917, sum=985.835 (2)",
-            "tab": "General information",
-            "score": 492.91743119266056
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=787.574, mean=787.574, max=787.574, sum=1575.148 (2)",
-            "tab": "General information",
-            "score": 787.574074074074
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2220.005, mean=2220.005, max=2220.005, sum=4440.01 (2)",
-            "tab": "General information",
-            "score": 2220.0049019607845
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1424.439, mean=1424.439, max=1424.439, sum=2848.878 (2)",
-            "tab": "General information",
-            "score": 1424.4388185654009
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.863,
-        "details": {
-          "description": "min=0.863, mean=0.863, max=0.863, sum=1.725 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.305, mean=0.305, max=0.305, sum=0.61 (2)",
-            "tab": "Efficiency",
-            "score": 0.30522876897734913
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.303, max=0.303, sum=0.606 (2)",
-            "tab": "Efficiency",
-            "score": 0.30280636285097545
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=316.453, mean=316.453, max=316.453, sum=632.906 (2)",
-            "tab": "General information",
-            "score": 316.4529147982063
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=335.695, mean=335.695, max=335.695, sum=671.389 (2)",
-            "tab": "General information",
-            "score": 335.69465648854964
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.926,
-        "details": {
-          "description": "min=0.926, mean=0.926, max=0.926, sum=1.851 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.685 (2)",
-            "tab": "Efficiency",
-            "score": 0.3425306268959991
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=639.504, mean=639.504, max=639.504, sum=1279.008 (2)",
-            "tab": "General information",
-            "score": 639.5041322314049
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=1.742 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.595 (2)",
-            "tab": "Efficiency",
-            "score": 0.29739713961361375
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=445.84, mean=445.84, max=445.84, sum=891.681 (2)",
-            "tab": "General information",
-            "score": 445.840490797546
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.616,
-        "details": {
-          "description": "min=0.616, mean=0.616, max=0.616, sum=1.232 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.594 (2)",
-            "tab": "Efficiency",
-            "score": 0.2970866986683437
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=666.205, mean=666.205, max=666.205, sum=1332.411 (2)",
-            "tab": "General information",
-            "score": 666.2053571428571
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.845,
-        "details": {
-          "description": "min=0.845, mean=0.845, max=0.845, sum=1.689 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.305, mean=0.305, max=0.305, sum=0.611 (2)",
-            "tab": "Efficiency",
-            "score": 0.3053626088262762
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=279.485, mean=279.485, max=279.485, sum=558.971 (2)",
-            "tab": "General information",
-            "score": 279.4854368932039
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.927,
-        "details": {
-          "description": "min=0.927, mean=0.927, max=0.927, sum=1.855 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.306, mean=0.306, max=0.306, sum=0.612 (2)",
-            "tab": "Efficiency",
-            "score": 0.3060942073153634
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=399.85, mean=399.85, max=399.85, sum=799.701 (2)",
-            "tab": "General information",
-            "score": 399.85042735042737
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "details": {
-          "description": "min=0.89, mean=0.89, max=0.89, sum=1.78 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)",
-            "tab": "Efficiency",
-            "score": 0.31078683137893676
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=343.23, mean=343.23, max=343.23, sum=686.46 (2)",
-            "tab": "General information",
-            "score": 343.23
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.913,
-        "details": {
-          "description": "min=0.913, mean=0.913, max=0.913, sum=1.826 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.302, mean=0.302, max=0.302, sum=0.604 (2)",
-            "tab": "Efficiency",
-            "score": 0.3020631249989282
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=296.479, mean=296.479, max=296.479, sum=592.958 (2)",
-            "tab": "General information",
-            "score": 296.47892720306515
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.485,
-        "details": {
-          "description": "min=0.485, mean=0.485, max=0.485, sum=0.97 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.316, mean=0.316, max=0.316, sum=0.631 (2)",
-            "tab": "Efficiency",
-            "score": 0.31556026577260454
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.318, mean=0.318, max=0.318, sum=0.637 (2)",
-            "tab": "Efficiency",
-            "score": 0.3183864769322912
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=474.835, mean=474.835, max=474.835, sum=949.671 (2)",
-            "tab": "General information",
-            "score": 474.83526011560696
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=655.068, mean=655.068, max=655.068, sum=1310.136 (2)",
-            "tab": "General information",
-            "score": 655.068156424581
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.827,
-        "details": {
-          "description": "min=0.827, mean=0.827, max=0.827, sum=1.654 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.31, mean=0.31, max=0.31, sum=0.621 (2)",
-            "tab": "Efficiency",
-            "score": 0.3104910164876701
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=581.997, mean=581.997, max=581.997, sum=1163.993 (2)",
-            "tab": "General information",
-            "score": 581.9967320261438
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.833,
-        "details": {
-          "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.311, mean=0.311, max=0.311, sum=0.621 (2)",
-            "tab": "Efficiency",
-            "score": 0.3106661284411395
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=513.944, mean=513.944, max=513.944, sum=1027.889 (2)",
-            "tab": "General information",
-            "score": 513.9444444444445
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791,
-        "details": {
-          "description": "min=0.791, mean=0.791, max=0.791, sum=1.582 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.303, max=0.303, sum=0.606 (2)",
-            "tab": "Efficiency",
-            "score": 0.30300807519392536
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=402.918, mean=402.918, max=402.918, sum=805.836 (2)",
-            "tab": "General information",
-            "score": 402.91818181818184
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.788,
-        "details": {
-          "description": "min=0.788, mean=0.788, max=0.788, sum=1.576 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.733, mean=0.733, max=0.733, sum=1.466 (2)",
-            "tab": "Efficiency",
-            "score": 0.733092721627683
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1166.686, mean=1166.686, max=1166.686, sum=2333.371 (2)",
-            "tab": "General information",
-            "score": 1166.6857142857143
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.361, mean=0.361, max=0.361, sum=0.722 (2)",
-            "tab": "Efficiency",
-            "score": 0.3608738794848694
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=444.269, mean=444.269, max=444.269, sum=888.537 (2)",
-            "tab": "General information",
-            "score": 444.2686567164179
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.536,
-        "details": {
-          "description": "min=0.536, mean=0.536, max=0.536, sum=1.072 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.489, mean=0.489, max=0.489, sum=0.978 (2)",
-            "tab": "Efficiency",
-            "score": 0.48897463298705685
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=334.434, mean=334.434, max=334.434, sum=668.867 (2)",
-            "tab": "General information",
-            "score": 334.43373493975906
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.86,
-        "details": {
-          "description": "min=0.86, mean=0.86, max=0.86, sum=1.719 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.623, mean=0.623, max=0.623, sum=1.247 (2)",
-            "tab": "Efficiency",
-            "score": 0.6232896199700428
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=267.936, mean=267.936, max=267.936, sum=535.871 (2)",
-            "tab": "General information",
-            "score": 267.9356725146199
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.774,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json b/data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json
deleted file mode 100644
index 4b924f5af..000000000
--- a/data/helm_mmlu/qwen/qwen1.5-110b-chat/ff9b6c57-cadd-4d5d-92cb-62be63939b1b.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-110b-chat/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5 Chat 110B",
-    "id": "qwen/qwen1.5-110b-chat",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.768,
-        "details": {
-          "description": "min=0.478, mean=0.768, max=0.984, sum=87.534 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.229, mean=0.287, max=0.751, sum=32.77 (114)",
-            "tab": "Efficiency",
-            "score": 0.2874531237731517
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=276.07, mean=625.598, max=2814.903, sum=71318.198 (114)",
-            "tab": "General information",
-            "score": 625.5982315160392
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.57,
-        "details": {
-          "description": "min=0.57, mean=0.57, max=0.57, sum=1.14 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.23, mean=0.23, max=0.23, sum=0.459 (2)",
-            "tab": "Efficiency",
-            "score": 0.22966567754745484
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=378.19, mean=378.19, max=378.19, sum=756.38 (2)",
-            "tab": "General information",
-            "score": 378.19
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.696,
-        "details": {
-          "description": "min=0.696, mean=0.696, max=0.696, sum=1.393 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)",
-            "tab": "Efficiency",
-            "score": 0.2600334096837927
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=353.978, mean=353.978, max=353.978, sum=707.956 (2)",
-            "tab": "General information",
-            "score": 353.97777777777776
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.51,
-        "details": {
-          "description": "min=0.51, mean=0.51, max=0.51, sum=1.02 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.257, mean=0.257, max=0.257, sum=0.513 (2)",
-            "tab": "Efficiency",
-            "score": 0.2566096520423889
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.296, mean=0.296, max=0.296, sum=0.592 (2)",
-            "tab": "Efficiency",
-            "score": 0.2957576380835639
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.326, mean=0.326, max=0.326, sum=0.652 (2)",
-            "tab": "Efficiency",
-            "score": 0.3260823440551758
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)",
-            "tab": "Efficiency",
-            "score": 0.2992465353012085
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.269, mean=0.269, max=0.269, sum=0.538 (2)",
-            "tab": "Efficiency",
-            "score": 0.2690960313543419
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.281, mean=0.281, max=0.281, sum=0.562 (2)",
-            "tab": "Efficiency",
-            "score": 0.28119626699709427
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=568.25, mean=568.25, max=568.25, sum=1136.5 (2)",
-            "tab": "General information",
-            "score": 568.25
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=486.979, mean=486.979, max=486.979, sum=973.958 (2)",
-            "tab": "General information",
-            "score": 486.9791666666667
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=838.58, mean=838.58, max=838.58, sum=1677.16 (2)",
-            "tab": "General information",
-            "score": 838.58
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=607.7, mean=607.7, max=607.7, sum=1215.4 (2)",
-            "tab": "General information",
-            "score": 607.7
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=506.098, mean=506.098, max=506.098, sum=1012.197 (2)",
-            "tab": "General information",
-            "score": 506.0982658959538
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=516.265, mean=516.265, max=516.265, sum=1032.529 (2)",
-            "tab": "General information",
-            "score": 516.2647058823529
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.277, mean=0.277, max=0.277, sum=0.555 (2)",
-            "tab": "Efficiency",
-            "score": 0.2773160576820374
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=386.64, mean=386.64, max=386.64, sum=773.28 (2)",
-            "tab": "General information",
-            "score": 386.64
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.64,
-        "details": {
-          "description": "min=0.64, mean=0.64, max=0.64, sum=1.281 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.248, mean=0.248, max=0.248, sum=0.496 (2)",
-            "tab": "Efficiency",
-            "score": 0.24817464017031485
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=627.939, mean=627.939, max=627.939, sum=1255.877 (2)",
-            "tab": "General information",
-            "score": 627.938596491228
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.51,
-        "details": {
-          "description": "min=0.51, mean=0.51, max=0.51, sum=1.02 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.257, mean=0.257, max=0.257, sum=0.514 (2)",
-            "tab": "Efficiency",
-            "score": 0.25695453643798827
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=429.06, mean=429.06, max=429.06, sum=858.12 (2)",
-            "tab": "General information",
-            "score": 429.06
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.833,
-        "details": {
-          "description": "min=0.833, mean=0.833, max=0.833, sum=1.667 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.256, mean=0.256, max=0.256, sum=0.512 (2)",
-            "tab": "Efficiency",
-            "score": 0.25610714267801354
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=394.713, mean=394.713, max=394.713, sum=789.426 (2)",
-            "tab": "General information",
-            "score": 394.712962962963
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.823,
-        "details": {
-          "description": "min=0.823, mean=0.823, max=0.823, sum=1.646 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.233, mean=0.233, max=0.233, sum=0.465 (2)",
-            "tab": "Efficiency",
-            "score": 0.2326939565959084
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=329.09, mean=329.09, max=329.09, sum=658.18 (2)",
-            "tab": "General information",
-            "score": 329.09003215434086
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.82, mean=0.82, max=0.82, sum=1.641 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.396, mean=0.396, max=0.396, sum=0.792 (2)",
-            "tab": "Efficiency",
-            "score": 0.39590225675526786
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.243, mean=0.243, max=0.243, sum=0.486 (2)",
-            "tab": "Efficiency",
-            "score": 0.24316950554543354
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.319, mean=0.319, max=0.319, sum=0.638 (2)",
-            "tab": "Efficiency",
-            "score": 0.31920133731200456
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.245, mean=0.245, max=0.245, sum=0.491 (2)",
-            "tab": "Efficiency",
-            "score": 0.2452772462290097
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1125.199, mean=1125.199, max=1125.199, sum=2250.397 (2)",
-            "tab": "General information",
-            "score": 1125.1985294117646
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=739.34, mean=739.34, max=739.34, sum=1478.681 (2)",
-            "tab": "General information",
-            "score": 739.3404255319149
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1663.969, mean=1663.969, max=1663.969, sum=3327.939 (2)",
-            "tab": "General information",
-            "score": 1663.9693611473272
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=581.417, mean=581.417, max=581.417, sum=1162.833 (2)",
-            "tab": "General information",
-            "score": 581.4166666666666
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.229, mean=0.229, max=0.229, sum=0.459 (2)",
-            "tab": "Efficiency",
-            "score": 0.22928016662597656
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=428.16, mean=428.16, max=428.16, sum=856.32 (2)",
-            "tab": "General information",
-            "score": 428.16
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.803 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.306, mean=0.306, max=0.306, sum=0.612 (2)",
-            "tab": "Efficiency",
-            "score": 0.3059707331029992
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=589.849, mean=589.849, max=589.849, sum=1179.697 (2)",
-            "tab": "General information",
-            "score": 589.8486842105264
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)",
-            "tab": "Efficiency",
-            "score": 0.31108115911483764
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=569.87, mean=569.87, max=569.87, sum=1139.74 (2)",
-            "tab": "General information",
-            "score": 569.87
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.766,
-        "details": {
-          "description": "min=0.766, mean=0.766, max=0.766, sum=1.532 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.268, mean=0.268, max=0.268, sum=0.536 (2)",
-            "tab": "Efficiency",
-            "score": 0.26778328283777775
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=400.623, mean=400.623, max=400.623, sum=801.245 (2)",
-            "tab": "General information",
-            "score": 400.62264150943395
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.838,
-        "details": {
-          "description": "min=0.838, mean=0.838, max=0.838, sum=1.677 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.267, mean=0.267, max=0.267, sum=0.533 (2)",
-            "tab": "Efficiency",
-            "score": 0.26653050361795627
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=305.494, mean=305.494, max=305.494, sum=610.987 (2)",
-            "tab": "General information",
-            "score": 305.4936170212766
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.752,
-        "details": {
-          "description": "min=0.752, mean=0.752, max=0.752, sum=1.503 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.24, mean=0.24, max=0.24, sum=0.481 (2)",
-            "tab": "Efficiency",
-            "score": 0.24032716751098632
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=463.8, mean=463.8, max=463.8, sum=927.6 (2)",
-            "tab": "General information",
-            "score": 463.8
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.669,
-        "details": {
-          "description": "min=0.669, mean=0.669, max=0.669, sum=1.339 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.286, mean=0.286, max=0.286, sum=0.571 (2)",
-            "tab": "Efficiency",
-            "score": 0.28569977939444247
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=577.119, mean=577.119, max=577.119, sum=1154.238 (2)",
-            "tab": "General information",
-            "score": 577.1190476190476
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.643,
-        "details": {
-          "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.284, mean=0.284, max=0.284, sum=0.567 (2)",
-            "tab": "Efficiency",
-            "score": 0.2836597722674173
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=604.667, mean=604.667, max=604.667, sum=1209.333 (2)",
-            "tab": "General information",
-            "score": 604.6666666666666
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.903,
-        "details": {
-          "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.283, mean=0.283, max=0.283, sum=0.566 (2)",
-            "tab": "Efficiency",
-            "score": 0.2828109118246263
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.586 (2)",
-            "tab": "Efficiency",
-            "score": 0.29298263935032737
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.307, mean=0.307, max=0.307, sum=0.615 (2)",
-            "tab": "Efficiency",
-            "score": 0.30738641500473024
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.593, mean=0.593, max=0.593, sum=1.186 (2)",
-            "tab": "Efficiency",
-            "score": 0.5927927941987009
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.277, mean=0.277, max=0.277, sum=0.553 (2)",
-            "tab": "Efficiency",
-            "score": 0.2765737639533149
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.253, mean=0.253, max=0.253, sum=0.505 (2)",
-            "tab": "Efficiency",
-            "score": 0.2526841929539498
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.264, mean=0.264, max=0.264, sum=0.527 (2)",
-            "tab": "Efficiency",
-            "score": 0.2636140242601052
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.289, mean=0.289, max=0.289, sum=0.578 (2)",
-            "tab": "Efficiency",
-            "score": 0.28875163837715434
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.254, mean=0.254, max=0.254, sum=0.508 (2)",
-            "tab": "Efficiency",
-            "score": 0.2539960216073429
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.281, mean=0.281, max=0.281, sum=0.562 (2)",
-            "tab": "Efficiency",
-            "score": 0.28084811943256305
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.244, mean=0.244, max=0.244, sum=0.489 (2)",
-            "tab": "Efficiency",
-            "score": 0.24437280532416947
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.34, mean=0.34, max=0.34, sum=0.679 (2)",
-            "tab": "Efficiency",
-            "score": 0.3396394296928688
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.416, mean=0.416, max=0.416, sum=0.832 (2)",
-            "tab": "Efficiency",
-            "score": 0.4159782189948886
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.751, mean=0.751, max=0.751, sum=1.501 (2)",
-            "tab": "Efficiency",
-            "score": 0.7505324741959069
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=513.916, mean=513.916, max=513.916, sum=1027.832 (2)",
-            "tab": "General information",
-            "score": 513.916129032258
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=517.261, mean=517.261, max=517.261, sum=1034.522 (2)",
-            "tab": "General information",
-            "score": 517.2610837438424
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=878.46, mean=878.46, max=878.46, sum=1756.92 (2)",
-            "tab": "General information",
-            "score": 878.46
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2814.903, mean=2814.903, max=2814.903, sum=5629.806 (2)",
-            "tab": "General information",
-            "score": 2814.9030303030304
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=372.217, mean=372.217, max=372.217, sum=744.434 (2)",
-            "tab": "General information",
-            "score": 372.2171717171717
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=467.311, mean=467.311, max=467.311, sum=934.622 (2)",
-            "tab": "General information",
-            "score": 467.31088082901556
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=374.349, mean=374.349, max=374.349, sum=748.697 (2)",
-            "tab": "General information",
-            "score": 374.34871794871793
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=565.326, mean=565.326, max=565.326, sum=1130.652 (2)",
-            "tab": "General information",
-            "score": 565.325925925926
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=402.277, mean=402.277, max=402.277, sum=804.555 (2)",
-            "tab": "General information",
-            "score": 402.2773109243698
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=580.536, mean=580.536, max=580.536, sum=1161.073 (2)",
-            "tab": "General information",
-            "score": 580.5364238410596
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=495.521, mean=495.521, max=495.521, sum=991.042 (2)",
-            "tab": "General information",
-            "score": 495.52110091743117
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=830.477, mean=830.477, max=830.477, sum=1660.954 (2)",
-            "tab": "General information",
-            "score": 830.4768518518518
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2237.176, mean=2237.176, max=2237.176, sum=4474.353 (2)",
-            "tab": "General information",
-            "score": 2237.176470588235
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1448.354, mean=1448.354, max=1448.354, sum=2896.709 (2)",
-            "tab": "General information",
-            "score": 1448.3544303797469
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.855,
-        "details": {
-          "description": "min=0.855, mean=0.855, max=0.855, sum=1.71 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.245, mean=0.245, max=0.245, sum=0.49 (2)",
-            "tab": "Efficiency",
-            "score": 0.24486422538757324
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.254, mean=0.254, max=0.254, sum=0.508 (2)",
-            "tab": "Efficiency",
-            "score": 0.25416288121056013
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=322.121, mean=322.121, max=322.121, sum=644.242 (2)",
-            "tab": "General information",
-            "score": 322.1210762331838
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=341.504, mean=341.504, max=341.504, sum=683.008 (2)",
-            "tab": "General information",
-            "score": 341.5038167938931
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.876,
-        "details": {
-          "description": "min=0.876, mean=0.876, max=0.876, sum=1.752 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.277, mean=0.277, max=0.277, sum=0.555 (2)",
-            "tab": "Efficiency",
-            "score": 0.2773902613269396
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=640.579, mean=640.579, max=640.579, sum=1281.157 (2)",
-            "tab": "General information",
-            "score": 640.5785123966942
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.828,
-        "details": {
-          "description": "min=0.828, mean=0.828, max=0.828, sum=1.656 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.248, mean=0.248, max=0.248, sum=0.496 (2)",
-            "tab": "Efficiency",
-            "score": 0.24794307661934134
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=449.632, mean=449.632, max=449.632, sum=899.264 (2)",
-            "tab": "General information",
-            "score": 449.6319018404908
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.634,
-        "details": {
-          "description": "min=0.634, mean=0.634, max=0.634, sum=1.268 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.284, mean=0.284, max=0.284, sum=0.567 (2)",
-            "tab": "Efficiency",
-            "score": 0.2835228868893215
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=681.848, mean=681.848, max=681.848, sum=1363.696 (2)",
-            "tab": "General information",
-            "score": 681.8482142857143
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.835,
-        "details": {
-          "description": "min=0.835, mean=0.835, max=0.835, sum=1.67 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.28, mean=0.28, max=0.28, sum=0.56 (2)",
-            "tab": "Efficiency",
-            "score": 0.28018068804324253
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=283.854, mean=283.854, max=283.854, sum=567.709 (2)",
-            "tab": "General information",
-            "score": 283.8543689320388
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.919,
-        "details": {
-          "description": "min=0.919, mean=0.919, max=0.919, sum=1.838 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.254, mean=0.254, max=0.254, sum=0.509 (2)",
-            "tab": "Efficiency",
-            "score": 0.2544598365441347
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=404.415, mean=404.415, max=404.415, sum=808.829 (2)",
-            "tab": "General information",
-            "score": 404.4145299145299
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.27, mean=0.27, max=0.27, sum=0.541 (2)",
-            "tab": "Efficiency",
-            "score": 0.27034429311752317
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=342.35, mean=342.35, max=342.35, sum=684.7 (2)",
-            "tab": "General information",
-            "score": 342.35
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.934,
-        "details": {
-          "description": "min=0.934, mean=0.934, max=0.934, sum=1.867 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.246, mean=0.246, max=0.246, sum=0.492 (2)",
-            "tab": "Efficiency",
-            "score": 0.24603491085242493
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=303.7, mean=303.7, max=303.7, sum=607.4 (2)",
-            "tab": "General information",
-            "score": 303.6998722860792
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.783,
-        "details": {
-          "description": "min=0.783, mean=0.783, max=0.783, sum=1.566 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.256, mean=0.256, max=0.256, sum=0.513 (2)",
-            "tab": "Efficiency",
-            "score": 0.2563680651559995
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.257, mean=0.257, max=0.257, sum=0.514 (2)",
-            "tab": "Efficiency",
-            "score": 0.25722797329865354
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=476.182, mean=476.182, max=476.182, sum=952.364 (2)",
-            "tab": "General information",
-            "score": 476.1820809248555
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=668.494, mean=668.494, max=668.494, sum=1336.988 (2)",
-            "tab": "General information",
-            "score": 668.4938547486033
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.804,
-        "details": {
-          "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.271, mean=0.271, max=0.271, sum=0.542 (2)",
-            "tab": "Efficiency",
-            "score": 0.27095749721028445
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=599.637, mean=599.637, max=599.637, sum=1199.275 (2)",
-            "tab": "General information",
-            "score": 599.6372549019608
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.867,
-        "details": {
-          "description": "min=0.867, mean=0.867, max=0.867, sum=1.735 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.242, mean=0.242, max=0.242, sum=0.483 (2)",
-            "tab": "Efficiency",
-            "score": 0.2415844319779196
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=528.364, mean=528.364, max=528.364, sum=1056.728 (2)",
-            "tab": "General information",
-            "score": 528.3641975308642
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.773,
-        "details": {
-          "description": "min=0.773, mean=0.773, max=0.773, sum=1.545 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.25, mean=0.25, max=0.25, sum=0.5 (2)",
-            "tab": "Efficiency",
-            "score": 0.2501691276376898
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=408.427, mean=408.427, max=408.427, sum=816.855 (2)",
-            "tab": "General information",
-            "score": 408.42727272727274
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.735,
-        "details": {
-          "description": "min=0.735, mean=0.735, max=0.735, sum=1.469 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.283, mean=0.283, max=0.283, sum=0.565 (2)",
-            "tab": "Efficiency",
-            "score": 0.28266452769843897
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1166.931, mean=1166.931, max=1166.931, sum=2333.861 (2)",
-            "tab": "General information",
-            "score": 1166.930612244898
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.866,
-        "details": {
-          "description": "min=0.866, mean=0.866, max=0.866, sum=1.731 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.258, mean=0.258, max=0.258, sum=0.516 (2)",
-            "tab": "Efficiency",
-            "score": 0.258230237818476
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=450.1, mean=450.1, max=450.1, sum=900.199 (2)",
-            "tab": "General information",
-            "score": 450.0995024875622
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.542,
-        "details": {
-          "description": "min=0.542, mean=0.542, max=0.542, sum=1.084 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.248, mean=0.248, max=0.248, sum=0.495 (2)",
-            "tab": "Efficiency",
-            "score": 0.24754508719386825
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=343.819, mean=343.819, max=343.819, sum=687.639 (2)",
-            "tab": "General information",
-            "score": 343.8192771084337
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.871,
-        "details": {
-          "description": "min=0.871, mean=0.871, max=0.871, sum=1.743 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.235, mean=0.235, max=0.235, sum=0.471 (2)",
-            "tab": "Efficiency",
-            "score": 0.23539779897321733
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=276.07, mean=276.07, max=276.07, sum=552.14 (2)",
-            "tab": "General information",
-            "score": 276.0701754385965
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.875,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json b/data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json
deleted file mode 100644
index 9bfc87f91..000000000
--- a/data/helm_mmlu/qwen/qwen1.5-14b/fa6a6772-671b-402e-9480-d61e0fb4a61e.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-14b/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5 14B",
-    "id": "qwen/qwen1.5-14b",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.686,
-        "details": {
-          "description": "min=0.368, mean=0.686, max=0.893, sum=78.254 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.282, mean=0.321, max=0.549, sum=36.618 (114)",
-            "tab": "Efficiency",
-            "score": 0.3212107113231387
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=269.07, mean=618.598, max=2807.903, sum=70520.198 (114)",
-            "tab": "General information",
-            "score": 618.5982315160392
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4,
-        "details": {
-          "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.285, mean=0.285, max=0.285, sum=0.569 (2)",
-            "tab": "Efficiency",
-            "score": 0.28459527969360354
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=371.19, mean=371.19, max=371.19, sum=742.38 (2)",
-            "tab": "General information",
-            "score": 371.19
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.637,
-        "details": {
-          "description": "min=0.637, mean=0.637, max=0.637, sum=1.274 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.663 (2)",
-            "tab": "Efficiency",
-            "score": 0.33150761745594165
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=346.978, mean=346.978, max=346.978, sum=693.956 (2)",
-            "tab": "General information",
-            "score": 346.97777777777776
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.48,
-        "details": {
-          "description": "min=0.48, mean=0.48, max=0.48, sum=0.961 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.335, mean=0.335, max=0.335, sum=0.67 (2)",
-            "tab": "Efficiency",
-            "score": 0.33498176813125613
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.295, mean=0.295, max=0.295, sum=0.589 (2)",
-            "tab": "Efficiency",
-            "score": 0.2946729362010956
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.673 (2)",
-            "tab": "Efficiency",
-            "score": 0.3364031720161438
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.324, mean=0.324, max=0.324, sum=0.648 (2)",
-            "tab": "Efficiency",
-            "score": 0.3238637447357178
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.306, mean=0.306, max=0.306, sum=0.611 (2)",
-            "tab": "Efficiency",
-            "score": 0.3055199033263102
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)",
-            "tab": "Efficiency",
-            "score": 0.31105106250912534
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=561.25, mean=561.25, max=561.25, sum=1122.5 (2)",
-            "tab": "General information",
-            "score": 561.25
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=479.979, mean=479.979, max=479.979, sum=959.958 (2)",
-            "tab": "General information",
-            "score": 479.9791666666667
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=831.58, mean=831.58, max=831.58, sum=1663.16 (2)",
-            "tab": "General information",
-            "score": 831.58
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=600.7, mean=600.7, max=600.7, sum=1201.4 (2)",
-            "tab": "General information",
-            "score": 600.7
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=499.098, mean=499.098, max=499.098, sum=998.197 (2)",
-            "tab": "General information",
-            "score": 499.0982658959538
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=509.265, mean=509.265, max=509.265, sum=1018.529 (2)",
-            "tab": "General information",
-            "score": 509.2647058823529
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)",
-            "tab": "Efficiency",
-            "score": 0.2989851474761963
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=379.64, mean=379.64, max=379.64, sum=759.28 (2)",
-            "tab": "General information",
-            "score": 379.64
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.561,
-        "details": {
-          "description": "min=0.561, mean=0.561, max=0.561, sum=1.123 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.312, mean=0.312, max=0.312, sum=0.624 (2)",
-            "tab": "Efficiency",
-            "score": 0.3118862185561866
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=620.939, mean=620.939, max=620.939, sum=1241.877 (2)",
-            "tab": "General information",
-            "score": 620.938596491228
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.49,
-        "details": {
-          "description": "min=0.49, mean=0.49, max=0.49, sum=0.98 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.306, mean=0.306, max=0.306, sum=0.611 (2)",
-            "tab": "Efficiency",
-            "score": 0.30553135871887205
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=422.06, mean=422.06, max=422.06, sum=844.12 (2)",
-            "tab": "General information",
-            "score": 422.06
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.769,
-        "details": {
-          "description": "min=0.769, mean=0.769, max=0.769, sum=1.537 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.309, mean=0.309, max=0.309, sum=0.618 (2)",
-            "tab": "Efficiency",
-            "score": 0.3092155566921941
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=387.713, mean=387.713, max=387.713, sum=775.426 (2)",
-            "tab": "General information",
-            "score": 387.712962962963
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.717,
-        "details": {
-          "description": "min=0.717, mean=0.717, max=0.717, sum=1.434 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)",
-            "tab": "Efficiency",
-            "score": 0.3108927659283114
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=322.09, mean=322.09, max=322.09, sum=644.18 (2)",
-            "tab": "General information",
-            "score": 322.09003215434086
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.699,
-        "details": {
-          "description": "min=0.699, mean=0.699, max=0.699, sum=1.399 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.475, mean=0.475, max=0.475, sum=0.951 (2)",
-            "tab": "Efficiency",
-            "score": 0.47532147870344277
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.319, mean=0.319, max=0.319, sum=0.638 (2)",
-            "tab": "Efficiency",
-            "score": 0.31895153404127624
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)",
-            "tab": "Efficiency",
-            "score": 0.4000247932941382
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.301, mean=0.301, max=0.301, sum=0.602 (2)",
-            "tab": "Efficiency",
-            "score": 0.3012406826019287
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1118.199, mean=1118.199, max=1118.199, sum=2236.397 (2)",
-            "tab": "General information",
-            "score": 1118.1985294117646
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=732.34, mean=732.34, max=732.34, sum=1464.681 (2)",
-            "tab": "General information",
-            "score": 732.3404255319149
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1656.969, mean=1656.969, max=1656.969, sum=3313.939 (2)",
-            "tab": "General information",
-            "score": 1656.9693611473272
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=574.417, mean=574.417, max=574.417, sum=1148.833 (2)",
-            "tab": "General information",
-            "score": 574.4166666666666
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.319, mean=0.319, max=0.319, sum=0.638 (2)",
-            "tab": "Efficiency",
-            "score": 0.31888857364654544
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=421.16, mean=421.16, max=421.16, sum=842.32 (2)",
-            "tab": "General information",
-            "score": 421.16
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.724,
-        "details": {
-          "description": "min=0.724, mean=0.724, max=0.724, sum=1.447 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.295, mean=0.295, max=0.295, sum=0.589 (2)",
-            "tab": "Efficiency",
-            "score": 0.29459338125429657
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=582.849, mean=582.849, max=582.849, sum=1165.697 (2)",
-            "tab": "General information",
-            "score": 582.8486842105264
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.75,
-        "details": {
-          "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.323, max=0.323, sum=0.647 (2)",
-            "tab": "Efficiency",
-            "score": 0.32330512285232543
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=562.87, mean=562.87, max=562.87, sum=1125.74 (2)",
-            "tab": "General information",
-            "score": 562.87
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.736,
-        "details": {
-          "description": "min=0.736, mean=0.736, max=0.736, sum=1.472 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)",
-            "tab": "Efficiency",
-            "score": 0.2987864755234628
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=393.623, mean=393.623, max=393.623, sum=787.245 (2)",
-            "tab": "General information",
-            "score": 393.62264150943395
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.694,
-        "details": {
-          "description": "min=0.694, mean=0.694, max=0.694, sum=1.387 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.287, mean=0.287, max=0.287, sum=0.575 (2)",
-            "tab": "Efficiency",
-            "score": 0.2873024098416592
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=298.494, mean=298.494, max=298.494, sum=596.987 (2)",
-            "tab": "General information",
-            "score": 298.4936170212766
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.683,
-        "details": {
-          "description": "min=0.683, mean=0.683, max=0.683, sum=1.366 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.286, mean=0.286, max=0.286, sum=0.573 (2)",
-            "tab": "Efficiency",
-            "score": 0.2863943790567332
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=456.8, mean=456.8, max=456.8, sum=913.6 (2)",
-            "tab": "General information",
-            "score": 456.8
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.603,
-        "details": {
-          "description": "min=0.603, mean=0.603, max=0.603, sum=1.206 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.317, mean=0.317, max=0.317, sum=0.635 (2)",
-            "tab": "Efficiency",
-            "score": 0.3172515391041993
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=570.119, mean=570.119, max=570.119, sum=1140.238 (2)",
-            "tab": "General information",
-            "score": 570.1190476190476
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.492,
-        "details": {
-          "description": "min=0.492, mean=0.492, max=0.492, sum=0.984 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.317, mean=0.317, max=0.317, sum=0.634 (2)",
-            "tab": "Efficiency",
-            "score": 0.31694961918724907
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=597.667, mean=597.667, max=597.667, sum=1195.333 (2)",
-            "tab": "General information",
-            "score": 597.6666666666666
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.679 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.303, max=0.303, sum=0.605 (2)",
-            "tab": "Efficiency",
-            "score": 0.3025627659213158
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)",
-            "tab": "Efficiency",
-            "score": 0.3108991178972968
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.305, mean=0.305, max=0.305, sum=0.61 (2)",
-            "tab": "Efficiency",
-            "score": 0.30484641551971436
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.549, mean=0.549, max=0.549, sum=1.098 (2)",
-            "tab": "Efficiency",
-            "score": 0.548761223301743
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.312, mean=0.312, max=0.312, sum=0.624 (2)",
-            "tab": "Efficiency",
-            "score": 0.3120840137655085
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.599 (2)",
-            "tab": "Efficiency",
-            "score": 0.29960165616761836
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)",
-            "tab": "Efficiency",
-            "score": 0.29392006519513253
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.312, mean=0.312, max=0.312, sum=0.625 (2)",
-            "tab": "Efficiency",
-            "score": 0.3124903016620212
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.282, mean=0.282, max=0.282, sum=0.565 (2)",
-            "tab": "Efficiency",
-            "score": 0.28235371273104887
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.308, mean=0.308, max=0.308, sum=0.615 (2)",
-            "tab": "Efficiency",
-            "score": 0.30758162681630113
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.317, mean=0.317, max=0.317, sum=0.634 (2)",
-            "tab": "Efficiency",
-            "score": 0.3172066456680998
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.335, mean=0.335, max=0.335, sum=0.67 (2)",
-            "tab": "Efficiency",
-            "score": 0.33508766580511024
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.453, mean=0.453, max=0.453, sum=0.906 (2)",
-            "tab": "Efficiency",
-            "score": 0.4531192370489532
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.386, mean=0.386, max=0.386, sum=0.771 (2)",
-            "tab": "Efficiency",
-            "score": 0.3856232206529706
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=506.916, mean=506.916, max=506.916, sum=1013.832 (2)",
-            "tab": "General information",
-            "score": 506.9161290322581
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=510.261, mean=510.261, max=510.261, sum=1020.522 (2)",
-            "tab": "General information",
-            "score": 510.2610837438424
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=871.46, mean=871.46, max=871.46, sum=1742.92 (2)",
-            "tab": "General information",
-            "score": 871.46
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2807.903, mean=2807.903, max=2807.903, sum=5615.806 (2)",
-            "tab": "General information",
-            "score": 2807.9030303030304
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=365.217, mean=365.217, max=365.217, sum=730.434 (2)",
-            "tab": "General information",
-            "score": 365.2171717171717
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=460.311, mean=460.311, max=460.311, sum=920.622 (2)",
-            "tab": "General information",
-            "score": 460.31088082901556
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=367.349, mean=367.349, max=367.349, sum=734.697 (2)",
-            "tab": "General information",
-            "score": 367.34871794871793
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=558.326, mean=558.326, max=558.326, sum=1116.652 (2)",
-            "tab": "General information",
-            "score": 558.325925925926
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=395.277, mean=395.277, max=395.277, sum=790.555 (2)",
-            "tab": "General information",
-            "score": 395.2773109243698
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=573.536, mean=573.536, max=573.536, sum=1147.073 (2)",
-            "tab": "General information",
-            "score": 573.5364238410596
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=488.521, mean=488.521, max=488.521, sum=977.042 (2)",
-            "tab": "General information",
-            "score": 488.52110091743117
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=823.477, mean=823.477, max=823.477, sum=1646.954 (2)",
-            "tab": "General information",
-            "score": 823.4768518518518
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2230.176, mean=2230.176, max=2230.176, sum=4460.353 (2)",
-            "tab": "General information",
-            "score": 2230.176470588235
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1441.354, mean=1441.354, max=1441.354, sum=2882.709 (2)",
-            "tab": "General information",
-            "score": 1441.3544303797469
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.756,
-        "details": {
-          "description": "min=0.756, mean=0.756, max=0.756, sum=1.511 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.29, mean=0.29, max=0.29, sum=0.58 (2)",
-            "tab": "Efficiency",
-            "score": 0.29016303160799994
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.322, mean=0.322, max=0.322, sum=0.645 (2)",
-            "tab": "Efficiency",
-            "score": 0.3224487978083487
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=315.121, mean=315.121, max=315.121, sum=630.242 (2)",
-            "tab": "General information",
-            "score": 315.1210762331838
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=334.504, mean=334.504, max=334.504, sum=669.008 (2)",
-            "tab": "General information",
-            "score": 334.5038167938931
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.826,
-        "details": {
-          "description": "min=0.826, mean=0.826, max=0.826, sum=1.653 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.308, mean=0.308, max=0.308, sum=0.615 (2)",
-            "tab": "Efficiency",
-            "score": 0.307678321176324
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=633.579, mean=633.579, max=633.579, sum=1267.157 (2)",
-            "tab": "General information",
-            "score": 633.5785123966942
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.736,
-        "details": {
-          "description": "min=0.736, mean=0.736, max=0.736, sum=1.472 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.305, mean=0.305, max=0.305, sum=0.61 (2)",
-            "tab": "Efficiency",
-            "score": 0.3051488355624895
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=442.632, mean=442.632, max=442.632, sum=885.264 (2)",
-            "tab": "General information",
-            "score": 442.6319018404908
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.509,
-        "details": {
-          "description": "min=0.509, mean=0.509, max=0.509, sum=1.018 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.308, mean=0.308, max=0.308, sum=0.616 (2)",
-            "tab": "Efficiency",
-            "score": 0.3079095014504024
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=674.848, mean=674.848, max=674.848, sum=1349.696 (2)",
-            "tab": "General information",
-            "score": 674.8482142857143
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.816,
-        "details": {
-          "description": "min=0.816, mean=0.816, max=0.816, sum=1.631 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.317, mean=0.317, max=0.317, sum=0.633 (2)",
-            "tab": "Efficiency",
-            "score": 0.316567536696647
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=276.854, mean=276.854, max=276.854, sum=553.709 (2)",
-            "tab": "General information",
-            "score": 276.8543689320388
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.893,
-        "details": {
-          "description": "min=0.893, mean=0.893, max=0.893, sum=1.786 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.31, mean=0.31, max=0.31, sum=0.621 (2)",
-            "tab": "Efficiency",
-            "score": 0.3104041937070015
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=397.415, mean=397.415, max=397.415, sum=794.829 (2)",
-            "tab": "General information",
-            "score": 397.4145299145299
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.302, mean=0.302, max=0.302, sum=0.603 (2)",
-            "tab": "Efficiency",
-            "score": 0.30150007486343383
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=335.35, mean=335.35, max=335.35, sum=670.7 (2)",
-            "tab": "General information",
-            "score": 335.35
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.835,
-        "details": {
-          "description": "min=0.835, mean=0.835, max=0.835, sum=1.67 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)",
-            "tab": "Efficiency",
-            "score": 0.29396778352720376
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=296.7, mean=296.7, max=296.7, sum=593.4 (2)",
-            "tab": "General information",
-            "score": 296.6998722860792
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.368,
-        "details": {
-          "description": "min=0.368, mean=0.368, max=0.368, sum=0.735 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)",
-            "tab": "Efficiency",
-            "score": 0.30380174465951204
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.301, mean=0.301, max=0.301, sum=0.601 (2)",
-            "tab": "Efficiency",
-            "score": 0.3006620183337334
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=469.182, mean=469.182, max=469.182, sum=938.364 (2)",
-            "tab": "General information",
-            "score": 469.1820809248555
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=661.494, mean=661.494, max=661.494, sum=1322.988 (2)",
-            "tab": "General information",
-            "score": 661.4938547486033
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.742,
-        "details": {
-          "description": "min=0.742, mean=0.742, max=0.742, sum=1.484 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.319, mean=0.319, max=0.319, sum=0.639 (2)",
-            "tab": "Efficiency",
-            "score": 0.31930122655980725
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=592.637, mean=592.637, max=592.637, sum=1185.275 (2)",
-            "tab": "General information",
-            "score": 592.6372549019608
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.71,
-        "details": {
-          "description": "min=0.71, mean=0.71, max=0.71, sum=1.42 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.313, mean=0.313, max=0.313, sum=0.625 (2)",
-            "tab": "Efficiency",
-            "score": 0.3125371013158633
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=521.364, mean=521.364, max=521.364, sum=1042.728 (2)",
-            "tab": "General information",
-            "score": 521.3641975308642
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.655,
-        "details": {
-          "description": "min=0.655, mean=0.655, max=0.655, sum=1.309 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.296, mean=0.296, max=0.296, sum=0.592 (2)",
-            "tab": "Efficiency",
-            "score": 0.29603702588514846
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=401.427, mean=401.427, max=401.427, sum=802.855 (2)",
-            "tab": "General information",
-            "score": 401.42727272727274
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.352, mean=0.352, max=0.352, sum=0.704 (2)",
-            "tab": "Efficiency",
-            "score": 0.3521312304905483
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1159.931, mean=1159.931, max=1159.931, sum=2319.861 (2)",
-            "tab": "General information",
-            "score": 1159.930612244898
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.841,
-        "details": {
-          "description": "min=0.841, mean=0.841, max=0.841, sum=1.682 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.609 (2)",
-            "tab": "Efficiency",
-            "score": 0.3044381426341498
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=443.1, mean=443.1, max=443.1, sum=886.199 (2)",
-            "tab": "General information",
-            "score": 443.0995024875622
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.458,
-        "details": {
-          "description": "min=0.458, mean=0.458, max=0.458, sum=0.916 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.595 (2)",
-            "tab": "Efficiency",
-            "score": 0.297343333083463
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=336.819, mean=336.819, max=336.819, sum=673.639 (2)",
-            "tab": "General information",
-            "score": 336.8192771084337
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.842,
-        "details": {
-          "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.303, max=0.303, sum=0.605 (2)",
-            "tab": "Efficiency",
-            "score": 0.3027164573557893
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=269.07, mean=269.07, max=269.07, sum=538.14 (2)",
-            "tab": "General information",
-            "score": 269.0701754385965
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json b/data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json
deleted file mode 100644
index d1a9f19e1..000000000
--- a/data/helm_mmlu/qwen/qwen1.5-32b/b5279e94-ae7f-4671-9315-874e162a24fd.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-32b/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5 32B",
-    "id": "qwen/qwen1.5-32b",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.744,
-        "details": {
-          "description": "min=0.4, mean=0.744, max=0.974, sum=84.853 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.294, mean=0.413, max=0.973, sum=47.06 (114)",
-            "tab": "Efficiency",
-            "score": 0.41280544410672226
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=269.07, mean=618.598, max=2807.903, sum=70520.198 (114)",
-            "tab": "General information",
-            "score": 618.5982315160392
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4,
-        "details": {
-          "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.337, mean=0.337, max=0.337, sum=0.675 (2)",
-            "tab": "Efficiency",
-            "score": 0.33740817070007323
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=371.19, mean=371.19, max=371.19, sum=742.38 (2)",
-            "tab": "General information",
-            "score": 371.19
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.644,
-        "details": {
-          "description": "min=0.644, mean=0.644, max=0.644, sum=1.289 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.353, mean=0.353, max=0.353, sum=0.706 (2)",
-            "tab": "Efficiency",
-            "score": 0.35299032705801503
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=346.978, mean=346.978, max=346.978, sum=693.956 (2)",
-            "tab": "General information",
-            "score": 346.97777777777776
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.51,
-        "details": {
-          "description": "min=0.51, mean=0.51, max=0.51, sum=1.02 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.338, mean=0.338, max=0.338, sum=0.677 (2)",
-            "tab": "Efficiency",
-            "score": 0.33828389167785644
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.57, mean=0.57, max=0.57, sum=1.141 (2)",
-            "tab": "Efficiency",
-            "score": 0.5704119238588545
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.407, mean=0.407, max=0.407, sum=0.813 (2)",
-            "tab": "Efficiency",
-            "score": 0.4065530586242676
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.683, mean=0.683, max=0.683, sum=1.366 (2)",
-            "tab": "Efficiency",
-            "score": 0.6829782605171204
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)",
-            "tab": "Efficiency",
-            "score": 0.34014028896486137
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.616, mean=0.616, max=0.616, sum=1.231 (2)",
-            "tab": "Efficiency",
-            "score": 0.6156594987009086
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=561.25, mean=561.25, max=561.25, sum=1122.5 (2)",
-            "tab": "General information",
-            "score": 561.25
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=479.979, mean=479.979, max=479.979, sum=959.958 (2)",
-            "tab": "General information",
-            "score": 479.9791666666667
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=831.58, mean=831.58, max=831.58, sum=1663.16 (2)",
-            "tab": "General information",
-            "score": 831.58
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=600.7, mean=600.7, max=600.7, sum=1201.4 (2)",
-            "tab": "General information",
-            "score": 600.7
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=499.098, mean=499.098, max=499.098, sum=998.197 (2)",
-            "tab": "General information",
-            "score": 499.0982658959538
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=509.265, mean=509.265, max=509.265, sum=1018.529 (2)",
-            "tab": "General information",
-            "score": 509.2647058823529
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.339, mean=0.339, max=0.339, sum=0.678 (2)",
-            "tab": "Efficiency",
-            "score": 0.3387904930114746
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=379.64, mean=379.64, max=379.64, sum=759.28 (2)",
-            "tab": "General information",
-            "score": 379.64
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.561,
-        "details": {
-          "description": "min=0.561, mean=0.561, max=0.561, sum=1.123 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.367, mean=0.367, max=0.367, sum=0.733 (2)",
-            "tab": "Efficiency",
-            "score": 0.3666987272731045
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=620.939, mean=620.939, max=620.939, sum=1241.877 (2)",
-            "tab": "General information",
-            "score": 620.938596491228
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.47,
-        "details": {
-          "description": "min=0.47, mean=0.47, max=0.47, sum=0.94 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.65, mean=0.65, max=0.65, sum=1.3 (2)",
-            "tab": "Efficiency",
-            "score": 0.6499223327636718
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=422.06, mean=422.06, max=422.06, sum=844.12 (2)",
-            "tab": "General information",
-            "score": 422.06
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.843,
-        "details": {
-          "description": "min=0.843, mean=0.843, max=0.843, sum=1.685 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.301, mean=0.301, max=0.301, sum=0.601 (2)",
-            "tab": "Efficiency",
-            "score": 0.30060131240774085
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=387.713, mean=387.713, max=387.713, sum=775.426 (2)",
-            "tab": "General information",
-            "score": 387.712962962963
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.826,
-        "details": {
-          "description": "min=0.826, mean=0.826, max=0.826, sum=1.653 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.595 (2)",
-            "tab": "Efficiency",
-            "score": 0.2974156122115647
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=322.09, mean=322.09, max=322.09, sum=644.18 (2)",
-            "tab": "General information",
-            "score": 322.09003215434086
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.75,
-        "details": {
-          "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.465, mean=0.465, max=0.465, sum=0.93 (2)",
-            "tab": "Efficiency",
-            "score": 0.46517644997905283
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.381, mean=0.381, max=0.381, sum=0.762 (2)",
-            "tab": "Efficiency",
-            "score": 0.3812122328061584
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.649, mean=0.649, max=0.649, sum=1.299 (2)",
-            "tab": "Efficiency",
-            "score": 0.6492582102642532
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.388, mean=0.388, max=0.388, sum=0.775 (2)",
-            "tab": "Efficiency",
-            "score": 0.38769422676049026
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1118.199, mean=1118.199, max=1118.199, sum=2236.397 (2)",
-            "tab": "General information",
-            "score": 1118.1985294117646
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=732.34, mean=732.34, max=732.34, sum=1464.681 (2)",
-            "tab": "General information",
-            "score": 732.3404255319149
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1656.969, mean=1656.969, max=1656.969, sum=3313.939 (2)",
-            "tab": "General information",
-            "score": 1656.9693611473272
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=574.417, mean=574.417, max=574.417, sum=1148.833 (2)",
-            "tab": "General information",
-            "score": 574.4166666666666
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)",
-            "tab": "Efficiency",
-            "score": 0.3429260540008545
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=421.16, mean=421.16, max=421.16, sum=842.32 (2)",
-            "tab": "General information",
-            "score": 421.16
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.855,
-        "details": {
-          "description": "min=0.855, mean=0.855, max=0.855, sum=1.711 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)",
-            "tab": "Efficiency",
-            "score": 0.33687377132867513
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=582.849, mean=582.849, max=582.849, sum=1165.697 (2)",
-            "tab": "General information",
-            "score": 582.8486842105264
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.77,
-        "details": {
-          "description": "min=0.77, mean=0.77, max=0.77, sum=1.54 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.356, mean=0.356, max=0.356, sum=0.713 (2)",
-            "tab": "Efficiency",
-            "score": 0.3564377498626709
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=562.87, mean=562.87, max=562.87, sum=1125.74 (2)",
-            "tab": "General information",
-            "score": 562.87
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.781,
-        "details": {
-          "description": "min=0.781, mean=0.781, max=0.781, sum=1.562 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.319, mean=0.319, max=0.319, sum=0.638 (2)",
-            "tab": "Efficiency",
-            "score": 0.3190377280397235
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=393.623, mean=393.623, max=393.623, sum=787.245 (2)",
-            "tab": "General information",
-            "score": 393.62264150943395
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.766,
-        "details": {
-          "description": "min=0.766, mean=0.766, max=0.766, sum=1.532 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.872 (2)",
-            "tab": "Efficiency",
-            "score": 0.4358475421337371
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=298.494, mean=298.494, max=298.494, sum=596.987 (2)",
-            "tab": "General information",
-            "score": 298.4936170212766
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.731,
-        "details": {
-          "description": "min=0.731, mean=0.731, max=0.731, sum=1.462 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.642 (2)",
-            "tab": "Efficiency",
-            "score": 0.32112578523570096
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=456.8, mean=456.8, max=456.8, sum=913.6 (2)",
-            "tab": "General information",
-            "score": 456.8
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.685,
-        "details": {
-          "description": "min=0.685, mean=0.685, max=0.685, sum=1.37 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.352, mean=0.352, max=0.352, sum=0.705 (2)",
-            "tab": "Efficiency",
-            "score": 0.3522766809614878
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=570.119, mean=570.119, max=570.119, sum=1140.238 (2)",
-            "tab": "General information",
-            "score": 570.1190476190476
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.524,
-        "details": {
-          "description": "min=0.524, mean=0.524, max=0.524, sum=1.048 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.37, mean=0.37, max=0.37, sum=0.739 (2)",
-            "tab": "Efficiency",
-            "score": 0.3697236606052944
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=597.667, mean=597.667, max=597.667, sum=1195.333 (2)",
-            "tab": "General information",
-            "score": 597.6666666666666
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.869,
-        "details": {
-          "description": "min=0.869, mean=0.869, max=0.869, sum=1.738 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.32, mean=0.32, max=0.32, sum=0.639 (2)",
-            "tab": "Efficiency",
-            "score": 0.3195470579208866
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.369, mean=0.369, max=0.369, sum=0.739 (2)",
-            "tab": "Efficiency",
-            "score": 0.36928989969450854
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.724, mean=0.724, max=0.724, sum=1.448 (2)",
-            "tab": "Efficiency",
-            "score": 0.7240336751937866
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.973, mean=0.973, max=0.973, sum=1.946 (2)",
-            "tab": "Efficiency",
-            "score": 0.9729607683239561
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)",
-            "tab": "Efficiency",
-            "score": 0.30711602562605733
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.338, mean=0.338, max=0.338, sum=0.675 (2)",
-            "tab": "Efficiency",
-            "score": 0.3376439371257248
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.41, mean=0.41, max=0.41, sum=0.82 (2)",
-            "tab": "Efficiency",
-            "score": 0.410240764495654
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.363, mean=0.363, max=0.363, sum=0.725 (2)",
-            "tab": "Efficiency",
-            "score": 0.36270895887304233
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.314, mean=0.314, max=0.314, sum=0.629 (2)",
-            "tab": "Efficiency",
-            "score": 0.3144632788265453
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.368, mean=0.368, max=0.368, sum=0.736 (2)",
-            "tab": "Efficiency",
-            "score": 0.3679169850633634
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.452, mean=0.452, max=0.452, sum=0.903 (2)",
-            "tab": "Efficiency",
-            "score": 0.45166520109964076
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.378, mean=0.378, max=0.378, sum=0.757 (2)",
-            "tab": "Efficiency",
-            "score": 0.37830896068502357
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.888, mean=0.888, max=0.888, sum=1.776 (2)",
-            "tab": "Efficiency",
-            "score": 0.8882208957391626
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.551, mean=0.551, max=0.551, sum=1.102 (2)",
-            "tab": "Efficiency",
-            "score": 0.5509252004985568
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=506.916, mean=506.916, max=506.916, sum=1013.832 (2)",
-            "tab": "General information",
-            "score": 506.9161290322581
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=510.261, mean=510.261, max=510.261, sum=1020.522 (2)",
-            "tab": "General information",
-            "score": 510.2610837438424
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=871.46, mean=871.46, max=871.46, sum=1742.92 (2)",
-            "tab": "General information",
-            "score": 871.46
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2807.903, mean=2807.903, max=2807.903, sum=5615.806 (2)",
-            "tab": "General information",
-            "score": 2807.9030303030304
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=365.217, mean=365.217, max=365.217, sum=730.434 (2)",
-            "tab": "General information",
-            "score": 365.2171717171717
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=460.311, mean=460.311, max=460.311, sum=920.622 (2)",
-            "tab": "General information",
-            "score": 460.31088082901556
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=367.349, mean=367.349, max=367.349, sum=734.697 (2)",
-            "tab": "General information",
-            "score": 367.34871794871793
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=558.326, mean=558.326, max=558.326, sum=1116.652 (2)",
-            "tab": "General information",
-            "score": 558.325925925926
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=395.277, mean=395.277, max=395.277, sum=790.555 (2)",
-            "tab": "General information",
-            "score": 395.2773109243698
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=573.536, mean=573.536, max=573.536, sum=1147.073 (2)",
-            "tab": "General information",
-            "score": 573.5364238410596
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=488.521, mean=488.521, max=488.521, sum=977.042 (2)",
-            "tab": "General information",
-            "score": 488.52110091743117
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=823.477, mean=823.477, max=823.477, sum=1646.954 (2)",
-            "tab": "General information",
-            "score": 823.4768518518518
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2230.176, mean=2230.176, max=2230.176, sum=4460.353 (2)",
-            "tab": "General information",
-            "score": 2230.176470588235
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1441.354, mean=1441.354, max=1441.354, sum=2882.709 (2)",
-            "tab": "General information",
-            "score": 1441.3544303797469
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.847,
-        "details": {
-          "description": "min=0.847, mean=0.847, max=0.847, sum=1.695 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.314, mean=0.314, max=0.314, sum=0.627 (2)",
-            "tab": "Efficiency",
-            "score": 0.31371782071921855
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.323, max=0.323, sum=0.647 (2)",
-            "tab": "Efficiency",
-            "score": 0.32332972897828083
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=315.121, mean=315.121, max=315.121, sum=630.242 (2)",
-            "tab": "General information",
-            "score": 315.1210762331838
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=334.504, mean=334.504, max=334.504, sum=669.008 (2)",
-            "tab": "General information",
-            "score": 334.5038167938931
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.884,
-        "details": {
-          "description": "min=0.884, mean=0.884, max=0.884, sum=1.769 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.382, mean=0.382, max=0.382, sum=0.765 (2)",
-            "tab": "Efficiency",
-            "score": 0.38232671130787244
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=633.579, mean=633.579, max=633.579, sum=1267.157 (2)",
-            "tab": "General information",
-            "score": 633.5785123966942
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.822,
-        "details": {
-          "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.313, mean=0.313, max=0.313, sum=0.625 (2)",
-            "tab": "Efficiency",
-            "score": 0.31269068220641716
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=442.632, mean=442.632, max=442.632, sum=885.264 (2)",
-            "tab": "General information",
-            "score": 442.6319018404908
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.616,
-        "details": {
-          "description": "min=0.616, mean=0.616, max=0.616, sum=1.232 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.359, mean=0.359, max=0.359, sum=0.719 (2)",
-            "tab": "Efficiency",
-            "score": 0.3593791680676596
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=674.848, mean=674.848, max=674.848, sum=1349.696 (2)",
-            "tab": "General information",
-            "score": 674.8482142857143
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.874,
-        "details": {
-          "description": "min=0.874, mean=0.874, max=0.874, sum=1.748 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.633, mean=0.633, max=0.633, sum=1.265 (2)",
-            "tab": "Efficiency",
-            "score": 0.6326094113507317
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=276.854, mean=276.854, max=276.854, sum=553.709 (2)",
-            "tab": "General information",
-            "score": 276.8543689320388
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.936,
-        "details": {
-          "description": "min=0.936, mean=0.936, max=0.936, sum=1.872 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.328, mean=0.328, max=0.328, sum=0.655 (2)",
-            "tab": "Efficiency",
-            "score": 0.3277416534912892
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=397.415, mean=397.415, max=397.415, sum=794.829 (2)",
-            "tab": "General information",
-            "score": 397.4145299145299
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)",
-            "tab": "Efficiency",
-            "score": 0.2937913846969604
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=335.35, mean=335.35, max=335.35, sum=670.7 (2)",
-            "tab": "General information",
-            "score": 335.35
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.884,
-        "details": {
-          "description": "min=0.884, mean=0.884, max=0.884, sum=1.768 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.347, mean=0.347, max=0.347, sum=0.693 (2)",
-            "tab": "Efficiency",
-            "score": 0.34673521040652144
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=296.7, mean=296.7, max=296.7, sum=593.4 (2)",
-            "tab": "General information",
-            "score": 296.6998722860792
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.545,
-        "details": {
-          "description": "min=0.545, mean=0.545, max=0.545, sum=1.091 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.334, mean=0.334, max=0.334, sum=0.667 (2)",
-            "tab": "Efficiency",
-            "score": 0.3335799164854722
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.396, mean=0.396, max=0.396, sum=0.792 (2)",
-            "tab": "Efficiency",
-            "score": 0.3961469775471607
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=469.182, mean=469.182, max=469.182, sum=938.364 (2)",
-            "tab": "General information",
-            "score": 469.1820809248555
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=661.494, mean=661.494, max=661.494, sum=1322.988 (2)",
-            "tab": "General information",
-            "score": 661.4938547486033
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.621 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.338, mean=0.338, max=0.338, sum=0.676 (2)",
-            "tab": "Efficiency",
-            "score": 0.33816951162674846
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=592.637, mean=592.637, max=592.637, sum=1185.275 (2)",
-            "tab": "General information",
-            "score": 592.6372549019608
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.654 (2)",
-            "tab": "Efficiency",
-            "score": 0.3270495865080092
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=521.364, mean=521.364, max=521.364, sum=1042.728 (2)",
-            "tab": "General information",
-            "score": 521.3641975308642
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.664,
-        "details": {
-          "description": "min=0.664, mean=0.664, max=0.664, sum=1.327 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.305, mean=0.305, max=0.305, sum=0.609 (2)",
-            "tab": "Efficiency",
-            "score": 0.3046790404753251
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=401.427, mean=401.427, max=401.427, sum=802.855 (2)",
-            "tab": "General information",
-            "score": 401.42727272727274
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.829,
-        "details": {
-          "description": "min=0.829, mean=0.829, max=0.829, sum=1.657 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.478, mean=0.478, max=0.478, sum=0.956 (2)",
-            "tab": "Efficiency",
-            "score": 0.47783534575481806
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1159.931, mean=1159.931, max=1159.931, sum=2319.861 (2)",
-            "tab": "General information",
-            "score": 1159.930612244898
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.881,
-        "details": {
-          "description": "min=0.881, mean=0.881, max=0.881, sum=1.761 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.341, mean=0.341, max=0.341, sum=0.681 (2)",
-            "tab": "Efficiency",
-            "score": 0.3407213664173487
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=443.1, mean=443.1, max=443.1, sum=886.199 (2)",
-            "tab": "General information",
-            "score": 443.0995024875622
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.578,
-        "details": {
-          "description": "min=0.578, mean=0.578, max=0.578, sum=1.157 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.329, mean=0.329, max=0.329, sum=0.658 (2)",
-            "tab": "Efficiency",
-            "score": 0.3289937297981906
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=336.819, mean=336.819, max=336.819, sum=673.639 (2)",
-            "tab": "General information",
-            "score": 336.8192771084337
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.854,
-        "details": {
-          "description": "min=0.854, mean=0.854, max=0.854, sum=1.708 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.32, mean=0.32, max=0.32, sum=0.64 (2)",
-            "tab": "Efficiency",
-            "score": 0.31992746933161864
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=269.07, mean=269.07, max=269.07, sum=538.14 (2)",
-            "tab": "General information",
-            "score": 269.0701754385965
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.624,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json b/data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json
deleted file mode 100644
index 94c5e4e80..000000000
--- a/data/helm_mmlu/qwen/qwen1.5-72b/de00e8da-9c83-40df-b642-b94719ce1ac2.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-72b/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5 72B",
-    "id": "qwen/qwen1.5-72b",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.774,
-        "details": {
-          "description": "min=0.44, mean=0.774, max=0.99, sum=88.227 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.375, max=0.713, sum=42.762 (114)",
-            "tab": "Efficiency",
-            "score": 0.37510459085651054
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=269.07, mean=618.598, max=2807.903, sum=70520.198 (114)",
-            "tab": "General information",
-            "score": 618.5982315160392
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44,
-        "details": {
-          "description": "min=0.44, mean=0.44, max=0.44, sum=0.88 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.348, mean=0.348, max=0.348, sum=0.696 (2)",
-            "tab": "Efficiency",
-            "score": 0.3480935263633728
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=371.19, mean=371.19, max=371.19, sum=742.38 (2)",
-            "tab": "General information",
-            "score": 371.19
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.733,
-        "details": {
-          "description": "min=0.733, mean=0.733, max=0.733, sum=1.467 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.342, mean=0.342, max=0.342, sum=0.685 (2)",
-            "tab": "Efficiency",
-            "score": 0.3424220985836453
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=346.978, mean=346.978, max=346.978, sum=693.956 (2)",
-            "tab": "General information",
-            "score": 346.97777777777776
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.559,
-        "details": {
-          "description": "min=0.559, mean=0.559, max=0.559, sum=1.118 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.396, mean=0.396, max=0.396, sum=0.791 (2)",
-            "tab": "Efficiency",
-            "score": 0.39563153505325316
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.349, mean=0.349, max=0.349, sum=0.698 (2)",
-            "tab": "Efficiency",
-            "score": 0.3488144195742077
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.398, mean=0.398, max=0.398, sum=0.797 (2)",
-            "tab": "Efficiency",
-            "score": 0.39839950799942014
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.372, mean=0.372, max=0.372, sum=0.743 (2)",
-            "tab": "Efficiency",
-            "score": 0.3715039682388306
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.346, mean=0.346, max=0.346, sum=0.693 (2)",
-            "tab": "Efficiency",
-            "score": 0.34641625977665014
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)",
-            "tab": "Efficiency",
-            "score": 0.38388992290870816
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=561.25, mean=561.25, max=561.25, sum=1122.5 (2)",
-            "tab": "General information",
-            "score": 561.25
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=479.979, mean=479.979, max=479.979, sum=959.958 (2)",
-            "tab": "General information",
-            "score": 479.9791666666667
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=831.58, mean=831.58, max=831.58, sum=1663.16 (2)",
-            "tab": "General information",
-            "score": 831.58
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=600.7, mean=600.7, max=600.7, sum=1201.4 (2)",
-            "tab": "General information",
-            "score": 600.7
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=499.098, mean=499.098, max=499.098, sum=998.197 (2)",
-            "tab": "General information",
-            "score": 499.0982658959538
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=509.265, mean=509.265, max=509.265, sum=1018.529 (2)",
-            "tab": "General information",
-            "score": 509.2647058823529
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81,
-        "details": {
-          "description": "min=0.81, mean=0.81, max=0.81, sum=1.62 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.338, mean=0.338, max=0.338, sum=0.676 (2)",
-            "tab": "Efficiency",
-            "score": 0.3379603147506714
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=379.64, mean=379.64, max=379.64, sum=759.28 (2)",
-            "tab": "General information",
-            "score": 379.64
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.544,
-        "details": {
-          "description": "min=0.544, mean=0.544, max=0.544, sum=1.088 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.386, mean=0.386, max=0.386, sum=0.772 (2)",
-            "tab": "Efficiency",
-            "score": 0.3857871189452054
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=620.939, mean=620.939, max=620.939, sum=1241.877 (2)",
-            "tab": "General information",
-            "score": 620.938596491228
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.56,
-        "details": {
-          "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.335, mean=0.335, max=0.335, sum=0.669 (2)",
-            "tab": "Efficiency",
-            "score": 0.3347077107429504
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=422.06, mean=422.06, max=422.06, sum=844.12 (2)",
-            "tab": "General information",
-            "score": 422.06
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.824,
-        "details": {
-          "description": "min=0.824, mean=0.824, max=0.824, sum=1.648 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)",
-            "tab": "Efficiency",
-            "score": 0.3512495689921909
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=387.713, mean=387.713, max=387.713, sum=775.426 (2)",
-            "tab": "General information",
-            "score": 387.712962962963
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.659 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)",
-            "tab": "Efficiency",
-            "score": 0.34987031455208634
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=322.09, mean=322.09, max=322.09, sum=644.18 (2)",
-            "tab": "General information",
-            "score": 322.09003215434086
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.809,
-        "details": {
-          "description": "min=0.809, mean=0.809, max=0.809, sum=1.618 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.426, mean=0.426, max=0.426, sum=0.852 (2)",
-            "tab": "Efficiency",
-            "score": 0.4260168829384972
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.375, mean=0.375, max=0.375, sum=0.75 (2)",
-            "tab": "Efficiency",
-            "score": 0.3750799666059778
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.501, mean=0.501, max=0.501, sum=1.002 (2)",
-            "tab": "Efficiency",
-            "score": 0.501238130839272
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.359, mean=0.359, max=0.359, sum=0.719 (2)",
-            "tab": "Efficiency",
-            "score": 0.3593972987598843
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1118.199, mean=1118.199, max=1118.199, sum=2236.397 (2)",
-            "tab": "General information",
-            "score": 1118.1985294117646
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=732.34, mean=732.34, max=732.34, sum=1464.681 (2)",
-            "tab": "General information",
-            "score": 732.3404255319149
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1656.969, mean=1656.969, max=1656.969, sum=3313.939 (2)",
-            "tab": "General information",
-            "score": 1656.9693611473272
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=574.417, mean=574.417, max=574.417, sum=1148.833 (2)",
-            "tab": "General information",
-            "score": 574.4166666666666
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.352, mean=0.352, max=0.352, sum=0.703 (2)",
-            "tab": "Efficiency",
-            "score": 0.3515354657173157
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=421.16, mean=421.16, max=421.16, sum=842.32 (2)",
-            "tab": "General information",
-            "score": 421.16
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.868,
-        "details": {
-          "description": "min=0.868, mean=0.868, max=0.868, sum=1.737 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.373, mean=0.373, max=0.373, sum=0.746 (2)",
-            "tab": "Efficiency",
-            "score": 0.3729873691734515
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=582.849, mean=582.849, max=582.849, sum=1165.697 (2)",
-            "tab": "General information",
-            "score": 582.8486842105264
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.405, mean=0.405, max=0.405, sum=0.81 (2)",
-            "tab": "Efficiency",
-            "score": 0.40487982749938967
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=562.87, mean=562.87, max=562.87, sum=1125.74 (2)",
-            "tab": "General information",
-            "score": 562.87
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.834,
-        "details": {
-          "description": "min=0.834, mean=0.834, max=0.834, sum=1.668 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.349, mean=0.349, max=0.349, sum=0.698 (2)",
-            "tab": "Efficiency",
-            "score": 0.34907986892844145
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=393.623, mean=393.623, max=393.623, sum=787.245 (2)",
-            "tab": "General information",
-            "score": 393.62264150943395
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.821,
-        "details": {
-          "description": "min=0.821, mean=0.821, max=0.821, sum=1.643 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.329, mean=0.329, max=0.329, sum=0.658 (2)",
-            "tab": "Efficiency",
-            "score": 0.3290608903194996
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=298.494, mean=298.494, max=298.494, sum=596.987 (2)",
-            "tab": "General information",
-            "score": 298.4936170212766
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.779,
-        "details": {
-          "description": "min=0.779, mean=0.779, max=0.779, sum=1.559 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.323, max=0.323, sum=0.646 (2)",
-            "tab": "Efficiency",
-            "score": 0.32275488458830737
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=456.8, mean=456.8, max=456.8, sum=913.6 (2)",
-            "tab": "General information",
-            "score": 456.8
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.696,
-        "details": {
-          "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.365, mean=0.365, max=0.365, sum=0.73 (2)",
-            "tab": "Efficiency",
-            "score": 0.364848568325951
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=570.119, mean=570.119, max=570.119, sum=1140.238 (2)",
-            "tab": "General information",
-            "score": 570.1190476190476
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.556,
-        "details": {
-          "description": "min=0.556, mean=0.556, max=0.556, sum=1.111 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.359, mean=0.359, max=0.359, sum=0.718 (2)",
-            "tab": "Efficiency",
-            "score": 0.3588152726491292
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=597.667, mean=597.667, max=597.667, sum=1195.333 (2)",
-            "tab": "General information",
-            "score": 597.6666666666666
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.899,
-        "details": {
-          "description": "min=0.899, mean=0.899, max=0.899, sum=1.797 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.365, mean=0.365, max=0.365, sum=0.729 (2)",
-            "tab": "Efficiency",
-            "score": 0.3646186044139247
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.366, mean=0.366, max=0.366, sum=0.731 (2)",
-            "tab": "Efficiency",
-            "score": 0.36553433728335527
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.381, mean=0.381, max=0.381, sum=0.761 (2)",
-            "tab": "Efficiency",
-            "score": 0.38066073894500735
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.713, mean=0.713, max=0.713, sum=1.426 (2)",
-            "tab": "Efficiency",
-            "score": 0.7130387075019605
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.36, mean=0.36, max=0.36, sum=0.72 (2)",
-            "tab": "Efficiency",
-            "score": 0.36007895975401905
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.672 (2)",
-            "tab": "Efficiency",
-            "score": 0.3358402029837969
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.663 (2)",
-            "tab": "Efficiency",
-            "score": 0.3316040589259221
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.374, mean=0.374, max=0.374, sum=0.747 (2)",
-            "tab": "Efficiency",
-            "score": 0.3736002833754928
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.325, mean=0.325, max=0.325, sum=0.649 (2)",
-            "tab": "Efficiency",
-            "score": 0.32468783655086486
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.392, mean=0.392, max=0.392, sum=0.785 (2)",
-            "tab": "Efficiency",
-            "score": 0.3924832533526894
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.36, mean=0.36, max=0.36, sum=0.721 (2)",
-            "tab": "Efficiency",
-            "score": 0.3602875184575352
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.399, mean=0.399, max=0.399, sum=0.798 (2)",
-            "tab": "Efficiency",
-            "score": 0.39876955968362315
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.554, mean=0.554, max=0.554, sum=1.107 (2)",
-            "tab": "Efficiency",
-            "score": 0.5536784272567898
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.475, mean=0.475, max=0.475, sum=0.949 (2)",
-            "tab": "Efficiency",
-            "score": 0.474577054695741
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=506.916, mean=506.916, max=506.916, sum=1013.832 (2)",
-            "tab": "General information",
-            "score": 506.9161290322581
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=510.261, mean=510.261, max=510.261, sum=1020.522 (2)",
-            "tab": "General information",
-            "score": 510.2610837438424
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=871.46, mean=871.46, max=871.46, sum=1742.92 (2)",
-            "tab": "General information",
-            "score": 871.46
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2807.903, mean=2807.903, max=2807.903, sum=5615.806 (2)",
-            "tab": "General information",
-            "score": 2807.9030303030304
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=365.217, mean=365.217, max=365.217, sum=730.434 (2)",
-            "tab": "General information",
-            "score": 365.2171717171717
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=460.311, mean=460.311, max=460.311, sum=920.622 (2)",
-            "tab": "General information",
-            "score": 460.31088082901556
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=367.349, mean=367.349, max=367.349, sum=734.697 (2)",
-            "tab": "General information",
-            "score": 367.34871794871793
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=558.326, mean=558.326, max=558.326, sum=1116.652 (2)",
-            "tab": "General information",
-            "score": 558.325925925926
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=395.277, mean=395.277, max=395.277, sum=790.555 (2)",
-            "tab": "General information",
-            "score": 395.2773109243698
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=573.536, mean=573.536, max=573.536, sum=1147.073 (2)",
-            "tab": "General information",
-            "score": 573.5364238410596
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=488.521, mean=488.521, max=488.521, sum=977.042 (2)",
-            "tab": "General information",
-            "score": 488.52110091743117
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=823.477, mean=823.477, max=823.477, sum=1646.954 (2)",
-            "tab": "General information",
-            "score": 823.4768518518518
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2230.176, mean=2230.176, max=2230.176, sum=4460.353 (2)",
-            "tab": "General information",
-            "score": 2230.176470588235
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1441.354, mean=1441.354, max=1441.354, sum=2882.709 (2)",
-            "tab": "General information",
-            "score": 1441.3544303797469
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.878,
-        "details": {
-          "description": "min=0.878, mean=0.878, max=0.878, sum=1.756 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.346, mean=0.346, max=0.346, sum=0.692 (2)",
-            "tab": "Efficiency",
-            "score": 0.34584820110167086
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.357, mean=0.357, max=0.357, sum=0.714 (2)",
-            "tab": "Efficiency",
-            "score": 0.35706568856275717
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=315.121, mean=315.121, max=315.121, sum=630.242 (2)",
-            "tab": "General information",
-            "score": 315.1210762331838
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=334.504, mean=334.504, max=334.504, sum=669.008 (2)",
-            "tab": "General information",
-            "score": 334.5038167938931
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.909,
-        "details": {
-          "description": "min=0.909, mean=0.909, max=0.909, sum=1.818 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.375, mean=0.375, max=0.375, sum=0.75 (2)",
-            "tab": "Efficiency",
-            "score": 0.37501588931753616
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=633.579, mean=633.579, max=633.579, sum=1267.157 (2)",
-            "tab": "General information",
-            "score": 633.5785123966942
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.853,
-        "details": {
-          "description": "min=0.853, mean=0.853, max=0.853, sum=1.706 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.347, mean=0.347, max=0.347, sum=0.694 (2)",
-            "tab": "Efficiency",
-            "score": 0.34693217131257786
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=442.632, mean=442.632, max=442.632, sum=885.264 (2)",
-            "tab": "General information",
-            "score": 442.6319018404908
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.67,
-        "details": {
-          "description": "min=0.67, mean=0.67, max=0.67, sum=1.339 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.36, mean=0.36, max=0.36, sum=0.719 (2)",
-            "tab": "Efficiency",
-            "score": 0.3595333376101085
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=674.848, mean=674.848, max=674.848, sum=1349.696 (2)",
-            "tab": "General information",
-            "score": 674.8482142857143
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.854,
-        "details": {
-          "description": "min=0.854, mean=0.854, max=0.854, sum=1.709 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.346, mean=0.346, max=0.346, sum=0.692 (2)",
-            "tab": "Efficiency",
-            "score": 0.3462491313230644
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=276.854, mean=276.854, max=276.854, sum=553.709 (2)",
-            "tab": "General information",
-            "score": 276.8543689320388
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.949,
-        "details": {
-          "description": "min=0.949, mean=0.949, max=0.949, sum=1.897 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)",
-            "tab": "Efficiency",
-            "score": 0.3498607089376857
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=397.415, mean=397.415, max=397.415, sum=794.829 (2)",
-            "tab": "General information",
-            "score": 397.4145299145299
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.686 (2)",
-            "tab": "Efficiency",
-            "score": 0.3427603816986084
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=335.35, mean=335.35, max=335.35, sum=670.7 (2)",
-            "tab": "General information",
-            "score": 335.35
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.921,
-        "details": {
-          "description": "min=0.921, mean=0.921, max=0.921, sum=1.842 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.343, mean=0.343, max=0.343, sum=0.687 (2)",
-            "tab": "Efficiency",
-            "score": 0.3433326785744074
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=296.7, mean=296.7, max=296.7, sum=593.4 (2)",
-            "tab": "General information",
-            "score": 296.6998722860792
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.669,
-        "details": {
-          "description": "min=0.669, mean=0.669, max=0.669, sum=1.339 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.347, mean=0.347, max=0.347, sum=0.693 (2)",
-            "tab": "Efficiency",
-            "score": 0.34657375729841994
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.374, mean=0.374, max=0.374, sum=0.749 (2)",
-            "tab": "Efficiency",
-            "score": 0.37438980161144747
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=469.182, mean=469.182, max=469.182, sum=938.364 (2)",
-            "tab": "General information",
-            "score": 469.1820809248555
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=661.494, mean=661.494, max=661.494, sum=1322.988 (2)",
-            "tab": "General information",
-            "score": 661.4938547486033
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.859,
-        "details": {
-          "description": "min=0.859, mean=0.859, max=0.859, sum=1.719 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.372, mean=0.372, max=0.372, sum=0.744 (2)",
-            "tab": "Efficiency",
-            "score": 0.3719378265680051
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=592.637, mean=592.637, max=592.637, sum=1185.275 (2)",
-            "tab": "General information",
-            "score": 592.6372549019608
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "details": {
-          "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.36, mean=0.36, max=0.36, sum=0.72 (2)",
-            "tab": "Efficiency",
-            "score": 0.35996099313100177
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=521.364, mean=521.364, max=521.364, sum=1042.728 (2)",
-            "tab": "General information",
-            "score": 521.3641975308642
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.755,
-        "details": {
-          "description": "min=0.755, mean=0.755, max=0.755, sum=1.509 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.34, mean=0.34, max=0.34, sum=0.68 (2)",
-            "tab": "Efficiency",
-            "score": 0.340008375861428
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=401.427, mean=401.427, max=401.427, sum=802.855 (2)",
-            "tab": "General information",
-            "score": 401.42727272727274
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.824,
-        "details": {
-          "description": "min=0.824, mean=0.824, max=0.824, sum=1.649 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.432, mean=0.432, max=0.432, sum=0.864 (2)",
-            "tab": "Efficiency",
-            "score": 0.43211937923820654
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1159.931, mean=1159.931, max=1159.931, sum=2319.861 (2)",
-            "tab": "General information",
-            "score": 1159.930612244898
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.801 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.353, mean=0.353, max=0.353, sum=0.707 (2)",
-            "tab": "Efficiency",
-            "score": 0.35334858491053034
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=443.1, mean=443.1, max=443.1, sum=886.199 (2)",
-            "tab": "General information",
-            "score": 443.0995024875622
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.584,
-        "details": {
-          "description": "min=0.584, mean=0.584, max=0.584, sum=1.169 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.338, mean=0.338, max=0.338, sum=0.676 (2)",
-            "tab": "Efficiency",
-            "score": 0.33793931696788376
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=336.819, mean=336.819, max=336.819, sum=673.639 (2)",
-            "tab": "General information",
-            "score": 336.8192771084337
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.883,
-        "details": {
-          "description": "min=0.883, mean=0.883, max=0.883, sum=1.766 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.358, mean=0.358, max=0.358, sum=0.716 (2)",
-            "tab": "Efficiency",
-            "score": 0.358185218788727
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=269.07, mean=269.07, max=269.07, sum=538.14 (2)",
-            "tab": "General information",
-            "score": 269.0701754385965
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.65,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json b/data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json
deleted file mode 100644
index 166da7894..000000000
--- a/data/helm_mmlu/qwen/qwen1.5-7b/119b645f-04c8-4979-bff2-d1e4fdc2a7bc.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen1.5-7b/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5 7B",
-    "id": "qwen/qwen1.5-7b",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.626,
-        "details": {
-          "description": "min=0.364, mean=0.626, max=0.863, sum=71.339 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.269, mean=0.302, max=0.42, sum=34.377 (114)",
-            "tab": "Efficiency",
-            "score": 0.3015485066726155
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=269.07, mean=618.598, max=2807.903, sum=70520.198 (114)",
-            "tab": "General information",
-            "score": 618.5982315160392
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39,
-        "details": {
-          "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.281, mean=0.281, max=0.281, sum=0.562 (2)",
-            "tab": "Efficiency",
-            "score": 0.28086970567703246
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=371.19, mean=371.19, max=371.19, sum=742.38 (2)",
-            "tab": "General information",
-            "score": 371.19
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.526,
-        "details": {
-          "description": "min=0.526, mean=0.526, max=0.526, sum=1.052 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.286, mean=0.286, max=0.286, sum=0.572 (2)",
-            "tab": "Efficiency",
-            "score": 0.2861745004300718
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=346.978, mean=346.978, max=346.978, sum=693.956 (2)",
-            "tab": "General information",
-            "score": 346.97777777777776
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.471,
-        "details": {
-          "description": "min=0.471, mean=0.471, max=0.471, sum=0.941 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.296, mean=0.296, max=0.296, sum=0.592 (2)",
-            "tab": "Efficiency",
-            "score": 0.2962386703491211
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.312, mean=0.312, max=0.312, sum=0.624 (2)",
-            "tab": "Efficiency",
-            "score": 0.3117961171600554
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.599 (2)",
-            "tab": "Efficiency",
-            "score": 0.299501326084137
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.303, max=0.303, sum=0.607 (2)",
-            "tab": "Efficiency",
-            "score": 0.3033126187324524
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.289, mean=0.289, max=0.289, sum=0.577 (2)",
-            "tab": "Efficiency",
-            "score": 0.2886359746745556
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.322, mean=0.322, max=0.322, sum=0.643 (2)",
-            "tab": "Efficiency",
-            "score": 0.32153993026882993
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=561.25, mean=561.25, max=561.25, sum=1122.5 (2)",
-            "tab": "General information",
-            "score": 561.25
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=479.979, mean=479.979, max=479.979, sum=959.958 (2)",
-            "tab": "General information",
-            "score": 479.9791666666667
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=831.58, mean=831.58, max=831.58, sum=1663.16 (2)",
-            "tab": "General information",
-            "score": 831.58
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=600.7, mean=600.7, max=600.7, sum=1201.4 (2)",
-            "tab": "General information",
-            "score": 600.7
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=499.098, mean=499.098, max=499.098, sum=998.197 (2)",
-            "tab": "General information",
-            "score": 499.0982658959538
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=509.265, mean=509.265, max=509.265, sum=1018.529 (2)",
-            "tab": "General information",
-            "score": 509.2647058823529
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.298, mean=0.298, max=0.298, sum=0.597 (2)",
-            "tab": "Efficiency",
-            "score": 0.2982983756065369
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=379.64, mean=379.64, max=379.64, sum=759.28 (2)",
-            "tab": "General information",
-            "score": 379.64
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.447,
-        "details": {
-          "description": "min=0.447, mean=0.447, max=0.447, sum=0.895 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.283, mean=0.283, max=0.283, sum=0.566 (2)",
-            "tab": "Efficiency",
-            "score": 0.282820323057342
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=620.939, mean=620.939, max=620.939, sum=1241.877 (2)",
-            "tab": "General information",
-            "score": 620.938596491228
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4,
-        "details": {
-          "description": "min=0.4, mean=0.4, max=0.4, sum=0.8 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)",
-            "tab": "Efficiency",
-            "score": 0.2939557838439941
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=422.06, mean=422.06, max=422.06, sum=844.12 (2)",
-            "tab": "General information",
-            "score": 422.06
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.778,
-        "details": {
-          "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.593 (2)",
-            "tab": "Efficiency",
-            "score": 0.2966193402255023
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=387.713, mean=387.713, max=387.713, sum=775.426 (2)",
-            "tab": "General information",
-            "score": 387.712962962963
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.691,
-        "details": {
-          "description": "min=0.691, mean=0.691, max=0.691, sum=1.383 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.287, mean=0.287, max=0.287, sum=0.575 (2)",
-            "tab": "Efficiency",
-            "score": 0.28725898534155353
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=322.09, mean=322.09, max=322.09, sum=644.18 (2)",
-            "tab": "General information",
-            "score": 322.09003215434086
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.603,
-        "details": {
-          "description": "min=0.603, mean=0.603, max=0.603, sum=1.206 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.309, mean=0.309, max=0.309, sum=0.617 (2)",
-            "tab": "Efficiency",
-            "score": 0.30863527515355277
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.585 (2)",
-            "tab": "Efficiency",
-            "score": 0.2926285613513162
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.323, max=0.323, sum=0.645 (2)",
-            "tab": "Efficiency",
-            "score": 0.32274515889925004
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.303, max=0.303, sum=0.607 (2)",
-            "tab": "Efficiency",
-            "score": 0.30344173058964846
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1118.199, mean=1118.199, max=1118.199, sum=2236.397 (2)",
-            "tab": "General information",
-            "score": 1118.1985294117646
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=732.34, mean=732.34, max=732.34, sum=1464.681 (2)",
-            "tab": "General information",
-            "score": 732.3404255319149
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1656.969, mean=1656.969, max=1656.969, sum=3313.939 (2)",
-            "tab": "General information",
-            "score": 1656.9693611473272
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=574.417, mean=574.417, max=574.417, sum=1148.833 (2)",
-            "tab": "General information",
-            "score": 574.4166666666666
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.289, mean=0.289, max=0.289, sum=0.578 (2)",
-            "tab": "Efficiency",
-            "score": 0.28910151720046995
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=421.16, mean=421.16, max=421.16, sum=842.32 (2)",
-            "tab": "General information",
-            "score": 421.16
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.671,
-        "details": {
-          "description": "min=0.671, mean=0.671, max=0.671, sum=1.342 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.307, mean=0.307, max=0.307, sum=0.614 (2)",
-            "tab": "Efficiency",
-            "score": 0.30717346699614273
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=582.849, mean=582.849, max=582.849, sum=1165.697 (2)",
-            "tab": "General information",
-            "score": 582.8486842105264
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69,
-        "details": {
-          "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.306, mean=0.306, max=0.306, sum=0.612 (2)",
-            "tab": "Efficiency",
-            "score": 0.3062057161331177
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=562.87, mean=562.87, max=562.87, sum=1125.74 (2)",
-            "tab": "General information",
-            "score": 562.87
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.691,
-        "details": {
-          "description": "min=0.691, mean=0.691, max=0.691, sum=1.381 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.295, mean=0.295, max=0.295, sum=0.589 (2)",
-            "tab": "Efficiency",
-            "score": 0.2947473319071644
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=393.623, mean=393.623, max=393.623, sum=787.245 (2)",
-            "tab": "General information",
-            "score": 393.62264150943395
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.579,
-        "details": {
-          "description": "min=0.579, mean=0.579, max=0.579, sum=1.157 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.28, mean=0.28, max=0.28, sum=0.561 (2)",
-            "tab": "Efficiency",
-            "score": 0.2803657531738281
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=298.494, mean=298.494, max=298.494, sum=596.987 (2)",
-            "tab": "General information",
-            "score": 298.4936170212766
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.572,
-        "details": {
-          "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.269, mean=0.269, max=0.269, sum=0.539 (2)",
-            "tab": "Efficiency",
-            "score": 0.2693853361853238
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=456.8, mean=456.8, max=456.8, sum=913.6 (2)",
-            "tab": "General information",
-            "score": 456.8
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.294, mean=0.294, max=0.294, sum=0.588 (2)",
-            "tab": "Efficiency",
-            "score": 0.2938981220204994
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=570.119, mean=570.119, max=570.119, sum=1140.238 (2)",
-            "tab": "General information",
-            "score": 570.1190476190476
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.397,
-        "details": {
-          "description": "min=0.397, mean=0.397, max=0.397, sum=0.794 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.601 (2)",
-            "tab": "Efficiency",
-            "score": 0.300293557227604
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=597.667, mean=597.667, max=597.667, sum=1195.333 (2)",
-            "tab": "General information",
-            "score": 597.6666666666666
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.789,
-        "details": {
-          "description": "min=0.789, mean=0.789, max=0.789, sum=1.578 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.303, max=0.303, sum=0.605 (2)",
-            "tab": "Efficiency",
-            "score": 0.30256526470184325
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.585 (2)",
-            "tab": "Efficiency",
-            "score": 0.29262745321677824
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)",
-            "tab": "Efficiency",
-            "score": 0.3042095494270325
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.839 (2)",
-            "tab": "Efficiency",
-            "score": 0.4195035573207971
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.303, max=0.303, sum=0.605 (2)",
-            "tab": "Efficiency",
-            "score": 0.3027432386321251
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.294, mean=0.294, max=0.294, sum=0.589 (2)",
-            "tab": "Efficiency",
-            "score": 0.29444977903613156
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.291, mean=0.291, max=0.291, sum=0.582 (2)",
-            "tab": "Efficiency",
-            "score": 0.2909054010342329
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.585 (2)",
-            "tab": "Efficiency",
-            "score": 0.29262985565044264
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)",
-            "tab": "Efficiency",
-            "score": 0.3041165916859603
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.301, mean=0.301, max=0.301, sum=0.603 (2)",
-            "tab": "Efficiency",
-            "score": 0.3013988425400083
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.309, mean=0.309, max=0.309, sum=0.618 (2)",
-            "tab": "Efficiency",
-            "score": 0.3090610066685108
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.318, mean=0.318, max=0.318, sum=0.635 (2)",
-            "tab": "Efficiency",
-            "score": 0.31764531577074967
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.364, mean=0.364, max=0.364, sum=0.727 (2)",
-            "tab": "Efficiency",
-            "score": 0.3635554044854407
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.323, max=0.323, sum=0.646 (2)",
-            "tab": "Efficiency",
-            "score": 0.32297819073190165
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=506.916, mean=506.916, max=506.916, sum=1013.832 (2)",
-            "tab": "General information",
-            "score": 506.9161290322581
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=510.261, mean=510.261, max=510.261, sum=1020.522 (2)",
-            "tab": "General information",
-            "score": 510.2610837438424
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=871.46, mean=871.46, max=871.46, sum=1742.92 (2)",
-            "tab": "General information",
-            "score": 871.46
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2807.903, mean=2807.903, max=2807.903, sum=5615.806 (2)",
-            "tab": "General information",
-            "score": 2807.9030303030304
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=365.217, mean=365.217, max=365.217, sum=730.434 (2)",
-            "tab": "General information",
-            "score": 365.2171717171717
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=460.311, mean=460.311, max=460.311, sum=920.622 (2)",
-            "tab": "General information",
-            "score": 460.31088082901556
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=367.349, mean=367.349, max=367.349, sum=734.697 (2)",
-            "tab": "General information",
-            "score": 367.34871794871793
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=558.326, mean=558.326, max=558.326, sum=1116.652 (2)",
-            "tab": "General information",
-            "score": 558.325925925926
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=395.277, mean=395.277, max=395.277, sum=790.555 (2)",
-            "tab": "General information",
-            "score": 395.2773109243698
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=573.536, mean=573.536, max=573.536, sum=1147.073 (2)",
-            "tab": "General information",
-            "score": 573.5364238410596
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=488.521, mean=488.521, max=488.521, sum=977.042 (2)",
-            "tab": "General information",
-            "score": 488.52110091743117
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=823.477, mean=823.477, max=823.477, sum=1646.954 (2)",
-            "tab": "General information",
-            "score": 823.4768518518518
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2230.176, mean=2230.176, max=2230.176, sum=4460.353 (2)",
-            "tab": "General information",
-            "score": 2230.176470588235
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1441.354, mean=1441.354, max=1441.354, sum=2882.709 (2)",
-            "tab": "General information",
-            "score": 1441.3544303797469
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.695,
-        "details": {
-          "description": "min=0.695, mean=0.695, max=0.695, sum=1.389 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.289, mean=0.289, max=0.289, sum=0.578 (2)",
-            "tab": "Efficiency",
-            "score": 0.28891397057092777
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.298, mean=0.298, max=0.298, sum=0.596 (2)",
-            "tab": "Efficiency",
-            "score": 0.2980237170940137
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=315.121, mean=315.121, max=315.121, sum=630.242 (2)",
-            "tab": "General information",
-            "score": 315.1210762331838
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=334.504, mean=334.504, max=334.504, sum=669.008 (2)",
-            "tab": "General information",
-            "score": 334.5038167938931
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=1.521 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.299, mean=0.299, max=0.299, sum=0.599 (2)",
-            "tab": "Efficiency",
-            "score": 0.2993730572629566
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=633.579, mean=633.579, max=633.579, sum=1267.157 (2)",
-            "tab": "General information",
-            "score": 633.5785123966942
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.706,
-        "details": {
-          "description": "min=0.706, mean=0.706, max=0.706, sum=1.411 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.283, mean=0.283, max=0.283, sum=0.566 (2)",
-            "tab": "Efficiency",
-            "score": 0.28320794456575543
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=442.632, mean=442.632, max=442.632, sum=885.264 (2)",
-            "tab": "General information",
-            "score": 442.6319018404908
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.411,
-        "details": {
-          "description": "min=0.411, mean=0.411, max=0.411, sum=0.821 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.292, mean=0.292, max=0.292, sum=0.583 (2)",
-            "tab": "Efficiency",
-            "score": 0.2917012700012752
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=674.848, mean=674.848, max=674.848, sum=1349.696 (2)",
-            "tab": "General information",
-            "score": 674.8482142857143
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.816,
-        "details": {
-          "description": "min=0.816, mean=0.816, max=0.816, sum=1.631 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.607 (2)",
-            "tab": "Efficiency",
-            "score": 0.3037459641984365
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=276.854, mean=276.854, max=276.854, sum=553.709 (2)",
-            "tab": "General information",
-            "score": 276.8543689320388
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.863,
-        "details": {
-          "description": "min=0.863, mean=0.863, max=0.863, sum=1.726 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.304, mean=0.304, max=0.304, sum=0.608 (2)",
-            "tab": "Efficiency",
-            "score": 0.30402050364730704
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=397.415, mean=397.415, max=397.415, sum=794.829 (2)",
-            "tab": "General information",
-            "score": 397.4145299145299
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69,
-        "details": {
-          "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.308, mean=0.308, max=0.308, sum=0.616 (2)",
-            "tab": "Efficiency",
-            "score": 0.3079418969154358
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=335.35, mean=335.35, max=335.35, sum=670.7 (2)",
-            "tab": "General information",
-            "score": 335.35
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.765,
-        "details": {
-          "description": "min=0.765, mean=0.765, max=0.765, sum=1.53 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.287, mean=0.287, max=0.287, sum=0.575 (2)",
-            "tab": "Efficiency",
-            "score": 0.2874623727372171
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=296.7, mean=296.7, max=296.7, sum=593.4 (2)",
-            "tab": "General information",
-            "score": 296.6998722860792
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.372,
-        "details": {
-          "description": "min=0.372, mean=0.372, max=0.372, sum=0.744 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.294, mean=0.294, max=0.294, sum=0.587 (2)",
-            "tab": "Efficiency",
-            "score": 0.29359787530292664
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.298, mean=0.298, max=0.298, sum=0.596 (2)",
-            "tab": "Efficiency",
-            "score": 0.2979323072806417
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=469.182, mean=469.182, max=469.182, sum=938.364 (2)",
-            "tab": "General information",
-            "score": 469.1820809248555
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=661.494, mean=661.494, max=661.494, sum=1322.988 (2)",
-            "tab": "General information",
-            "score": 661.4938547486033
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.696,
-        "details": {
-          "description": "min=0.696, mean=0.696, max=0.696, sum=1.392 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.586 (2)",
-            "tab": "Efficiency",
-            "score": 0.29277056572484034
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=592.637, mean=592.637, max=592.637, sum=1185.275 (2)",
-            "tab": "General information",
-            "score": 592.6372549019608
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.688,
-        "details": {
-          "description": "min=0.688, mean=0.688, max=0.688, sum=1.377 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.301, mean=0.301, max=0.301, sum=0.602 (2)",
-            "tab": "Efficiency",
-            "score": 0.30120949097621585
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=521.364, mean=521.364, max=521.364, sum=1042.728 (2)",
-            "tab": "General information",
-            "score": 521.3641975308642
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.627,
-        "details": {
-          "description": "min=0.627, mean=0.627, max=0.627, sum=1.255 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.308, mean=0.308, max=0.308, sum=0.616 (2)",
-            "tab": "Efficiency",
-            "score": 0.30815364880995316
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=401.427, mean=401.427, max=401.427, sum=802.855 (2)",
-            "tab": "General information",
-            "score": 401.42727272727274
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.727,
-        "details": {
-          "description": "min=0.727, mean=0.727, max=0.727, sum=1.453 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.296, mean=0.296, max=0.296, sum=0.592 (2)",
-            "tab": "Efficiency",
-            "score": 0.2958566675380785
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1159.931, mean=1159.931, max=1159.931, sum=2319.861 (2)",
-            "tab": "General information",
-            "score": 1159.930612244898
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.836,
-        "details": {
-          "description": "min=0.836, mean=0.836, max=0.836, sum=1.672 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.299, mean=0.299, max=0.299, sum=0.598 (2)",
-            "tab": "Efficiency",
-            "score": 0.29908941278410195
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=443.1, mean=443.1, max=443.1, sum=886.199 (2)",
-            "tab": "General information",
-            "score": 443.0995024875622
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.488,
-        "details": {
-          "description": "min=0.488, mean=0.488, max=0.488, sum=0.976 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.286, mean=0.286, max=0.286, sum=0.572 (2)",
-            "tab": "Efficiency",
-            "score": 0.2861345144639532
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=336.819, mean=336.819, max=336.819, sum=673.639 (2)",
-            "tab": "General information",
-            "score": 336.8192771084337
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.778,
-        "details": {
-          "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)",
-            "tab": "Efficiency",
-            "score": 0.3150970712739822
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=269.07, mean=269.07, max=269.07, sum=538.14 (2)",
-            "tab": "General information",
-            "score": 269.0701754385965
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.843,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json b/data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json
deleted file mode 100644
index 6f8b955e0..000000000
--- a/data/helm_mmlu/qwen/qwen2-72b-instruct/80aabdf4-60b7-493b-98d8-1854f1c41c10.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen2-72b-instruct/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2 Instruct 72B",
-    "id": "qwen/qwen2-72b-instruct",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.824,
-        "details": {
-          "description": "min=0.52, mean=0.824, max=0.979, sum=93.879 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.195, mean=0.359, max=2.502, sum=40.898 (114)",
-            "tab": "Efficiency",
-            "score": 0.3587521754503106
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=276.07, mean=625.598, max=2814.903, sum=71318.198 (114)",
-            "tab": "General information",
-            "score": 625.5982315160392
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.67,
-        "details": {
-          "description": "min=0.67, mean=0.67, max=0.67, sum=1.34 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.395, mean=0.395, max=0.395, sum=0.79 (2)",
-            "tab": "Efficiency",
-            "score": 0.3948828268051148
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=378.19, mean=378.19, max=378.19, sum=756.38 (2)",
-            "tab": "General information",
-            "score": 378.19
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.793,
-        "details": {
-          "description": "min=0.793, mean=0.793, max=0.793, sum=1.585 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.266, mean=0.266, max=0.266, sum=0.531 (2)",
-            "tab": "Efficiency",
-            "score": 0.2657013893127441
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=353.978, mean=353.978, max=353.978, sum=707.956 (2)",
-            "tab": "General information",
-            "score": 353.97777777777776
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.598,
-        "details": {
-          "description": "min=0.598, mean=0.598, max=0.598, sum=1.196 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.249, mean=0.249, max=0.249, sum=0.498 (2)",
-            "tab": "Efficiency",
-            "score": 0.24894725322723388
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.298, mean=0.298, max=0.298, sum=0.596 (2)",
-            "tab": "Efficiency",
-            "score": 0.2977961285246743
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.642 (2)",
-            "tab": "Efficiency",
-            "score": 0.3207618069648743
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.334, mean=0.334, max=0.334, sum=0.667 (2)",
-            "tab": "Efficiency",
-            "score": 0.3337481117248535
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.234, mean=0.234, max=0.234, sum=0.468 (2)",
-            "tab": "Efficiency",
-            "score": 0.2340707227673834
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.25, mean=0.25, max=0.25, sum=0.5 (2)",
-            "tab": "Efficiency",
-            "score": 0.25010308097390566
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=568.25, mean=568.25, max=568.25, sum=1136.5 (2)",
-            "tab": "General information",
-            "score": 568.25
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=486.979, mean=486.979, max=486.979, sum=973.958 (2)",
-            "tab": "General information",
-            "score": 486.9791666666667
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=838.58, mean=838.58, max=838.58, sum=1677.16 (2)",
-            "tab": "General information",
-            "score": 838.58
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=607.7, mean=607.7, max=607.7, sum=1215.4 (2)",
-            "tab": "General information",
-            "score": 607.7
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=506.098, mean=506.098, max=506.098, sum=1012.197 (2)",
-            "tab": "General information",
-            "score": 506.0982658959538
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=516.265, mean=516.265, max=516.265, sum=1032.529 (2)",
-            "tab": "General information",
-            "score": 516.2647058823529
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.281, mean=0.281, max=0.281, sum=0.563 (2)",
-            "tab": "Efficiency",
-            "score": 0.2812828135490417
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=386.64, mean=386.64, max=386.64, sum=773.28 (2)",
-            "tab": "General information",
-            "score": 386.64
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.737,
-        "details": {
-          "description": "min=0.737, mean=0.737, max=0.737, sum=1.474 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.265, mean=0.265, max=0.265, sum=0.53 (2)",
-            "tab": "Efficiency",
-            "score": 0.26492034552390115
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=627.939, mean=627.939, max=627.939, sum=1255.877 (2)",
-            "tab": "General information",
-            "score": 627.938596491228
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.58,
-        "details": {
-          "description": "min=0.58, mean=0.58, max=0.58, sum=1.16 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.254, mean=0.254, max=0.254, sum=0.507 (2)",
-            "tab": "Efficiency",
-            "score": 0.25351563215255735
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=429.06, mean=429.06, max=429.06, sum=858.12 (2)",
-            "tab": "General information",
-            "score": 429.06
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.257, mean=0.257, max=0.257, sum=0.513 (2)",
-            "tab": "Efficiency",
-            "score": 0.256509714656406
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=394.713, mean=394.713, max=394.713, sum=789.426 (2)",
-            "tab": "General information",
-            "score": 394.712962962963
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.859,
-        "details": {
-          "description": "min=0.859, mean=0.859, max=0.859, sum=1.717 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.204, mean=0.204, max=0.204, sum=0.409 (2)",
-            "tab": "Efficiency",
-            "score": 0.20427469348600824
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=329.09, mean=329.09, max=329.09, sum=658.18 (2)",
-            "tab": "General information",
-            "score": 329.09003215434086
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.886,
-        "details": {
-          "description": "min=0.886, mean=0.886, max=0.886, sum=1.771 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.507, mean=0.507, max=0.507, sum=1.014 (2)",
-            "tab": "Efficiency",
-            "score": 0.5070785135030746
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.31, mean=0.31, max=0.31, sum=0.621 (2)",
-            "tab": "Efficiency",
-            "score": 0.31040529579135545
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.407, mean=0.407, max=0.407, sum=0.814 (2)",
-            "tab": "Efficiency",
-            "score": 0.40680916352875074
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.324, mean=0.324, max=0.324, sum=0.647 (2)",
-            "tab": "Efficiency",
-            "score": 0.32369842482548133
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1125.199, mean=1125.199, max=1125.199, sum=2250.397 (2)",
-            "tab": "General information",
-            "score": 1125.1985294117646
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=739.34, mean=739.34, max=739.34, sum=1478.681 (2)",
-            "tab": "General information",
-            "score": 739.3404255319149
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1663.969, mean=1663.969, max=1663.969, sum=3327.939 (2)",
-            "tab": "General information",
-            "score": 1663.9693611473272
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=581.417, mean=581.417, max=581.417, sum=1162.833 (2)",
-            "tab": "General information",
-            "score": 581.4166666666666
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.195, mean=0.195, max=0.195, sum=0.389 (2)",
-            "tab": "Efficiency",
-            "score": 0.19451653003692626
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=428.16, mean=428.16, max=428.16, sum=856.32 (2)",
-            "tab": "General information",
-            "score": 428.16
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.934,
-        "details": {
-          "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.32, mean=0.32, max=0.32, sum=0.641 (2)",
-            "tab": "Efficiency",
-            "score": 0.32045089571099533
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=589.849, mean=589.849, max=589.849, sum=1179.697 (2)",
-            "tab": "General information",
-            "score": 589.8486842105264
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.351, mean=0.351, max=0.351, sum=0.701 (2)",
-            "tab": "Efficiency",
-            "score": 0.350736882686615
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=569.87, mean=569.87, max=569.87, sum=1139.74 (2)",
-            "tab": "General information",
-            "score": 569.87
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.868,
-        "details": {
-          "description": "min=0.868, mean=0.868, max=0.868, sum=1.736 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.26, mean=0.26, max=0.26, sum=0.52 (2)",
-            "tab": "Efficiency",
-            "score": 0.2597639983555056
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=400.623, mean=400.623, max=400.623, sum=801.245 (2)",
-            "tab": "General information",
-            "score": 400.62264150943395
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.872,
-        "details": {
-          "description": "min=0.872, mean=0.872, max=0.872, sum=1.745 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.242, mean=0.242, max=0.242, sum=0.484 (2)",
-            "tab": "Efficiency",
-            "score": 0.2420806296328281
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=305.494, mean=305.494, max=305.494, sum=610.987 (2)",
-            "tab": "General information",
-            "score": 305.4936170212766
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.793,
-        "details": {
-          "description": "min=0.793, mean=0.793, max=0.793, sum=1.586 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.235, mean=0.235, max=0.235, sum=0.47 (2)",
-            "tab": "Efficiency",
-            "score": 0.23504354542699354
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=463.8, mean=463.8, max=463.8, sum=927.6 (2)",
-            "tab": "General information",
-            "score": 463.8
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.825,
-        "details": {
-          "description": "min=0.825, mean=0.825, max=0.825, sum=1.651 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.261, mean=0.261, max=0.261, sum=0.523 (2)",
-            "tab": "Efficiency",
-            "score": 0.2613614286695208
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=577.119, mean=577.119, max=577.119, sum=1154.238 (2)",
-            "tab": "General information",
-            "score": 577.1190476190476
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.667,
-        "details": {
-          "description": "min=0.667, mean=0.667, max=0.667, sum=1.333 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.666 (2)",
-            "tab": "Efficiency",
-            "score": 0.3330562947288392
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=604.667, mean=604.667, max=604.667, sum=1209.333 (2)",
-            "tab": "General information",
-            "score": 604.6666666666666
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.932,
-        "details": {
-          "description": "min=0.932, mean=0.932, max=0.932, sum=1.865 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.247, mean=0.247, max=0.247, sum=0.495 (2)",
-            "tab": "Efficiency",
-            "score": 0.24744614170443627
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.301, mean=0.301, max=0.301, sum=0.602 (2)",
-            "tab": "Efficiency",
-            "score": 0.3010592906933113
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.29, mean=0.29, max=0.29, sum=0.581 (2)",
-            "tab": "Efficiency",
-            "score": 0.2903395962715149
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.629, mean=0.629, max=0.629, sum=1.258 (2)",
-            "tab": "Efficiency",
-            "score": 0.6291334065524015
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.457, mean=0.457, max=0.457, sum=0.913 (2)",
-            "tab": "Efficiency",
-            "score": 0.4567244630871397
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.249, mean=0.249, max=0.249, sum=0.498 (2)",
-            "tab": "Efficiency",
-            "score": 0.24882311524504824
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.245, mean=0.245, max=0.245, sum=0.489 (2)",
-            "tab": "Efficiency",
-            "score": 0.24466082010513696
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.257, mean=0.257, max=0.257, sum=0.514 (2)",
-            "tab": "Efficiency",
-            "score": 0.2570408988881994
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.27, mean=0.27, max=0.27, sum=0.539 (2)",
-            "tab": "Efficiency",
-            "score": 0.26973113893460826
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.285, mean=0.285, max=0.285, sum=0.57 (2)",
-            "tab": "Efficiency",
-            "score": 0.2847776444542487
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.32, mean=0.32, max=0.32, sum=0.641 (2)",
-            "tab": "Efficiency",
-            "score": 0.32032192956416977
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.357, mean=0.357, max=0.357, sum=0.714 (2)",
-            "tab": "Efficiency",
-            "score": 0.3567825931089896
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=2.502, mean=2.502, max=2.502, sum=5.003 (2)",
-            "tab": "Efficiency",
-            "score": 2.501642145362555
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=2.182, mean=2.182, max=2.182, sum=4.364 (2)",
-            "tab": "Efficiency",
-            "score": 2.18210094890514
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=513.916, mean=513.916, max=513.916, sum=1027.832 (2)",
-            "tab": "General information",
-            "score": 513.916129032258
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=517.261, mean=517.261, max=517.261, sum=1034.522 (2)",
-            "tab": "General information",
-            "score": 517.2610837438424
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=878.46, mean=878.46, max=878.46, sum=1756.92 (2)",
-            "tab": "General information",
-            "score": 878.46
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2814.903, mean=2814.903, max=2814.903, sum=5629.806 (2)",
-            "tab": "General information",
-            "score": 2814.9030303030304
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=372.217, mean=372.217, max=372.217, sum=744.434 (2)",
-            "tab": "General information",
-            "score": 372.2171717171717
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=467.311, mean=467.311, max=467.311, sum=934.622 (2)",
-            "tab": "General information",
-            "score": 467.31088082901556
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=374.349, mean=374.349, max=374.349, sum=748.697 (2)",
-            "tab": "General information",
-            "score": 374.34871794871793
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=565.326, mean=565.326, max=565.326, sum=1130.652 (2)",
-            "tab": "General information",
-            "score": 565.325925925926
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=402.277, mean=402.277, max=402.277, sum=804.555 (2)",
-            "tab": "General information",
-            "score": 402.2773109243698
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=580.536, mean=580.536, max=580.536, sum=1161.073 (2)",
-            "tab": "General information",
-            "score": 580.5364238410596
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=495.521, mean=495.521, max=495.521, sum=991.042 (2)",
-            "tab": "General information",
-            "score": 495.52110091743117
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=830.477, mean=830.477, max=830.477, sum=1660.954 (2)",
-            "tab": "General information",
-            "score": 830.4768518518518
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2237.176, mean=2237.176, max=2237.176, sum=4474.353 (2)",
-            "tab": "General information",
-            "score": 2237.176470588235
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1448.354, mean=1448.354, max=1448.354, sum=2896.709 (2)",
-            "tab": "General information",
-            "score": 1448.3544303797469
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.893,
-        "details": {
-          "description": "min=0.893, mean=0.893, max=0.893, sum=1.786 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.275, mean=0.275, max=0.275, sum=0.55 (2)",
-            "tab": "Efficiency",
-            "score": 0.2751739634526685
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.327, mean=0.327, max=0.327, sum=0.655 (2)",
-            "tab": "Efficiency",
-            "score": 0.32726097470931426
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=322.121, mean=322.121, max=322.121, sum=644.242 (2)",
-            "tab": "General information",
-            "score": 322.1210762331838
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=341.504, mean=341.504, max=341.504, sum=683.008 (2)",
-            "tab": "General information",
-            "score": 341.5038167938931
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.893,
-        "details": {
-          "description": "min=0.893, mean=0.893, max=0.893, sum=1.785 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.297, mean=0.297, max=0.297, sum=0.594 (2)",
-            "tab": "Efficiency",
-            "score": 0.2972275757592572
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=640.579, mean=640.579, max=640.579, sum=1281.157 (2)",
-            "tab": "General information",
-            "score": 640.5785123966942
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.914,
-        "details": {
-          "description": "min=0.914, mean=0.914, max=0.914, sum=1.828 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.218, mean=0.218, max=0.218, sum=0.436 (2)",
-            "tab": "Efficiency",
-            "score": 0.21798631311194297
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=449.632, mean=449.632, max=449.632, sum=899.264 (2)",
-            "tab": "General information",
-            "score": 449.6319018404908
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.768,
-        "details": {
-          "description": "min=0.768, mean=0.768, max=0.768, sum=1.536 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.223, mean=0.223, max=0.223, sum=0.446 (2)",
-            "tab": "Efficiency",
-            "score": 0.22287436042513167
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=681.848, mean=681.848, max=681.848, sum=1363.696 (2)",
-            "tab": "General information",
-            "score": 681.8482142857143
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.903,
-        "details": {
-          "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.239, mean=0.239, max=0.239, sum=0.478 (2)",
-            "tab": "Efficiency",
-            "score": 0.23922002662732764
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=283.854, mean=283.854, max=283.854, sum=567.709 (2)",
-            "tab": "General information",
-            "score": 283.8543689320388
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.953,
-        "details": {
-          "description": "min=0.953, mean=0.953, max=0.953, sum=1.906 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.257, mean=0.257, max=0.257, sum=0.514 (2)",
-            "tab": "Efficiency",
-            "score": 0.2568996777901283
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=404.415, mean=404.415, max=404.415, sum=808.829 (2)",
-            "tab": "General information",
-            "score": 404.4145299145299
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9,
-        "details": {
-          "description": "min=0.9, mean=0.9, max=0.9, sum=1.8 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.267, mean=0.267, max=0.267, sum=0.534 (2)",
-            "tab": "Efficiency",
-            "score": 0.26675461292266844
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=342.35, mean=342.35, max=342.35, sum=684.7 (2)",
-            "tab": "General information",
-            "score": 342.35
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.943,
-        "details": {
-          "description": "min=0.943, mean=0.943, max=0.943, sum=1.885 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.227, mean=0.227, max=0.227, sum=0.453 (2)",
-            "tab": "Efficiency",
-            "score": 0.22672867470469663
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=303.7, mean=303.7, max=303.7, sum=607.4 (2)",
-            "tab": "General information",
-            "score": 303.6998722860792
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.815,
-        "details": {
-          "description": "min=0.815, mean=0.815, max=0.815, sum=1.629 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.237, mean=0.237, max=0.237, sum=0.473 (2)",
-            "tab": "Efficiency",
-            "score": 0.23662481900584492
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.242, mean=0.242, max=0.242, sum=0.483 (2)",
-            "tab": "Efficiency",
-            "score": 0.241705964264257
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=476.182, mean=476.182, max=476.182, sum=952.364 (2)",
-            "tab": "General information",
-            "score": 476.1820809248555
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=668.494, mean=668.494, max=668.494, sum=1336.988 (2)",
-            "tab": "General information",
-            "score": 668.4938547486033
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.902,
-        "details": {
-          "description": "min=0.902, mean=0.902, max=0.902, sum=1.804 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.25, mean=0.25, max=0.25, sum=0.5 (2)",
-            "tab": "Efficiency",
-            "score": 0.2500531182569616
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=599.637, mean=599.637, max=599.637, sum=1199.275 (2)",
-            "tab": "General information",
-            "score": 599.6372549019608
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.914,
-        "details": {
-          "description": "min=0.914, mean=0.914, max=0.914, sum=1.827 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.257, mean=0.257, max=0.257, sum=0.515 (2)",
-            "tab": "Efficiency",
-            "score": 0.25728267504845137
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=528.364, mean=528.364, max=528.364, sum=1056.728 (2)",
-            "tab": "General information",
-            "score": 528.3641975308642
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.745,
-        "details": {
-          "description": "min=0.745, mean=0.745, max=0.745, sum=1.491 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.219, mean=0.219, max=0.219, sum=0.437 (2)",
-            "tab": "Efficiency",
-            "score": 0.2186152393167669
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=408.427, mean=408.427, max=408.427, sum=816.855 (2)",
-            "tab": "General information",
-            "score": 408.42727272727274
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.837,
-        "details": {
-          "description": "min=0.837, mean=0.837, max=0.837, sum=1.673 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.298, mean=0.298, max=0.298, sum=0.595 (2)",
-            "tab": "Efficiency",
-            "score": 0.29758678261114624
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1166.931, mean=1166.931, max=1166.931, sum=2333.861 (2)",
-            "tab": "General information",
-            "score": 1166.930612244898
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.935,
-        "details": {
-          "description": "min=0.935, mean=0.935, max=0.935, sum=1.871 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.228, mean=0.228, max=0.228, sum=0.457 (2)",
-            "tab": "Efficiency",
-            "score": 0.22830370172339293
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=450.1, mean=450.1, max=450.1, sum=900.199 (2)",
-            "tab": "General information",
-            "score": 450.0995024875622
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.56,
-        "details": {
-          "description": "min=0.56, mean=0.56, max=0.56, sum=1.12 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.25, mean=0.25, max=0.25, sum=0.499 (2)",
-            "tab": "Efficiency",
-            "score": 0.24956520206956978
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=343.819, mean=343.819, max=343.819, sum=687.639 (2)",
-            "tab": "General information",
-            "score": 343.8192771084337
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.848,
-        "details": {
-          "description": "min=0.848, mean=0.848, max=0.848, sum=1.696 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.197, mean=0.197, max=0.197, sum=0.394 (2)",
-            "tab": "Efficiency",
-            "score": 0.19691006342569986
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=276.07, mean=276.07, max=276.07, sum=552.14 (2)",
-            "tab": "General information",
-            "score": 276.0701754385965
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.826,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json b/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json
deleted file mode 100644
index a61d620fd..000000000
--- a/data/helm_mmlu/qwen/qwen2.5-72b-instruct-turbo/29958cee-32c9-4d51-8f14-72db4273459f.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen2.5-72b-instruct-turbo/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5 Instruct Turbo 72B",
-    "id": "qwen/qwen2.5-72b-instruct-turbo",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.834,
-        "details": {
-          "description": "min=0.584, mean=0.834, max=0.99, sum=95.044 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.277, mean=0.504, max=1.68, sum=57.492 (114)",
-            "tab": "Efficiency",
-            "score": 0.5043123259817794
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=276.07, mean=625.598, max=2814.903, sum=71318.198 (114)",
-            "tab": "General information",
-            "score": 625.5982315160392
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.68,
-        "details": {
-          "description": "min=0.68, mean=0.68, max=0.68, sum=1.36 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.438, mean=0.438, max=0.438, sum=0.877 (2)",
-            "tab": "Efficiency",
-            "score": 0.438259596824646
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=378.19, mean=378.19, max=378.19, sum=756.38 (2)",
-            "tab": "General information",
-            "score": 378.19
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.822,
-        "details": {
-          "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.365, mean=0.365, max=0.365, sum=0.729 (2)",
-            "tab": "Efficiency",
-            "score": 0.3645249543366609
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=353.978, mean=353.978, max=353.978, sum=707.956 (2)",
-            "tab": "General information",
-            "score": 353.97777777777776
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.588,
-        "details": {
-          "description": "min=0.588, mean=0.588, max=0.588, sum=1.176 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.519, mean=0.519, max=0.519, sum=1.038 (2)",
-            "tab": "Efficiency",
-            "score": 0.5187593793869019
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.406, mean=0.406, max=0.406, sum=0.811 (2)",
-            "tab": "Efficiency",
-            "score": 0.40557659500175053
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.485, mean=0.485, max=0.485, sum=0.97 (2)",
-            "tab": "Efficiency",
-            "score": 0.48524248123168945
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.566, mean=0.566, max=0.566, sum=1.132 (2)",
-            "tab": "Efficiency",
-            "score": 0.5662378907203675
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.528, mean=0.528, max=0.528, sum=1.055 (2)",
-            "tab": "Efficiency",
-            "score": 0.5277049872227487
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.45, mean=0.45, max=0.45, sum=0.9 (2)",
-            "tab": "Efficiency",
-            "score": 0.4500672326368444
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=568.25, mean=568.25, max=568.25, sum=1136.5 (2)",
-            "tab": "General information",
-            "score": 568.25
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=486.979, mean=486.979, max=486.979, sum=973.958 (2)",
-            "tab": "General information",
-            "score": 486.9791666666667
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=838.58, mean=838.58, max=838.58, sum=1677.16 (2)",
-            "tab": "General information",
-            "score": 838.58
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=607.7, mean=607.7, max=607.7, sum=1215.4 (2)",
-            "tab": "General information",
-            "score": 607.7
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=506.098, mean=506.098, max=506.098, sum=1012.197 (2)",
-            "tab": "General information",
-            "score": 506.0982658959538
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=516.265, mean=516.265, max=516.265, sum=1032.529 (2)",
-            "tab": "General information",
-            "score": 516.2647058823529
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.86,
-        "details": {
-          "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.506, mean=0.506, max=0.506, sum=1.011 (2)",
-            "tab": "Efficiency",
-            "score": 0.5056298255920411
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=386.64, mean=386.64, max=386.64, sum=773.28 (2)",
-            "tab": "General information",
-            "score": 386.64
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.728,
-        "details": {
-          "description": "min=0.728, mean=0.728, max=0.728, sum=1.456 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.646, mean=0.646, max=0.646, sum=1.293 (2)",
-            "tab": "Efficiency",
-            "score": 0.6464532927462929
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=627.939, mean=627.939, max=627.939, sum=1255.877 (2)",
-            "tab": "General information",
-            "score": 627.938596491228
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.61,
-        "details": {
-          "description": "min=0.61, mean=0.61, max=0.61, sum=1.22 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.517, mean=0.517, max=0.517, sum=1.035 (2)",
-            "tab": "Efficiency",
-            "score": 0.5174938654899597
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=429.06, mean=429.06, max=429.06, sum=858.12 (2)",
-            "tab": "General information",
-            "score": 429.06
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.552, mean=0.552, max=0.552, sum=1.105 (2)",
-            "tab": "Efficiency",
-            "score": 0.55242551918383
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=394.713, mean=394.713, max=394.713, sum=789.426 (2)",
-            "tab": "General information",
-            "score": 394.712962962963
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.839,
-        "details": {
-          "description": "min=0.839, mean=0.839, max=0.839, sum=1.678 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=1.352, mean=1.352, max=1.352, sum=2.704 (2)",
-            "tab": "Efficiency",
-            "score": 1.3517981679493207
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=329.09, mean=329.09, max=329.09, sum=658.18 (2)",
-            "tab": "General information",
-            "score": 329.09003215434086
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.864,
-        "details": {
-          "description": "min=0.864, mean=0.864, max=0.864, sum=1.729 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=1.02, mean=1.02, max=1.02, sum=2.039 (2)",
-            "tab": "Efficiency",
-            "score": 1.019735706203124
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.282, mean=0.282, max=0.282, sum=0.565 (2)",
-            "tab": "Efficiency",
-            "score": 0.2822888328673992
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=1.68, mean=1.68, max=1.68, sum=3.36 (2)",
-            "tab": "Efficiency",
-            "score": 1.6800112862630494
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.573, mean=0.573, max=0.573, sum=1.145 (2)",
-            "tab": "Efficiency",
-            "score": 0.5726091144910825
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1125.199, mean=1125.199, max=1125.199, sum=2250.397 (2)",
-            "tab": "General information",
-            "score": 1125.1985294117646
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=739.34, mean=739.34, max=739.34, sum=1478.681 (2)",
-            "tab": "General information",
-            "score": 739.3404255319149
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1663.969, mean=1663.969, max=1663.969, sum=3327.939 (2)",
-            "tab": "General information",
-            "score": 1663.9693611473272
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=581.417, mean=581.417, max=581.417, sum=1162.833 (2)",
-            "tab": "General information",
-            "score": 581.4166666666666
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.96,
-        "details": {
-          "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.789, mean=0.789, max=0.789, sum=1.578 (2)",
-            "tab": "Efficiency",
-            "score": 0.7888539290428161
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=428.16, mean=428.16, max=428.16, sum=856.32 (2)",
-            "tab": "General information",
-            "score": 428.16
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.934,
-        "details": {
-          "description": "min=0.934, mean=0.934, max=0.934, sum=1.868 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.991, mean=0.991, max=0.991, sum=1.983 (2)",
-            "tab": "Efficiency",
-            "score": 0.9913477442766491
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=589.849, mean=589.849, max=589.849, sum=1179.697 (2)",
-            "tab": "General information",
-            "score": 589.8486842105264
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.581, mean=0.581, max=0.581, sum=1.163 (2)",
-            "tab": "Efficiency",
-            "score": 0.5813773083686828
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=569.87, mean=569.87, max=569.87, sum=1139.74 (2)",
-            "tab": "General information",
-            "score": 569.87
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.872,
-        "details": {
-          "description": "min=0.872, mean=0.872, max=0.872, sum=1.743 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.74, mean=0.74, max=0.74, sum=1.48 (2)",
-            "tab": "Efficiency",
-            "score": 0.7399316436839554
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=400.623, mean=400.623, max=400.623, sum=801.245 (2)",
-            "tab": "General information",
-            "score": 400.62264150943395
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.885,
-        "details": {
-          "description": "min=0.885, mean=0.885, max=0.885, sum=1.77 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.643 (2)",
-            "tab": "Efficiency",
-            "score": 0.32127690010882437
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=305.494, mean=305.494, max=305.494, sum=610.987 (2)",
-            "tab": "General information",
-            "score": 305.4936170212766
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.513, mean=0.513, max=0.513, sum=1.026 (2)",
-            "tab": "Efficiency",
-            "score": 0.5130313610208446
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=463.8, mean=463.8, max=463.8, sum=927.6 (2)",
-            "tab": "General information",
-            "score": 463.8
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=1.022, mean=1.022, max=1.022, sum=2.044 (2)",
-            "tab": "Efficiency",
-            "score": 1.0221643580330744
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=577.119, mean=577.119, max=577.119, sum=1154.238 (2)",
-            "tab": "General information",
-            "score": 577.1190476190476
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.73,
-        "details": {
-          "description": "min=0.73, mean=0.73, max=0.73, sum=1.46 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.489, mean=0.489, max=0.489, sum=0.978 (2)",
-            "tab": "Efficiency",
-            "score": 0.48887844501979766
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=604.667, mean=604.667, max=604.667, sum=1209.333 (2)",
-            "tab": "General information",
-            "score": 604.6666666666666
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.342, mean=0.342, max=0.342, sum=0.685 (2)",
-            "tab": "Efficiency",
-            "score": 0.34227523111527963
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.673 (2)",
-            "tab": "Efficiency",
-            "score": 0.3364456193200473
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.384, mean=0.384, max=0.384, sum=0.768 (2)",
-            "tab": "Efficiency",
-            "score": 0.38405280351638793
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.582, mean=0.582, max=0.582, sum=1.165 (2)",
-            "tab": "Efficiency",
-            "score": 0.5822634451317065
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.366, mean=0.366, max=0.366, sum=0.731 (2)",
-            "tab": "Efficiency",
-            "score": 0.3657490508724945
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.388, mean=0.388, max=0.388, sum=0.776 (2)",
-            "tab": "Efficiency",
-            "score": 0.3882344139672314
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.311, mean=0.311, max=0.311, sum=0.623 (2)",
-            "tab": "Efficiency",
-            "score": 0.31144848542335707
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.364, mean=0.364, max=0.364, sum=0.727 (2)",
-            "tab": "Efficiency",
-            "score": 0.3636930130146168
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)",
-            "tab": "Efficiency",
-            "score": 0.5723558383829453
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.891, mean=0.891, max=0.891, sum=1.782 (2)",
-            "tab": "Efficiency",
-            "score": 0.8909238490047834
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.312, mean=0.312, max=0.312, sum=0.623 (2)",
-            "tab": "Efficiency",
-            "score": 0.31171117397623327
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.376, mean=0.376, max=0.376, sum=0.751 (2)",
-            "tab": "Efficiency",
-            "score": 0.3756344163859332
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.453, mean=0.453, max=0.453, sum=0.907 (2)",
-            "tab": "Efficiency",
-            "score": 0.45333802466299017
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.526, mean=0.526, max=0.526, sum=1.051 (2)",
-            "tab": "Efficiency",
-            "score": 0.5255286924949678
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=513.916, mean=513.916, max=513.916, sum=1027.832 (2)",
-            "tab": "General information",
-            "score": 513.916129032258
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=517.261, mean=517.261, max=517.261, sum=1034.522 (2)",
-            "tab": "General information",
-            "score": 517.2610837438424
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=878.46, mean=878.46, max=878.46, sum=1756.92 (2)",
-            "tab": "General information",
-            "score": 878.46
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2814.903, mean=2814.903, max=2814.903, sum=5629.806 (2)",
-            "tab": "General information",
-            "score": 2814.9030303030304
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=372.217, mean=372.217, max=372.217, sum=744.434 (2)",
-            "tab": "General information",
-            "score": 372.2171717171717
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=467.311, mean=467.311, max=467.311, sum=934.622 (2)",
-            "tab": "General information",
-            "score": 467.31088082901556
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=374.349, mean=374.349, max=374.349, sum=748.697 (2)",
-            "tab": "General information",
-            "score": 374.34871794871793
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=565.326, mean=565.326, max=565.326, sum=1130.652 (2)",
-            "tab": "General information",
-            "score": 565.325925925926
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=402.277, mean=402.277, max=402.277, sum=804.555 (2)",
-            "tab": "General information",
-            "score": 402.2773109243698
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=580.536, mean=580.536, max=580.536, sum=1161.073 (2)",
-            "tab": "General information",
-            "score": 580.5364238410596
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=495.521, mean=495.521, max=495.521, sum=991.042 (2)",
-            "tab": "General information",
-            "score": 495.52110091743117
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=830.477, mean=830.477, max=830.477, sum=1660.954 (2)",
-            "tab": "General information",
-            "score": 830.4768518518518
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2237.176, mean=2237.176, max=2237.176, sum=4474.353 (2)",
-            "tab": "General information",
-            "score": 2237.176470588235
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1448.354, mean=1448.354, max=1448.354, sum=2896.709 (2)",
-            "tab": "General information",
-            "score": 1448.3544303797469
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.878,
-        "details": {
-          "description": "min=0.878, mean=0.878, max=0.878, sum=1.756 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.428, mean=0.428, max=0.428, sum=0.856 (2)",
-            "tab": "Efficiency",
-            "score": 0.42812311168208783
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.318, mean=0.318, max=0.318, sum=0.635 (2)",
-            "tab": "Efficiency",
-            "score": 0.3175856612110866
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=322.121, mean=322.121, max=322.121, sum=644.242 (2)",
-            "tab": "General information",
-            "score": 322.1210762331838
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=341.504, mean=341.504, max=341.504, sum=683.008 (2)",
-            "tab": "General information",
-            "score": 341.5038167938931
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.893,
-        "details": {
-          "description": "min=0.893, mean=0.893, max=0.893, sum=1.785 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.425, mean=0.425, max=0.425, sum=0.85 (2)",
-            "tab": "Efficiency",
-            "score": 0.4248029200498723
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=640.579, mean=640.579, max=640.579, sum=1281.157 (2)",
-            "tab": "General information",
-            "score": 640.5785123966942
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.89,
-        "details": {
-          "description": "min=0.89, mean=0.89, max=0.89, sum=1.779 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.346, mean=0.346, max=0.346, sum=0.692 (2)",
-            "tab": "Efficiency",
-            "score": 0.3458571419394089
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=449.632, mean=449.632, max=449.632, sum=899.264 (2)",
-            "tab": "General information",
-            "score": 449.6319018404908
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.777,
-        "details": {
-          "description": "min=0.777, mean=0.777, max=0.777, sum=1.554 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.348, mean=0.348, max=0.348, sum=0.697 (2)",
-            "tab": "Efficiency",
-            "score": 0.3483003888811384
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=681.848, mean=681.848, max=681.848, sum=1363.696 (2)",
-            "tab": "General information",
-            "score": 681.8482142857143
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.913,
-        "details": {
-          "description": "min=0.913, mean=0.913, max=0.913, sum=1.825 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.587 (2)",
-            "tab": "Efficiency",
-            "score": 0.2933675108604061
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=283.854, mean=283.854, max=283.854, sum=567.709 (2)",
-            "tab": "General information",
-            "score": 283.8543689320388
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.953,
-        "details": {
-          "description": "min=0.953, mean=0.953, max=0.953, sum=1.906 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.475, mean=0.475, max=0.475, sum=0.949 (2)",
-            "tab": "Efficiency",
-            "score": 0.4746182779980521
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=404.415, mean=404.415, max=404.415, sum=808.829 (2)",
-            "tab": "General information",
-            "score": 404.4145299145299
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.311, mean=0.311, max=0.311, sum=0.622 (2)",
-            "tab": "Efficiency",
-            "score": 0.3110049200057983
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=342.35, mean=342.35, max=342.35, sum=684.7 (2)",
-            "tab": "General information",
-            "score": 342.35
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.932,
-        "details": {
-          "description": "min=0.932, mean=0.932, max=0.932, sum=1.865 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.345, mean=0.345, max=0.345, sum=0.689 (2)",
-            "tab": "Efficiency",
-            "score": 0.3445042967035091
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=303.7, mean=303.7, max=303.7, sum=607.4 (2)",
-            "tab": "General information",
-            "score": 303.6998722860792
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.787,
-        "details": {
-          "description": "min=0.787, mean=0.787, max=0.787, sum=1.573 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.291, mean=0.291, max=0.291, sum=0.583 (2)",
-            "tab": "Efficiency",
-            "score": 0.2913500532249495
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.32, mean=0.32, max=0.32, sum=0.641 (2)",
-            "tab": "Efficiency",
-            "score": 0.32045427327715487
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=476.182, mean=476.182, max=476.182, sum=952.364 (2)",
-            "tab": "General information",
-            "score": 476.1820809248555
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=668.494, mean=668.494, max=668.494, sum=1336.988 (2)",
-            "tab": "General information",
-            "score": 668.4938547486033
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.886,
-        "details": {
-          "description": "min=0.886, mean=0.886, max=0.886, sum=1.771 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.293, mean=0.293, max=0.293, sum=0.585 (2)",
-            "tab": "Efficiency",
-            "score": 0.29262306565552754
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=599.637, mean=599.637, max=599.637, sum=1199.275 (2)",
-            "tab": "General information",
-            "score": 599.6372549019608
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.34, mean=0.34, max=0.34, sum=0.681 (2)",
-            "tab": "Efficiency",
-            "score": 0.340311410986347
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=528.364, mean=528.364, max=528.364, sum=1056.728 (2)",
-            "tab": "General information",
-            "score": 528.3641975308642
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.782,
-        "details": {
-          "description": "min=0.782, mean=0.782, max=0.782, sum=1.564 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.277, mean=0.277, max=0.277, sum=0.554 (2)",
-            "tab": "Efficiency",
-            "score": 0.2769838809967041
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=408.427, mean=408.427, max=408.427, sum=816.855 (2)",
-            "tab": "General information",
-            "score": 408.42727272727274
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.849,
-        "details": {
-          "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.377, mean=0.377, max=0.377, sum=0.754 (2)",
-            "tab": "Efficiency",
-            "score": 0.3771621781952527
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1166.931, mean=1166.931, max=1166.931, sum=2333.861 (2)",
-            "tab": "General information",
-            "score": 1166.930612244898
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.925,
-        "details": {
-          "description": "min=0.925, mean=0.925, max=0.925, sum=1.851 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.291, mean=0.291, max=0.291, sum=0.582 (2)",
-            "tab": "Efficiency",
-            "score": 0.2910151019025205
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=450.1, mean=450.1, max=450.1, sum=900.199 (2)",
-            "tab": "General information",
-            "score": 450.0995024875622
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.584,
-        "details": {
-          "description": "min=0.584, mean=0.584, max=0.584, sum=1.169 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.351, mean=0.351, max=0.351, sum=0.702 (2)",
-            "tab": "Efficiency",
-            "score": 0.35115946631833733
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=343.819, mean=343.819, max=343.819, sum=687.639 (2)",
-            "tab": "General information",
-            "score": 343.8192771084337
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.801 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.381, mean=0.381, max=0.381, sum=0.762 (2)",
-            "tab": "Efficiency",
-            "score": 0.3812444461019416
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=276.07, mean=276.07, max=276.07, sum=552.14 (2)",
-            "tab": "General information",
-            "score": 276.0701754385965
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.548,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json b/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json
deleted file mode 100644
index c045e519d..000000000
--- a/data/helm_mmlu/qwen/qwen2.5-7b-instruct-turbo/72537b16-feda-4e5e-a477-f415650db847.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/qwen_qwen2.5-7b-instruct-turbo/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5 Instruct Turbo 7B",
-    "id": "qwen/qwen2.5-7b-instruct-turbo",
-    "developer": "qwen",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.729,
-        "details": {
-          "description": "min=0.42, mean=0.729, max=0.919, sum=83.073 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.162, mean=0.242, max=0.44, sum=27.616 (114)",
-            "tab": "Efficiency",
-            "score": 0.24224721190343979
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=276.07, mean=625.598, max=2814.903, sum=71318.198 (114)",
-            "tab": "General information",
-            "score": 625.5982315160392
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.49,
-        "details": {
-          "description": "min=0.49, mean=0.49, max=0.49, sum=0.98 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.431, mean=0.431, max=0.431, sum=0.863 (2)",
-            "tab": "Efficiency",
-            "score": 0.43148461580276487
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=378.19, mean=378.19, max=378.19, sum=756.38 (2)",
-            "tab": "General information",
-            "score": 378.19
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.689,
-        "details": {
-          "description": "min=0.689, mean=0.689, max=0.689, sum=1.378 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.333, mean=0.333, max=0.333, sum=0.667 (2)",
-            "tab": "Efficiency",
-            "score": 0.3332981339207402
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=353.978, mean=353.978, max=353.978, sum=707.956 (2)",
-            "tab": "General information",
-            "score": 353.97777777777776
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.51,
-        "details": {
-          "description": "min=0.51, mean=0.51, max=0.51, sum=1.02 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.285, mean=0.285, max=0.285, sum=0.571 (2)",
-            "tab": "Efficiency",
-            "score": 0.28538883924484254
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.31, mean=0.31, max=0.31, sum=0.619 (2)",
-            "tab": "Efficiency",
-            "score": 0.309537861082289
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.302, mean=0.302, max=0.302, sum=0.604 (2)",
-            "tab": "Efficiency",
-            "score": 0.30183048248291017
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.279, mean=0.279, max=0.279, sum=0.558 (2)",
-            "tab": "Efficiency",
-            "score": 0.2791933488845825
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.303, mean=0.303, max=0.303, sum=0.607 (2)",
-            "tab": "Efficiency",
-            "score": 0.3032711007002461
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.3, mean=0.3, max=0.3, sum=0.599 (2)",
-            "tab": "Efficiency",
-            "score": 0.2996697425842285
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=568.25, mean=568.25, max=568.25, sum=1136.5 (2)",
-            "tab": "General information",
-            "score": 568.25
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=486.979, mean=486.979, max=486.979, sum=973.958 (2)",
-            "tab": "General information",
-            "score": 486.9791666666667
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=838.58, mean=838.58, max=838.58, sum=1677.16 (2)",
-            "tab": "General information",
-            "score": 838.58
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=607.7, mean=607.7, max=607.7, sum=1215.4 (2)",
-            "tab": "General information",
-            "score": 607.7
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=506.098, mean=506.098, max=506.098, sum=1012.197 (2)",
-            "tab": "General information",
-            "score": 506.0982658959538
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=516.265, mean=516.265, max=516.265, sum=1032.529 (2)",
-            "tab": "General information",
-            "score": 516.2647058823529
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.352, mean=0.352, max=0.352, sum=0.705 (2)",
-            "tab": "Efficiency",
-            "score": 0.3522661328315735
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=386.64, mean=386.64, max=386.64, sum=773.28 (2)",
-            "tab": "General information",
-            "score": 386.64
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.64,
-        "details": {
-          "description": "min=0.64, mean=0.64, max=0.64, sum=1.281 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.346, mean=0.346, max=0.346, sum=0.691 (2)",
-            "tab": "Efficiency",
-            "score": 0.34558368356604324
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=627.939, mean=627.939, max=627.939, sum=1255.877 (2)",
-            "tab": "General information",
-            "score": 627.938596491228
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.42,
-        "details": {
-          "description": "min=0.42, mean=0.42, max=0.42, sum=0.84 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)",
-            "tab": "Efficiency",
-            "score": 0.314766480922699
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=429.06, mean=429.06, max=429.06, sum=858.12 (2)",
-            "tab": "General information",
-            "score": 429.06
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "description": "min=0.796, mean=0.796, max=0.796, sum=1.593 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.642 (2)",
-            "tab": "Efficiency",
-            "score": 0.32116924391852486
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=394.713, mean=394.713, max=394.713, sum=789.426 (2)",
-            "tab": "General information",
-            "score": 394.712962962963
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.746,
-        "details": {
-          "description": "min=0.746, mean=0.746, max=0.746, sum=1.492 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.44, mean=0.44, max=0.44, sum=0.88 (2)",
-            "tab": "Efficiency",
-            "score": 0.4401504610129108
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=329.09, mean=329.09, max=329.09, sum=658.18 (2)",
-            "tab": "General information",
-            "score": 329.09003215434086
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.757,
-        "details": {
-          "description": "min=0.757, mean=0.757, max=0.757, sum=1.513 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.394, mean=0.394, max=0.394, sum=0.788 (2)",
-            "tab": "Efficiency",
-            "score": 0.393971232806935
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.185, mean=0.185, max=0.185, sum=0.371 (2)",
-            "tab": "Efficiency",
-            "score": 0.18525678553479782
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.205, mean=0.205, max=0.205, sum=0.409 (2)",
-            "tab": "Efficiency",
-            "score": 0.20459390463698485
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.166, mean=0.166, max=0.166, sum=0.332 (2)",
-            "tab": "Efficiency",
-            "score": 0.16597708611706502
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1125.199, mean=1125.199, max=1125.199, sum=2250.397 (2)",
-            "tab": "General information",
-            "score": 1125.1985294117646
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=739.34, mean=739.34, max=739.34, sum=1478.681 (2)",
-            "tab": "General information",
-            "score": 739.3404255319149
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1663.969, mean=1663.969, max=1663.969, sum=3327.939 (2)",
-            "tab": "General information",
-            "score": 1663.9693611473272
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=581.417, mean=581.417, max=581.417, sum=1162.833 (2)",
-            "tab": "General information",
-            "score": 581.4166666666666
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.86,
-        "details": {
-          "description": "min=0.86, mean=0.86, max=0.86, sum=1.72 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.33, mean=0.33, max=0.33, sum=0.66 (2)",
-            "tab": "Efficiency",
-            "score": 0.33019849777221677
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=428.16, mean=428.16, max=428.16, sum=856.32 (2)",
-            "tab": "General information",
-            "score": 428.16
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.836,
-        "details": {
-          "description": "min=0.836, mean=0.836, max=0.836, sum=1.671 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.314, mean=0.314, max=0.314, sum=0.629 (2)",
-            "tab": "Efficiency",
-            "score": 0.3143457660549565
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=589.849, mean=589.849, max=589.849, sum=1179.697 (2)",
-            "tab": "General information",
-            "score": 589.8486842105264
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.308, mean=0.308, max=0.308, sum=0.615 (2)",
-            "tab": "Efficiency",
-            "score": 0.3076848840713501
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=569.87, mean=569.87, max=569.87, sum=1139.74 (2)",
-            "tab": "General information",
-            "score": 569.87
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.785,
-        "details": {
-          "description": "min=0.785, mean=0.785, max=0.785, sum=1.57 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.335, mean=0.335, max=0.335, sum=0.67 (2)",
-            "tab": "Efficiency",
-            "score": 0.33518469288664043
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=400.623, mean=400.623, max=400.623, sum=801.245 (2)",
-            "tab": "General information",
-            "score": 400.62264150943395
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.736,
-        "details": {
-          "description": "min=0.736, mean=0.736, max=0.736, sum=1.472 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.253, mean=0.253, max=0.253, sum=0.506 (2)",
-            "tab": "Efficiency",
-            "score": 0.2531234142628122
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=305.494, mean=305.494, max=305.494, sum=610.987 (2)",
-            "tab": "General information",
-            "score": 305.4936170212766
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.717,
-        "details": {
-          "description": "min=0.717, mean=0.717, max=0.717, sum=1.434 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.198, mean=0.198, max=0.198, sum=0.396 (2)",
-            "tab": "Efficiency",
-            "score": 0.19794883070320918
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=463.8, mean=463.8, max=463.8, sum=927.6 (2)",
-            "tab": "General information",
-            "score": 463.8
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.643,
-        "details": {
-          "description": "min=0.643, mean=0.643, max=0.643, sum=1.286 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.202, mean=0.202, max=0.202, sum=0.404 (2)",
-            "tab": "Efficiency",
-            "score": 0.2021035529949047
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=577.119, mean=577.119, max=577.119, sum=1154.238 (2)",
-            "tab": "General information",
-            "score": 577.1190476190476
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.587,
-        "details": {
-          "description": "min=0.587, mean=0.587, max=0.587, sum=1.175 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.197, mean=0.197, max=0.197, sum=0.393 (2)",
-            "tab": "Efficiency",
-            "score": 0.196545644411965
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=604.667, mean=604.667, max=604.667, sum=1209.333 (2)",
-            "tab": "General information",
-            "score": 604.6666666666666
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.878,
-        "details": {
-          "description": "min=0.878, mean=0.878, max=0.878, sum=1.755 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.192, mean=0.192, max=0.192, sum=0.384 (2)",
-            "tab": "Efficiency",
-            "score": 0.19177444058079873
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.236, mean=0.236, max=0.236, sum=0.472 (2)",
-            "tab": "Efficiency",
-            "score": 0.23597407693346145
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.202, mean=0.202, max=0.202, sum=0.404 (2)",
-            "tab": "Efficiency",
-            "score": 0.20180433988571167
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.313, mean=0.313, max=0.313, sum=0.626 (2)",
-            "tab": "Efficiency",
-            "score": 0.3130656791455818
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.215, mean=0.215, max=0.215, sum=0.43 (2)",
-            "tab": "Efficiency",
-            "score": 0.21512896725625702
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.192, mean=0.192, max=0.192, sum=0.384 (2)",
-            "tab": "Efficiency",
-            "score": 0.19191643611137113
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.204, mean=0.204, max=0.204, sum=0.409 (2)",
-            "tab": "Efficiency",
-            "score": 0.20429076965038592
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.234, mean=0.234, max=0.234, sum=0.468 (2)",
-            "tab": "Efficiency",
-            "score": 0.2337868098859434
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.184, mean=0.184, max=0.184, sum=0.367 (2)",
-            "tab": "Efficiency",
-            "score": 0.18365505863638484
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.194, mean=0.194, max=0.194, sum=0.388 (2)",
-            "tab": "Efficiency",
-            "score": 0.19382640068104726
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.203, mean=0.203, max=0.203, sum=0.405 (2)",
-            "tab": "Efficiency",
-            "score": 0.20258700432033713
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.226, mean=0.226, max=0.226, sum=0.451 (2)",
-            "tab": "Efficiency",
-            "score": 0.22551235446223505
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.249, mean=0.249, max=0.249, sum=0.498 (2)",
-            "tab": "Efficiency",
-            "score": 0.2492340417469249
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.231, mean=0.231, max=0.231, sum=0.462 (2)",
-            "tab": "Efficiency",
-            "score": 0.23088843812419393
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=513.916, mean=513.916, max=513.916, sum=1027.832 (2)",
-            "tab": "General information",
-            "score": 513.916129032258
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=517.261, mean=517.261, max=517.261, sum=1034.522 (2)",
-            "tab": "General information",
-            "score": 517.2610837438424
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=878.46, mean=878.46, max=878.46, sum=1756.92 (2)",
-            "tab": "General information",
-            "score": 878.46
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2814.903, mean=2814.903, max=2814.903, sum=5629.806 (2)",
-            "tab": "General information",
-            "score": 2814.9030303030304
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=372.217, mean=372.217, max=372.217, sum=744.434 (2)",
-            "tab": "General information",
-            "score": 372.2171717171717
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=467.311, mean=467.311, max=467.311, sum=934.622 (2)",
-            "tab": "General information",
-            "score": 467.31088082901556
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=374.349, mean=374.349, max=374.349, sum=748.697 (2)",
-            "tab": "General information",
-            "score": 374.34871794871793
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=565.326, mean=565.326, max=565.326, sum=1130.652 (2)",
-            "tab": "General information",
-            "score": 565.325925925926
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=402.277, mean=402.277, max=402.277, sum=804.555 (2)",
-            "tab": "General information",
-            "score": 402.2773109243698
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=580.536, mean=580.536, max=580.536, sum=1161.073 (2)",
-            "tab": "General information",
-            "score": 580.5364238410596
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=495.521, mean=495.521, max=495.521, sum=991.042 (2)",
-            "tab": "General information",
-            "score": 495.52110091743117
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=830.477, mean=830.477, max=830.477, sum=1660.954 (2)",
-            "tab": "General information",
-            "score": 830.4768518518518
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2237.176, mean=2237.176, max=2237.176, sum=4474.353 (2)",
-            "tab": "General information",
-            "score": 2237.176470588235
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1448.354, mean=1448.354, max=1448.354, sum=2896.709 (2)",
-            "tab": "General information",
-            "score": 1448.3544303797469
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.794,
-        "details": {
-          "description": "min=0.794, mean=0.794, max=0.794, sum=1.588 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.206, mean=0.206, max=0.206, sum=0.411 (2)",
-            "tab": "Efficiency",
-            "score": 0.20559344591046663
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.191, mean=0.191, max=0.191, sum=0.381 (2)",
-            "tab": "Efficiency",
-            "score": 0.19073554941716084
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=322.121, mean=322.121, max=322.121, sum=644.242 (2)",
-            "tab": "General information",
-            "score": 322.1210762331838
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=341.504, mean=341.504, max=341.504, sum=683.008 (2)",
-            "tab": "General information",
-            "score": 341.5038167938931
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.86,
-        "details": {
-          "description": "min=0.86, mean=0.86, max=0.86, sum=1.719 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.23, mean=0.23, max=0.23, sum=0.46 (2)",
-            "tab": "Efficiency",
-            "score": 0.22999596792804308
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=640.579, mean=640.579, max=640.579, sum=1281.157 (2)",
-            "tab": "General information",
-            "score": 640.5785123966942
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.773,
-        "details": {
-          "description": "min=0.773, mean=0.773, max=0.773, sum=1.546 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.201, mean=0.201, max=0.201, sum=0.401 (2)",
-            "tab": "Efficiency",
-            "score": 0.2005681289485627
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=449.632, mean=449.632, max=449.632, sum=899.264 (2)",
-            "tab": "General information",
-            "score": 449.6319018404908
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.554,
-        "details": {
-          "description": "min=0.554, mean=0.554, max=0.554, sum=1.107 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.232, mean=0.232, max=0.232, sum=0.463 (2)",
-            "tab": "Efficiency",
-            "score": 0.23156332118170603
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=681.848, mean=681.848, max=681.848, sum=1363.696 (2)",
-            "tab": "General information",
-            "score": 681.8482142857143
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.845,
-        "details": {
-          "description": "min=0.845, mean=0.845, max=0.845, sum=1.689 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.197, mean=0.197, max=0.197, sum=0.394 (2)",
-            "tab": "Efficiency",
-            "score": 0.19694008410555644
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=283.854, mean=283.854, max=283.854, sum=567.709 (2)",
-            "tab": "General information",
-            "score": 283.8543689320388
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.919,
-        "details": {
-          "description": "min=0.919, mean=0.919, max=0.919, sum=1.838 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.184, mean=0.184, max=0.184, sum=0.368 (2)",
-            "tab": "Efficiency",
-            "score": 0.18401269525544256
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=404.415, mean=404.415, max=404.415, sum=808.829 (2)",
-            "tab": "General information",
-            "score": 404.4145299145299
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=1.7 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.176, mean=0.176, max=0.176, sum=0.351 (2)",
-            "tab": "Efficiency",
-            "score": 0.17553309679031373
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=342.35, mean=342.35, max=342.35, sum=684.7 (2)",
-            "tab": "General information",
-            "score": 342.35
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.852,
-        "details": {
-          "description": "min=0.852, mean=0.852, max=0.852, sum=1.704 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.174, mean=0.174, max=0.174, sum=0.347 (2)",
-            "tab": "Efficiency",
-            "score": 0.17373346399377892
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=303.7, mean=303.7, max=303.7, sum=607.4 (2)",
-            "tab": "General information",
-            "score": 303.6998722860792
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.511,
-        "details": {
-          "description": "min=0.511, mean=0.511, max=0.511, sum=1.021 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.168, mean=0.168, max=0.168, sum=0.337 (2)",
-            "tab": "Efficiency",
-            "score": 0.16836080041234894
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.171, mean=0.171, max=0.171, sum=0.342 (2)",
-            "tab": "Efficiency",
-            "score": 0.1708347949235799
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=476.182, mean=476.182, max=476.182, sum=952.364 (2)",
-            "tab": "General information",
-            "score": 476.1820809248555
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=668.494, mean=668.494, max=668.494, sum=1336.988 (2)",
-            "tab": "General information",
-            "score": 668.4938547486033
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.778,
-        "details": {
-          "description": "min=0.778, mean=0.778, max=0.778, sum=1.556 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.168, mean=0.168, max=0.168, sum=0.337 (2)",
-            "tab": "Efficiency",
-            "score": 0.16839487724054872
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=599.637, mean=599.637, max=599.637, sum=1199.275 (2)",
-            "tab": "General information",
-            "score": 599.6372549019608
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.836,
-        "details": {
-          "description": "min=0.836, mean=0.836, max=0.836, sum=1.673 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.168, mean=0.168, max=0.168, sum=0.337 (2)",
-            "tab": "Efficiency",
-            "score": 0.16826030795956837
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=528.364, mean=528.364, max=528.364, sum=1056.728 (2)",
-            "tab": "General information",
-            "score": 528.3641975308642
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.709,
-        "details": {
-          "description": "min=0.709, mean=0.709, max=0.709, sum=1.418 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.164, mean=0.164, max=0.164, sum=0.328 (2)",
-            "tab": "Efficiency",
-            "score": 0.1641989447853782
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=408.427, mean=408.427, max=408.427, sum=816.855 (2)",
-            "tab": "General information",
-            "score": 408.42727272727274
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.682,
-        "details": {
-          "description": "min=0.682, mean=0.682, max=0.682, sum=1.363 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.174, mean=0.174, max=0.174, sum=0.349 (2)",
-            "tab": "Efficiency",
-            "score": 0.1744946577111069
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1166.931, mean=1166.931, max=1166.931, sum=2333.861 (2)",
-            "tab": "General information",
-            "score": 1166.930612244898
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.861,
-        "details": {
-          "description": "min=0.861, mean=0.861, max=0.861, sum=1.721 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.19, mean=0.19, max=0.19, sum=0.381 (2)",
-            "tab": "Efficiency",
-            "score": 0.1903395510431546
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=450.1, mean=450.1, max=450.1, sum=900.199 (2)",
-            "tab": "General information",
-            "score": 450.0995024875622
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.578,
-        "details": {
-          "description": "min=0.578, mean=0.578, max=0.578, sum=1.157 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.174, mean=0.174, max=0.174, sum=0.348 (2)",
-            "tab": "Efficiency",
-            "score": 0.1741443513387657
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=343.819, mean=343.819, max=343.819, sum=687.639 (2)",
-            "tab": "General information",
-            "score": 343.8192771084337
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.661 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.162, mean=0.162, max=0.162, sum=0.325 (2)",
-            "tab": "Efficiency",
-            "score": 0.16239780292176365
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=276.07, mean=276.07, max=276.07, sum=552.14 (2)",
-            "tab": "General information",
-            "score": 276.0701754385965
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.887,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json b/data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json
deleted file mode 100644
index 0afa77758..000000000
--- a/data/helm_mmlu/snowflake/snowflake-arctic-instruct/7df68af5-667a-4125-9c12-e71fb5af0a74.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/snowflake_snowflake-arctic-instruct/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Arctic Instruct",
-    "id": "snowflake/snowflake-arctic-instruct",
-    "developer": "snowflake",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.677,
-        "details": {
-          "description": "min=0.28, mean=0.677, max=0.912, sum=77.129 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.35, mean=0.42, max=0.544, sum=47.89 (114)",
-            "tab": "Efficiency",
-            "score": 0.4200856614493726
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=304.474, mean=706.682, max=3159.636, sum=80561.749 (114)",
-            "tab": "General information",
-            "score": 706.6820126388612
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35,
-        "details": {
-          "description": "min=0.35, mean=0.35, max=0.35, sum=0.7 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.377, mean=0.377, max=0.377, sum=0.753 (2)",
-            "tab": "Efficiency",
-            "score": 0.37665764808654784
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=397.65, mean=397.65, max=397.65, sum=795.3 (2)",
-            "tab": "General information",
-            "score": 397.65
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.652,
-        "details": {
-          "description": "min=0.652, mean=0.652, max=0.652, sum=1.304 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.365, mean=0.365, max=0.365, sum=0.731 (2)",
-            "tab": "Efficiency",
-            "score": 0.3654881194785789
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=409.133, mean=409.133, max=409.133, sum=818.267 (2)",
-            "tab": "General information",
-            "score": 409.1333333333333
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.461,
-        "details": {
-          "description": "min=0.461, mean=0.461, max=0.461, sum=0.922 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.35, mean=0.35, max=0.35, sum=0.701 (2)",
-            "tab": "Efficiency",
-            "score": 0.3502761268615723
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)",
-            "tab": "Efficiency",
-            "score": 0.421069688267178
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.427, mean=0.427, max=0.427, sum=0.853 (2)",
-            "tab": "Efficiency",
-            "score": 0.4266632032394409
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.429, mean=0.429, max=0.429, sum=0.858 (2)",
-            "tab": "Efficiency",
-            "score": 0.42887043952941895
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.434, mean=0.434, max=0.434, sum=0.869 (2)",
-            "tab": "Efficiency",
-            "score": 0.4343285574389331
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)",
-            "tab": "Efficiency",
-            "score": 0.4209739086674709
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=622.43, mean=622.43, max=622.43, sum=1244.86 (2)",
-            "tab": "General information",
-            "score": 622.43
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=553.632, mean=553.632, max=553.632, sum=1107.264 (2)",
-            "tab": "General information",
-            "score": 553.6319444444445
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=901.14, mean=901.14, max=901.14, sum=1802.28 (2)",
-            "tab": "General information",
-            "score": 901.14
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=646.96, mean=646.96, max=646.96, sum=1293.92 (2)",
-            "tab": "General information",
-            "score": 646.96
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=608.671, mean=608.671, max=608.671, sum=1217.341 (2)",
-            "tab": "General information",
-            "score": 608.6705202312139
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=551.873, mean=551.873, max=551.873, sum=1103.745 (2)",
-            "tab": "General information",
-            "score": 551.8725490196078
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84,
-        "details": {
-          "description": "min=0.84, mean=0.84, max=0.84, sum=1.68 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.412, mean=0.412, max=0.412, sum=0.825 (2)",
-            "tab": "Efficiency",
-            "score": 0.41247488737106325
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=428.17, mean=428.17, max=428.17, sum=856.34 (2)",
-            "tab": "General information",
-            "score": 428.17
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.873 (2)",
-            "tab": "Efficiency",
-            "score": 0.436487873395284
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=684.675, mean=684.675, max=684.675, sum=1369.351 (2)",
-            "tab": "General information",
-            "score": 684.6754385964912
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39,
-        "details": {
-          "description": "min=0.39, mean=0.39, max=0.39, sum=0.78 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.839 (2)",
-            "tab": "Efficiency",
-            "score": 0.41951879262924197
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=484.54, mean=484.54, max=484.54, sum=969.08 (2)",
-            "tab": "General information",
-            "score": 484.54
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.741,
-        "details": {
-          "description": "min=0.741, mean=0.741, max=0.741, sum=1.481 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.422, mean=0.422, max=0.422, sum=0.843 (2)",
-            "tab": "Efficiency",
-            "score": 0.421647725281892
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=449.898, mean=449.898, max=449.898, sum=899.796 (2)",
-            "tab": "General information",
-            "score": 449.89814814814815
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.752,
-        "details": {
-          "description": "min=0.752, mean=0.752, max=0.752, sum=1.505 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.418, mean=0.418, max=0.418, sum=0.837 (2)",
-            "tab": "Efficiency",
-            "score": 0.418486426497579
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=372.122, mean=372.122, max=372.122, sum=744.244 (2)",
-            "tab": "General information",
-            "score": 372.12218649517683
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.724,
-        "details": {
-          "description": "min=0.724, mean=0.724, max=0.724, sum=1.448 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.445, mean=0.445, max=0.445, sum=0.89 (2)",
-            "tab": "Efficiency",
-            "score": 0.4448305149288738
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.443, mean=0.443, max=0.443, sum=0.887 (2)",
-            "tab": "Efficiency",
-            "score": 0.44340477683019974
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.531, mean=0.531, max=0.531, sum=1.062 (2)",
-            "tab": "Efficiency",
-            "score": 0.531202322345669
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.423, mean=0.423, max=0.423, sum=0.847 (2)",
-            "tab": "Efficiency",
-            "score": 0.42342418120577446
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1330.647, mean=1330.647, max=1330.647, sum=2661.294 (2)",
-            "tab": "General information",
-            "score": 1330.6470588235295
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=823.277, mean=823.277, max=823.277, sum=1646.553 (2)",
-            "tab": "General information",
-            "score": 823.2765957446809
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1915.007, mean=1915.007, max=1915.007, sum=3830.014 (2)",
-            "tab": "General information",
-            "score": 1915.0071707953064
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=650.078, mean=650.078, max=650.078, sum=1300.157 (2)",
-            "tab": "General information",
-            "score": 650.0784313725491
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "details": {
-          "description": "min=0.88, mean=0.88, max=0.88, sum=1.76 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.424, mean=0.424, max=0.424, sum=0.848 (2)",
-            "tab": "Efficiency",
-            "score": 0.42398189067840575
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=479.81, mean=479.81, max=479.81, sum=959.62 (2)",
-            "tab": "General information",
-            "score": 479.81
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.763,
-        "details": {
-          "description": "min=0.763, mean=0.763, max=0.763, sum=1.526 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.424, mean=0.424, max=0.424, sum=0.848 (2)",
-            "tab": "Efficiency",
-            "score": 0.42381788398090164
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=681.079, mean=681.079, max=681.079, sum=1362.158 (2)",
-            "tab": "General information",
-            "score": 681.078947368421
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69,
-        "details": {
-          "description": "min=0.69, mean=0.69, max=0.69, sum=1.38 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.432, mean=0.432, max=0.432, sum=0.863 (2)",
-            "tab": "Efficiency",
-            "score": 0.4315712761878967
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=674.44, mean=674.44, max=674.44, sum=1348.88 (2)",
-            "tab": "General information",
-            "score": 674.44
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.781,
-        "details": {
-          "description": "min=0.781, mean=0.781, max=0.781, sum=1.562 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.841 (2)",
-            "tab": "Efficiency",
-            "score": 0.4204666920428006
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=487.374, mean=487.374, max=487.374, sum=974.747 (2)",
-            "tab": "General information",
-            "score": 487.3735849056604
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.634,
-        "details": {
-          "description": "min=0.634, mean=0.634, max=0.634, sum=1.268 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.412, mean=0.412, max=0.412, sum=0.824 (2)",
-            "tab": "Efficiency",
-            "score": 0.4118805824442113
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=333.153, mean=333.153, max=333.153, sum=666.306 (2)",
-            "tab": "General information",
-            "score": 333.1531914893617
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.662,
-        "details": {
-          "description": "min=0.662, mean=0.662, max=0.662, sum=1.324 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.428, mean=0.428, max=0.428, sum=0.856 (2)",
-            "tab": "Efficiency",
-            "score": 0.42821227435407966
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=497.779, mean=497.779, max=497.779, sum=995.559 (2)",
-            "tab": "General information",
-            "score": 497.7793103448276
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.481,
-        "details": {
-          "description": "min=0.481, mean=0.481, max=0.481, sum=0.963 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.427, mean=0.427, max=0.427, sum=0.853 (2)",
-            "tab": "Efficiency",
-            "score": 0.4265344634888664
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=609.156, mean=609.156, max=609.156, sum=1218.312 (2)",
-            "tab": "General information",
-            "score": 609.1560846560847
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.444,
-        "details": {
-          "description": "min=0.444, mean=0.444, max=0.444, sum=0.889 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.411, mean=0.411, max=0.411, sum=0.821 (2)",
-            "tab": "Efficiency",
-            "score": 0.4107102117841206
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=691.81, mean=691.81, max=691.81, sum=1383.619 (2)",
-            "tab": "General information",
-            "score": 691.8095238095239
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.827,
-        "details": {
-          "description": "min=0.827, mean=0.827, max=0.827, sum=1.654 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.424, mean=0.424, max=0.424, sum=0.847 (2)",
-            "tab": "Efficiency",
-            "score": 0.42357982127897204
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.412, mean=0.412, max=0.412, sum=0.825 (2)",
-            "tab": "Efficiency",
-            "score": 0.41242665375394777
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.445, mean=0.445, max=0.445, sum=0.89 (2)",
-            "tab": "Efficiency",
-            "score": 0.44495458364486695
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.544, mean=0.544, max=0.544, sum=1.088 (2)",
-            "tab": "Efficiency",
-            "score": 0.5441486705433238
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)",
-            "tab": "Efficiency",
-            "score": 0.4149725003675981
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.383, mean=0.383, max=0.383, sum=0.766 (2)",
-            "tab": "Efficiency",
-            "score": 0.38312110629106433
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.403, mean=0.403, max=0.403, sum=0.807 (2)",
-            "tab": "Efficiency",
-            "score": 0.4034240123553154
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.39, mean=0.39, max=0.39, sum=0.779 (2)",
-            "tab": "Efficiency",
-            "score": 0.38954139285617406
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.399, mean=0.399, max=0.399, sum=0.798 (2)",
-            "tab": "Efficiency",
-            "score": 0.3992174813727371
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.409, mean=0.409, max=0.409, sum=0.819 (2)",
-            "tab": "Efficiency",
-            "score": 0.40926165138648835
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.408, mean=0.408, max=0.408, sum=0.816 (2)",
-            "tab": "Efficiency",
-            "score": 0.4081065694126514
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.417, mean=0.417, max=0.417, sum=0.833 (2)",
-            "tab": "Efficiency",
-            "score": 0.4166152830477114
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.45, mean=0.45, max=0.45, sum=0.901 (2)",
-            "tab": "Efficiency",
-            "score": 0.4504043985815609
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.416, mean=0.416, max=0.416, sum=0.833 (2)",
-            "tab": "Efficiency",
-            "score": 0.4162542166086189
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=596.894, mean=596.894, max=596.894, sum=1193.787 (2)",
-            "tab": "General information",
-            "score": 596.8935483870968
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=568.665, mean=568.665, max=568.665, sum=1137.33 (2)",
-            "tab": "General information",
-            "score": 568.6650246305419
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=988.57, mean=988.57, max=988.57, sum=1977.14 (2)",
-            "tab": "General information",
-            "score": 988.57
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=3159.636, mean=3159.636, max=3159.636, sum=6319.273 (2)",
-            "tab": "General information",
-            "score": 3159.6363636363635
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=436.657, mean=436.657, max=436.657, sum=873.313 (2)",
-            "tab": "General information",
-            "score": 436.65656565656565
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=527.927, mean=527.927, max=527.927, sum=1055.855 (2)",
-            "tab": "General information",
-            "score": 527.9274611398964
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=445.662, mean=445.662, max=445.662, sum=891.323 (2)",
-            "tab": "General information",
-            "score": 445.66153846153844
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=579.181, mean=579.181, max=579.181, sum=1158.363 (2)",
-            "tab": "General information",
-            "score": 579.1814814814815
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=449.492, mean=449.492, max=449.492, sum=898.983 (2)",
-            "tab": "General information",
-            "score": 449.49159663865544
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=621.788, mean=621.788, max=621.788, sum=1243.576 (2)",
-            "tab": "General information",
-            "score": 621.7880794701987
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=585.919, mean=585.919, max=585.919, sum=1171.839 (2)",
-            "tab": "General information",
-            "score": 585.9192660550459
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=908.208, mean=908.208, max=908.208, sum=1816.417 (2)",
-            "tab": "General information",
-            "score": 908.2083333333334
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2535.324, mean=2535.324, max=2535.324, sum=5070.647 (2)",
-            "tab": "General information",
-            "score": 2535.323529411765
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1638.219, mean=1638.219, max=1638.219, sum=3276.439 (2)",
-            "tab": "General information",
-            "score": 1638.2194092827003
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.847,
-        "details": {
-          "description": "min=0.847, mean=0.847, max=0.847, sum=1.695 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.401, mean=0.401, max=0.401, sum=0.802 (2)",
-            "tab": "Efficiency",
-            "score": 0.4010318255745242
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.393, mean=0.393, max=0.393, sum=0.787 (2)",
-            "tab": "Efficiency",
-            "score": 0.39331119843111695
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=361.26, mean=361.26, max=361.26, sum=722.52 (2)",
-            "tab": "General information",
-            "score": 361.26008968609864
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=403.382, mean=403.382, max=403.382, sum=806.763 (2)",
-            "tab": "General information",
-            "score": 403.381679389313
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.826,
-        "details": {
-          "description": "min=0.826, mean=0.826, max=0.826, sum=1.653 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.841 (2)",
-            "tab": "Efficiency",
-            "score": 0.42040472779392213
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=729.463, mean=729.463, max=729.463, sum=1458.926 (2)",
-            "tab": "General information",
-            "score": 729.4628099173553
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.779,
-        "details": {
-          "description": "min=0.779, mean=0.779, max=0.779, sum=1.558 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.404, mean=0.404, max=0.404, sum=0.809 (2)",
-            "tab": "Efficiency",
-            "score": 0.4043445353127696
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=502.755, mean=502.755, max=502.755, sum=1005.509 (2)",
-            "tab": "General information",
-            "score": 502.7546012269939
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.473,
-        "details": {
-          "description": "min=0.473, mean=0.473, max=0.473, sum=0.946 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.421, mean=0.421, max=0.421, sum=0.842 (2)",
-            "tab": "Efficiency",
-            "score": 0.42122456644262585
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=730.402, mean=730.402, max=730.402, sum=1460.804 (2)",
-            "tab": "General information",
-            "score": 730.4017857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796,
-        "details": {
-          "description": "min=0.796, mean=0.796, max=0.796, sum=1.592 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.392, mean=0.392, max=0.392, sum=0.785 (2)",
-            "tab": "Efficiency",
-            "score": 0.392485206566968
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=315.777, mean=315.777, max=315.777, sum=631.553 (2)",
-            "tab": "General information",
-            "score": 315.77669902912623
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.902,
-        "details": {
-          "description": "min=0.902, mean=0.902, max=0.902, sum=1.803 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.407, mean=0.407, max=0.407, sum=0.813 (2)",
-            "tab": "Efficiency",
-            "score": 0.406507401384859
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=472.628, mean=472.628, max=472.628, sum=945.256 (2)",
-            "tab": "General information",
-            "score": 472.62820512820514
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.417, mean=0.417, max=0.417, sum=0.835 (2)",
-            "tab": "Efficiency",
-            "score": 0.41734427213668823
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=408.14, mean=408.14, max=408.14, sum=816.28 (2)",
-            "tab": "General information",
-            "score": 408.14
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.875,
-        "details": {
-          "description": "min=0.875, mean=0.875, max=0.875, sum=1.75 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.407, mean=0.407, max=0.407, sum=0.814 (2)",
-            "tab": "Efficiency",
-            "score": 0.40693108880200146
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=345.913, mean=345.913, max=345.913, sum=691.826 (2)",
-            "tab": "General information",
-            "score": 345.9131545338442
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.28,
-        "details": {
-          "description": "min=0.28, mean=0.28, max=0.28, sum=0.561 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.424, mean=0.424, max=0.424, sum=0.848 (2)",
-            "tab": "Efficiency",
-            "score": 0.4239204674097844
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.433, mean=0.433, max=0.433, sum=0.866 (2)",
-            "tab": "Efficiency",
-            "score": 0.43297034721800737
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=542.506, mean=542.506, max=542.506, sum=1085.012 (2)",
-            "tab": "General information",
-            "score": 542.5057803468208
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=756.479, mean=756.479, max=756.479, sum=1512.959 (2)",
-            "tab": "General information",
-            "score": 756.4793296089385
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.725,
-        "details": {
-          "description": "min=0.725, mean=0.725, max=0.725, sum=1.451 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.417, mean=0.417, max=0.417, sum=0.835 (2)",
-            "tab": "Efficiency",
-            "score": 0.41727598430284485
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=695.922, mean=695.922, max=695.922, sum=1391.843 (2)",
-            "tab": "General information",
-            "score": 695.9215686274509
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.79,
-        "details": {
-          "description": "min=0.79, mean=0.79, max=0.79, sum=1.58 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.43, mean=0.43, max=0.43, sum=0.861 (2)",
-            "tab": "Efficiency",
-            "score": 0.4303552037403907
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=619.185, mean=619.185, max=619.185, sum=1238.37 (2)",
-            "tab": "General information",
-            "score": 619.1851851851852
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.664,
-        "details": {
-          "description": "min=0.664, mean=0.664, max=0.664, sum=1.327 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.428, mean=0.428, max=0.428, sum=0.855 (2)",
-            "tab": "Efficiency",
-            "score": 0.42750670259649104
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=474.827, mean=474.827, max=474.827, sum=949.655 (2)",
-            "tab": "General information",
-            "score": 474.8272727272727
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78,
-        "details": {
-          "description": "min=0.78, mean=0.78, max=0.78, sum=1.559 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.466, mean=0.466, max=0.466, sum=0.933 (2)",
-            "tab": "Efficiency",
-            "score": 0.4662662194699657
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1377.531, mean=1377.531, max=1377.531, sum=2755.061 (2)",
-            "tab": "General information",
-            "score": 1377.530612244898
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.891,
-        "details": {
-          "description": "min=0.891, mean=0.891, max=0.891, sum=1.781 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.416, mean=0.416, max=0.416, sum=0.832 (2)",
-            "tab": "Efficiency",
-            "score": 0.4159522590352528
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=508.478, mean=508.478, max=508.478, sum=1016.955 (2)",
-            "tab": "General information",
-            "score": 508.4776119402985
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.536,
-        "details": {
-          "description": "min=0.536, mean=0.536, max=0.536, sum=1.072 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.405, mean=0.405, max=0.405, sum=0.809 (2)",
-            "tab": "Efficiency",
-            "score": 0.40467354332108096
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=405.108, mean=405.108, max=405.108, sum=810.217 (2)",
-            "tab": "General information",
-            "score": 405.10843373493975
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.854,
-        "details": {
-          "description": "min=0.854, mean=0.854, max=0.854, sum=1.708 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.393, mean=0.393, max=0.393, sum=0.787 (2)",
-            "tab": "Efficiency",
-            "score": 0.39336834455791275
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=304.474, mean=304.474, max=304.474, sum=608.947 (2)",
-            "tab": "General information",
-            "score": 304.4736842105263
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.565,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json b/data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json
deleted file mode 100644
index 2c0cfc48a..000000000
--- a/data/helm_mmlu/upstage/solar-pro-241126/1845eb8b-4c94-4d22-8771-012f7230dc62.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/upstage_solar-pro-241126/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Solar Pro",
-    "id": "upstage/solar-pro-241126",
-    "developer": "upstage",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.776,
-        "details": {
-          "description": "min=0.44, mean=0.776, max=0.97, sum=88.521 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.415, mean=0.5, max=1.447, sum=56.972 (114)",
-            "tab": "Efficiency",
-            "score": 0.4997569605932576
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=313.474, mean=715.682, max=3168.636, sum=81587.749 (114)",
-            "tab": "General information",
-            "score": 715.6820126388612
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46,
-        "details": {
-          "description": "min=0.46, mean=0.46, max=0.46, sum=0.92 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.471, mean=0.471, max=0.471, sum=0.941 (2)",
-            "tab": "Efficiency",
-            "score": 0.47064422845840453
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=406.65, mean=406.65, max=406.65, sum=813.3 (2)",
-            "tab": "General information",
-            "score": 406.65
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.719,
-        "details": {
-          "description": "min=0.719, mean=0.719, max=0.719, sum=1.437 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.526, mean=0.526, max=0.526, sum=1.052 (2)",
-            "tab": "Efficiency",
-            "score": 0.5261570206394902
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=418.133, mean=418.133, max=418.133, sum=836.267 (2)",
-            "tab": "General information",
-            "score": 418.1333333333333
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.559,
-        "details": {
-          "description": "min=0.559, mean=0.559, max=0.559, sum=1.118 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.538, mean=0.538, max=0.538, sum=1.077 (2)",
-            "tab": "Efficiency",
-            "score": 0.5384537291526794
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.443, mean=0.443, max=0.443, sum=0.886 (2)",
-            "tab": "Efficiency",
-            "score": 0.44289560781584847
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.872 (2)",
-            "tab": "Efficiency",
-            "score": 0.4359678840637207
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.432, mean=0.432, max=0.432, sum=0.865 (2)",
-            "tab": "Efficiency",
-            "score": 0.4324680757522583
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.435, mean=0.435, max=0.435, sum=0.869 (2)",
-            "tab": "Efficiency",
-            "score": 0.4347288250234086
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.432, mean=0.432, max=0.432, sum=0.863 (2)",
-            "tab": "Efficiency",
-            "score": 0.43169068121442605
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=631.43, mean=631.43, max=631.43, sum=1262.86 (2)",
-            "tab": "General information",
-            "score": 631.43
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=562.632, mean=562.632, max=562.632, sum=1125.264 (2)",
-            "tab": "General information",
-            "score": 562.6319444444445
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=910.14, mean=910.14, max=910.14, sum=1820.28 (2)",
-            "tab": "General information",
-            "score": 910.14
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=655.96, mean=655.96, max=655.96, sum=1311.92 (2)",
-            "tab": "General information",
-            "score": 655.96
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=617.671, mean=617.671, max=617.671, sum=1235.341 (2)",
-            "tab": "General information",
-            "score": 617.6705202312139
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=560.873, mean=560.873, max=560.873, sum=1121.745 (2)",
-            "tab": "General information",
-            "score": 560.8725490196078
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.765, mean=0.765, max=0.765, sum=1.53 (2)",
-            "tab": "Efficiency",
-            "score": 0.7652230095863343
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=437.17, mean=437.17, max=437.17, sum=874.34 (2)",
-            "tab": "General information",
-            "score": 437.17
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.605,
-        "details": {
-          "description": "min=0.605, mean=0.605, max=0.605, sum=1.211 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.429, mean=0.429, max=0.429, sum=0.858 (2)",
-            "tab": "Efficiency",
-            "score": 0.4288227077116046
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=693.675, mean=693.675, max=693.675, sum=1387.351 (2)",
-            "tab": "General information",
-            "score": 693.6754385964912
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5,
-        "details": {
-          "description": "min=0.5, mean=0.5, max=0.5, sum=1 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.43, mean=0.43, max=0.43, sum=0.859 (2)",
-            "tab": "Efficiency",
-            "score": 0.4296323895454407
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=493.54, mean=493.54, max=493.54, sum=987.08 (2)",
-            "tab": "General information",
-            "score": 493.54
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.898,
-        "details": {
-          "description": "min=0.898, mean=0.898, max=0.898, sum=1.796 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.421, mean=0.421, max=0.421, sum=0.841 (2)",
-            "tab": "Efficiency",
-            "score": 0.4206738162923742
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=458.898, mean=458.898, max=458.898, sum=917.796 (2)",
-            "tab": "General information",
-            "score": 458.89814814814815
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.817,
-        "details": {
-          "description": "min=0.817, mean=0.817, max=0.817, sum=1.633 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.871 (2)",
-            "tab": "Efficiency",
-            "score": 0.43559602372516004
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=381.122, mean=381.122, max=381.122, sum=762.244 (2)",
-            "tab": "General information",
-            "score": 381.12218649517683
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85,
-        "details": {
-          "description": "min=0.85, mean=0.85, max=0.85, sum=1.699 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.497, mean=0.497, max=0.497, sum=0.994 (2)",
-            "tab": "Efficiency",
-            "score": 0.4968351388678831
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.459, mean=0.459, max=0.459, sum=0.917 (2)",
-            "tab": "Efficiency",
-            "score": 0.4586718564337872
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=1.016, mean=1.016, max=1.016, sum=2.033 (2)",
-            "tab": "Efficiency",
-            "score": 1.016288014092377
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.443, mean=0.443, max=0.443, sum=0.885 (2)",
-            "tab": "Efficiency",
-            "score": 0.4426119109384375
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1339.647, mean=1339.647, max=1339.647, sum=2679.294 (2)",
-            "tab": "General information",
-            "score": 1339.6470588235295
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=832.277, mean=832.277, max=832.277, sum=1664.553 (2)",
-            "tab": "General information",
-            "score": 832.2765957446809
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1924.007, mean=1924.007, max=1924.007, sum=3848.014 (2)",
-            "tab": "General information",
-            "score": 1924.0071707953064
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=659.078, mean=659.078, max=659.078, sum=1318.157 (2)",
-            "tab": "General information",
-            "score": 659.0784313725491
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.97,
-        "details": {
-          "description": "min=0.97, mean=0.97, max=0.97, sum=1.94 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.441, mean=0.441, max=0.441, sum=0.882 (2)",
-            "tab": "Efficiency",
-            "score": 0.44084484577178956
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=488.81, mean=488.81, max=488.81, sum=977.62 (2)",
-            "tab": "General information",
-            "score": 488.81
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.868,
-        "details": {
-          "description": "min=0.868, mean=0.868, max=0.868, sum=1.737 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.446, mean=0.446, max=0.446, sum=0.892 (2)",
-            "tab": "Efficiency",
-            "score": 0.4461362079570168
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=690.079, mean=690.079, max=690.079, sum=1380.158 (2)",
-            "tab": "General information",
-            "score": 690.078947368421
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8,
-        "details": {
-          "description": "min=0.8, mean=0.8, max=0.8, sum=1.6 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.872 (2)",
-            "tab": "Efficiency",
-            "score": 0.4362391257286072
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=683.44, mean=683.44, max=683.44, sum=1366.88 (2)",
-            "tab": "General information",
-            "score": 683.44
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.808,
-        "details": {
-          "description": "min=0.808, mean=0.808, max=0.808, sum=1.615 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.427, mean=0.427, max=0.427, sum=0.855 (2)",
-            "tab": "Efficiency",
-            "score": 0.42739290561316146
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=496.374, mean=496.374, max=496.374, sum=992.747 (2)",
-            "tab": "General information",
-            "score": 496.3735849056604
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.826,
-        "details": {
-          "description": "min=0.826, mean=0.826, max=0.826, sum=1.651 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.452, mean=0.452, max=0.452, sum=0.904 (2)",
-            "tab": "Efficiency",
-            "score": 0.4520118307560048
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=342.153, mean=342.153, max=342.153, sum=684.306 (2)",
-            "tab": "General information",
-            "score": 342.1531914893617
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.697,
-        "details": {
-          "description": "min=0.697, mean=0.697, max=0.697, sum=1.393 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.487, mean=0.487, max=0.487, sum=0.974 (2)",
-            "tab": "Efficiency",
-            "score": 0.4870024582435345
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=506.779, mean=506.779, max=506.779, sum=1013.559 (2)",
-            "tab": "General information",
-            "score": 506.7793103448276
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.611,
-        "details": {
-          "description": "min=0.611, mean=0.611, max=0.611, sum=1.222 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.457, mean=0.457, max=0.457, sum=0.915 (2)",
-            "tab": "Efficiency",
-            "score": 0.4574742739793485
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=618.156, mean=618.156, max=618.156, sum=1236.312 (2)",
-            "tab": "General information",
-            "score": 618.1560846560847
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.579,
-        "details": {
-          "description": "min=0.579, mean=0.579, max=0.579, sum=1.159 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.445, mean=0.445, max=0.445, sum=0.889 (2)",
-            "tab": "Efficiency",
-            "score": 0.44462628780849395
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=700.81, mean=700.81, max=700.81, sum=1401.619 (2)",
-            "tab": "General information",
-            "score": 700.8095238095239
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.907,
-        "details": {
-          "description": "min=0.907, mean=0.907, max=0.907, sum=1.814 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.44, mean=0.44, max=0.44, sum=0.879 (2)",
-            "tab": "Efficiency",
-            "score": 0.4396143251849759
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.433, mean=0.433, max=0.433, sum=0.865 (2)",
-            "tab": "Efficiency",
-            "score": 0.4325766810055437
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.444, mean=0.444, max=0.444, sum=0.887 (2)",
-            "tab": "Efficiency",
-            "score": 0.4435269355773926
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=1.447, mean=1.447, max=1.447, sum=2.894 (2)",
-            "tab": "Efficiency",
-            "score": 1.44696401682767
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.427, mean=0.427, max=0.427, sum=0.854 (2)",
-            "tab": "Efficiency",
-            "score": 0.4269573845044531
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.442, mean=0.442, max=0.442, sum=0.885 (2)",
-            "tab": "Efficiency",
-            "score": 0.4422582035855308
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.423, mean=0.423, max=0.423, sum=0.846 (2)",
-            "tab": "Efficiency",
-            "score": 0.4230540263347137
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.438, mean=0.438, max=0.438, sum=0.877 (2)",
-            "tab": "Efficiency",
-            "score": 0.4383223215738932
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.443, mean=0.443, max=0.443, sum=0.887 (2)",
-            "tab": "Efficiency",
-            "score": 0.4434382264353648
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.448, mean=0.448, max=0.448, sum=0.896 (2)",
-            "tab": "Efficiency",
-            "score": 0.4479467000392889
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.438, mean=0.438, max=0.438, sum=0.876 (2)",
-            "tab": "Efficiency",
-            "score": 0.43786543006197026
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.454, mean=0.454, max=0.454, sum=0.907 (2)",
-            "tab": "Efficiency",
-            "score": 0.45358082431334035
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=1.14, mean=1.14, max=1.14, sum=2.28 (2)",
-            "tab": "Efficiency",
-            "score": 1.13988286373662
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.833, mean=0.833, max=0.833, sum=1.666 (2)",
-            "tab": "Efficiency",
-            "score": 0.8329467803617067
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=605.894, mean=605.894, max=605.894, sum=1211.787 (2)",
-            "tab": "General information",
-            "score": 605.8935483870968
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=577.665, mean=577.665, max=577.665, sum=1155.33 (2)",
-            "tab": "General information",
-            "score": 577.6650246305419
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=997.57, mean=997.57, max=997.57, sum=1995.14 (2)",
-            "tab": "General information",
-            "score": 997.57
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=3168.636, mean=3168.636, max=3168.636, sum=6337.273 (2)",
-            "tab": "General information",
-            "score": 3168.6363636363635
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=445.657, mean=445.657, max=445.657, sum=891.313 (2)",
-            "tab": "General information",
-            "score": 445.65656565656565
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=536.927, mean=536.927, max=536.927, sum=1073.855 (2)",
-            "tab": "General information",
-            "score": 536.9274611398964
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=454.662, mean=454.662, max=454.662, sum=909.323 (2)",
-            "tab": "General information",
-            "score": 454.66153846153844
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=588.181, mean=588.181, max=588.181, sum=1176.363 (2)",
-            "tab": "General information",
-            "score": 588.1814814814815
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=458.492, mean=458.492, max=458.492, sum=916.983 (2)",
-            "tab": "General information",
-            "score": 458.49159663865544
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=630.788, mean=630.788, max=630.788, sum=1261.576 (2)",
-            "tab": "General information",
-            "score": 630.7880794701987
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=594.919, mean=594.919, max=594.919, sum=1189.839 (2)",
-            "tab": "General information",
-            "score": 594.9192660550459
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=917.208, mean=917.208, max=917.208, sum=1834.417 (2)",
-            "tab": "General information",
-            "score": 917.2083333333334
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2544.324, mean=2544.324, max=2544.324, sum=5088.647 (2)",
-            "tab": "General information",
-            "score": 2544.323529411765
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1647.219, mean=1647.219, max=1647.219, sum=3294.439 (2)",
-            "tab": "General information",
-            "score": 1647.2194092827003
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.847,
-        "details": {
-          "description": "min=0.847, mean=0.847, max=0.847, sum=1.695 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.873 (2)",
-            "tab": "Efficiency",
-            "score": 0.43635595539760164
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.434, mean=0.434, max=0.434, sum=0.869 (2)",
-            "tab": "Efficiency",
-            "score": 0.4343654235810724
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=370.26, mean=370.26, max=370.26, sum=740.52 (2)",
-            "tab": "General information",
-            "score": 370.26008968609864
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=412.382, mean=412.382, max=412.382, sum=824.763 (2)",
-            "tab": "General information",
-            "score": 412.381679389313
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.461, mean=0.461, max=0.461, sum=0.922 (2)",
-            "tab": "Efficiency",
-            "score": 0.46112686346385107
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=738.463, mean=738.463, max=738.463, sum=1476.926 (2)",
-            "tab": "General information",
-            "score": 738.4628099173553
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.865,
-        "details": {
-          "description": "min=0.865, mean=0.865, max=0.865, sum=1.73 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.45, mean=0.45, max=0.45, sum=0.9 (2)",
-            "tab": "Efficiency",
-            "score": 0.44979269080366824
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=511.755, mean=511.755, max=511.755, sum=1023.509 (2)",
-            "tab": "General information",
-            "score": 511.7546012269939
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.616,
-        "details": {
-          "description": "min=0.616, mean=0.616, max=0.616, sum=1.232 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.466, mean=0.466, max=0.466, sum=0.932 (2)",
-            "tab": "Efficiency",
-            "score": 0.46596066866602215
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=739.402, mean=739.402, max=739.402, sum=1478.804 (2)",
-            "tab": "General information",
-            "score": 739.4017857142857
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.864,
-        "details": {
-          "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.439, mean=0.439, max=0.439, sum=0.878 (2)",
-            "tab": "Efficiency",
-            "score": 0.43890966720951413
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=324.777, mean=324.777, max=324.777, sum=649.553 (2)",
-            "tab": "General information",
-            "score": 324.77669902912623
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.953,
-        "details": {
-          "description": "min=0.953, mean=0.953, max=0.953, sum=1.906 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.443, mean=0.443, max=0.443, sum=0.885 (2)",
-            "tab": "Efficiency",
-            "score": 0.4425381727707692
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=481.628, mean=481.628, max=481.628, sum=963.256 (2)",
-            "tab": "General information",
-            "score": 481.62820512820514
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=1.82 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.436, mean=0.436, max=0.436, sum=0.872 (2)",
-            "tab": "Efficiency",
-            "score": 0.43624018907546996
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=417.14, mean=417.14, max=417.14, sum=834.28 (2)",
-            "tab": "General information",
-            "score": 417.14
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.888,
-        "details": {
-          "description": "min=0.888, mean=0.888, max=0.888, sum=1.775 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.434, mean=0.434, max=0.434, sum=0.868 (2)",
-            "tab": "Efficiency",
-            "score": 0.4337884417591119
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=354.913, mean=354.913, max=354.913, sum=709.826 (2)",
-            "tab": "General information",
-            "score": 354.9131545338442
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.811,
-        "details": {
-          "description": "min=0.811, mean=0.811, max=0.811, sum=1.622 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.454, mean=0.454, max=0.454, sum=0.908 (2)",
-            "tab": "Efficiency",
-            "score": 0.4541343209371401
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.452, mean=0.452, max=0.452, sum=0.905 (2)",
-            "tab": "Efficiency",
-            "score": 0.4522555020934377
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=551.506, mean=551.506, max=551.506, sum=1103.012 (2)",
-            "tab": "General information",
-            "score": 551.5057803468208
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=765.479, mean=765.479, max=765.479, sum=1530.959 (2)",
-            "tab": "General information",
-            "score": 765.4793296089385
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.859,
-        "details": {
-          "description": "min=0.859, mean=0.859, max=0.859, sum=1.719 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.469, mean=0.469, max=0.469, sum=0.937 (2)",
-            "tab": "Efficiency",
-            "score": 0.46850453872306674
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=704.922, mean=704.922, max=704.922, sum=1409.843 (2)",
-            "tab": "General information",
-            "score": 704.9215686274509
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.867,
-        "details": {
-          "description": "min=0.867, mean=0.867, max=0.867, sum=1.735 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.459, mean=0.459, max=0.459, sum=0.919 (2)",
-            "tab": "Efficiency",
-            "score": 0.45942840973536175
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=628.185, mean=628.185, max=628.185, sum=1256.37 (2)",
-            "tab": "General information",
-            "score": 628.1851851851852
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.764,
-        "details": {
-          "description": "min=0.764, mean=0.764, max=0.764, sum=1.527 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.424, mean=0.424, max=0.424, sum=0.848 (2)",
-            "tab": "Efficiency",
-            "score": 0.4240685766393488
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=483.827, mean=483.827, max=483.827, sum=967.655 (2)",
-            "tab": "General information",
-            "score": 483.8272727272727
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.82, mean=0.82, max=0.82, sum=1.641 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.5, mean=0.5, max=0.5, sum=1.001 (2)",
-            "tab": "Efficiency",
-            "score": 0.500300864784085
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1386.531, mean=1386.531, max=1386.531, sum=2773.061 (2)",
-            "tab": "General information",
-            "score": 1386.530612244898
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.886,
-        "details": {
-          "description": "min=0.886, mean=0.886, max=0.886, sum=1.771 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.44, mean=0.44, max=0.44, sum=0.879 (2)",
-            "tab": "Efficiency",
-            "score": 0.4395348717324176
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=517.478, mean=517.478, max=517.478, sum=1034.955 (2)",
-            "tab": "General information",
-            "score": 517.4776119402985
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.572,
-        "details": {
-          "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.426, mean=0.426, max=0.426, sum=0.852 (2)",
-            "tab": "Efficiency",
-            "score": 0.4260225296020508
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=414.108, mean=414.108, max=414.108, sum=828.217 (2)",
-            "tab": "General information",
-            "score": 414.10843373493975
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.883,
-        "details": {
-          "description": "min=0.883, mean=0.883, max=0.883, sum=1.766 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.415, mean=0.415, max=0.415, sum=0.83 (2)",
-            "tab": "Efficiency",
-            "score": 0.41479549212762484
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=313.474, mean=313.474, max=313.474, sum=626.947 (2)",
-            "tab": "General information",
-            "score": 313.4736842105263
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.462,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json b/data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json
deleted file mode 100644
index c204b253d..000000000
--- a/data/helm_mmlu/writer/palmyra-x-004/b2c8cfd1-f09a-4616-8038-c7e1930bce74.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/writer_palmyra-x-004/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Palmyra-X-004",
-    "id": "writer/palmyra-x-004",
-    "developer": "writer",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.813,
-        "details": {
-          "description": "min=0.52, mean=0.813, max=0.959, sum=92.659 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.298, mean=0.535, max=2.946, sum=60.962 (114)",
-            "tab": "Efficiency",
-            "score": 0.5347547453538
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=274.52, mean=614.619, max=2797.885, sum=70066.61 (114)",
-            "tab": "General information",
-            "score": 614.6193817308517
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=0.968, mean=0.991, max=1, sum=112.995 (114)",
-            "tab": "General information",
-            "score": 0.9911842955118555
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.75,
-        "details": {
-          "description": "min=0.75, mean=0.75, max=0.75, sum=1.5 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.722, mean=0.722, max=0.722, sum=1.444 (2)",
-            "tab": "Efficiency",
-            "score": 0.7220739269256592
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=373.43, mean=373.43, max=373.43, sum=746.86 (2)",
-            "tab": "General information",
-            "score": 373.43
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.822,
-        "details": {
-          "description": "min=0.822, mean=0.822, max=0.822, sum=1.644 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.323, mean=0.323, max=0.323, sum=0.646 (2)",
-            "tab": "Efficiency",
-            "score": 0.3229873922136095
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=353.874, mean=353.874, max=353.874, sum=707.748 (2)",
-            "tab": "General information",
-            "score": 353.8740740740741
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=0.993, mean=0.993, max=0.993, sum=1.985 (2)",
-            "tab": "General information",
-            "score": 0.9925925925925926
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.647,
-        "details": {
-          "description": "min=0.647, mean=0.647, max=0.647, sum=1.294 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.316, mean=0.316, max=0.316, sum=0.632 (2)",
-            "tab": "Efficiency",
-            "score": 0.316190505027771
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=2.087, mean=2.087, max=2.087, sum=4.175 (2)",
-            "tab": "Efficiency",
-            "score": 2.0873730795250998
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=1.575, mean=1.575, max=1.575, sum=3.15 (2)",
-            "tab": "Efficiency",
-            "score": 1.574983057975769
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=1.58, mean=1.58, max=1.58, sum=3.16 (2)",
-            "tab": "Efficiency",
-            "score": 1.5799101972579956
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=1.786, mean=1.786, max=1.786, sum=3.572 (2)",
-            "tab": "Efficiency",
-            "score": 1.786004883705536
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=1.112, mean=1.112, max=1.112, sum=2.225 (2)",
-            "tab": "Efficiency",
-            "score": 1.1123062372207642
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=549.28, mean=549.28, max=549.28, sum=1098.56 (2)",
-            "tab": "General information",
-            "score": 549.28
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=0.97, mean=0.97, max=0.97, sum=1.94 (2)",
-            "tab": "General information",
-            "score": 0.97
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=473.875, mean=473.875, max=473.875, sum=947.75 (2)",
-            "tab": "General information",
-            "score": 473.875
-          },
-          "College Biology - # output tokens": {
-            "description": "min=0.993, mean=0.993, max=0.993, sum=1.986 (2)",
-            "tab": "General information",
-            "score": 0.9930555555555556
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=828.29, mean=828.29, max=828.29, sum=1656.58 (2)",
-            "tab": "General information",
-            "score": 828.29
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=594.51, mean=594.51, max=594.51, sum=1189.02 (2)",
-            "tab": "General information",
-            "score": 594.51
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=0.98, mean=0.98, max=0.98, sum=1.96 (2)",
-            "tab": "General information",
-            "score": 0.98
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=502.705, mean=502.705, max=502.705, sum=1005.41 (2)",
-            "tab": "General information",
-            "score": 502.70520231213874
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=0.994, mean=0.994, max=0.994, sum=1.988 (2)",
-            "tab": "General information",
-            "score": 0.9942196531791907
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=503.569, mean=503.569, max=503.569, sum=1007.137 (2)",
-            "tab": "General information",
-            "score": 503.5686274509804
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82,
-        "details": {
-          "description": "min=0.82, mean=0.82, max=0.82, sum=1.64 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.309, mean=0.309, max=0.309, sum=0.618 (2)",
-            "tab": "Efficiency",
-            "score": 0.3091639161109924
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=378.51, mean=378.51, max=378.51, sum=757.02 (2)",
-            "tab": "General information",
-            "score": 378.51
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=0.99, mean=0.99, max=0.99, sum=1.98 (2)",
-            "tab": "General information",
-            "score": 0.99
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.684,
-        "details": {
-          "description": "min=0.684, mean=0.684, max=0.684, sum=1.368 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.322, mean=0.322, max=0.322, sum=0.644 (2)",
-            "tab": "Efficiency",
-            "score": 0.32210456070147064
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=614.421, mean=614.421, max=614.421, sum=1228.842 (2)",
-            "tab": "General information",
-            "score": 614.421052631579
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=0.991, mean=0.991, max=0.991, sum=1.982 (2)",
-            "tab": "General information",
-            "score": 0.9912280701754386
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.62,
-        "details": {
-          "description": "min=0.62, mean=0.62, max=0.62, sum=1.24 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.311, mean=0.311, max=0.311, sum=0.621 (2)",
-            "tab": "Efficiency",
-            "score": 0.31063568592071533
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=399.71, mean=399.71, max=399.71, sum=799.42 (2)",
-            "tab": "General information",
-            "score": 399.71
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=0.98, mean=0.98, max=0.98, sum=1.96 (2)",
-            "tab": "General information",
-            "score": 0.98
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.843,
-        "details": {
-          "description": "min=0.843, mean=0.843, max=0.843, sum=1.685 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.298, mean=0.298, max=0.298, sum=0.597 (2)",
-            "tab": "Efficiency",
-            "score": 0.29833372433980304
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=394.63, mean=394.63, max=394.63, sum=789.259 (2)",
-            "tab": "General information",
-            "score": 394.6296296296296
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=0.991, mean=0.991, max=0.991, sum=1.981 (2)",
-            "tab": "General information",
-            "score": 0.9907407407407407
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.659 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.306, mean=0.306, max=0.306, sum=0.612 (2)",
-            "tab": "Efficiency",
-            "score": 0.30590631187537093
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=329.084, mean=329.084, max=329.084, sum=658.167 (2)",
-            "tab": "General information",
-            "score": 329.08360128617363
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=0.994, mean=0.994, max=0.994, sum=1.987 (2)",
-            "tab": "General information",
-            "score": 0.9935691318327974
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.845,
-        "details": {
-          "description": "min=0.845, mean=0.845, max=0.845, sum=1.69 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.42, mean=0.42, max=0.42, sum=0.841 (2)",
-            "tab": "Efficiency",
-            "score": 0.42044701295740466
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.352, mean=0.352, max=0.352, sum=0.704 (2)",
-            "tab": "Efficiency",
-            "score": 0.35206349944391996
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=2.946, mean=2.946, max=2.946, sum=5.892 (2)",
-            "tab": "Efficiency",
-            "score": 2.9459040923410784
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.342, mean=0.342, max=0.342, sum=0.683 (2)",
-            "tab": "Efficiency",
-            "score": 0.34150391076904496
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1094.489, mean=1094.489, max=1094.489, sum=2188.978 (2)",
-            "tab": "General information",
-            "score": 1094.4889705882354
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=0.989, mean=0.989, max=0.989, sum=1.978 (2)",
-            "tab": "General information",
-            "score": 0.9889705882352942
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=658.585, mean=658.585, max=658.585, sum=1317.17 (2)",
-            "tab": "General information",
-            "score": 658.5851063829788
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=0.968, mean=0.968, max=0.968, sum=1.936 (2)",
-            "tab": "General information",
-            "score": 0.9680851063829787
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1637.601, mean=1637.601, max=1637.601, sum=3275.202 (2)",
-            "tab": "General information",
-            "score": 1637.6010430247718
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=0.995, mean=0.995, max=0.995, sum=1.99 (2)",
-            "tab": "General information",
-            "score": 0.9947848761408083
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=575.098, mean=575.098, max=575.098, sum=1150.196 (2)",
-            "tab": "General information",
-            "score": 575.0980392156863
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=0.993, mean=0.993, max=0.993, sum=1.987 (2)",
-            "tab": "General information",
-            "score": 0.9934640522875817
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92,
-        "details": {
-          "description": "min=0.92, mean=0.92, max=0.92, sum=1.84 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.312, mean=0.312, max=0.312, sum=0.624 (2)",
-            "tab": "Efficiency",
-            "score": 0.31222330808639526
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=422.79, mean=422.79, max=422.79, sum=845.58 (2)",
-            "tab": "General information",
-            "score": 422.79
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.928,
-        "details": {
-          "description": "min=0.928, mean=0.928, max=0.928, sum=1.855 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.326, mean=0.326, max=0.326, sum=0.653 (2)",
-            "tab": "Efficiency",
-            "score": 0.3264871161235006
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=579.684, mean=579.684, max=579.684, sum=1159.368 (2)",
-            "tab": "General information",
-            "score": 579.6842105263158
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=0.993, mean=0.993, max=0.993, sum=1.987 (2)",
-            "tab": "General information",
-            "score": 0.993421052631579
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76,
-        "details": {
-          "description": "min=0.76, mean=0.76, max=0.76, sum=1.52 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.321, mean=0.321, max=0.321, sum=0.643 (2)",
-            "tab": "Efficiency",
-            "score": 0.3212712168693542
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=569.52, mean=569.52, max=569.52, sum=1139.04 (2)",
-            "tab": "General information",
-            "score": 569.52
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=0.98, mean=0.98, max=0.98, sum=1.96 (2)",
-            "tab": "General information",
-            "score": 0.98
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.879,
-        "details": {
-          "description": "min=0.879, mean=0.879, max=0.879, sum=1.758 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.477, mean=0.477, max=0.477, sum=0.953 (2)",
-            "tab": "Efficiency",
-            "score": 0.4765495894090185
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=397.928, mean=397.928, max=397.928, sum=795.857 (2)",
-            "tab": "General information",
-            "score": 397.92830188679244
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=0.992, mean=0.992, max=0.992, sum=1.985 (2)",
-            "tab": "General information",
-            "score": 0.9924528301886792
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.885,
-        "details": {
-          "description": "min=0.885, mean=0.885, max=0.885, sum=1.77 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.347, mean=0.347, max=0.347, sum=0.693 (2)",
-            "tab": "Efficiency",
-            "score": 0.3465714748869551
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=304.834, mean=304.834, max=304.834, sum=609.668 (2)",
-            "tab": "General information",
-            "score": 304.83404255319147
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=0.996, mean=0.996, max=0.996, sum=1.991 (2)",
-            "tab": "General information",
-            "score": 0.9957446808510638
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.793,
-        "details": {
-          "description": "min=0.793, mean=0.793, max=0.793, sum=1.586 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.305, mean=0.305, max=0.305, sum=0.611 (2)",
-            "tab": "Efficiency",
-            "score": 0.3054168865598481
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=435.607, mean=435.607, max=435.607, sum=871.214 (2)",
-            "tab": "General information",
-            "score": 435.60689655172416
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=0.993, mean=0.993, max=0.993, sum=1.986 (2)",
-            "tab": "General information",
-            "score": 0.993103448275862
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.841,
-        "details": {
-          "description": "min=0.841, mean=0.841, max=0.841, sum=1.683 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.313, mean=0.313, max=0.313, sum=0.627 (2)",
-            "tab": "Efficiency",
-            "score": 0.31325215069705215
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=531.854, mean=531.854, max=531.854, sum=1063.709 (2)",
-            "tab": "General information",
-            "score": 531.8544973544973
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=0.995, mean=0.995, max=0.995, sum=1.989 (2)",
-            "tab": "General information",
-            "score": 0.9947089947089947
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.579,
-        "details": {
-          "description": "min=0.579, mean=0.579, max=0.579, sum=1.159 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=1.035, mean=1.035, max=1.035, sum=2.07 (2)",
-            "tab": "Efficiency",
-            "score": 1.034958042795696
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=601.778, mean=601.778, max=601.778, sum=1203.556 (2)",
-            "tab": "General information",
-            "score": 601.7777777777778
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.911,
-        "details": {
-          "description": "min=0.911, mean=0.911, max=0.911, sum=1.823 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.562, mean=0.562, max=0.562, sum=1.123 (2)",
-            "tab": "Efficiency",
-            "score": 0.561508382520368
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.349, mean=0.349, max=0.349, sum=0.698 (2)",
-            "tab": "Efficiency",
-            "score": 0.34899539900530735
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.423, mean=0.423, max=0.423, sum=0.845 (2)",
-            "tab": "Efficiency",
-            "score": 0.4227438974380493
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=0.899, mean=0.899, max=0.899, sum=1.799 (2)",
-            "tab": "Efficiency",
-            "score": 0.8994465018763687
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.324, mean=0.324, max=0.324, sum=0.647 (2)",
-            "tab": "Efficiency",
-            "score": 0.3236422189558395
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.314, mean=0.314, max=0.314, sum=0.627 (2)",
-            "tab": "Efficiency",
-            "score": 0.31354672550537427
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.314, mean=0.314, max=0.314, sum=0.628 (2)",
-            "tab": "Efficiency",
-            "score": 0.31394460568061244
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)",
-            "tab": "Efficiency",
-            "score": 0.3151667806837294
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)",
-            "tab": "Efficiency",
-            "score": 0.3151869453301951
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.32, mean=0.32, max=0.32, sum=0.639 (2)",
-            "tab": "Efficiency",
-            "score": 0.31971652302520953
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.315, mean=0.315, max=0.315, sum=0.63 (2)",
-            "tab": "Efficiency",
-            "score": 0.3149662079067405
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.386, mean=0.386, max=0.386, sum=0.772 (2)",
-            "tab": "Efficiency",
-            "score": 0.3859624167283376
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=0.651, mean=0.651, max=0.651, sum=1.303 (2)",
-            "tab": "Efficiency",
-            "score": 0.6513510615217919
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.472, mean=0.472, max=0.472, sum=0.945 (2)",
-            "tab": "Efficiency",
-            "score": 0.4723552480528626
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=513.671, mean=513.671, max=513.671, sum=1027.342 (2)",
-            "tab": "General information",
-            "score": 513.6709677419354
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=0.994, mean=0.994, max=0.994, sum=1.987 (2)",
-            "tab": "General information",
-            "score": 0.9935483870967742
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=496.704, mean=496.704, max=496.704, sum=993.409 (2)",
-            "tab": "General information",
-            "score": 496.70443349753697
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=0.985, mean=0.985, max=0.985, sum=1.97 (2)",
-            "tab": "General information",
-            "score": 0.9852216748768473
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=867.78, mean=867.78, max=867.78, sum=1735.56 (2)",
-            "tab": "General information",
-            "score": 867.78
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2797.885, mean=2797.885, max=2797.885, sum=5595.77 (2)",
-            "tab": "General information",
-            "score": 2797.8848484848486
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=372.035, mean=372.035, max=372.035, sum=744.071 (2)",
-            "tab": "General information",
-            "score": 372.0353535353535
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=0.99, mean=0.99, max=0.99, sum=1.98 (2)",
-            "tab": "General information",
-            "score": 0.98989898989899
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=465.824, mean=465.824, max=465.824, sum=931.648 (2)",
-            "tab": "General information",
-            "score": 465.8238341968912
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=0.979, mean=0.979, max=0.979, sum=1.959 (2)",
-            "tab": "General information",
-            "score": 0.9792746113989638
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=370.908, mean=370.908, max=370.908, sum=741.815 (2)",
-            "tab": "General information",
-            "score": 370.9076923076923
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=0.992, mean=0.992, max=0.992, sum=1.985 (2)",
-            "tab": "General information",
-            "score": 0.9923076923076923
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=532.356, mean=532.356, max=532.356, sum=1064.711 (2)",
-            "tab": "General information",
-            "score": 532.3555555555556
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=0.993, mean=0.993, max=0.993, sum=1.985 (2)",
-            "tab": "General information",
-            "score": 0.9925925925925926
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=399.013, mean=399.013, max=399.013, sum=798.025 (2)",
-            "tab": "General information",
-            "score": 399.0126050420168
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=0.987, mean=0.987, max=0.987, sum=1.975 (2)",
-            "tab": "General information",
-            "score": 0.9873949579831933
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=560.457, mean=560.457, max=560.457, sum=1120.914 (2)",
-            "tab": "General information",
-            "score": 560.4569536423841
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=0.974, mean=0.974, max=0.974, sum=1.947 (2)",
-            "tab": "General information",
-            "score": 0.9735099337748344
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=495.242, mean=495.242, max=495.242, sum=990.484 (2)",
-            "tab": "General information",
-            "score": 495.2422018348624
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=0.996, mean=0.996, max=0.996, sum=1.993 (2)",
-            "tab": "General information",
-            "score": 0.9963302752293578
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=795.639, mean=795.639, max=795.639, sum=1591.278 (2)",
-            "tab": "General information",
-            "score": 795.6388888888889
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=0.977, mean=0.977, max=0.977, sum=1.954 (2)",
-            "tab": "General information",
-            "score": 0.9768518518518519
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2217.809, mean=2217.809, max=2217.809, sum=4435.618 (2)",
-            "tab": "General information",
-            "score": 2217.8088235294117
-          },
-          "High School US History - # output tokens": {
-            "description": "min=0.99, mean=0.99, max=0.99, sum=1.98 (2)",
-            "tab": "General information",
-            "score": 0.9901960784313726
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1428.173, mean=1428.173, max=1428.173, sum=2856.346 (2)",
-            "tab": "General information",
-            "score": 1428.1729957805908
-          },
-          "High School World History - # output tokens": {
-            "description": "min=0.996, mean=0.996, max=0.996, sum=1.992 (2)",
-            "tab": "General information",
-            "score": 0.9957805907172996
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.924,
-        "details": {
-          "description": "min=0.924, mean=0.924, max=0.924, sum=1.847 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.322, mean=0.322, max=0.322, sum=0.644 (2)",
-            "tab": "Efficiency",
-            "score": 0.3221198432648663
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.319, mean=0.319, max=0.319, sum=0.638 (2)",
-            "tab": "Efficiency",
-            "score": 0.31875184474100593
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=319.888, mean=319.888, max=319.888, sum=639.776 (2)",
-            "tab": "General information",
-            "score": 319.88789237668163
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=0.996, mean=0.996, max=0.996, sum=1.991 (2)",
-            "tab": "General information",
-            "score": 0.9955156950672646
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=341.168, mean=341.168, max=341.168, sum=682.336 (2)",
-            "tab": "General information",
-            "score": 341.1679389312977
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=0.992, mean=0.992, max=0.992, sum=1.985 (2)",
-            "tab": "General information",
-            "score": 0.9923664122137404
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.901,
-        "details": {
-          "description": "min=0.901, mean=0.901, max=0.901, sum=1.802 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.336, mean=0.336, max=0.336, sum=0.671 (2)",
-            "tab": "Efficiency",
-            "score": 0.33550412989844963
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=639.818, mean=639.818, max=639.818, sum=1279.636 (2)",
-            "tab": "General information",
-            "score": 639.8181818181819
-          },
-          "International Law - # output tokens": {
-            "description": "min=0.983, mean=0.983, max=0.983, sum=1.967 (2)",
-            "tab": "General information",
-            "score": 0.9834710743801653
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.877,
-        "details": {
-          "description": "min=0.877, mean=0.877, max=0.877, sum=1.755 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.312, mean=0.312, max=0.312, sum=0.624 (2)",
-            "tab": "Efficiency",
-            "score": 0.3120760069302986
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=449.564, mean=449.564, max=449.564, sum=899.129 (2)",
-            "tab": "General information",
-            "score": 449.5644171779141
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.679,
-        "details": {
-          "description": "min=0.679, mean=0.679, max=0.679, sum=1.357 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)",
-            "tab": "Efficiency",
-            "score": 0.3368471988609859
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=668.054, mean=668.054, max=668.054, sum=1336.107 (2)",
-            "tab": "General information",
-            "score": 668.0535714285714
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.903,
-        "details": {
-          "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.31, mean=0.31, max=0.31, sum=0.621 (2)",
-            "tab": "Efficiency",
-            "score": 0.3103753525076561
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=283.786, mean=283.786, max=283.786, sum=567.573 (2)",
-            "tab": "General information",
-            "score": 283.7864077669903
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.932,
-        "details": {
-          "description": "min=0.932, mean=0.932, max=0.932, sum=1.863 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.314, mean=0.314, max=0.314, sum=0.628 (2)",
-            "tab": "Efficiency",
-            "score": 0.3138112644863944
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=404.218, mean=404.218, max=404.218, sum=808.436 (2)",
-            "tab": "General information",
-            "score": 404.21794871794873
-          },
-          "Marketing - # output tokens": {
-            "description": "min=0.991, mean=0.991, max=0.991, sum=1.983 (2)",
-            "tab": "General information",
-            "score": 0.9914529914529915
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.74 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.31, mean=0.31, max=0.31, sum=0.619 (2)",
-            "tab": "Efficiency",
-            "score": 0.3096977710723877
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=340.99, mean=340.99, max=340.99, sum=681.98 (2)",
-            "tab": "General information",
-            "score": 340.99
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=0.97, mean=0.97, max=0.97, sum=1.94 (2)",
-            "tab": "General information",
-            "score": 0.97
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.934,
-        "details": {
-          "description": "min=0.934, mean=0.934, max=0.934, sum=1.867 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.311, mean=0.311, max=0.311, sum=0.621 (2)",
-            "tab": "Efficiency",
-            "score": 0.3106613128730316
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=299.911, mean=299.911, max=299.911, sum=599.821 (2)",
-            "tab": "General information",
-            "score": 299.9106002554278
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=0.99, mean=0.99, max=0.99, sum=1.98 (2)",
-            "tab": "General information",
-            "score": 0.9897828863346104
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.825,
-        "details": {
-          "description": "min=0.825, mean=0.825, max=0.825, sum=1.649 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.313, mean=0.313, max=0.313, sum=0.626 (2)",
-            "tab": "Efficiency",
-            "score": 0.31282479501184013
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.335, mean=0.335, max=0.335, sum=0.67 (2)",
-            "tab": "Efficiency",
-            "score": 0.3348748574709759
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=476.113, mean=476.113, max=476.113, sum=952.225 (2)",
-            "tab": "General information",
-            "score": 476.1127167630058
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=0.994, mean=0.994, max=0.994, sum=1.988 (2)",
-            "tab": "General information",
-            "score": 0.9942196531791907
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=656.455, mean=656.455, max=656.455, sum=1312.909 (2)",
-            "tab": "General information",
-            "score": 656.454748603352
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=0.993, mean=0.993, max=0.993, sum=1.987 (2)",
-            "tab": "General information",
-            "score": 0.9932960893854749
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.869,
-        "details": {
-          "description": "min=0.869, mean=0.869, max=0.869, sum=1.739 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.332, mean=0.332, max=0.332, sum=0.664 (2)",
-            "tab": "Efficiency",
-            "score": 0.33182784311132496
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=586.814, mean=586.814, max=586.814, sum=1173.627 (2)",
-            "tab": "General information",
-            "score": 586.8137254901961
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=0.997, mean=0.997, max=0.997, sum=1.993 (2)",
-            "tab": "General information",
-            "score": 0.9967320261437909
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.917,
-        "details": {
-          "description": "min=0.917, mean=0.917, max=0.917, sum=1.833 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.316, mean=0.316, max=0.316, sum=0.632 (2)",
-            "tab": "Efficiency",
-            "score": 0.3158548356574259
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=514.528, mean=514.528, max=514.528, sum=1029.056 (2)",
-            "tab": "General information",
-            "score": 514.5277777777778
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=0.988, mean=0.988, max=0.988, sum=1.975 (2)",
-            "tab": "General information",
-            "score": 0.9876543209876543
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791,
-        "details": {
-          "description": "min=0.791, mean=0.791, max=0.791, sum=1.582 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.328, mean=0.328, max=0.328, sum=0.657 (2)",
-            "tab": "Efficiency",
-            "score": 0.32829454161904076
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=405.318, mean=405.318, max=405.318, sum=810.636 (2)",
-            "tab": "General information",
-            "score": 405.3181818181818
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.849,
-        "details": {
-          "description": "min=0.849, mean=0.849, max=0.849, sum=1.698 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.443, mean=0.443, max=0.443, sum=0.886 (2)",
-            "tab": "Efficiency",
-            "score": 0.44323594618816764
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1164.473, mean=1164.473, max=1164.473, sum=2328.947 (2)",
-            "tab": "General information",
-            "score": 1164.4734693877551
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=0.992, mean=0.992, max=0.992, sum=1.984 (2)",
-            "tab": "General information",
-            "score": 0.9918367346938776
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.915,
-        "details": {
-          "description": "min=0.915, mean=0.915, max=0.915, sum=1.831 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.337, mean=0.337, max=0.337, sum=0.674 (2)",
-            "tab": "Efficiency",
-            "score": 0.336861949654954
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=445.517, mean=445.517, max=445.517, sum=891.035 (2)",
-            "tab": "General information",
-            "score": 445.51741293532336
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.584,
-        "details": {
-          "description": "min=0.584, mean=0.584, max=0.584, sum=1.169 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.328, mean=0.328, max=0.328, sum=0.656 (2)",
-            "tab": "Efficiency",
-            "score": 0.32804813155208723
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=343.018, mean=343.018, max=343.018, sum=686.036 (2)",
-            "tab": "General information",
-            "score": 343.01807228915663
-          },
-          "Virology - # output tokens": {
-            "description": "min=0.994, mean=0.994, max=0.994, sum=1.988 (2)",
-            "tab": "General information",
-            "score": 0.9939759036144579
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.842,
-        "details": {
-          "description": "min=0.842, mean=0.842, max=0.842, sum=1.684 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.376, mean=0.376, max=0.376, sum=0.752 (2)",
-            "tab": "Efficiency",
-            "score": 0.3761981662951018
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=274.52, mean=274.52, max=274.52, sum=549.041 (2)",
-            "tab": "General information",
-            "score": 274.5204678362573
-          },
-          "World Religions - # output tokens": {
-            "description": "min=0.994, mean=0.994, max=0.994, sum=1.988 (2)",
-            "tab": "General information",
-            "score": 0.9941520467836257
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.629,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json b/data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json
deleted file mode 100644
index 2eef769c8..000000000
--- a/data/helm_mmlu/writer/palmyra-x-v3/12976629-cefe-4329-b974-bb17f88d385d.json
+++ /dev/null
@@ -1,3021 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "helm_mmlu/writer_palmyra-x-v3/1770835937.459157",
-  "retrieved_timestamp": "1770835937.459157",
-  "source_metadata": {
-    "source_name": "helm_mmlu",
-    "source_type": "documentation",
-    "source_organization_name": "crfm",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Palmyra X V3 72B",
-    "id": "writer/palmyra-x-v3",
-    "developer": "writer",
-    "inference_platform": "unknown"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "MMLU All Subjects",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on MMLU All Subjects",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.786,
-        "details": {
-          "description": "min=0.493, mean=0.786, max=0.979, sum=89.625 (114)",
-          "tab": "Accuracy",
-          "MMLU All Subjects - Observed inference time (s)": {
-            "description": "min=0.555, mean=0.663, max=1.566, sum=75.544 (114)",
-            "tab": "Efficiency",
-            "score": 0.6626657480593275
-          },
-          "MMLU All Subjects - # eval": {
-            "description": "min=100, mean=246.351, max=1534, sum=28084 (114)",
-            "tab": "General information",
-            "score": 246.35087719298247
-          },
-          "MMLU All Subjects - # train": {
-            "description": "min=5, mean=5, max=5, sum=570 (114)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "MMLU All Subjects - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (114)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "MMLU All Subjects - # prompt tokens": {
-            "description": "min=277.386, mean=627.489, max=2844.03, sum=71533.746 (114)",
-            "tab": "General information",
-            "score": 627.4890026560713
-          },
-          "MMLU All Subjects - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=114 (114)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": [
-            "abstract_algebra",
-            "anatomy",
-            "astronomy",
-            "business_ethics",
-            "clinical_knowledge",
-            "college_biology",
-            "college_chemistry",
-            "college_computer_science",
-            "college_mathematics",
-            "college_medicine",
-            "college_physics",
-            "computer_security",
-            "conceptual_physics",
-            "econometrics",
-            "electrical_engineering",
-            "elementary_mathematics",
-            "formal_logic",
-            "global_facts",
-            "high_school_biology",
-            "high_school_chemistry",
-            "high_school_computer_science",
-            "high_school_european_history",
-            "high_school_geography",
-            "high_school_government_and_politics",
-            "high_school_macroeconomics",
-            "high_school_mathematics",
-            "high_school_microeconomics",
-            "high_school_physics",
-            "high_school_psychology",
-            "high_school_statistics",
-            "high_school_us_history",
-            "high_school_world_history",
-            "human_aging",
-            "human_sexuality",
-            "international_law",
-            "jurisprudence",
-            "logical_fallacies",
-            "machine_learning",
-            "management",
-            "marketing",
-            "medical_genetics",
-            "miscellaneous",
-            "moral_disputes",
-            "moral_scenarios",
-            "nutrition",
-            "philosophy",
-            "prehistory",
-            "professional_accounting",
-            "professional_law",
-            "professional_medicine",
-            "professional_psychology",
-            "public_relations",
-            "security_studies",
-            "sociology",
-            "us_foreign_policy",
-            "virology",
-            "world_religions"
-          ],
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": [
-            "mmlu_abstract_algebra",
-            "mmlu_anatomy",
-            "mmlu_astronomy",
-            "mmlu_business_ethics",
-            "mmlu_clinical_knowledge",
-            "mmlu_college_biology",
-            "mmlu_college_chemistry",
-            "mmlu_college_computer_science",
-            "mmlu_college_mathematics",
-            "mmlu_college_medicine",
-            "mmlu_college_physics",
-            "mmlu_computer_security",
-            "mmlu_conceptual_physics",
-            "mmlu_econometrics",
-            "mmlu_electrical_engineering",
-            "mmlu_elementary_mathematics",
-            "mmlu_formal_logic",
-            "mmlu_global_facts",
-            "mmlu_high_school_biology",
-            "mmlu_high_school_chemistry",
-            "mmlu_high_school_computer_science",
-            "mmlu_high_school_european_history",
-            "mmlu_high_school_geography",
-            "mmlu_high_school_government_and_politics",
-            "mmlu_high_school_macroeconomics",
-            "mmlu_high_school_mathematics",
-            "mmlu_high_school_microeconomics",
-            "mmlu_high_school_physics",
-            "mmlu_high_school_psychology",
-            "mmlu_high_school_statistics",
-            "mmlu_high_school_us_history",
-            "mmlu_high_school_world_history",
-            "mmlu_human_aging",
-            "mmlu_human_sexuality",
-            "mmlu_international_law",
-            "mmlu_jurisprudence",
-            "mmlu_logical_fallacies",
-            "mmlu_machine_learning",
-            "mmlu_management",
-            "mmlu_marketing",
-            "mmlu_medical_genetics",
-            "mmlu_miscellaneous",
-            "mmlu_moral_disputes",
-            "mmlu_moral_scenarios",
-            "mmlu_nutrition",
-            "mmlu_philosophy",
-            "mmlu_prehistory",
-            "mmlu_professional_accounting",
-            "mmlu_professional_law",
-            "mmlu_professional_medicine",
-            "mmlu_professional_psychology",
-            "mmlu_public_relations",
-            "mmlu_security_studies",
-            "mmlu_sociology",
-            "mmlu_us_foreign_policy",
-            "mmlu_virology",
-            "mmlu_world_religions"
-          ]
-        }
-      }
-    },
-    {
-      "evaluation_name": "Abstract Algebra",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Abstract Algebra",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53,
-        "details": {
-          "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)",
-          "tab": "Accuracy",
-          "Abstract Algebra - Observed inference time (s)": {
-            "description": "min=0.62, mean=0.62, max=0.62, sum=1.239 (2)",
-            "tab": "Efficiency",
-            "score": 0.6195793676376343
-          },
-          "Abstract Algebra - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Abstract Algebra - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Abstract Algebra - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Abstract Algebra - # prompt tokens": {
-            "description": "min=371.38, mean=371.38, max=371.38, sum=742.76 (2)",
-            "tab": "General information",
-            "score": 371.38
-          },
-          "Abstract Algebra - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "abstract_algebra",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_abstract_algebra"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Anatomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Anatomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.733,
-        "details": {
-          "description": "min=0.733, mean=0.733, max=0.733, sum=1.467 (2)",
-          "tab": "Accuracy",
-          "Anatomy - Observed inference time (s)": {
-            "description": "min=0.586, mean=0.586, max=0.586, sum=1.172 (2)",
-            "tab": "Efficiency",
-            "score": 0.5858598179287381
-          },
-          "Anatomy - # eval": {
-            "description": "min=135, mean=135, max=135, sum=270 (2)",
-            "tab": "General information",
-            "score": 135.0
-          },
-          "Anatomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Anatomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Anatomy - # prompt tokens": {
-            "description": "min=372.081, mean=372.081, max=372.081, sum=744.163 (2)",
-            "tab": "General information",
-            "score": 372.0814814814815
-          },
-          "Anatomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "anatomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_anatomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "College Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on College Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.549,
-        "details": {
-          "description": "min=0.549, mean=0.549, max=0.549, sum=1.098 (2)",
-          "tab": "Accuracy",
-          "College Chemistry - Observed inference time (s)": {
-            "description": "min=0.664, mean=0.664, max=0.664, sum=1.327 (2)",
-            "tab": "Efficiency",
-            "score": 0.6636523914337158
-          },
-          "College Biology - Observed inference time (s)": {
-            "description": "min=0.575, mean=0.575, max=0.575, sum=1.15 (2)",
-            "tab": "Efficiency",
-            "score": 0.5751992679304547
-          },
-          "College Computer Science - Observed inference time (s)": {
-            "description": "min=0.867, mean=0.867, max=0.867, sum=1.734 (2)",
-            "tab": "Efficiency",
-            "score": 0.8668097257614136
-          },
-          "College Mathematics - Observed inference time (s)": {
-            "description": "min=0.591, mean=0.591, max=0.591, sum=1.182 (2)",
-            "tab": "Efficiency",
-            "score": 0.5912106204032898
-          },
-          "College Medicine - Observed inference time (s)": {
-            "description": "min=0.593, mean=0.593, max=0.593, sum=1.186 (2)",
-            "tab": "Efficiency",
-            "score": 0.5927534434147653
-          },
-          "College Physics - Observed inference time (s)": {
-            "description": "min=0.58, mean=0.58, max=0.58, sum=1.159 (2)",
-            "tab": "Efficiency",
-            "score": 0.5796795171849868
-          },
-          "College Chemistry - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Chemistry - # prompt tokens": {
-            "description": "min=545.4, mean=545.4, max=545.4, sum=1090.8 (2)",
-            "tab": "General information",
-            "score": 545.4
-          },
-          "College Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Biology - # eval": {
-            "description": "min=144, mean=144, max=144, sum=288 (2)",
-            "tab": "General information",
-            "score": 144.0
-          },
-          "College Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Biology - # prompt tokens": {
-            "description": "min=482.278, mean=482.278, max=482.278, sum=964.556 (2)",
-            "tab": "General information",
-            "score": 482.27777777777777
-          },
-          "College Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Computer Science - # prompt tokens": {
-            "description": "min=852.15, mean=852.15, max=852.15, sum=1704.3 (2)",
-            "tab": "General information",
-            "score": 852.15
-          },
-          "College Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Mathematics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "College Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Mathematics - # prompt tokens": {
-            "description": "min=611.53, mean=611.53, max=611.53, sum=1223.06 (2)",
-            "tab": "General information",
-            "score": 611.53
-          },
-          "College Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Medicine - # eval": {
-            "description": "min=173, mean=173, max=173, sum=346 (2)",
-            "tab": "General information",
-            "score": 173.0
-          },
-          "College Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Medicine - # prompt tokens": {
-            "description": "min=530.301, mean=530.301, max=530.301, sum=1060.601 (2)",
-            "tab": "General information",
-            "score": 530.3005780346821
-          },
-          "College Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "College Physics - # eval": {
-            "description": "min=102, mean=102, max=102, sum=204 (2)",
-            "tab": "General information",
-            "score": 102.0
-          },
-          "College Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "College Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "College Physics - # prompt tokens": {
-            "description": "min=489.324, mean=489.324, max=489.324, sum=978.647 (2)",
-            "tab": "General information",
-            "score": 489.3235294117647
-          },
-          "College Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "college_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_college_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Computer Security",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Computer Security",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78,
-        "details": {
-          "description": "min=0.78, mean=0.78, max=0.78, sum=1.56 (2)",
-          "tab": "Accuracy",
-          "Computer Security - Observed inference time (s)": {
-            "description": "min=0.613, mean=0.613, max=0.613, sum=1.227 (2)",
-            "tab": "Efficiency",
-            "score": 0.613369300365448
-          },
-          "Computer Security - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Computer Security - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Computer Security - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Computer Security - # prompt tokens": {
-            "description": "min=387.4, mean=387.4, max=387.4, sum=774.8 (2)",
-            "tab": "General information",
-            "score": 387.4
-          },
-          "Computer Security - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "computer_security",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_computer_security"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Econometrics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Econometrics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.649,
-        "details": {
-          "description": "min=0.649, mean=0.649, max=0.649, sum=1.298 (2)",
-          "tab": "Accuracy",
-          "Econometrics - Observed inference time (s)": {
-            "description": "min=0.783, mean=0.783, max=0.783, sum=1.566 (2)",
-            "tab": "Efficiency",
-            "score": 0.7830351319229394
-          },
-          "Econometrics - # eval": {
-            "description": "min=114, mean=114, max=114, sum=228 (2)",
-            "tab": "General information",
-            "score": 114.0
-          },
-          "Econometrics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Econometrics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Econometrics - # prompt tokens": {
-            "description": "min=624.07, mean=624.07, max=624.07, sum=1248.14 (2)",
-            "tab": "General information",
-            "score": 624.0701754385965
-          },
-          "Econometrics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "econometrics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_econometrics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Global Facts",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Global Facts",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53,
-        "details": {
-          "description": "min=0.53, mean=0.53, max=0.53, sum=1.06 (2)",
-          "tab": "Accuracy",
-          "Global Facts - Observed inference time (s)": {
-            "description": "min=0.586, mean=0.586, max=0.586, sum=1.172 (2)",
-            "tab": "Efficiency",
-            "score": 0.5858692646026611
-          },
-          "Global Facts - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Global Facts - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Global Facts - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Global Facts - # prompt tokens": {
-            "description": "min=398.42, mean=398.42, max=398.42, sum=796.84 (2)",
-            "tab": "General information",
-            "score": 398.42
-          },
-          "Global Facts - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "global_facts",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_global_facts"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Jurisprudence",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Jurisprudence",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88,
-        "details": {
-          "description": "min=0.88, mean=0.88, max=0.88, sum=1.759 (2)",
-          "tab": "Accuracy",
-          "Jurisprudence - Observed inference time (s)": {
-            "description": "min=0.581, mean=0.581, max=0.581, sum=1.162 (2)",
-            "tab": "Efficiency",
-            "score": 0.5810460448265076
-          },
-          "Jurisprudence - # eval": {
-            "description": "min=108, mean=108, max=108, sum=216 (2)",
-            "tab": "General information",
-            "score": 108.0
-          },
-          "Jurisprudence - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Jurisprudence - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Jurisprudence - # prompt tokens": {
-            "description": "min=418.722, mean=418.722, max=418.722, sum=837.444 (2)",
-            "tab": "General information",
-            "score": 418.72222222222223
-          },
-          "Jurisprudence - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "jurisprudence",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_jurisprudence"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Philosophy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Philosophy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.836,
-        "details": {
-          "description": "min=0.836, mean=0.836, max=0.836, sum=1.672 (2)",
-          "tab": "Accuracy",
-          "Philosophy - Observed inference time (s)": {
-            "description": "min=0.576, mean=0.576, max=0.576, sum=1.152 (2)",
-            "tab": "Efficiency",
-            "score": 0.5761417744627336
-          },
-          "Philosophy - # eval": {
-            "description": "min=311, mean=311, max=311, sum=622 (2)",
-            "tab": "General information",
-            "score": 311.0
-          },
-          "Philosophy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Philosophy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Philosophy - # prompt tokens": {
-            "description": "min=353.704, mean=353.704, max=353.704, sum=707.408 (2)",
-            "tab": "General information",
-            "score": 353.7041800643087
-          },
-          "Philosophy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "philosophy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_philosophy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Professional Psychology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Professional Psychology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.858,
-        "details": {
-          "description": "min=0.858, mean=0.858, max=0.858, sum=1.716 (2)",
-          "tab": "Accuracy",
-          "Professional Medicine - Observed inference time (s)": {
-            "description": "min=0.884, mean=0.884, max=0.884, sum=1.768 (2)",
-            "tab": "Efficiency",
-            "score": 0.8839500090655159
-          },
-          "Professional Accounting - Observed inference time (s)": {
-            "description": "min=0.711, mean=0.711, max=0.711, sum=1.423 (2)",
-            "tab": "Efficiency",
-            "score": 0.7114707704976941
-          },
-          "Professional Law - Observed inference time (s)": {
-            "description": "min=0.981, mean=0.981, max=0.981, sum=1.962 (2)",
-            "tab": "Efficiency",
-            "score": 0.9809994663377785
-          },
-          "Professional Psychology - Observed inference time (s)": {
-            "description": "min=0.598, mean=0.598, max=0.598, sum=1.196 (2)",
-            "tab": "Efficiency",
-            "score": 0.5978598594665527
-          },
-          "Professional Medicine - # eval": {
-            "description": "min=272, mean=272, max=272, sum=544 (2)",
-            "tab": "General information",
-            "score": 272.0
-          },
-          "Professional Medicine - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Medicine - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Medicine - # prompt tokens": {
-            "description": "min=1118.287, mean=1118.287, max=1118.287, sum=2236.574 (2)",
-            "tab": "General information",
-            "score": 1118.2867647058824
-          },
-          "Professional Medicine - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Accounting - # eval": {
-            "description": "min=282, mean=282, max=282, sum=564 (2)",
-            "tab": "General information",
-            "score": 282.0
-          },
-          "Professional Accounting - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Accounting - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Accounting - # prompt tokens": {
-            "description": "min=660.72, mean=660.72, max=660.72, sum=1321.44 (2)",
-            "tab": "General information",
-            "score": 660.7198581560284
-          },
-          "Professional Accounting - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Law - # eval": {
-            "description": "min=1534, mean=1534, max=1534, sum=3068 (2)",
-            "tab": "General information",
-            "score": 1534.0
-          },
-          "Professional Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Law - # prompt tokens": {
-            "description": "min=1658.73, mean=1658.73, max=1658.73, sum=3317.46 (2)",
-            "tab": "General information",
-            "score": 1658.7301173402868
-          },
-          "Professional Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Professional Psychology - # eval": {
-            "description": "min=612, mean=612, max=612, sum=1224 (2)",
-            "tab": "General information",
-            "score": 612.0
-          },
-          "Professional Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Professional Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Professional Psychology - # prompt tokens": {
-            "description": "min=597.574, mean=597.574, max=597.574, sum=1195.147 (2)",
-            "tab": "General information",
-            "score": 597.5735294117648
-          },
-          "Professional Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "professional_psychology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_professional_psychology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Us Foreign Policy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Us Foreign Policy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.96,
-        "details": {
-          "description": "min=0.96, mean=0.96, max=0.96, sum=1.92 (2)",
-          "tab": "Accuracy",
-          "Us Foreign Policy - Observed inference time (s)": {
-            "description": "min=0.604, mean=0.604, max=0.604, sum=1.207 (2)",
-            "tab": "Efficiency",
-            "score": 0.6037013912200928
-          },
-          "Us Foreign Policy - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Us Foreign Policy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Us Foreign Policy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Us Foreign Policy - # prompt tokens": {
-            "description": "min=433.12, mean=433.12, max=433.12, sum=866.24 (2)",
-            "tab": "General information",
-            "score": 433.12
-          },
-          "Us Foreign Policy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "us_foreign_policy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_us_foreign_policy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Astronomy",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Astronomy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.862,
-        "details": {
-          "description": "min=0.862, mean=0.862, max=0.862, sum=1.724 (2)",
-          "tab": "Accuracy",
-          "Astronomy - Observed inference time (s)": {
-            "description": "min=0.593, mean=0.593, max=0.593, sum=1.186 (2)",
-            "tab": "Efficiency",
-            "score": 0.5929083667303386
-          },
-          "Astronomy - # eval": {
-            "description": "min=152, mean=152, max=152, sum=304 (2)",
-            "tab": "General information",
-            "score": 152.0
-          },
-          "Astronomy - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Astronomy - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Astronomy - # prompt tokens": {
-            "description": "min=600.112, mean=600.112, max=600.112, sum=1200.224 (2)",
-            "tab": "General information",
-            "score": 600.1118421052631
-          },
-          "Astronomy - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "astronomy",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_astronomy"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Business Ethics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Business Ethics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Business Ethics - Observed inference time (s)": {
-            "description": "min=0.598, mean=0.598, max=0.598, sum=1.196 (2)",
-            "tab": "Efficiency",
-            "score": 0.5981829071044922
-          },
-          "Business Ethics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Business Ethics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Business Ethics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Business Ethics - # prompt tokens": {
-            "description": "min=589.46, mean=589.46, max=589.46, sum=1178.92 (2)",
-            "tab": "General information",
-            "score": 589.46
-          },
-          "Business Ethics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "business_ethics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_business_ethics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Clinical Knowledge",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Clinical Knowledge",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.804,
-        "details": {
-          "description": "min=0.804, mean=0.804, max=0.804, sum=1.608 (2)",
-          "tab": "Accuracy",
-          "Clinical Knowledge - Observed inference time (s)": {
-            "description": "min=0.575, mean=0.575, max=0.575, sum=1.15 (2)",
-            "tab": "Efficiency",
-            "score": 0.5750116924069962
-          },
-          "Clinical Knowledge - # eval": {
-            "description": "min=265, mean=265, max=265, sum=530 (2)",
-            "tab": "General information",
-            "score": 265.0
-          },
-          "Clinical Knowledge - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Clinical Knowledge - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Clinical Knowledge - # prompt tokens": {
-            "description": "min=423.925, mean=423.925, max=423.925, sum=847.849 (2)",
-            "tab": "General information",
-            "score": 423.92452830188677
-          },
-          "Clinical Knowledge - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "clinical_knowledge",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_clinical_knowledge"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Conceptual Physics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Conceptual Physics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.809,
-        "details": {
-          "description": "min=0.809, mean=0.809, max=0.809, sum=1.617 (2)",
-          "tab": "Accuracy",
-          "Conceptual Physics - Observed inference time (s)": {
-            "description": "min=0.58, mean=0.58, max=0.58, sum=1.161 (2)",
-            "tab": "Efficiency",
-            "score": 0.5802780881841132
-          },
-          "Conceptual Physics - # eval": {
-            "description": "min=235, mean=235, max=235, sum=470 (2)",
-            "tab": "General information",
-            "score": 235.0
-          },
-          "Conceptual Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Conceptual Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Conceptual Physics - # prompt tokens": {
-            "description": "min=313.723, mean=313.723, max=313.723, sum=627.447 (2)",
-            "tab": "General information",
-            "score": 313.72340425531917
-          },
-          "Conceptual Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "conceptual_physics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_conceptual_physics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Electrical Engineering",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Electrical Engineering",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.772,
-        "details": {
-          "description": "min=0.772, mean=0.772, max=0.772, sum=1.545 (2)",
-          "tab": "Accuracy",
-          "Electrical Engineering - Observed inference time (s)": {
-            "description": "min=0.583, mean=0.583, max=0.583, sum=1.165 (2)",
-            "tab": "Efficiency",
-            "score": 0.5827381166918525
-          },
-          "Electrical Engineering - # eval": {
-            "description": "min=145, mean=145, max=145, sum=290 (2)",
-            "tab": "General information",
-            "score": 145.0
-          },
-          "Electrical Engineering - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Electrical Engineering - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Electrical Engineering - # prompt tokens": {
-            "description": "min=430.345, mean=430.345, max=430.345, sum=860.69 (2)",
-            "tab": "General information",
-            "score": 430.3448275862069
-          },
-          "Electrical Engineering - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "electrical_engineering",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_electrical_engineering"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Elementary Mathematics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Elementary Mathematics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.661,
-        "details": {
-          "description": "min=0.661, mean=0.661, max=0.661, sum=1.323 (2)",
-          "tab": "Accuracy",
-          "Elementary Mathematics - Observed inference time (s)": {
-            "description": "min=0.584, mean=0.584, max=0.584, sum=1.167 (2)",
-            "tab": "Efficiency",
-            "score": 0.5836543033993433
-          },
-          "Elementary Mathematics - # eval": {
-            "description": "min=378, mean=378, max=378, sum=756 (2)",
-            "tab": "General information",
-            "score": 378.0
-          },
-          "Elementary Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Elementary Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Elementary Mathematics - # prompt tokens": {
-            "description": "min=506.09, mean=506.09, max=506.09, sum=1012.18 (2)",
-            "tab": "General information",
-            "score": 506.0899470899471
-          },
-          "Elementary Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "elementary_mathematics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_elementary_mathematics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Formal Logic",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Formal Logic",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.659,
-        "details": {
-          "description": "min=0.659, mean=0.659, max=0.659, sum=1.317 (2)",
-          "tab": "Accuracy",
-          "Formal Logic - Observed inference time (s)": {
-            "description": "min=0.597, mean=0.597, max=0.597, sum=1.194 (2)",
-            "tab": "Efficiency",
-            "score": 0.5971027309932406
-          },
-          "Formal Logic - # eval": {
-            "description": "min=126, mean=126, max=126, sum=252 (2)",
-            "tab": "General information",
-            "score": 126.0
-          },
-          "Formal Logic - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Formal Logic - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Formal Logic - # prompt tokens": {
-            "description": "min=641, mean=641, max=641, sum=1282 (2)",
-            "tab": "General information",
-            "score": 641.0
-          },
-          "Formal Logic - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "formal_logic",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_formal_logic"
-        }
-      }
-    },
-    {
-      "evaluation_name": "High School World History",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on High School World History",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.911,
-        "details": {
-          "description": "min=0.911, mean=0.911, max=0.911, sum=1.823 (2)",
-          "tab": "Accuracy",
-          "High School Biology - Observed inference time (s)": {
-            "description": "min=0.584, mean=0.584, max=0.584, sum=1.168 (2)",
-            "tab": "Efficiency",
-            "score": 0.5838540715555991
-          },
-          "High School Chemistry - Observed inference time (s)": {
-            "description": "min=0.579, mean=0.579, max=0.579, sum=1.159 (2)",
-            "tab": "Efficiency",
-            "score": 0.5794280843781721
-          },
-          "High School Computer Science - Observed inference time (s)": {
-            "description": "min=0.873, mean=0.873, max=0.873, sum=1.745 (2)",
-            "tab": "Efficiency",
-            "score": 0.8726636576652527
-          },
-          "High School European History - Observed inference time (s)": {
-            "description": "min=1.532, mean=1.532, max=1.532, sum=3.063 (2)",
-            "tab": "Efficiency",
-            "score": 1.5316768602891402
-          },
-          "High School Geography - Observed inference time (s)": {
-            "description": "min=0.568, mean=0.568, max=0.568, sum=1.135 (2)",
-            "tab": "Efficiency",
-            "score": 0.5675288703706529
-          },
-          "High School Government And Politics - Observed inference time (s)": {
-            "description": "min=0.574, mean=0.574, max=0.574, sum=1.147 (2)",
-            "tab": "Efficiency",
-            "score": 0.573576919773082
-          },
-          "High School Macroeconomics - Observed inference time (s)": {
-            "description": "min=0.608, mean=0.608, max=0.608, sum=1.215 (2)",
-            "tab": "Efficiency",
-            "score": 0.607545349536798
-          },
-          "High School Mathematics - Observed inference time (s)": {
-            "description": "min=0.594, mean=0.594, max=0.594, sum=1.187 (2)",
-            "tab": "Efficiency",
-            "score": 0.5936917472768712
-          },
-          "High School Microeconomics - Observed inference time (s)": {
-            "description": "min=0.561, mean=0.561, max=0.561, sum=1.123 (2)",
-            "tab": "Efficiency",
-            "score": 0.5614581979623362
-          },
-          "High School Physics - Observed inference time (s)": {
-            "description": "min=0.594, mean=0.594, max=0.594, sum=1.189 (2)",
-            "tab": "Efficiency",
-            "score": 0.5943679051683438
-          },
-          "High School Psychology - Observed inference time (s)": {
-            "description": "min=0.595, mean=0.595, max=0.595, sum=1.189 (2)",
-            "tab": "Efficiency",
-            "score": 0.5945224263252469
-          },
-          "High School Statistics - Observed inference time (s)": {
-            "description": "min=0.889, mean=0.889, max=0.889, sum=1.778 (2)",
-            "tab": "Efficiency",
-            "score": 0.8891873856385549
-          },
-          "High School US History - Observed inference time (s)": {
-            "description": "min=1.566, mean=1.566, max=1.566, sum=3.131 (2)",
-            "tab": "Efficiency",
-            "score": 1.5656375043532427
-          },
-          "High School World History - Observed inference time (s)": {
-            "description": "min=0.876, mean=0.876, max=0.876, sum=1.751 (2)",
-            "tab": "Efficiency",
-            "score": 0.8755375081476783
-          },
-          "High School Biology - # eval": {
-            "description": "min=310, mean=310, max=310, sum=620 (2)",
-            "tab": "General information",
-            "score": 310.0
-          },
-          "High School Biology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Biology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Biology - # prompt tokens": {
-            "description": "min=540.748, mean=540.748, max=540.748, sum=1081.497 (2)",
-            "tab": "General information",
-            "score": 540.7483870967742
-          },
-          "High School Biology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Chemistry - # eval": {
-            "description": "min=203, mean=203, max=203, sum=406 (2)",
-            "tab": "General information",
-            "score": 203.0
-          },
-          "High School Chemistry - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Chemistry - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Chemistry - # prompt tokens": {
-            "description": "min=495.65, mean=495.65, max=495.65, sum=991.3 (2)",
-            "tab": "General information",
-            "score": 495.6502463054187
-          },
-          "High School Chemistry - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Computer Science - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "High School Computer Science - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Computer Science - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Computer Science - # prompt tokens": {
-            "description": "min=904.15, mean=904.15, max=904.15, sum=1808.3 (2)",
-            "tab": "General information",
-            "score": 904.15
-          },
-          "High School Computer Science - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School European History - # eval": {
-            "description": "min=165, mean=165, max=165, sum=330 (2)",
-            "tab": "General information",
-            "score": 165.0
-          },
-          "High School European History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School European History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School European History - # prompt tokens": {
-            "description": "min=2844.03, mean=2844.03, max=2844.03, sum=5688.061 (2)",
-            "tab": "General information",
-            "score": 2844.030303030303
-          },
-          "High School European History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Geography - # eval": {
-            "description": "min=198, mean=198, max=198, sum=396 (2)",
-            "tab": "General information",
-            "score": 198.0
-          },
-          "High School Geography - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Geography - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Geography - # prompt tokens": {
-            "description": "min=397.646, mean=397.646, max=397.646, sum=795.293 (2)",
-            "tab": "General information",
-            "score": 397.64646464646466
-          },
-          "High School Geography - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Government And Politics - # eval": {
-            "description": "min=193, mean=193, max=193, sum=386 (2)",
-            "tab": "General information",
-            "score": 193.0
-          },
-          "High School Government And Politics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Government And Politics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Government And Politics - # prompt tokens": {
-            "description": "min=478.073, mean=478.073, max=478.073, sum=956.145 (2)",
-            "tab": "General information",
-            "score": 478.07253886010363
-          },
-          "High School Government And Politics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Macroeconomics - # eval": {
-            "description": "min=390, mean=390, max=390, sum=780 (2)",
-            "tab": "General information",
-            "score": 390.0
-          },
-          "High School Macroeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Macroeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Macroeconomics - # prompt tokens": {
-            "description": "min=391.987, mean=391.987, max=391.987, sum=783.974 (2)",
-            "tab": "General information",
-            "score": 391.9871794871795
-          },
-          "High School Macroeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Mathematics - # eval": {
-            "description": "min=270, mean=270, max=270, sum=540 (2)",
-            "tab": "General information",
-            "score": 270.0
-          },
-          "High School Mathematics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Mathematics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Mathematics - # prompt tokens": {
-            "description": "min=526.352, mean=526.352, max=526.352, sum=1052.704 (2)",
-            "tab": "General information",
-            "score": 526.3518518518518
-          },
-          "High School Mathematics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Microeconomics - # eval": {
-            "description": "min=238, mean=238, max=238, sum=476 (2)",
-            "tab": "General information",
-            "score": 238.0
-          },
-          "High School Microeconomics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Microeconomics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Microeconomics - # prompt tokens": {
-            "description": "min=411.055, mean=411.055, max=411.055, sum=822.109 (2)",
-            "tab": "General information",
-            "score": 411.0546218487395
-          },
-          "High School Microeconomics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Physics - # eval": {
-            "description": "min=151, mean=151, max=151, sum=302 (2)",
-            "tab": "General information",
-            "score": 151.0
-          },
-          "High School Physics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Physics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Physics - # prompt tokens": {
-            "description": "min=553.669, mean=553.669, max=553.669, sum=1107.338 (2)",
-            "tab": "General information",
-            "score": 553.6688741721854
-          },
-          "High School Physics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Psychology - # eval": {
-            "description": "min=545, mean=545, max=545, sum=1090 (2)",
-            "tab": "General information",
-            "score": 545.0
-          },
-          "High School Psychology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Psychology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Psychology - # prompt tokens": {
-            "description": "min=516.842, mean=516.842, max=516.842, sum=1033.684 (2)",
-            "tab": "General information",
-            "score": 516.8422018348624
-          },
-          "High School Psychology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School Statistics - # eval": {
-            "description": "min=216, mean=216, max=216, sum=432 (2)",
-            "tab": "General information",
-            "score": 216.0
-          },
-          "High School Statistics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School Statistics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School Statistics - # prompt tokens": {
-            "description": "min=805, mean=805, max=805, sum=1610 (2)",
-            "tab": "General information",
-            "score": 805.0
-          },
-          "High School Statistics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School US History - # eval": {
-            "description": "min=204, mean=204, max=204, sum=408 (2)",
-            "tab": "General information",
-            "score": 204.0
-          },
-          "High School US History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School US History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School US History - # prompt tokens": {
-            "description": "min=2242.25, mean=2242.25, max=2242.25, sum=4484.5 (2)",
-            "tab": "General information",
-            "score": 2242.25
-          },
-          "High School US History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "High School World History - # eval": {
-            "description": "min=237, mean=237, max=237, sum=474 (2)",
-            "tab": "General information",
-            "score": 237.0
-          },
-          "High School World History - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "High School World History - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "High School World History - # prompt tokens": {
-            "description": "min=1438.561, mean=1438.561, max=1438.561, sum=2877.122 (2)",
-            "tab": "General information",
-            "score": 1438.5611814345991
-          },
-          "High School World History - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "high_school_world_history",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_high_school_world_history"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Human Sexuality",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Human Sexuality",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.924,
-        "details": {
-          "description": "min=0.924, mean=0.924, max=0.924, sum=1.847 (2)",
-          "tab": "Accuracy",
-          "Human Aging - Observed inference time (s)": {
-            "description": "min=0.577, mean=0.577, max=0.577, sum=1.154 (2)",
-            "tab": "Efficiency",
-            "score": 0.5767963167797824
-          },
-          "Human Sexuality - Observed inference time (s)": {
-            "description": "min=0.564, mean=0.564, max=0.564, sum=1.127 (2)",
-            "tab": "Efficiency",
-            "score": 0.5637276700434793
-          },
-          "Human Aging - # eval": {
-            "description": "min=223, mean=223, max=223, sum=446 (2)",
-            "tab": "General information",
-            "score": 223.0
-          },
-          "Human Aging - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Aging - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Aging - # prompt tokens": {
-            "description": "min=324.48, mean=324.48, max=324.48, sum=648.96 (2)",
-            "tab": "General information",
-            "score": 324.47982062780267
-          },
-          "Human Aging - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Human Sexuality - # eval": {
-            "description": "min=131, mean=131, max=131, sum=262 (2)",
-            "tab": "General information",
-            "score": 131.0
-          },
-          "Human Sexuality - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Human Sexuality - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Human Sexuality - # prompt tokens": {
-            "description": "min=357.626, mean=357.626, max=357.626, sum=715.252 (2)",
-            "tab": "General information",
-            "score": 357.62595419847327
-          },
-          "Human Sexuality - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "human_sexuality",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_human_sexuality"
-        }
-      }
-    },
-    {
-      "evaluation_name": "International Law",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on International Law",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.909,
-        "details": {
-          "description": "min=0.909, mean=0.909, max=0.909, sum=1.818 (2)",
-          "tab": "Accuracy",
-          "International Law - Observed inference time (s)": {
-            "description": "min=0.603, mean=0.603, max=0.603, sum=1.205 (2)",
-            "tab": "Efficiency",
-            "score": 0.6025364970372729
-          },
-          "International Law - # eval": {
-            "description": "min=121, mean=121, max=121, sum=242 (2)",
-            "tab": "General information",
-            "score": 121.0
-          },
-          "International Law - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "International Law - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "International Law - # prompt tokens": {
-            "description": "min=639.843, mean=639.843, max=639.843, sum=1279.686 (2)",
-            "tab": "General information",
-            "score": 639.8429752066115
-          },
-          "International Law - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "international_law",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_international_law"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Logical Fallacies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Logical Fallacies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.877,
-        "details": {
-          "description": "min=0.877, mean=0.877, max=0.877, sum=1.755 (2)",
-          "tab": "Accuracy",
-          "Logical Fallacies - Observed inference time (s)": {
-            "description": "min=0.577, mean=0.577, max=0.577, sum=1.154 (2)",
-            "tab": "Efficiency",
-            "score": 0.5770467907373159
-          },
-          "Logical Fallacies - # eval": {
-            "description": "min=163, mean=163, max=163, sum=326 (2)",
-            "tab": "General information",
-            "score": 163.0
-          },
-          "Logical Fallacies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Logical Fallacies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Logical Fallacies - # prompt tokens": {
-            "description": "min=454.227, mean=454.227, max=454.227, sum=908.454 (2)",
-            "tab": "General information",
-            "score": 454.2269938650307
-          },
-          "Logical Fallacies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "logical_fallacies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_logical_fallacies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Machine Learning",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Machine Learning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.625,
-        "details": {
-          "description": "min=0.625, mean=0.625, max=0.625, sum=1.25 (2)",
-          "tab": "Accuracy",
-          "Machine Learning - Observed inference time (s)": {
-            "description": "min=0.612, mean=0.612, max=0.612, sum=1.223 (2)",
-            "tab": "Efficiency",
-            "score": 0.6116326642887933
-          },
-          "Machine Learning - # eval": {
-            "description": "min=112, mean=112, max=112, sum=224 (2)",
-            "tab": "General information",
-            "score": 112.0
-          },
-          "Machine Learning - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Machine Learning - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Machine Learning - # prompt tokens": {
-            "description": "min=671.598, mean=671.598, max=671.598, sum=1343.196 (2)",
-            "tab": "General information",
-            "score": 671.5982142857143
-          },
-          "Machine Learning - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "machine_learning",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_machine_learning"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Management",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Management",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.903,
-        "details": {
-          "description": "min=0.903, mean=0.903, max=0.903, sum=1.806 (2)",
-          "tab": "Accuracy",
-          "Management - Observed inference time (s)": {
-            "description": "min=0.555, mean=0.555, max=0.555, sum=1.111 (2)",
-            "tab": "Efficiency",
-            "score": 0.5553541276061419
-          },
-          "Management - # eval": {
-            "description": "min=103, mean=103, max=103, sum=206 (2)",
-            "tab": "General information",
-            "score": 103.0
-          },
-          "Management - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Management - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Management - # prompt tokens": {
-            "description": "min=292.34, mean=292.34, max=292.34, sum=584.68 (2)",
-            "tab": "General information",
-            "score": 292.3398058252427
-          },
-          "Management - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "management",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_management"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Marketing",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Marketing",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94,
-        "details": {
-          "description": "min=0.94, mean=0.94, max=0.94, sum=1.88 (2)",
-          "tab": "Accuracy",
-          "Marketing - Observed inference time (s)": {
-            "description": "min=0.567, mean=0.567, max=0.567, sum=1.133 (2)",
-            "tab": "Efficiency",
-            "score": 0.56665647131765
-          },
-          "Marketing - # eval": {
-            "description": "min=234, mean=234, max=234, sum=468 (2)",
-            "tab": "General information",
-            "score": 234.0
-          },
-          "Marketing - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Marketing - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Marketing - # prompt tokens": {
-            "description": "min=438.697, mean=438.697, max=438.697, sum=877.393 (2)",
-            "tab": "General information",
-            "score": 438.6965811965812
-          },
-          "Marketing - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "marketing",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_marketing"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Medical Genetics",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Medical Genetics",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.83,
-        "details": {
-          "description": "min=0.83, mean=0.83, max=0.83, sum=1.66 (2)",
-          "tab": "Accuracy",
-          "Medical Genetics - Observed inference time (s)": {
-            "description": "min=0.566, mean=0.566, max=0.566, sum=1.131 (2)",
-            "tab": "Efficiency",
-            "score": 0.5655512261390686
-          },
-          "Medical Genetics - # eval": {
-            "description": "min=100, mean=100, max=100, sum=200 (2)",
-            "tab": "General information",
-            "score": 100.0
-          },
-          "Medical Genetics - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Medical Genetics - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Medical Genetics - # prompt tokens": {
-            "description": "min=352.71, mean=352.71, max=352.71, sum=705.42 (2)",
-            "tab": "General information",
-            "score": 352.71
-          },
-          "Medical Genetics - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "medical_genetics",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_medical_genetics"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Miscellaneous",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Miscellaneous",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.894,
-        "details": {
-          "description": "min=0.894, mean=0.894, max=0.894, sum=1.788 (2)",
-          "tab": "Accuracy",
-          "Miscellaneous - Observed inference time (s)": {
-            "description": "min=0.571, mean=0.571, max=0.571, sum=1.142 (2)",
-            "tab": "Efficiency",
-            "score": 0.5712210739252668
-          },
-          "Miscellaneous - # eval": {
-            "description": "min=783, mean=783, max=783, sum=1566 (2)",
-            "tab": "General information",
-            "score": 783.0
-          },
-          "Miscellaneous - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Miscellaneous - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Miscellaneous - # prompt tokens": {
-            "description": "min=314.847, mean=314.847, max=314.847, sum=629.693 (2)",
-            "tab": "General information",
-            "score": 314.84674329501917
-          },
-          "Miscellaneous - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "miscellaneous",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_miscellaneous"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Moral Scenarios",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Moral Scenarios",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.562,
-        "details": {
-          "description": "min=0.562, mean=0.562, max=0.562, sum=1.124 (2)",
-          "tab": "Accuracy",
-          "Moral Disputes - Observed inference time (s)": {
-            "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)",
-            "tab": "Efficiency",
-            "score": 0.5724084032753299
-          },
-          "Moral Scenarios - Observed inference time (s)": {
-            "description": "min=0.583, mean=0.583, max=0.583, sum=1.166 (2)",
-            "tab": "Efficiency",
-            "score": 0.5827599754546607
-          },
-          "Moral Disputes - # eval": {
-            "description": "min=346, mean=346, max=346, sum=692 (2)",
-            "tab": "General information",
-            "score": 346.0
-          },
-          "Moral Disputes - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Disputes - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Disputes - # prompt tokens": {
-            "description": "min=497.329, mean=497.329, max=497.329, sum=994.659 (2)",
-            "tab": "General information",
-            "score": 497.32947976878614
-          },
-          "Moral Disputes - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          },
-          "Moral Scenarios - # eval": {
-            "description": "min=895, mean=895, max=895, sum=1790 (2)",
-            "tab": "General information",
-            "score": 895.0
-          },
-          "Moral Scenarios - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Moral Scenarios - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Moral Scenarios - # prompt tokens": {
-            "description": "min=664.482, mean=664.482, max=664.482, sum=1328.963 (2)",
-            "tab": "General information",
-            "score": 664.4815642458101
-          },
-          "Moral Scenarios - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "moral_scenarios",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_moral_scenarios"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Nutrition",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Nutrition",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.856,
-        "details": {
-          "description": "min=0.856, mean=0.856, max=0.856, sum=1.712 (2)",
-          "tab": "Accuracy",
-          "Nutrition - Observed inference time (s)": {
-            "description": "min=0.59, mean=0.59, max=0.59, sum=1.18 (2)",
-            "tab": "Efficiency",
-            "score": 0.5898437850615558
-          },
-          "Nutrition - # eval": {
-            "description": "min=306, mean=306, max=306, sum=612 (2)",
-            "tab": "General information",
-            "score": 306.0
-          },
-          "Nutrition - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Nutrition - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Nutrition - # prompt tokens": {
-            "description": "min=584.69, mean=584.69, max=584.69, sum=1169.379 (2)",
-            "tab": "General information",
-            "score": 584.6895424836601
-          },
-          "Nutrition - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "nutrition",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_nutrition"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Prehistory",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Prehistory",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.87,
-        "details": {
-          "description": "min=0.87, mean=0.87, max=0.87, sum=1.741 (2)",
-          "tab": "Accuracy",
-          "Prehistory - Observed inference time (s)": {
-            "description": "min=0.585, mean=0.585, max=0.585, sum=1.17 (2)",
-            "tab": "Efficiency",
-            "score": 0.5852300509994413
-          },
-          "Prehistory - # eval": {
-            "description": "min=324, mean=324, max=324, sum=648 (2)",
-            "tab": "General information",
-            "score": 324.0
-          },
-          "Prehistory - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Prehistory - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Prehistory - # prompt tokens": {
-            "description": "min=524.454, mean=524.454, max=524.454, sum=1048.907 (2)",
-            "tab": "General information",
-            "score": 524.4537037037037
-          },
-          "Prehistory - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "prehistory",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_prehistory"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Public Relations",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Public Relations",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.773,
-        "details": {
-          "description": "min=0.773, mean=0.773, max=0.773, sum=1.545 (2)",
-          "tab": "Accuracy",
-          "Public Relations - Observed inference time (s)": {
-            "description": "min=0.567, mean=0.567, max=0.567, sum=1.134 (2)",
-            "tab": "Efficiency",
-            "score": 0.5669147144664418
-          },
-          "Public Relations - # eval": {
-            "description": "min=110, mean=110, max=110, sum=220 (2)",
-            "tab": "General information",
-            "score": 110.0
-          },
-          "Public Relations - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Public Relations - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Public Relations - # prompt tokens": {
-            "description": "min=420.609, mean=420.609, max=420.609, sum=841.218 (2)",
-            "tab": "General information",
-            "score": 420.6090909090909
-          },
-          "Public Relations - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "public_relations",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_public_relations"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Security Studies",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Security Studies",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.833,
-        "details": {
-          "description": "min=0.833, mean=0.833, max=0.833, sum=1.665 (2)",
-          "tab": "Accuracy",
-          "Security Studies - Observed inference time (s)": {
-            "description": "min=0.864, mean=0.864, max=0.864, sum=1.728 (2)",
-            "tab": "Efficiency",
-            "score": 0.8641960144042968
-          },
-          "Security Studies - # eval": {
-            "description": "min=245, mean=245, max=245, sum=490 (2)",
-            "tab": "General information",
-            "score": 245.0
-          },
-          "Security Studies - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Security Studies - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Security Studies - # prompt tokens": {
-            "description": "min=1196.433, mean=1196.433, max=1196.433, sum=2392.865 (2)",
-            "tab": "General information",
-            "score": 1196.4326530612245
-          },
-          "Security Studies - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "security_studies",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_security_studies"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Sociology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Sociology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.91,
-        "details": {
-          "description": "min=0.91, mean=0.91, max=0.91, sum=1.821 (2)",
-          "tab": "Accuracy",
-          "Sociology - Observed inference time (s)": {
-            "description": "min=0.579, mean=0.579, max=0.579, sum=1.158 (2)",
-            "tab": "Efficiency",
-            "score": 0.5788582047419761
-          },
-          "Sociology - # eval": {
-            "description": "min=201, mean=201, max=201, sum=402 (2)",
-            "tab": "General information",
-            "score": 201.0
-          },
-          "Sociology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Sociology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Sociology - # prompt tokens": {
-            "description": "min=446.512, mean=446.512, max=446.512, sum=893.025 (2)",
-            "tab": "General information",
-            "score": 446.5124378109453
-          },
-          "Sociology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "sociology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_sociology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Virology",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on Virology",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.572,
-        "details": {
-          "description": "min=0.572, mean=0.572, max=0.572, sum=1.145 (2)",
-          "tab": "Accuracy",
-          "Virology - Observed inference time (s)": {
-            "description": "min=0.569, mean=0.569, max=0.569, sum=1.138 (2)",
-            "tab": "Efficiency",
-            "score": 0.5690187689769699
-          },
-          "Virology - # eval": {
-            "description": "min=166, mean=166, max=166, sum=332 (2)",
-            "tab": "General information",
-            "score": 166.0
-          },
-          "Virology - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "Virology - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "Virology - # prompt tokens": {
-            "description": "min=352.753, mean=352.753, max=352.753, sum=705.506 (2)",
-            "tab": "General information",
-            "score": 352.7530120481928
-          },
-          "Virology - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "virology",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_virology"
-        }
-      }
-    },
-    {
-      "evaluation_name": "World Religions",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "EM on World Religions",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.877,
-        "details": {
-          "description": "min=0.877, mean=0.877, max=0.877, sum=1.754 (2)",
-          "tab": "Accuracy",
-          "World Religions - Observed inference time (s)": {
-            "description": "min=0.579, mean=0.579, max=0.579, sum=1.159 (2)",
-            "tab": "Efficiency",
-            "score": 0.5794550257119518
-          },
-          "World Religions - # eval": {
-            "description": "min=171, mean=171, max=171, sum=342 (2)",
-            "tab": "General information",
-            "score": 171.0
-          },
-          "World Religions - # train": {
-            "description": "min=5, mean=5, max=5, sum=10 (2)",
-            "tab": "General information",
-            "score": 5.0
-          },
-          "World Religions - truncated": {
-            "description": "min=0, mean=0, max=0, sum=0 (2)",
-            "tab": "General information",
-            "score": 0.0
-          },
-          "World Religions - # prompt tokens": {
-            "description": "min=277.386, mean=277.386, max=277.386, sum=554.772 (2)",
-            "tab": "General information",
-            "score": 277.3859649122807
-          },
-          "World Religions - # output tokens": {
-            "description": "min=1, mean=1, max=1, sum=2 (2)",
-            "tab": "General information",
-            "score": 1.0
-          }
-        }
-      },
-      "generation_config": {
-        "additional_details": {
-          "subject": "world_religions",
-          "method": "multiple_choice_joint",
-          "eval_split": "test",
-          "groups": "mmlu_world_religions"
-        }
-      }
-    },
-    {
-      "evaluation_name": "Mean win rate",
-      "source_data": {
-        "dataset_name": "helm_mmlu",
-        "source_type": "url",
-        "url": [
-          "https://storage.googleapis.com/crfm-helm-public/mmlu/benchmark_output/releases/v1.13.0/groups/mmlu_subjects.json"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "How many models this model outperforms on average (over columns).",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.325,
-        "details": {
-          "tab": "Efficiency"
-        }
-      },
-      "generation_config": {
-        "additional_details": {}
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/0-hero/Matter-0.2-7B-DPO/0d7928c3-c769-474e-8249-7a5c70c4c559.json b/data/hfopenllm_v2/0-hero/Matter-0.2-7B-DPO/0d7928c3-c769-474e-8249-7a5c70c4c559.json
deleted file mode 100644
index f776710f3..000000000
--- a/data/hfopenllm_v2/0-hero/Matter-0.2-7B-DPO/0d7928c3-c769-474e-8249-7a5c70c4c559.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/0-hero_Matter-0.2-7B-DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Matter-0.2-7B-DPO",
-    "id": "0-hero/Matter-0.2-7B-DPO",
-    "developer": "0-hero",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3303
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3596
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3814
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1164
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-34B-32K/f63536ed-752b-4538-9b92-2514a617a4bf.json b/data/hfopenllm_v2/01-ai/Yi-1.5-34B-32K/f63536ed-752b-4538-9b92-2514a617a4bf.json
deleted file mode 100644
index 7d0d73c85..000000000
--- a/data/hfopenllm_v2/01-ai/Yi-1.5-34B-32K/f63536ed-752b-4538-9b92-2514a617a4bf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-34B-32K/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-1.5-34B-32K",
-    "id": "01-ai/Yi-1.5-34B-32K",
-    "developer": "01-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3119
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6016
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1541
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3633
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4398
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4709
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-34B-Chat-16K/8ff13de2-ea43-4392-992f-ba70b6023e96.json b/data/hfopenllm_v2/01-ai/Yi-1.5-34B-Chat-16K/8ff13de2-ea43-4392-992f-ba70b6023e96.json
deleted file mode 100644
index 8682b3811..000000000
--- a/data/hfopenllm_v2/01-ai/Yi-1.5-34B-Chat-16K/8ff13de2-ea43-4392-992f-ba70b6023e96.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-34B-Chat-16K/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-1.5-34B-Chat-16K",
-    "id": "01-ai/Yi-1.5-34B-Chat-16K",
-    "developer": "01-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4564
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.61
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2137
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4398
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4545
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-34B-Chat/02bac8a7-bd09-4e73-979a-7dbaa7a8ed75.json b/data/hfopenllm_v2/01-ai/Yi-1.5-34B-Chat/02bac8a7-bd09-4e73-979a-7dbaa7a8ed75.json
deleted file mode 100644
index 1a02c9bdc..000000000
--- a/data/hfopenllm_v2/01-ai/Yi-1.5-34B-Chat/02bac8a7-bd09-4e73-979a-7dbaa7a8ed75.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-34B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-1.5-34B-Chat",
-    "id": "01-ai/Yi-1.5-34B-Chat",
-    "developer": "01-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6067
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6084
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2772
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3649
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4282
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.452
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-34B/74e4406d-b2b6-4c3f-b059-f52cccf1fff4.json b/data/hfopenllm_v2/01-ai/Yi-1.5-34B/74e4406d-b2b6-4c3f-b059-f52cccf1fff4.json
deleted file mode 100644
index 948057bc5..000000000
--- a/data/hfopenllm_v2/01-ai/Yi-1.5-34B/74e4406d-b2b6-4c3f-b059-f52cccf1fff4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-34B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-1.5-34B",
-    "id": "01-ai/Yi-1.5-34B",
-    "developer": "01-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2841
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5976
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1533
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3658
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4236
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4666
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-6B-Chat/ec8a6d6c-b8ea-48a3-9af6-d357e0057ec1.json b/data/hfopenllm_v2/01-ai/Yi-1.5-6B-Chat/ec8a6d6c-b8ea-48a3-9af6-d357e0057ec1.json
deleted file mode 100644
index 3a37bdc49..000000000
--- a/data/hfopenllm_v2/01-ai/Yi-1.5-6B-Chat/ec8a6d6c-b8ea-48a3-9af6-d357e0057ec1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-6B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-1.5-6B-Chat",
-    "id": "01-ai/Yi-1.5-6B-Chat",
-    "developer": "01-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.061
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5145
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4571
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1624
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4392
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3193
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-6B/05307b41-d832-4533-99bd-c8608bf8e64c.json b/data/hfopenllm_v2/01-ai/Yi-1.5-6B/05307b41-d832-4533-99bd-c8608bf8e64c.json
deleted file mode 100644
index 8abcdb009..000000000
--- a/data/hfopenllm_v2/01-ai/Yi-1.5-6B/05307b41-d832-4533-99bd-c8608bf8e64c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-6B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-1.5-6B",
-    "id": "01-ai/Yi-1.5-6B",
-    "developer": "01-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.061
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4493
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4374
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3144
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-9B-32K/c09bd9b0-6f85-4120-94a9-b628c68bccb7.json b/data/hfopenllm_v2/01-ai/Yi-1.5-9B-32K/c09bd9b0-6f85-4120-94a9-b628c68bccb7.json
deleted file mode 100644
index 510b97e1d..000000000
--- a/data/hfopenllm_v2/01-ai/Yi-1.5-9B-32K/c09bd9b0-6f85-4120-94a9-b628c68bccb7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-9B-32K/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-1.5-9B-32K",
-    "id": "01-ai/Yi-1.5-9B-32K",
-    "developer": "01-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.829
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2303
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4963
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3591
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4186
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3765
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-9B-Chat-16K/9f971385-1146-4436-91a6-0e52d4db1f07.json b/data/hfopenllm_v2/01-ai/Yi-1.5-9B-Chat-16K/9f971385-1146-4436-91a6-0e52d4db1f07.json
deleted file mode 100644
index 67dbc9f74..000000000
--- a/data/hfopenllm_v2/01-ai/Yi-1.5-9B-Chat-16K/9f971385-1146-4436-91a6-0e52d4db1f07.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-9B-Chat-16K/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-1.5-9B-Chat-16K",
-    "id": "01-ai/Yi-1.5-9B-Chat-16K",
-    "developer": "01-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.829
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4214
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5153
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1782
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4099
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3994
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-9B-Chat/80ed14ca-b4cd-4ceb-8fdb-24705e47bd0e.json b/data/hfopenllm_v2/01-ai/Yi-1.5-9B-Chat/80ed14ca-b4cd-4ceb-8fdb-24705e47bd0e.json
deleted file mode 100644
index 9ac18fdab..000000000
--- a/data/hfopenllm_v2/01-ai/Yi-1.5-9B-Chat/80ed14ca-b4cd-4ceb-8fdb-24705e47bd0e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-9B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-1.5-9B-Chat",
-    "id": "01-ai/Yi-1.5-9B-Chat",
-    "developer": "01-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.829
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6046
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5559
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2258
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3347
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4259
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3975
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/01-ai/Yi-1.5-9B/db88e3f5-58a9-4783-9093-a6df96483342.json b/data/hfopenllm_v2/01-ai/Yi-1.5-9B/db88e3f5-58a9-4783-9093-a6df96483342.json
deleted file mode 100644
index 465841ce2..000000000
--- a/data/hfopenllm_v2/01-ai/Yi-1.5-9B/db88e3f5-58a9-4783-9093-a6df96483342.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/01-ai_Yi-1.5-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-1.5-9B",
-    "id": "01-ai/Yi-1.5-9B",
-    "developer": "01-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.829
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5143
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4328
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3916
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/01-ai/Yi-34B-200K/8cd90f8a-d8dc-469b-95b9-260fcef804d2.json b/data/hfopenllm_v2/01-ai/Yi-34B-200K/8cd90f8a-d8dc-469b-95b9-260fcef804d2.json
deleted file mode 100644
index ba5c90e1f..000000000
--- a/data/hfopenllm_v2/01-ai/Yi-34B-200K/8cd90f8a-d8dc-469b-95b9-260fcef804d2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/01-ai_Yi-34B-200K/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-34B-200K",
-    "id": "01-ai/Yi-34B-200K",
-    "developer": "01-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1542
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5442
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4535
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/01-ai/Yi-34B-Chat/b2c82703-2b5c-407d-b84f-a8f8261ac894.json b/data/hfopenllm_v2/01-ai/Yi-34B-Chat/b2c82703-2b5c-407d-b84f-a8f8261ac894.json
deleted file mode 100644
index a62cda89d..000000000
--- a/data/hfopenllm_v2/01-ai/Yi-34B-Chat/b2c82703-2b5c-407d-b84f-a8f8261ac894.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/01-ai_Yi-34B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-34B-Chat",
-    "id": "01-ai/Yi-34B-Chat",
-    "developer": "01-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4699
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5561
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0627
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3978
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4093
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/01-ai/Yi-34B/55462e67-5eca-4e9d-9095-51fcf12de5fa.json b/data/hfopenllm_v2/01-ai/Yi-34B/55462e67-5eca-4e9d-9095-51fcf12de5fa.json
deleted file mode 100644
index 1781d005a..000000000
--- a/data/hfopenllm_v2/01-ai/Yi-34B/55462e67-5eca-4e9d-9095-51fcf12de5fa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/01-ai_Yi-34B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-34B",
-    "id": "01-ai/Yi-34B",
-    "developer": "01-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3046
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5457
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3666
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4119
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4412
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/01-ai/Yi-6B-200K/25a119f0-5eaa-4fa9-8cd4-e0f437ada456.json b/data/hfopenllm_v2/01-ai/Yi-6B-200K/25a119f0-5eaa-4fa9-8cd4-e0f437ada456.json
deleted file mode 100644
index 8ffbe3e70..000000000
--- a/data/hfopenllm_v2/01-ai/Yi-6B-200K/25a119f0-5eaa-4fa9-8cd4-e0f437ada456.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/01-ai_Yi-6B-200K/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-6B-200K",
-    "id": "01-ai/Yi-6B-200K",
-    "developer": "01-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.061
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0843
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4289
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4587
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/01-ai/Yi-6B-Chat/efc036b6-d8de-4393-87a1-d4f86fb44d91.json b/data/hfopenllm_v2/01-ai/Yi-6B-Chat/efc036b6-d8de-4393-87a1-d4f86fb44d91.json
deleted file mode 100644
index a454f4866..000000000
--- a/data/hfopenllm_v2/01-ai/Yi-6B-Chat/efc036b6-d8de-4393-87a1-d4f86fb44d91.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/01-ai_Yi-6B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-6B-Chat",
-    "id": "01-ai/Yi-6B-Chat",
-    "developer": "01-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.061
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3395
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4133
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3688
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3061
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/01-ai/Yi-6B/a5144406-eb85-43b2-a49d-be6b06d6b04a.json b/data/hfopenllm_v2/01-ai/Yi-6B/a5144406-eb85-43b2-a49d-be6b06d6b04a.json
deleted file mode 100644
index b7fc1616d..000000000
--- a/data/hfopenllm_v2/01-ai/Yi-6B/a5144406-eb85-43b2-a49d-be6b06d6b04a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/01-ai_Yi-6B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-6B",
-    "id": "01-ai/Yi-6B",
-    "developer": "01-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.061
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2893
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4309
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3937
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2991
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/01-ai/Yi-9B-200K/900184ad-656d-416b-956f-5f6e3a991d1b.json b/data/hfopenllm_v2/01-ai/Yi-9B-200K/900184ad-656d-416b-956f-5f6e3a991d1b.json
deleted file mode 100644
index de7d6ef80..000000000
--- a/data/hfopenllm_v2/01-ai/Yi-9B-200K/900184ad-656d-416b-956f-5f6e3a991d1b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/01-ai_Yi-9B-200K/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-9B-200K",
-    "id": "01-ai/Yi-9B-200K",
-    "developer": "01-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.829
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2327
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4793
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4294
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3622
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/01-ai/Yi-9B/7a58954a-5d7d-4640-99fd-773249640237.json b/data/hfopenllm_v2/01-ai/Yi-9B/7a58954a-5d7d-4640-99fd-773249640237.json
deleted file mode 100644
index f8eee73fd..000000000
--- a/data/hfopenllm_v2/01-ai/Yi-9B/7a58954a-5d7d-4640-99fd-773249640237.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/01-ai_Yi-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-9B",
-    "id": "01-ai/Yi-9B",
-    "developer": "01-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.829
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2709
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.494
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0559
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4054
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3574
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/01-ai/Yi-Coder-9B-Chat/4ea3146c-b912-424a-b0a9-7c37348348c8.json b/data/hfopenllm_v2/01-ai/Yi-Coder-9B-Chat/4ea3146c-b912-424a-b0a9-7c37348348c8.json
deleted file mode 100644
index aa071d68a..000000000
--- a/data/hfopenllm_v2/01-ai/Yi-Coder-9B-Chat/4ea3146c-b912-424a-b0a9-7c37348348c8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/01-ai_Yi-Coder-9B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-Coder-9B-Chat",
-    "id": "01-ai/Yi-Coder-9B-Chat",
-    "developer": "01-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.829
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4817
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4814
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2475
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3992
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2425
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/1-800-LLMs/Qwen-2.5-14B-Hindi-Custom-Instruct/b0276278-6d86-49c0-a246-cd9110ac1deb.json b/data/hfopenllm_v2/1-800-LLMs/Qwen-2.5-14B-Hindi-Custom-Instruct/b0276278-6d86-49c0-a246-cd9110ac1deb.json
deleted file mode 100644
index 14f396bdb..000000000
--- a/data/hfopenllm_v2/1-800-LLMs/Qwen-2.5-14B-Hindi-Custom-Instruct/b0276278-6d86-49c0-a246-cd9110ac1deb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/1-800-LLMs_Qwen-2.5-14B-Hindi-Custom-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-2.5-14B-Hindi-Custom-Instruct",
-    "id": "1-800-LLMs/Qwen-2.5-14B-Hindi-Custom-Instruct",
-    "developer": "1-800-LLMs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3077
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4491
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5164
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/1-800-LLMs/Qwen-2.5-14B-Hindi/04216f67-1385-43bf-b7de-5bae7a60f379.json b/data/hfopenllm_v2/1-800-LLMs/Qwen-2.5-14B-Hindi/04216f67-1385-43bf-b7de-5bae7a60f379.json
deleted file mode 100644
index 96c0dfd9a..000000000
--- a/data/hfopenllm_v2/1-800-LLMs/Qwen-2.5-14B-Hindi/04216f67-1385-43bf-b7de-5bae7a60f379.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/1-800-LLMs_Qwen-2.5-14B-Hindi/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-2.5-14B-Hindi",
-    "id": "1-800-LLMs/Qwen-2.5-14B-Hindi",
-    "developer": "1-800-LLMs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5826
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6524
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3624
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4489
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5263
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/1024m/PHI-4-Hindi/fbf7b76b-7ced-4217-8e14-1d02184e271c.json b/data/hfopenllm_v2/1024m/PHI-4-Hindi/fbf7b76b-7ced-4217-8e14-1d02184e271c.json
deleted file mode 100644
index 0b0fe7e6b..000000000
--- a/data/hfopenllm_v2/1024m/PHI-4-Hindi/fbf7b76b-7ced-4217-8e14-1d02184e271c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/1024m_PHI-4-Hindi/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PHI-4-Hindi",
-    "id": "1024m/PHI-4-Hindi",
-    "developer": "1024m",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0082
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.671
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2334
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3977
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4914
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5239
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/1024m/QWEN-14B-B100/74ac8aba-6dfb-464c-81b5-d02a9192b9cc.json b/data/hfopenllm_v2/1024m/QWEN-14B-B100/74ac8aba-6dfb-464c-81b5-d02a9192b9cc.json
deleted file mode 100644
index 21260063a..000000000
--- a/data/hfopenllm_v2/1024m/QWEN-14B-B100/74ac8aba-6dfb-464c-81b5-d02a9192b9cc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/1024m_QWEN-14B-B100/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QWEN-14B-B100",
-    "id": "1024m/QWEN-14B-B100",
-    "developer": "1024m",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7762
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6533
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3507
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.41
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5179
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/152334H/miqu-1-70b-sf/295938e1-ade2-4d36-beca-3cbe506b5b90.json b/data/hfopenllm_v2/152334H/miqu-1-70b-sf/295938e1-ade2-4d36-beca-3cbe506b5b90.json
deleted file mode 100644
index 133dae7a4..000000000
--- a/data/hfopenllm_v2/152334H/miqu-1-70b-sf/295938e1-ade2-4d36-beca-3cbe506b5b90.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/152334H_miqu-1-70b-sf/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "miqu-1-70b-sf",
-    "id": "152334H/miqu-1-70b-sf",
-    "developer": "152334H",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 68.977
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5182
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6102
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1246
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3507
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4582
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4228
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/1TuanPham/T-VisStar-7B-v0.1/f331782f-ea09-41bd-8c6a-e964c88d7e09.json b/data/hfopenllm_v2/1TuanPham/T-VisStar-7B-v0.1/f331782f-ea09-41bd-8c6a-e964c88d7e09.json
deleted file mode 100644
index 95ed6976e..000000000
--- a/data/hfopenllm_v2/1TuanPham/T-VisStar-7B-v0.1/f331782f-ea09-41bd-8c6a-e964c88d7e09.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/1TuanPham_T-VisStar-7B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "T-VisStar-7B-v0.1",
-    "id": "1TuanPham/T-VisStar-7B-v0.1",
-    "developer": "1TuanPham",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.294
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3607
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5052
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4375
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3211
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/1TuanPham/T-VisStar-v0.1/e4e3d79a-1de9-43be-a029-0be4f60e472b.json b/data/hfopenllm_v2/1TuanPham/T-VisStar-v0.1/e4e3d79a-1de9-43be-a029-0be4f60e472b.json
deleted file mode 100644
index 32ac26ea2..000000000
--- a/data/hfopenllm_v2/1TuanPham/T-VisStar-v0.1/e4e3d79a-1de9-43be-a029-0be4f60e472b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/1TuanPham_T-VisStar-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "T-VisStar-v0.1",
-    "id": "1TuanPham/T-VisStar-v0.1",
-    "developer": "1TuanPham",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.294
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3607
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5052
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4375
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3211
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/3rd-Degree-Burn/L-3.1-Science-Writer-8B/6914ac28-b543-4f36-81f1-f7491c018e3b.json b/data/hfopenllm_v2/3rd-Degree-Burn/L-3.1-Science-Writer-8B/6914ac28-b543-4f36-81f1-f7491c018e3b.json
deleted file mode 100644
index c548f8a55..000000000
--- a/data/hfopenllm_v2/3rd-Degree-Burn/L-3.1-Science-Writer-8B/6914ac28-b543-4f36-81f1-f7491c018e3b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/3rd-Degree-Burn_L-3.1-Science-Writer-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L-3.1-Science-Writer-8B",
-    "id": "3rd-Degree-Burn/L-3.1-Science-Writer-8B",
-    "developer": "3rd-Degree-Burn",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4263
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5041
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1035
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3959
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3649
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/3rd-Degree-Burn/Llama-3.1-8B-Squareroot-v1/b7378f41-46ab-41af-94cc-e7fb10738658.json b/data/hfopenllm_v2/3rd-Degree-Burn/Llama-3.1-8B-Squareroot-v1/b7378f41-46ab-41af-94cc-e7fb10738658.json
deleted file mode 100644
index 44e2bc2aa..000000000
--- a/data/hfopenllm_v2/3rd-Degree-Burn/Llama-3.1-8B-Squareroot-v1/b7378f41-46ab-41af-94cc-e7fb10738658.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/3rd-Degree-Burn_Llama-3.1-8B-Squareroot-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-Squareroot-v1",
-    "id": "3rd-Degree-Burn/Llama-3.1-8B-Squareroot-v1",
-    "developer": "3rd-Degree-Burn",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2892
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3343
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0884
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3341
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1127
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/3rd-Degree-Burn/Llama-3.1-8B-Squareroot/acedae59-6192-4ac4-a354-d520ecd6ba36.json b/data/hfopenllm_v2/3rd-Degree-Burn/Llama-3.1-8B-Squareroot/acedae59-6192-4ac4-a354-d520ecd6ba36.json
deleted file mode 100644
index 56cd9d2ea..000000000
--- a/data/hfopenllm_v2/3rd-Degree-Burn/Llama-3.1-8B-Squareroot/acedae59-6192-4ac4-a354-d520ecd6ba36.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/3rd-Degree-Burn_Llama-3.1-8B-Squareroot/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-Squareroot",
-    "id": "3rd-Degree-Burn/Llama-3.1-8B-Squareroot",
-    "developer": "3rd-Degree-Burn",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2213
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3461
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3089
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.175
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/3rd-Degree-Burn/Llama-Squared-8B/ff105961-761d-4261-8a44-20acf2e7f440.json b/data/hfopenllm_v2/3rd-Degree-Burn/Llama-Squared-8B/ff105961-761d-4261-8a44-20acf2e7f440.json
deleted file mode 100644
index 4375cd5c9..000000000
--- a/data/hfopenllm_v2/3rd-Degree-Burn/Llama-Squared-8B/ff105961-761d-4261-8a44-20acf2e7f440.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/3rd-Degree-Burn_Llama-Squared-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-Squared-8B",
-    "id": "3rd-Degree-Burn/Llama-Squared-8B",
-    "developer": "3rd-Degree-Burn",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2755
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4431
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3089
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2366
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/4season/final_model_test_v2/fa0901f6-514e-44ae-84dc-0b793f26169e.json b/data/hfopenllm_v2/4season/final_model_test_v2/fa0901f6-514e-44ae-84dc-0b793f26169e.json
deleted file mode 100644
index 2c3cdad71..000000000
--- a/data/hfopenllm_v2/4season/final_model_test_v2/fa0901f6-514e-44ae-84dc-0b793f26169e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/4season_final_model_test_v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "final_model_test_v2",
-    "id": "4season/final_model_test_v2",
-    "developer": "4season",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 21.421
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3191
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6342
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0838
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4314
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3528
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AALF/FuseChat-Llama-3.1-8B-Instruct-preview/d2dff5df-343b-40f3-85de-14eb72dab050.json b/data/hfopenllm_v2/AALF/FuseChat-Llama-3.1-8B-Instruct-preview/d2dff5df-343b-40f3-85de-14eb72dab050.json
deleted file mode 100644
index 6aa520b9d..000000000
--- a/data/hfopenllm_v2/AALF/FuseChat-Llama-3.1-8B-Instruct-preview/d2dff5df-343b-40f3-85de-14eb72dab050.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AALF_FuseChat-Llama-3.1-8B-Instruct-preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FuseChat-Llama-3.1-8B-Instruct-preview",
-    "id": "AALF/FuseChat-Llama-3.1-8B-Instruct-preview",
-    "developer": "AALF",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.719
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.512
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2477
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AALF/FuseChat-Llama-3.1-8B-SFT-preview/8fa3010f-b7a1-4fc1-9156-ba70453add86.json b/data/hfopenllm_v2/AALF/FuseChat-Llama-3.1-8B-SFT-preview/8fa3010f-b7a1-4fc1-9156-ba70453add86.json
deleted file mode 100644
index 5b5e4c7c1..000000000
--- a/data/hfopenllm_v2/AALF/FuseChat-Llama-3.1-8B-SFT-preview/8fa3010f-b7a1-4fc1-9156-ba70453add86.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AALF_FuseChat-Llama-3.1-8B-SFT-preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FuseChat-Llama-3.1-8B-SFT-preview",
-    "id": "AALF/FuseChat-Llama-3.1-8B-SFT-preview",
-    "developer": "AALF",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7281
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.524
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2251
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.402
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3743
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AALF/gemma-2-27b-it-SimPO-37K-100steps/58034f99-3b01-46d6-aea9-90c75d073bb0.json b/data/hfopenllm_v2/AALF/gemma-2-27b-it-SimPO-37K-100steps/58034f99-3b01-46d6-aea9-90c75d073bb0.json
deleted file mode 100644
index 410931148..000000000
--- a/data/hfopenllm_v2/AALF/gemma-2-27b-it-SimPO-37K-100steps/58034f99-3b01-46d6-aea9-90c75d073bb0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AALF_gemma-2-27b-it-SimPO-37K-100steps/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-27b-it-SimPO-37K-100steps",
-    "id": "AALF/gemma-2-27b-it-SimPO-37K-100steps",
-    "developer": "AALF",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2568
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3931
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0211
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3329
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2125
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AALF/gemma-2-27b-it-SimPO-37K/e6c08c9c-6d01-45c7-8a24-219b756b8632.json b/data/hfopenllm_v2/AALF/gemma-2-27b-it-SimPO-37K/e6c08c9c-6d01-45c7-8a24-219b756b8632.json
deleted file mode 100644
index 3cc0a4181..000000000
--- a/data/hfopenllm_v2/AALF/gemma-2-27b-it-SimPO-37K/e6c08c9c-6d01-45c7-8a24-219b756b8632.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AALF_gemma-2-27b-it-SimPO-37K/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-27b-it-SimPO-37K",
-    "id": "AALF/gemma-2-27b-it-SimPO-37K",
-    "developer": "AALF",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2407
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3911
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3488
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1971
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AELLM/gemma-2-aeria-infinity-9b/cd97ad01-1d20-4cbd-a9bb-2acf3d9fdcc7.json b/data/hfopenllm_v2/AELLM/gemma-2-aeria-infinity-9b/cd97ad01-1d20-4cbd-a9bb-2acf3d9fdcc7.json
deleted file mode 100644
index 4e8d23811..000000000
--- a/data/hfopenllm_v2/AELLM/gemma-2-aeria-infinity-9b/cd97ad01-1d20-4cbd-a9bb-2acf3d9fdcc7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AELLM_gemma-2-aeria-infinity-9b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-aeria-infinity-9b",
-    "id": "AELLM/gemma-2-aeria-infinity-9b",
-    "developer": "AELLM",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7594
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5983
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3339
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.402
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3862
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AELLM/gemma-2-lyco-infinity-9b/95f44ef8-e5ba-4bdc-97a7-2c5a678b07be.json b/data/hfopenllm_v2/AELLM/gemma-2-lyco-infinity-9b/95f44ef8-e5ba-4bdc-97a7-2c5a678b07be.json
deleted file mode 100644
index 8c6280d2f..000000000
--- a/data/hfopenllm_v2/AELLM/gemma-2-lyco-infinity-9b/95f44ef8-e5ba-4bdc-97a7-2c5a678b07be.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AELLM_gemma-2-lyco-infinity-9b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-lyco-infinity-9b",
-    "id": "AELLM/gemma-2-lyco-infinity-9b",
-    "developer": "AELLM",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7316
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.584
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1707
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4006
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3787
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AGI-0/Art-v0-3B/082f25f0-994c-438a-8086-b1e439aca466.json b/data/hfopenllm_v2/AGI-0/Art-v0-3B/082f25f0-994c-438a-8086-b1e439aca466.json
deleted file mode 100644
index 521160bae..000000000
--- a/data/hfopenllm_v2/AGI-0/Art-v0-3B/082f25f0-994c-438a-8086-b1e439aca466.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AGI-0_Art-v0-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Art-v0-3B",
-    "id": "AGI-0/Art-v0-3B",
-    "developer": "AGI-0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3192
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3401
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2462
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3768
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1179
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AGI-0/Artificium-llama3.1-8B-001/31423cbd-08cd-4079-b1c5-ba412acf1b51.json b/data/hfopenllm_v2/AGI-0/Artificium-llama3.1-8B-001/31423cbd-08cd-4079-b1c5-ba412acf1b51.json
deleted file mode 100644
index 39747f5fd..000000000
--- a/data/hfopenllm_v2/AGI-0/Artificium-llama3.1-8B-001/31423cbd-08cd-4079-b1c5-ba412acf1b51.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AGI-0_Artificium-llama3.1-8B-001/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Artificium-llama3.1-8B-001",
-    "id": "AGI-0/Artificium-llama3.1-8B-001",
-    "developer": "AGI-0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5248
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4256
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3795
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3182
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AGI-0/smartllama3.1-8B-001/2669bd86-da65-4d87-8464-bfa8c741ce0b.json b/data/hfopenllm_v2/AGI-0/smartllama3.1-8B-001/2669bd86-da65-4d87-8464-bfa8c741ce0b.json
deleted file mode 100644
index 2a80fe730..000000000
--- a/data/hfopenllm_v2/AGI-0/smartllama3.1-8B-001/2669bd86-da65-4d87-8464-bfa8c741ce0b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AGI-0_smartllama3.1-8B-001/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smartllama3.1-8B-001",
-    "id": "AGI-0/smartllama3.1-8B-001",
-    "developer": "AGI-0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3518
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.467
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1299
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4386
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3487
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AI-MO/NuminaMath-7B-CoT/ab2c19ff-5671-446f-b09e-731e2ae515ca.json b/data/hfopenllm_v2/AI-MO/NuminaMath-7B-CoT/ab2c19ff-5671-446f-b09e-731e2ae515ca.json
deleted file mode 100644
index 885230cbc..000000000
--- a/data/hfopenllm_v2/AI-MO/NuminaMath-7B-CoT/ab2c19ff-5671-446f-b09e-731e2ae515ca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AI-MO_NuminaMath-7B-CoT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NuminaMath-7B-CoT",
-    "id": "AI-MO/NuminaMath-7B-CoT",
-    "developer": "AI-MO",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.91
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2689
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4314
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2696
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3303
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2868
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AI-MO/NuminaMath-7B-TIR/36250dc3-cb51-43be-8ab0-6788eb5bda7c.json b/data/hfopenllm_v2/AI-MO/NuminaMath-7B-TIR/36250dc3-cb51-43be-8ab0-6788eb5bda7c.json
deleted file mode 100644
index d1e8152de..000000000
--- a/data/hfopenllm_v2/AI-MO/NuminaMath-7B-TIR/36250dc3-cb51-43be-8ab0-6788eb5bda7c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AI-MO_NuminaMath-7B-TIR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NuminaMath-7B-TIR",
-    "id": "AI-MO/NuminaMath-7B-TIR",
-    "developer": "AI-MO",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.91
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2756
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4144
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1609
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3509
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2733
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AI-Sweden-Models/Llama-3-8B-instruct/cd616d6a-151f-4aaa-93b5-9c4a758f95b5.json b/data/hfopenllm_v2/AI-Sweden-Models/Llama-3-8B-instruct/cd616d6a-151f-4aaa-93b5-9c4a758f95b5.json
deleted file mode 100644
index 4c342ae03..000000000
--- a/data/hfopenllm_v2/AI-Sweden-Models/Llama-3-8B-instruct/cd616d6a-151f-4aaa-93b5-9c4a758f95b5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AI-Sweden-Models_Llama-3-8B-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-instruct",
-    "id": "AI-Sweden-Models/Llama-3-8B-instruct",
-    "developer": "AI-Sweden-Models",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2401
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4173
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4771
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2597
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AI-Sweden-Models/gpt-sw3-40b/9cb09cae-9b1b-43b1-afbf-f44b0a44053c.json b/data/hfopenllm_v2/AI-Sweden-Models/gpt-sw3-40b/9cb09cae-9b1b-43b1-afbf-f44b0a44053c.json
deleted file mode 100644
index d5848d50e..000000000
--- a/data/hfopenllm_v2/AI-Sweden-Models/gpt-sw3-40b/9cb09cae-9b1b-43b1-afbf-f44b0a44053c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AI-Sweden-Models_gpt-sw3-40b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt-sw3-40b",
-    "id": "AI-Sweden-Models/gpt-sw3-40b",
-    "developer": "AI-Sweden-Models",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPT2LMHeadModel",
-      "params_billions": 39.927
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.147
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3268
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0174
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2349
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3632
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1276
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AI4free/Dhanishtha/038c32da-add5-4299-ac17-df6ef3fdea58.json b/data/hfopenllm_v2/AI4free/Dhanishtha/038c32da-add5-4299-ac17-df6ef3fdea58.json
deleted file mode 100644
index ead44ad6b..000000000
--- a/data/hfopenllm_v2/AI4free/Dhanishtha/038c32da-add5-4299-ac17-df6ef3fdea58.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AI4free_Dhanishtha/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dhanishtha",
-    "id": "AI4free/Dhanishtha",
-    "developer": "AI4free",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2451
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3404
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.256
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3569
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1643
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AI4free/t2/25eb4bdf-beb4-4ad2-a5e9-3a2f31c46cb5.json b/data/hfopenllm_v2/AI4free/t2/25eb4bdf-beb4-4ad2-a5e9-3a2f31c46cb5.json
deleted file mode 100644
index f06b40189..000000000
--- a/data/hfopenllm_v2/AI4free/t2/25eb4bdf-beb4-4ad2-a5e9-3a2f31c46cb5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AI4free_t2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "t2",
-    "id": "AI4free/t2",
-    "developer": "AI4free",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.291
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1896
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3846
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1144
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AIDC-AI/Marco-o1/77655d60-872f-468a-acc6-d584ef5bf46a.json b/data/hfopenllm_v2/AIDC-AI/Marco-o1/77655d60-872f-468a-acc6-d584ef5bf46a.json
deleted file mode 100644
index a71a6c124..000000000
--- a/data/hfopenllm_v2/AIDC-AI/Marco-o1/77655d60-872f-468a-acc6-d584ef5bf46a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AIDC-AI_Marco-o1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Marco-o1",
-    "id": "AIDC-AI/Marco-o1",
-    "developer": "AIDC-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4771
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5364
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3746
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4138
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4117
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Aashraf995/Creative-7B-nerd/4de378c8-ccf6-4f0b-8287-3d138a8645b9.json b/data/hfopenllm_v2/Aashraf995/Creative-7B-nerd/4de378c8-ccf6-4f0b-8287-3d138a8645b9.json
deleted file mode 100644
index 90198bb55..000000000
--- a/data/hfopenllm_v2/Aashraf995/Creative-7B-nerd/4de378c8-ccf6-4f0b-8287-3d138a8645b9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Aashraf995_Creative-7B-nerd/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Creative-7B-nerd",
-    "id": "Aashraf995/Creative-7B-nerd",
-    "developer": "Aashraf995",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4722
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5607
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3165
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4515
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4492
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Aashraf995/Gemma-Evo-10B/8039cadf-6644-44e7-8452-90e9c8069e28.json b/data/hfopenllm_v2/Aashraf995/Gemma-Evo-10B/8039cadf-6644-44e7-8452-90e9c8069e28.json
deleted file mode 100644
index 915792a80..000000000
--- a/data/hfopenllm_v2/Aashraf995/Gemma-Evo-10B/8039cadf-6644-44e7-8452-90e9c8069e28.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Aashraf995_Gemma-Evo-10B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-Evo-10B",
-    "id": "Aashraf995/Gemma-Evo-10B",
-    "developer": "Aashraf995",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7332
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6044
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2228
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.354
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4595
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4275
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Aashraf995/Qwen-Evo-7B/8914d89d-c873-4704-998e-dc807e96030b.json b/data/hfopenllm_v2/Aashraf995/Qwen-Evo-7B/8914d89d-c873-4704-998e-dc807e96030b.json
deleted file mode 100644
index b958767ce..000000000
--- a/data/hfopenllm_v2/Aashraf995/Qwen-Evo-7B/8914d89d-c873-4704-998e-dc807e96030b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Aashraf995_Qwen-Evo-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-Evo-7B",
-    "id": "Aashraf995/Qwen-Evo-7B",
-    "developer": "Aashraf995",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4757
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5709
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3142
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4541
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4462
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Aashraf995/QwenStock-14B/c2e9fc29-db07-4b49-a98a-084158831ac4.json b/data/hfopenllm_v2/Aashraf995/QwenStock-14B/c2e9fc29-db07-4b49-a98a-084158831ac4.json
deleted file mode 100644
index 2181091a7..000000000
--- a/data/hfopenllm_v2/Aashraf995/QwenStock-14B/c2e9fc29-db07-4b49-a98a-084158831ac4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Aashraf995_QwenStock-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenStock-14B",
-    "id": "Aashraf995/QwenStock-14B",
-    "developer": "Aashraf995",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5009
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.655
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3573
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3893
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4793
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5382
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AbacusResearch/Jallabi-34B/58724539-6fc5-40d9-ba43-87410959894d.json b/data/hfopenllm_v2/AbacusResearch/Jallabi-34B/58724539-6fc5-40d9-ba43-87410959894d.json
deleted file mode 100644
index 51b7d1999..000000000
--- a/data/hfopenllm_v2/AbacusResearch/Jallabi-34B/58724539-6fc5-40d9-ba43-87410959894d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AbacusResearch_Jallabi-34B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Jallabi-34B",
-    "id": "AbacusResearch/Jallabi-34B",
-    "developer": "AbacusResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3529
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6023
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0521
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3389
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4822
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4682
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Ahdoot/StructuredThinker-v0.3-MoreStructure/b13324cf-f6f5-4bf1-9cf3-c196120c4bcf.json b/data/hfopenllm_v2/Ahdoot/StructuredThinker-v0.3-MoreStructure/b13324cf-f6f5-4bf1-9cf3-c196120c4bcf.json
deleted file mode 100644
index a962961f2..000000000
--- a/data/hfopenllm_v2/Ahdoot/StructuredThinker-v0.3-MoreStructure/b13324cf-f6f5-4bf1-9cf3-c196120c4bcf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Ahdoot_StructuredThinker-v0.3-MoreStructure/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "StructuredThinker-v0.3-MoreStructure",
-    "id": "Ahdoot/StructuredThinker-v0.3-MoreStructure",
-    "developer": "Ahdoot",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.397
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4193
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4838
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2908
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4158
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.361
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Ahdoot/Test_StealthThinker/782b2df0-d1b3-414c-a4bd-59052a4441a9.json b/data/hfopenllm_v2/Ahdoot/Test_StealthThinker/782b2df0-d1b3-414c-a4bd-59052a4441a9.json
deleted file mode 100644
index 717c7bcb8..000000000
--- a/data/hfopenllm_v2/Ahdoot/Test_StealthThinker/782b2df0-d1b3-414c-a4bd-59052a4441a9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Ahdoot_Test_StealthThinker/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Test_StealthThinker",
-    "id": "Ahdoot/Test_StealthThinker",
-    "developer": "Ahdoot",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.422
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4647
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.179
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.428
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3597
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V0-Coder/b508e41e-0f1c-49ce-8b80-5e7ec82b8f15.json b/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V0-Coder/b508e41e-0f1c-49ce-8b80-5e7ec82b8f15.json
deleted file mode 100644
index 8c4cb6a4c..000000000
--- a/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V0-Coder/b508e41e-0f1c-49ce-8b80-5e7ec82b8f15.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AicoresSecurity_Cybernet-Sec-3B-R1-V0-Coder/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cybernet-Sec-3B-R1-V0-Coder",
-    "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V0-Coder",
-    "developer": "AicoresSecurity",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7098
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4478
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1488
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3408
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3178
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V0/2824e8d4-2749-4b18-a3a1-b987ed215ac6.json b/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V0/2824e8d4-2749-4b18-a3a1-b987ed215ac6.json
deleted file mode 100644
index 5acadd460..000000000
--- a/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V0/2824e8d4-2749-4b18-a3a1-b987ed215ac6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AicoresSecurity_Cybernet-Sec-3B-R1-V0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cybernet-Sec-3B-R1-V0",
-    "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V0",
-    "developer": "AicoresSecurity",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6358
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4497
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1156
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.301
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V1.1/53176984-ba93-4a64-b81e-21f6e0f65bcd.json b/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V1.1/53176984-ba93-4a64-b81e-21f6e0f65bcd.json
deleted file mode 100644
index 6634b986e..000000000
--- a/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V1.1/53176984-ba93-4a64-b81e-21f6e0f65bcd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AicoresSecurity_Cybernet-Sec-3B-R1-V1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cybernet-Sec-3B-R1-V1.1",
-    "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V1.1",
-    "developer": "AicoresSecurity",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.673
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4392
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.176
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3541
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3088
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V1/53252698-7d17-4f2a-9106-3b744ae7a985.json b/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V1/53252698-7d17-4f2a-9106-3b744ae7a985.json
deleted file mode 100644
index 0a69dcaf4..000000000
--- a/data/hfopenllm_v2/AicoresSecurity/Cybernet-Sec-3B-R1-V1/53252698-7d17-4f2a-9106-3b744ae7a985.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AicoresSecurity_Cybernet-Sec-3B-R1-V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cybernet-Sec-3B-R1-V1",
-    "id": "AicoresSecurity/Cybernet-Sec-3B-R1-V1",
-    "developer": "AicoresSecurity",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6146
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4282
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1518
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3287
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2876
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Alepach/notHumpback-M0/6dd0f3a2-27ee-48f1-9d97-ef6954d298c8.json b/data/hfopenllm_v2/Alepach/notHumpback-M0/6dd0f3a2-27ee-48f1-9d97-ef6954d298c8.json
deleted file mode 100644
index c2db43af7..000000000
--- a/data/hfopenllm_v2/Alepach/notHumpback-M0/6dd0f3a2-27ee-48f1-9d97-ef6954d298c8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Alepach_notHumpback-M0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "notHumpback-M0",
-    "id": "Alepach/notHumpback-M0",
-    "developer": "Alepach",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.235
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0189
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3552
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1119
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Alepach/notHumpback-M1-v2/35f11d5e-88c4-4a95-8d06-a40bee648b00.json b/data/hfopenllm_v2/Alepach/notHumpback-M1-v2/35f11d5e-88c4-4a95-8d06-a40bee648b00.json
deleted file mode 100644
index 63b777f1c..000000000
--- a/data/hfopenllm_v2/Alepach/notHumpback-M1-v2/35f11d5e-88c4-4a95-8d06-a40bee648b00.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Alepach_notHumpback-M1-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "notHumpback-M1-v2",
-    "id": "Alepach/notHumpback-M1-v2",
-    "developer": "Alepach",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2277
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2776
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1119
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Alepach/notHumpback-M1/ba1193c0-42b8-487d-b9fd-ddbc1fd15359.json b/data/hfopenllm_v2/Alepach/notHumpback-M1/ba1193c0-42b8-487d-b9fd-ddbc1fd15359.json
deleted file mode 100644
index 80d910c4c..000000000
--- a/data/hfopenllm_v2/Alepach/notHumpback-M1/ba1193c0-42b8-487d-b9fd-ddbc1fd15359.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Alepach_notHumpback-M1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "notHumpback-M1",
-    "id": "Alepach/notHumpback-M1",
-    "developer": "Alepach",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2207
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2882
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2374
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1091
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Alibaba-NLP/gte-Qwen2-7B-instruct/95733620-e1e7-4442-b9c3-a699165df5e7.json b/data/hfopenllm_v2/Alibaba-NLP/gte-Qwen2-7B-instruct/95733620-e1e7-4442-b9c3-a699165df5e7.json
deleted file mode 100644
index 2205af9c9..000000000
--- a/data/hfopenllm_v2/Alibaba-NLP/gte-Qwen2-7B-instruct/95733620-e1e7-4442-b9c3-a699165df5e7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Alibaba-NLP_gte-Qwen2-7B-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gte-Qwen2-7B-instruct",
-    "id": "Alibaba-NLP/gte-Qwen2-7B-instruct",
-    "developer": "Alibaba-NLP",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2255
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4495
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0642
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.245
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3559
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3321
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Alsebay/Qwen2.5-7B-test-novelist/cacfce0d-f5f1-4101-8065-f5f02eaab1fb.json b/data/hfopenllm_v2/Alsebay/Qwen2.5-7B-test-novelist/cacfce0d-f5f1-4101-8065-f5f02eaab1fb.json
deleted file mode 100644
index ea8074956..000000000
--- a/data/hfopenllm_v2/Alsebay/Qwen2.5-7B-test-novelist/cacfce0d-f5f1-4101-8065-f5f02eaab1fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Alsebay_Qwen2.5-7B-test-novelist/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-test-novelist",
-    "id": "Alsebay/Qwen2.5-7B-test-novelist",
-    "developer": "Alsebay",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5352
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5151
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2349
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4749
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3866
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Amaorynho/BBAI2006/72be5537-198a-43e9-9840-a803083158d3.json b/data/hfopenllm_v2/Amaorynho/BBAI2006/72be5537-198a-43e9-9840-a803083158d3.json
deleted file mode 100644
index 0d3a4cdd2..000000000
--- a/data/hfopenllm_v2/Amaorynho/BBAI2006/72be5537-198a-43e9-9840-a803083158d3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Amaorynho_BBAI2006/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI2006",
-    "id": "Amaorynho/BBAI2006",
-    "developer": "Amaorynho",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.09
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1467
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2704
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3605
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1123
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Amaorynho/BBAI270V4/2e9a3443-970d-4f37-a356-277a11c81754.json b/data/hfopenllm_v2/Amaorynho/BBAI270V4/2e9a3443-970d-4f37-a356-277a11c81754.json
deleted file mode 100644
index 2399e7c15..000000000
--- a/data/hfopenllm_v2/Amaorynho/BBAI270V4/2e9a3443-970d-4f37-a356-277a11c81754.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Amaorynho_BBAI270V4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI270V4",
-    "id": "Amaorynho/BBAI270V4",
-    "developer": "Amaorynho",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.199
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3071
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2458
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1114
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Amaorynho/BBAIIFEV1/1188402f-aa1c-4306-b031-c92ff0a5dd64.json b/data/hfopenllm_v2/Amaorynho/BBAIIFEV1/1188402f-aa1c-4306-b031-c92ff0a5dd64.json
deleted file mode 100644
index 1e9f99354..000000000
--- a/data/hfopenllm_v2/Amaorynho/BBAIIFEV1/1188402f-aa1c-4306-b031-c92ff0a5dd64.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Amaorynho_BBAIIFEV1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAIIFEV1",
-    "id": "Amaorynho/BBAIIFEV1",
-    "developer": "Amaorynho",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8047
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5292
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1934
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4185
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3857
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Amaorynho/BBAI_375/ee2f567a-6403-46d5-9a6b-bd029f81d660.json b/data/hfopenllm_v2/Amaorynho/BBAI_375/ee2f567a-6403-46d5-9a6b-bd029f81d660.json
deleted file mode 100644
index e92063a97..000000000
--- a/data/hfopenllm_v2/Amaorynho/BBAI_375/ee2f567a-6403-46d5-9a6b-bd029f81d660.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Amaorynho_BBAI_375/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI_375",
-    "id": "Amaorynho/BBAI_375",
-    "developer": "Amaorynho",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.09
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1467
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2704
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3605
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1123
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Amu/t1-1.5B/d809fdff-f5ff-44f5-afc7-7e8af9ce2f93.json b/data/hfopenllm_v2/Amu/t1-1.5B/d809fdff-f5ff-44f5-afc7-7e8af9ce2f93.json
deleted file mode 100644
index c7e1777b9..000000000
--- a/data/hfopenllm_v2/Amu/t1-1.5B/d809fdff-f5ff-44f5-afc7-7e8af9ce2f93.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Amu_t1-1.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "t1-1.5B",
-    "id": "Amu/t1-1.5B",
-    "developer": "Amu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3394
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4008
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2433
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3517
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2566
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Amu/t1-3B/87d66efc-173f-4c14-b76c-d8b7e00d575d.json b/data/hfopenllm_v2/Amu/t1-3B/87d66efc-173f-4c14-b76c-d8b7e00d575d.json
deleted file mode 100644
index 8c8a648ec..000000000
--- a/data/hfopenllm_v2/Amu/t1-3B/87d66efc-173f-4c14-b76c-d8b7e00d575d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Amu_t1-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "t1-3B",
-    "id": "Amu/t1-3B",
-    "developer": "Amu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.397
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3328
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3999
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1375
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2408
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3435
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1284
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ArliAI/ArliAI-RPMax-12B-v1.1/47f62378-c3cc-408f-a0d1-71eb3f522f57.json b/data/hfopenllm_v2/ArliAI/ArliAI-RPMax-12B-v1.1/47f62378-c3cc-408f-a0d1-71eb3f522f57.json
deleted file mode 100644
index 6c27ae339..000000000
--- a/data/hfopenllm_v2/ArliAI/ArliAI-RPMax-12B-v1.1/47f62378-c3cc-408f-a0d1-71eb3f522f57.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ArliAI_ArliAI-RPMax-12B-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ArliAI-RPMax-12B-v1.1",
-    "id": "ArliAI/ArliAI-RPMax-12B-v1.1",
-    "developer": "ArliAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5349
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4752
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3618
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3384
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ArliAI/Llama-3.1-8B-ArliAI-RPMax-v1.1/dba8c12c-388d-4f8b-8ce8-83acfc4920c7.json b/data/hfopenllm_v2/ArliAI/Llama-3.1-8B-ArliAI-RPMax-v1.1/dba8c12c-388d-4f8b-8ce8-83acfc4920c7.json
deleted file mode 100644
index c8b199daf..000000000
--- a/data/hfopenllm_v2/ArliAI/Llama-3.1-8B-ArliAI-RPMax-v1.1/dba8c12c-388d-4f8b-8ce8-83acfc4920c7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ArliAI_Llama-3.1-8B-ArliAI-RPMax-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-ArliAI-RPMax-v1.1",
-    "id": "ArliAI/Llama-3.1-8B-ArliAI-RPMax-v1.1",
-    "developer": "ArliAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6359
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5016
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1314
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3577
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3551
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Arthur-LAGACHERIE/Precis-1B-Instruct/e4087285-1d1a-465e-ac88-91310e939710.json b/data/hfopenllm_v2/Arthur-LAGACHERIE/Precis-1B-Instruct/e4087285-1d1a-465e-ac88-91310e939710.json
deleted file mode 100644
index d6fc53fc3..000000000
--- a/data/hfopenllm_v2/Arthur-LAGACHERIE/Precis-1B-Instruct/e4087285-1d1a-465e-ac88-91310e939710.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Arthur-LAGACHERIE_Precis-1B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Precis-1B-Instruct",
-    "id": "Arthur-LAGACHERIE/Precis-1B-Instruct",
-    "developer": "Arthur-LAGACHERIE",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3671
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3224
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0038
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3436
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1426
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Artples/L-MChat-7b/09f189d9-74fd-47bb-b5fb-7994cba56ae2.json b/data/hfopenllm_v2/Artples/L-MChat-7b/09f189d9-74fd-47bb-b5fb-7994cba56ae2.json
deleted file mode 100644
index 68e074cae..000000000
--- a/data/hfopenllm_v2/Artples/L-MChat-7b/09f189d9-74fd-47bb-b5fb-7994cba56ae2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Artples_L-MChat-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L-MChat-7b",
-    "id": "Artples/L-MChat-7b",
-    "developer": "Artples",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5297
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0921
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4029
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3299
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Artples/L-MChat-Small/5754c262-6ddf-4f54-9722-22ff20a8d76f.json b/data/hfopenllm_v2/Artples/L-MChat-Small/5754c262-6ddf-4f54-9722-22ff20a8d76f.json
deleted file mode 100644
index fd60828c3..000000000
--- a/data/hfopenllm_v2/Artples/L-MChat-Small/5754c262-6ddf-4f54-9722-22ff20a8d76f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Artples_L-MChat-Small/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L-MChat-Small",
-    "id": "Artples/L-MChat-Small",
-    "developer": "Artples",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.78
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3287
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4823
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0378
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3696
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2464
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Aryanne/QwentileSwap/cc1bd811-ec88-4514-8b47-4140ded4f03d.json b/data/hfopenllm_v2/Aryanne/QwentileSwap/cc1bd811-ec88-4514-8b47-4140ded4f03d.json
deleted file mode 100644
index cd69fff0a..000000000
--- a/data/hfopenllm_v2/Aryanne/QwentileSwap/cc1bd811-ec88-4514-8b47-4140ded4f03d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Aryanne_QwentileSwap/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwentileSwap",
-    "id": "Aryanne/QwentileSwap",
-    "developer": "Aryanne",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7378
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7008
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4222
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3674
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.464
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5946
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Aryanne/SHBA/3f08155d-8551-4472-86fe-7988cd6df78b.json b/data/hfopenllm_v2/Aryanne/SHBA/3f08155d-8551-4472-86fe-7988cd6df78b.json
deleted file mode 100644
index 5001e5134..000000000
--- a/data/hfopenllm_v2/Aryanne/SHBA/3f08155d-8551-4472-86fe-7988cd6df78b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Aryanne_SHBA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SHBA",
-    "id": "Aryanne/SHBA",
-    "developer": "Aryanne",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7817
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5233
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1798
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4161
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3892
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Aryanne/SuperHeart/339e12fb-b4a4-4a4b-bb40-899b4ad833f9.json b/data/hfopenllm_v2/Aryanne/SuperHeart/339e12fb-b4a4-4a4b-bb40-899b4ad833f9.json
deleted file mode 100644
index 9df9c1198..000000000
--- a/data/hfopenllm_v2/Aryanne/SuperHeart/339e12fb-b4a4-4a4b-bb40-899b4ad833f9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Aryanne_SuperHeart/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SuperHeart",
-    "id": "Aryanne/SuperHeart",
-    "developer": "Aryanne",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5192
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5215
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1563
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4436
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3912
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AtAndDev/Qwen2.5-1.5B-continuous-learnt/4fd60e9c-5c90-492a-b24d-7ca6d1e91eae.json b/data/hfopenllm_v2/AtAndDev/Qwen2.5-1.5B-continuous-learnt/4fd60e9c-5c90-492a-b24d-7ca6d1e91eae.json
deleted file mode 100644
index c1dcb320d..000000000
--- a/data/hfopenllm_v2/AtAndDev/Qwen2.5-1.5B-continuous-learnt/4fd60e9c-5c90-492a-b24d-7ca6d1e91eae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AtAndDev_Qwen2.5-1.5B-continuous-learnt/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-1.5B-continuous-learnt",
-    "id": "AtAndDev/Qwen2.5-1.5B-continuous-learnt",
-    "developer": "AtAndDev",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4605
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4258
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0748
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3636
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2812
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AtAndDev/Qwen2.5-1.5B-continuous-learnt/7f8d935e-3782-4769-8bd0-ee8a0ce91cd6.json b/data/hfopenllm_v2/AtAndDev/Qwen2.5-1.5B-continuous-learnt/7f8d935e-3782-4769-8bd0-ee8a0ce91cd6.json
deleted file mode 100644
index cd9fc36f7..000000000
--- a/data/hfopenllm_v2/AtAndDev/Qwen2.5-1.5B-continuous-learnt/7f8d935e-3782-4769-8bd0-ee8a0ce91cd6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AtAndDev_Qwen2.5-1.5B-continuous-learnt/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-1.5B-continuous-learnt",
-    "id": "AtAndDev/Qwen2.5-1.5B-continuous-learnt",
-    "developer": "AtAndDev",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4511
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4275
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1473
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3623
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2806
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Ateron/Glowing-Forest-12B/6fa07e60-9f82-4abc-aa45-4dfc0bcf9b8d.json b/data/hfopenllm_v2/Ateron/Glowing-Forest-12B/6fa07e60-9f82-4abc-aa45-4dfc0bcf9b8d.json
deleted file mode 100644
index d15a6561f..000000000
--- a/data/hfopenllm_v2/Ateron/Glowing-Forest-12B/6fa07e60-9f82-4abc-aa45-4dfc0bcf9b8d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Ateron_Glowing-Forest-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Glowing-Forest-12B",
-    "id": "Ateron/Glowing-Forest-12B",
-    "developer": "Ateron",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3592
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5492
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0778
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4449
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3718
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Ateron/Lotus-Magpic/99a0022b-3fe7-4612-9cbb-cf082c1f6b70.json b/data/hfopenllm_v2/Ateron/Lotus-Magpic/99a0022b-3fe7-4612-9cbb-cf082c1f6b70.json
deleted file mode 100644
index 63c101443..000000000
--- a/data/hfopenllm_v2/Ateron/Lotus-Magpic/99a0022b-3fe7-4612-9cbb-cf082c1f6b70.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Ateron_Lotus-Magpic/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lotus-Magpic",
-    "id": "Ateron/Lotus-Magpic",
-    "developer": "Ateron",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6286
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5254
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0997
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4332
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3491
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Ateron/Way_of_MagPicaro/b1153714-d6fe-4ff9-ab8c-85b677d57f8f.json b/data/hfopenllm_v2/Ateron/Way_of_MagPicaro/b1153714-d6fe-4ff9-ab8c-85b677d57f8f.json
deleted file mode 100644
index 7a1c07042..000000000
--- a/data/hfopenllm_v2/Ateron/Way_of_MagPicaro/b1153714-d6fe-4ff9-ab8c-85b677d57f8f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Ateron_Way_of_MagPicaro/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Way_of_MagPicaro",
-    "id": "Ateron/Way_of_MagPicaro",
-    "developer": "Ateron",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2637
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5427
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0589
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3339
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4649
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3536
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AuraIndustries/Aura-4B/c3d39b6c-02af-410d-8a5c-224495b04572.json b/data/hfopenllm_v2/AuraIndustries/Aura-4B/c3d39b6c-02af-410d-8a5c-224495b04572.json
deleted file mode 100644
index 259ade02b..000000000
--- a/data/hfopenllm_v2/AuraIndustries/Aura-4B/c3d39b6c-02af-410d-8a5c-224495b04572.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AuraIndustries_Aura-4B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Aura-4B",
-    "id": "AuraIndustries/Aura-4B",
-    "developer": "AuraIndustries",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.513
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3816
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.449
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0423
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3938
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2706
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AuraIndustries/Aura-8B/0426fcba-3db4-492d-b622-e34ab8d3fc8f.json b/data/hfopenllm_v2/AuraIndustries/Aura-8B/0426fcba-3db4-492d-b622-e34ab8d3fc8f.json
deleted file mode 100644
index 2bec15124..000000000
--- a/data/hfopenllm_v2/AuraIndustries/Aura-8B/0426fcba-3db4-492d-b622-e34ab8d3fc8f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AuraIndustries_Aura-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Aura-8B",
-    "id": "AuraIndustries/Aura-8B",
-    "developer": "AuraIndustries",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7205
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5131
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1518
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4004
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3874
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AuraIndustries/Aura-MoE-2x4B-v2/aa099cfe-ac9a-42dd-8357-f4d8115133ca.json b/data/hfopenllm_v2/AuraIndustries/Aura-MoE-2x4B-v2/aa099cfe-ac9a-42dd-8357-f4d8115133ca.json
deleted file mode 100644
index 2a673ad65..000000000
--- a/data/hfopenllm_v2/AuraIndustries/Aura-MoE-2x4B-v2/aa099cfe-ac9a-42dd-8357-f4d8115133ca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AuraIndustries_Aura-MoE-2x4B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Aura-MoE-2x4B-v2",
-    "id": "AuraIndustries/Aura-MoE-2x4B-v2",
-    "developer": "AuraIndustries",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 7.231
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4778
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4315
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0317
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4101
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.261
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/AuraIndustries/Aura-MoE-2x4B/ccbc8a5e-9a97-452a-b023-cc996ffe31f1.json b/data/hfopenllm_v2/AuraIndustries/Aura-MoE-2x4B/ccbc8a5e-9a97-452a-b023-cc996ffe31f1.json
deleted file mode 100644
index 460a41061..000000000
--- a/data/hfopenllm_v2/AuraIndustries/Aura-MoE-2x4B/ccbc8a5e-9a97-452a-b023-cc996ffe31f1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/AuraIndustries_Aura-MoE-2x4B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Aura-MoE-2x4B",
-    "id": "AuraIndustries/Aura-MoE-2x4B",
-    "developer": "AuraIndustries",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 7.231
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4601
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4339
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.031
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4085
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.265
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Aurel9/testmerge-7b/b359a7a3-cf2c-4952-b308-333672dadcec.json b/data/hfopenllm_v2/Aurel9/testmerge-7b/b359a7a3-cf2c-4952-b308-333672dadcec.json
deleted file mode 100644
index a8574b05b..000000000
--- a/data/hfopenllm_v2/Aurel9/testmerge-7b/b359a7a3-cf2c-4952-b308-333672dadcec.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Aurel9_testmerge-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "testmerge-7b",
-    "id": "Aurel9/testmerge-7b",
-    "developer": "Aurel9",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.398
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.519
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0657
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4659
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3053
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Ayush-Singh/Llama1B-sft-2/0864d5cf-d6fe-42bc-9059-9f2e5ff06b60.json b/data/hfopenllm_v2/Ayush-Singh/Llama1B-sft-2/0864d5cf-d6fe-42bc-9059-9f2e5ff06b60.json
deleted file mode 100644
index cf11b9c2a..000000000
--- a/data/hfopenllm_v2/Ayush-Singh/Llama1B-sft-2/0864d5cf-d6fe-42bc-9059-9f2e5ff06b60.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Ayush-Singh_Llama1B-sft-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama1B-sft-2",
-    "id": "Ayush-Singh/Llama1B-sft-2",
-    "developer": "Ayush-Singh",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1374
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2834
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2458
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3552
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1117
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Azure99/Blossom-V6-14B/e6ef2559-8a63-43e3-a60b-0d2b7256ad3d.json b/data/hfopenllm_v2/Azure99/Blossom-V6-14B/e6ef2559-8a63-43e3-a60b-0d2b7256ad3d.json
deleted file mode 100644
index 374b26fce..000000000
--- a/data/hfopenllm_v2/Azure99/Blossom-V6-14B/e6ef2559-8a63-43e3-a60b-0d2b7256ad3d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Azure99_Blossom-V6-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Blossom-V6-14B",
-    "id": "Azure99/Blossom-V6-14B",
-    "developer": "Azure99",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6395
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5069
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5257
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4035
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4544
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Azure99/Blossom-V6-7B/45d019ab-b23c-4fc3-baf5-d57576e9945c.json b/data/hfopenllm_v2/Azure99/Blossom-V6-7B/45d019ab-b23c-4fc3-baf5-d57576e9945c.json
deleted file mode 100644
index e807205a9..000000000
--- a/data/hfopenllm_v2/Azure99/Blossom-V6-7B/45d019ab-b23c-4fc3-baf5-d57576e9945c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Azure99_Blossom-V6-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Blossom-V6-7B",
-    "id": "Azure99/Blossom-V6-7B",
-    "developer": "Azure99",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5538
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4974
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4585
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4301
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4144
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Azure99/blossom-v5-32b/e3cd7c32-e5a1-4cd6-a9dc-95364a8abe75.json b/data/hfopenllm_v2/Azure99/blossom-v5-32b/e3cd7c32-e5a1-4cd6-a9dc-95364a8abe75.json
deleted file mode 100644
index c1499a18c..000000000
--- a/data/hfopenllm_v2/Azure99/blossom-v5-32b/e3cd7c32-e5a1-4cd6-a9dc-95364a8abe75.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Azure99_blossom-v5-32b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "blossom-v5-32b",
-    "id": "Azure99/blossom-v5-32b",
-    "developer": "Azure99",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.512
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5235
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5955
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1866
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.402
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4235
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Azure99/blossom-v5-llama3-8b/9be442e8-4b77-43e0-a981-887338e59b78.json b/data/hfopenllm_v2/Azure99/blossom-v5-llama3-8b/9be442e8-4b77-43e0-a981-887338e59b78.json
deleted file mode 100644
index 6b1528202..000000000
--- a/data/hfopenllm_v2/Azure99/blossom-v5-llama3-8b/9be442e8-4b77-43e0-a981-887338e59b78.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Azure99_blossom-v5-llama3-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "blossom-v5-llama3-8b",
-    "id": "Azure99/blossom-v5-llama3-8b",
-    "developer": "Azure99",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4343
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4185
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.367
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2206
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Azure99/blossom-v5.1-34b/a07b6326-f393-490e-b696-d8b45f593d4b.json b/data/hfopenllm_v2/Azure99/blossom-v5.1-34b/a07b6326-f393-490e-b696-d8b45f593d4b.json
deleted file mode 100644
index 108854be5..000000000
--- a/data/hfopenllm_v2/Azure99/blossom-v5.1-34b/a07b6326-f393-490e-b696-d8b45f593d4b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Azure99_blossom-v5.1-34b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "blossom-v5.1-34b",
-    "id": "Azure99/blossom-v5.1-34b",
-    "developer": "Azure99",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5697
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6109
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2591
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3928
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4558
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Azure99/blossom-v5.1-9b/b66ed91a-98d5-407c-9896-9c2e2a31e9da.json b/data/hfopenllm_v2/Azure99/blossom-v5.1-9b/b66ed91a-98d5-407c-9896-9c2e2a31e9da.json
deleted file mode 100644
index 812870f1b..000000000
--- a/data/hfopenllm_v2/Azure99/blossom-v5.1-9b/b66ed91a-98d5-407c-9896-9c2e2a31e9da.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Azure99_blossom-v5.1-9b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "blossom-v5.1-9b",
-    "id": "Azure99/blossom-v5.1-9b",
-    "developer": "Azure99",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.829
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5086
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5343
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2122
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3994
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3979
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BAAI/Gemma2-9B-IT-Simpo-Infinity-Preference/9c70921d-956b-4727-9201-1addbd01bb8b.json b/data/hfopenllm_v2/BAAI/Gemma2-9B-IT-Simpo-Infinity-Preference/9c70921d-956b-4727-9201-1addbd01bb8b.json
deleted file mode 100644
index 212643c5b..000000000
--- a/data/hfopenllm_v2/BAAI/Gemma2-9B-IT-Simpo-Infinity-Preference/9c70921d-956b-4727-9201-1addbd01bb8b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BAAI_Gemma2-9B-IT-Simpo-Infinity-Preference/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma2-9B-IT-Simpo-Infinity-Preference",
-    "id": "BAAI/Gemma2-9B-IT-Simpo-Infinity-Preference",
-    "developer": "BAAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3176
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5979
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0974
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3398
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3966
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3869
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0613-Llama3-70B/4ba6d51e-314a-4db4-9552-568a4093e01a.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0613-Llama3-70B/4ba6d51e-314a-4db4-9552-568a4093e01a.json
deleted file mode 100644
index 3e94d0dd0..000000000
--- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0613-Llama3-70B/4ba6d51e-314a-4db4-9552-568a4093e01a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-3M-0613-Llama3-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Infinity-Instruct-3M-0613-Llama3-70B",
-    "id": "BAAI/Infinity-Instruct-3M-0613-Llama3-70B",
-    "developer": "BAAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6821
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6642
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2153
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3582
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4523
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.473
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0613-Mistral-7B/835f5056-56bf-4a6c-886f-fbe6f263ac07.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0613-Mistral-7B/835f5056-56bf-4a6c-886f-fbe6f263ac07.json
deleted file mode 100644
index 9053fed79..000000000
--- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0613-Mistral-7B/835f5056-56bf-4a6c-886f-fbe6f263ac07.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-3M-0613-Mistral-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Infinity-Instruct-3M-0613-Mistral-7B",
-    "id": "BAAI/Infinity-Instruct-3M-0613-Mistral-7B",
-    "developer": "BAAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.532
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4958
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0816
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4351
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3161
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Llama3-70B/c2a63afa-9d25-41dc-b25f-848f5a640501.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Llama3-70B/c2a63afa-9d25-41dc-b25f-848f5a640501.json
deleted file mode 100644
index 090eaf46c..000000000
--- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Llama3-70B/c2a63afa-9d25-41dc-b25f-848f5a640501.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-3M-0625-Llama3-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Infinity-Instruct-3M-0625-Llama3-70B",
-    "id": "BAAI/Infinity-Instruct-3M-0625-Llama3-70B",
-    "developer": "BAAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7442
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.667
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2251
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3574
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4617
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4586
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Llama3-8B/f64f9d24-e448-4bb6-89c3-edb66499bac9.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Llama3-8B/f64f9d24-e448-4bb6-89c3-edb66499bac9.json
deleted file mode 100644
index 4f8d75121..000000000
--- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Llama3-8B/f64f9d24-e448-4bb6-89c3-edb66499bac9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-3M-0625-Llama3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Infinity-Instruct-3M-0625-Llama3-8B",
-    "id": "BAAI/Infinity-Instruct-3M-0625-Llama3-8B",
-    "developer": "BAAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.605
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4955
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0884
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3712
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3252
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Mistral-7B/2de14bfb-844a-4711-815e-8f63487a78fd.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Mistral-7B/2de14bfb-844a-4711-815e-8f63487a78fd.json
deleted file mode 100644
index ea49688f6..000000000
--- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Mistral-7B/2de14bfb-844a-4711-815e-8f63487a78fd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-3M-0625-Mistral-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Infinity-Instruct-3M-0625-Mistral-7B",
-    "id": "BAAI/Infinity-Instruct-3M-0625-Mistral-7B",
-    "developer": "BAAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5867
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.494
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0763
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4272
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Qwen2-7B/f953e0e2-ddca-42a2-a0f6-752a137bc6b5.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Qwen2-7B/f953e0e2-ddca-42a2-a0f6-752a137bc6b5.json
deleted file mode 100644
index ff2b44e64..000000000
--- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Qwen2-7B/f953e0e2-ddca-42a2-a0f6-752a137bc6b5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-3M-0625-Qwen2-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Infinity-Instruct-3M-0625-Qwen2-7B",
-    "id": "BAAI/Infinity-Instruct-3M-0625-Qwen2-7B",
-    "developer": "BAAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5554
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5346
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1926
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3888
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.396
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Yi-1.5-9B/98187b98-0cc8-4756-9cb7-c53deb998f90.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Yi-1.5-9B/98187b98-0cc8-4756-9cb7-c53deb998f90.json
deleted file mode 100644
index 5553e65e0..000000000
--- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-3M-0625-Yi-1.5-9B/98187b98-0cc8-4756-9cb7-c53deb998f90.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-3M-0625-Yi-1.5-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Infinity-Instruct-3M-0625-Yi-1.5-9B",
-    "id": "BAAI/Infinity-Instruct-3M-0625-Yi-1.5-9B",
-    "developer": "BAAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.829
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5186
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5509
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1639
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.354
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4575
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4118
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-0729-Llama3_1-8B/8c79c60d-ebf4-4409-be4f-928a54cedd1d.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-0729-Llama3_1-8B/8c79c60d-ebf4-4409-be4f-928a54cedd1d.json
deleted file mode 100644
index de7d61cf3..000000000
--- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-0729-Llama3_1-8B/8c79c60d-ebf4-4409-be4f-928a54cedd1d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-7M-0729-Llama3_1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Infinity-Instruct-7M-0729-Llama3_1-8B",
-    "id": "BAAI/Infinity-Instruct-7M-0729-Llama3_1-8B",
-    "developer": "BAAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6132
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5077
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1276
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3578
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3224
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-0729-mistral-7B/5d5cebeb-faf0-4fdf-8749-6307080e82f2.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-0729-mistral-7B/5d5cebeb-faf0-4fdf-8749-6307080e82f2.json
deleted file mode 100644
index 27fce40f9..000000000
--- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-0729-mistral-7B/5d5cebeb-faf0-4fdf-8749-6307080e82f2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-7M-0729-mistral-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Infinity-Instruct-7M-0729-mistral-7B",
-    "id": "BAAI/Infinity-Instruct-7M-0729-mistral-7B",
-    "developer": "BAAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6162
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4964
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0831
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4062
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3274
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-Llama3_1-70B/e926ce8f-45bb-4f3d-b579-ecadb3df6468.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-Llama3_1-70B/e926ce8f-45bb-4f3d-b579-ecadb3df6468.json
deleted file mode 100644
index c05f8ef93..000000000
--- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-Llama3_1-70B/e926ce8f-45bb-4f3d-b579-ecadb3df6468.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-7M-Gen-Llama3_1-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Infinity-Instruct-7M-Gen-Llama3_1-70B",
-    "id": "BAAI/Infinity-Instruct-7M-Gen-Llama3_1-70B",
-    "developer": "BAAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7335
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6695
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2523
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3758
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4539
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4607
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-Llama3_1-8B/070609d6-5f41-4712-9ad7-e215b1a6bb81.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-Llama3_1-8B/070609d6-5f41-4712-9ad7-e215b1a6bb81.json
deleted file mode 100644
index e21c1ef5d..000000000
--- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-Llama3_1-8B/070609d6-5f41-4712-9ad7-e215b1a6bb81.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-7M-Gen-Llama3_1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Infinity-Instruct-7M-Gen-Llama3_1-8B",
-    "id": "BAAI/Infinity-Instruct-7M-Gen-Llama3_1-8B",
-    "developer": "BAAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6132
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5077
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1276
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3578
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3224
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-mistral-7B/8d2909c7-37f2-4198-a1e2-4bf2ebc1444d.json b/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-mistral-7B/8d2909c7-37f2-4198-a1e2-4bf2ebc1444d.json
deleted file mode 100644
index 4f2f9d23d..000000000
--- a/data/hfopenllm_v2/BAAI/Infinity-Instruct-7M-Gen-mistral-7B/8d2909c7-37f2-4198-a1e2-4bf2ebc1444d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BAAI_Infinity-Instruct-7M-Gen-mistral-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Infinity-Instruct-7M-Gen-mistral-7B",
-    "id": "BAAI/Infinity-Instruct-7M-Gen-mistral-7B",
-    "developer": "BAAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6147
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4964
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0831
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4062
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3274
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BAAI/OPI-Llama-3.1-8B-Instruct/53587959-25f9-43aa-a34b-f274d8bc93af.json b/data/hfopenllm_v2/BAAI/OPI-Llama-3.1-8B-Instruct/53587959-25f9-43aa-a34b-f274d8bc93af.json
deleted file mode 100644
index e76dc81cd..000000000
--- a/data/hfopenllm_v2/BAAI/OPI-Llama-3.1-8B-Instruct/53587959-25f9-43aa-a34b-f274d8bc93af.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BAAI_OPI-Llama-3.1-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OPI-Llama-3.1-8B-Instruct",
-    "id": "BAAI/OPI-Llama-3.1-8B-Instruct",
-    "developer": "BAAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2075
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3551
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3233
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2124
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BEE-spoke-data/Meta-Llama-3-8Bee/2a7f80ed-d404-4c81-b000-b65c83069121.json b/data/hfopenllm_v2/BEE-spoke-data/Meta-Llama-3-8Bee/2a7f80ed-d404-4c81-b000-b65c83069121.json
deleted file mode 100644
index 3edbb6f6f..000000000
--- a/data/hfopenllm_v2/BEE-spoke-data/Meta-Llama-3-8Bee/2a7f80ed-d404-4c81-b000-b65c83069121.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BEE-spoke-data_Meta-Llama-3-8Bee/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Meta-Llama-3-8Bee",
-    "id": "BEE-spoke-data/Meta-Llama-3-8Bee",
-    "developer": "BEE-spoke-data",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1951
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4626
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0483
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3654
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.322
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BEE-spoke-data/smol_llama-101M-GQA/f0983645-4adb-4ddb-bf2f-33480cb7f421.json b/data/hfopenllm_v2/BEE-spoke-data/smol_llama-101M-GQA/f0983645-4adb-4ddb-bf2f-33480cb7f421.json
deleted file mode 100644
index 1bb779c1b..000000000
--- a/data/hfopenllm_v2/BEE-spoke-data/smol_llama-101M-GQA/f0983645-4adb-4ddb-bf2f-33480cb7f421.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BEE-spoke-data_smol_llama-101M-GQA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smol_llama-101M-GQA",
-    "id": "BEE-spoke-data/smol_llama-101M-GQA",
-    "developer": "BEE-spoke-data",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.101
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1384
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3018
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3713
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1107
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-GQA-fineweb_edu/161dadfe-4983-4f56-8a7d-9b97f1c5a3c7.json b/data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-GQA-fineweb_edu/161dadfe-4983-4f56-8a7d-9b97f1c5a3c7.json
deleted file mode 100644
index d7af3cb33..000000000
--- a/data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-GQA-fineweb_edu/161dadfe-4983-4f56-8a7d-9b97f1c5a3c7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BEE-spoke-data_smol_llama-220M-GQA-fineweb_edu/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smol_llama-220M-GQA-fineweb_edu",
-    "id": "BEE-spoke-data/smol_llama-220M-GQA-fineweb_edu",
-    "developer": "BEE-spoke-data",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.218
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1988
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2929
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4368
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1127
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-GQA/694a02f9-4729-4d0b-97ce-80adaef29be2.json b/data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-GQA/694a02f9-4729-4d0b-97ce-80adaef29be2.json
deleted file mode 100644
index aa2c6a93b..000000000
--- a/data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-GQA/694a02f9-4729-4d0b-97ce-80adaef29be2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BEE-spoke-data_smol_llama-220M-GQA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smol_llama-220M-GQA",
-    "id": "BEE-spoke-data/smol_llama-220M-GQA",
-    "developer": "BEE-spoke-data",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.218
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2386
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3032
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4059
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1149
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-openhermes/0521f51d-22c1-4821-8f04-23c533411668.json b/data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-openhermes/0521f51d-22c1-4821-8f04-23c533411668.json
deleted file mode 100644
index 7df19fae4..000000000
--- a/data/hfopenllm_v2/BEE-spoke-data/smol_llama-220M-openhermes/0521f51d-22c1-4821-8f04-23c533411668.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BEE-spoke-data_smol_llama-220M-openhermes/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smol_llama-220M-openhermes",
-    "id": "BEE-spoke-data/smol_llama-220M-openhermes",
-    "developer": "BEE-spoke-data",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.218
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1555
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3028
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3847
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.112
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024/8fdea71b-5e68-4a78-aefc-8a00650464c4.json b/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024/8fdea71b-5e68-4a78-aefc-8a00650464c4.json
deleted file mode 100644
index 18703f8e9..000000000
--- a/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024/8fdea71b-5e68-4a78-aefc-8a00650464c4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BEE-spoke-data_tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024",
-    "id": "BEE-spoke-data/tFINE-900m-e16-d32-flan-infinity-instruct-7m-T2T_en-1024",
-    "developer": "BEE-spoke-data",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "T5ForConditionalGeneration",
-      "params_billions": 0.887
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1321
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4393
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1237
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-flan/e2ba5674-9251-4a4e-9eb8-046c834da400.json b/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-flan/e2ba5674-9251-4a4e-9eb8-046c834da400.json
deleted file mode 100644
index c330cd864..000000000
--- a/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-flan/e2ba5674-9251-4a4e-9eb8-046c834da400.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BEE-spoke-data_tFINE-900m-e16-d32-flan/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tFINE-900m-e16-d32-flan",
-    "id": "BEE-spoke-data/tFINE-900m-e16-d32-flan",
-    "developer": "BEE-spoke-data",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "T5ForConditionalGeneration",
-      "params_billions": 0.887
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1506
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3028
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2332
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3724
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1307
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-instruct_2e/4caafdb2-3065-40d4-b5a7-9deb41e1d8a7.json b/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-instruct_2e/4caafdb2-3065-40d4-b5a7-9deb41e1d8a7.json
deleted file mode 100644
index 2509ffeaa..000000000
--- a/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-e16-d32-instruct_2e/4caafdb2-3065-40d4-b5a7-9deb41e1d8a7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BEE-spoke-data_tFINE-900m-e16-d32-instruct_2e/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tFINE-900m-e16-d32-instruct_2e",
-    "id": "BEE-spoke-data/tFINE-900m-e16-d32-instruct_2e",
-    "developer": "BEE-spoke-data",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "T5ForConditionalGeneration",
-      "params_billions": 0.887
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1403
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3135
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4207
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1237
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-instruct-orpo/886e0b8b-b2dc-434f-a299-50f668006241.json b/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-instruct-orpo/886e0b8b-b2dc-434f-a299-50f668006241.json
deleted file mode 100644
index 2061ed658..000000000
--- a/data/hfopenllm_v2/BEE-spoke-data/tFINE-900m-instruct-orpo/886e0b8b-b2dc-434f-a299-50f668006241.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BEE-spoke-data_tFINE-900m-instruct-orpo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tFINE-900m-instruct-orpo",
-    "id": "BEE-spoke-data/tFINE-900m-instruct-orpo",
-    "developer": "BEE-spoke-data",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "T5ForConditionalGeneration",
-      "params_billions": 0.887
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.133
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3022
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3409
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1152
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BSC-LT/salamandra-7b-instruct/7a6a9443-f331-4dfa-acf9-6aa30049bade.json b/data/hfopenllm_v2/BSC-LT/salamandra-7b-instruct/7a6a9443-f331-4dfa-acf9-6aa30049bade.json
deleted file mode 100644
index 83c4bafbf..000000000
--- a/data/hfopenllm_v2/BSC-LT/salamandra-7b-instruct/7a6a9443-f331-4dfa-acf9-6aa30049bade.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BSC-LT_salamandra-7b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "salamandra-7b-instruct",
-    "id": "BSC-LT/salamandra-7b-instruct",
-    "developer": "BSC-LT",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.768
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2451
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3851
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4134
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1805
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BSC-LT/salamandra-7b/6d523da4-ec4a-405b-a25d-afc7b1b5aefd.json b/data/hfopenllm_v2/BSC-LT/salamandra-7b/6d523da4-ec4a-405b-a25d-afc7b1b5aefd.json
deleted file mode 100644
index c5d56580f..000000000
--- a/data/hfopenllm_v2/BSC-LT/salamandra-7b/6d523da4-ec4a-405b-a25d-afc7b1b5aefd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BSC-LT_salamandra-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "salamandra-7b",
-    "id": "BSC-LT/salamandra-7b",
-    "developer": "BSC-LT",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.768
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1367
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3517
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0038
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3501
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1493
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Ba2han/Llama-Phi-3_DoRA/cfecfce3-090d-4c2e-826c-03c0c5337e98.json b/data/hfopenllm_v2/Ba2han/Llama-Phi-3_DoRA/cfecfce3-090d-4c2e-826c-03c0c5337e98.json
deleted file mode 100644
index a299a3968..000000000
--- a/data/hfopenllm_v2/Ba2han/Llama-Phi-3_DoRA/cfecfce3-090d-4c2e-826c-03c0c5337e98.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Ba2han_Llama-Phi-3_DoRA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-Phi-3_DoRA",
-    "id": "Ba2han/Llama-Phi-3_DoRA",
-    "developer": "Ba2han",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5131
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5515
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4069
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3915
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Baptiste-HUVELLE-10/LeTriomphant2.2_ECE_iLAB/5aa124dc-4abd-4c5f-b40a-a8d81af922eb.json b/data/hfopenllm_v2/Baptiste-HUVELLE-10/LeTriomphant2.2_ECE_iLAB/5aa124dc-4abd-4c5f-b40a-a8d81af922eb.json
deleted file mode 100644
index b23494890..000000000
--- a/data/hfopenllm_v2/Baptiste-HUVELLE-10/LeTriomphant2.2_ECE_iLAB/5aa124dc-4abd-4c5f-b40a-a8d81af922eb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Baptiste-HUVELLE-10_LeTriomphant2.2_ECE_iLAB/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LeTriomphant2.2_ECE_iLAB",
-    "id": "Baptiste-HUVELLE-10/LeTriomphant2.2_ECE_iLAB",
-    "developer": "Baptiste-HUVELLE-10",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5076
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7256
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4449
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3993
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4626
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5851
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BenevolenceMessiah/Qwen2.5-72B-2x-Instruct-TIES-v1.0/ec91b122-c8f5-4dfb-94fd-336ef78c3e14.json b/data/hfopenllm_v2/BenevolenceMessiah/Qwen2.5-72B-2x-Instruct-TIES-v1.0/ec91b122-c8f5-4dfb-94fd-336ef78c3e14.json
deleted file mode 100644
index 7d796c600..000000000
--- a/data/hfopenllm_v2/BenevolenceMessiah/Qwen2.5-72B-2x-Instruct-TIES-v1.0/ec91b122-c8f5-4dfb-94fd-336ef78c3e14.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BenevolenceMessiah_Qwen2.5-72B-2x-Instruct-TIES-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-72B-2x-Instruct-TIES-v1.0",
-    "id": "BenevolenceMessiah/Qwen2.5-72B-2x-Instruct-TIES-v1.0",
-    "developer": "BenevolenceMessiah",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.7
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5473
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7273
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5785
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3674
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4207
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5628
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BenevolenceMessiah/Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0/114f246a-6049-40bf-ad86-9a822d13cf74.json b/data/hfopenllm_v2/BenevolenceMessiah/Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0/114f246a-6049-40bf-ad86-9a822d13cf74.json
deleted file mode 100644
index 49b763b11..000000000
--- a/data/hfopenllm_v2/BenevolenceMessiah/Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0/114f246a-6049-40bf-ad86-9a822d13cf74.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BenevolenceMessiah_Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0",
-    "id": "BenevolenceMessiah/Yi-Coder-9B-Chat-Instruct-TIES-MoE-v1.0",
-    "developer": "BenevolenceMessiah",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 28.309
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4909
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0415
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.408
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.268
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BlackBeenie/Bloslain-8B-v0.2/82d28a3a-44f2-463f-a1b8-7e9079ec47b7.json b/data/hfopenllm_v2/BlackBeenie/Bloslain-8B-v0.2/82d28a3a-44f2-463f-a1b8-7e9079ec47b7.json
deleted file mode 100644
index 40f3caa9f..000000000
--- a/data/hfopenllm_v2/BlackBeenie/Bloslain-8B-v0.2/82d28a3a-44f2-463f-a1b8-7e9079ec47b7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BlackBeenie_Bloslain-8B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Bloslain-8B-v0.2",
-    "id": "BlackBeenie/Bloslain-8B-v0.2",
-    "developer": "BlackBeenie",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5023
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5111
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4076
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3654
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BlackBeenie/Llama-3.1-8B-OpenO1-SFT-v0.1/ed3c1349-a154-4866-890f-2b115ffaf127.json b/data/hfopenllm_v2/BlackBeenie/Llama-3.1-8B-OpenO1-SFT-v0.1/ed3c1349-a154-4866-890f-2b115ffaf127.json
deleted file mode 100644
index ce32e3e07..000000000
--- a/data/hfopenllm_v2/BlackBeenie/Llama-3.1-8B-OpenO1-SFT-v0.1/ed3c1349-a154-4866-890f-2b115ffaf127.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BlackBeenie_Llama-3.1-8B-OpenO1-SFT-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-OpenO1-SFT-v0.1",
-    "id": "BlackBeenie/Llama-3.1-8B-OpenO1-SFT-v0.1",
-    "developer": "BlackBeenie",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5124
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4787
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1526
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3618
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3492
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BlackBeenie/Llama-3.1-8B-pythonic-passthrough-merge/47942c55-5ddb-4fda-9c5b-34676ae2046a.json b/data/hfopenllm_v2/BlackBeenie/Llama-3.1-8B-pythonic-passthrough-merge/47942c55-5ddb-4fda-9c5b-34676ae2046a.json
deleted file mode 100644
index 3447f8e90..000000000
--- a/data/hfopenllm_v2/BlackBeenie/Llama-3.1-8B-pythonic-passthrough-merge/47942c55-5ddb-4fda-9c5b-34676ae2046a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BlackBeenie_Llama-3.1-8B-pythonic-passthrough-merge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-pythonic-passthrough-merge",
-    "id": "BlackBeenie/Llama-3.1-8B-pythonic-passthrough-merge",
-    "developer": "BlackBeenie",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 20.245
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2316
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3454
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0113
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3778
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1332
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BlackBeenie/Neos-Gemma-2-9b/d860210b-4c8a-4d15-ad3a-4e39905f91ed.json b/data/hfopenllm_v2/BlackBeenie/Neos-Gemma-2-9b/d860210b-4c8a-4d15-ad3a-4e39905f91ed.json
deleted file mode 100644
index 7ae80464f..000000000
--- a/data/hfopenllm_v2/BlackBeenie/Neos-Gemma-2-9b/d860210b-4c8a-4d15-ad3a-4e39905f91ed.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BlackBeenie_Neos-Gemma-2-9b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Neos-Gemma-2-9b",
-    "id": "BlackBeenie/Neos-Gemma-2-9b",
-    "developer": "BlackBeenie",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5876
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5503
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0982
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3618
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3981
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BlackBeenie/Neos-Llama-3.1-8B/d137f429-2b65-4ee9-9d66-3f619b270fad.json b/data/hfopenllm_v2/BlackBeenie/Neos-Llama-3.1-8B/d137f429-2b65-4ee9-9d66-3f619b270fad.json
deleted file mode 100644
index d7f22bbd1..000000000
--- a/data/hfopenllm_v2/BlackBeenie/Neos-Llama-3.1-8B/d137f429-2b65-4ee9-9d66-3f619b270fad.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BlackBeenie_Neos-Llama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Neos-Llama-3.1-8B",
-    "id": "BlackBeenie/Neos-Llama-3.1-8B",
-    "developer": "BlackBeenie",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4944
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4425
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1322
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BlackBeenie/Neos-Llama-3.1-base/1da10dfe-b0a3-4cb8-aaa3-e16d48f3aab4.json b/data/hfopenllm_v2/BlackBeenie/Neos-Llama-3.1-base/1da10dfe-b0a3-4cb8-aaa3-e16d48f3aab4.json
deleted file mode 100644
index 4db1b8828..000000000
--- a/data/hfopenllm_v2/BlackBeenie/Neos-Llama-3.1-base/1da10dfe-b0a3-4cb8-aaa3-e16d48f3aab4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BlackBeenie_Neos-Llama-3.1-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Neos-Llama-3.1-base",
-    "id": "BlackBeenie/Neos-Llama-3.1-base",
-    "developer": "BlackBeenie",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.65
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1751
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.293
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2374
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3499
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1112
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BlackBeenie/Neos-Phi-3-14B-v0.1/6156a0d2-4c32-40b2-9624-ef0c7a6a95bb.json b/data/hfopenllm_v2/BlackBeenie/Neos-Phi-3-14B-v0.1/6156a0d2-4c32-40b2-9624-ef0c7a6a95bb.json
deleted file mode 100644
index 5db8afdb2..000000000
--- a/data/hfopenllm_v2/BlackBeenie/Neos-Phi-3-14B-v0.1/6156a0d2-4c32-40b2-9624-ef0c7a6a95bb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BlackBeenie_Neos-Phi-3-14B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Neos-Phi-3-14B-v0.1",
-    "id": "BlackBeenie/Neos-Phi-3-14B-v0.1",
-    "developer": "BlackBeenie",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 13.96
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4022
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6212
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1782
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4125
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4564
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BlackBeenie/llama-3-luminous-merged/676342d2-f37a-4b6a-967d-3ac750243470.json b/data/hfopenllm_v2/BlackBeenie/llama-3-luminous-merged/676342d2-f37a-4b6a-967d-3ac750243470.json
deleted file mode 100644
index 2c979d2a9..000000000
--- a/data/hfopenllm_v2/BlackBeenie/llama-3-luminous-merged/676342d2-f37a-4b6a-967d-3ac750243470.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BlackBeenie_llama-3-luminous-merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-luminous-merged",
-    "id": "BlackBeenie/llama-3-luminous-merged",
-    "developer": "BlackBeenie",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4323
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5154
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0869
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4149
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3773
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BlackBeenie/llama-3.1-8B-Galore-openassistant-guanaco/950b7108-0192-4875-b4e9-c3e43ab71e08.json b/data/hfopenllm_v2/BlackBeenie/llama-3.1-8B-Galore-openassistant-guanaco/950b7108-0192-4875-b4e9-c3e43ab71e08.json
deleted file mode 100644
index 776b081c0..000000000
--- a/data/hfopenllm_v2/BlackBeenie/llama-3.1-8B-Galore-openassistant-guanaco/950b7108-0192-4875-b4e9-c3e43ab71e08.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BlackBeenie_llama-3.1-8B-Galore-openassistant-guanaco/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3.1-8B-Galore-openassistant-guanaco",
-    "id": "BlackBeenie/llama-3.1-8B-Galore-openassistant-guanaco",
-    "developer": "BlackBeenie",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2635
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5213
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4406
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3206
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Bllossom/llama-3.2-Korean-Bllossom-AICA-5B/85672df5-2f35-43be-8648-9937c66872dc.json b/data/hfopenllm_v2/Bllossom/llama-3.2-Korean-Bllossom-AICA-5B/85672df5-2f35-43be-8648-9937c66872dc.json
deleted file mode 100644
index 2f5bfbf50..000000000
--- a/data/hfopenllm_v2/Bllossom/llama-3.2-Korean-Bllossom-AICA-5B/85672df5-2f35-43be-8648-9937c66872dc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Bllossom_llama-3.2-Korean-Bllossom-AICA-5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3.2-Korean-Bllossom-AICA-5B",
-    "id": "Bllossom/llama-3.2-Korean-Bllossom-AICA-5B",
-    "developer": "Bllossom",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MllamaForConditionalGeneration",
-      "params_billions": 5.199
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5172
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4293
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1239
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3834
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BoltMonkey/DreadMix/051c5642-3b23-4879-9d10-639d1b3127d7.json b/data/hfopenllm_v2/BoltMonkey/DreadMix/051c5642-3b23-4879-9d10-639d1b3127d7.json
deleted file mode 100644
index 7ccd4a48e..000000000
--- a/data/hfopenllm_v2/BoltMonkey/DreadMix/051c5642-3b23-4879-9d10-639d1b3127d7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BoltMonkey_DreadMix/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DreadMix",
-    "id": "BoltMonkey/DreadMix",
-    "developer": "BoltMonkey",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7095
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5435
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1556
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4212
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.379
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/2acf0d12-7e0c-46dc-a079-ebc48a8818d3.json b/data/hfopenllm_v2/BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/2acf0d12-7e0c-46dc-a079-ebc48a8818d3.json
deleted file mode 100644
index 7730b26d1..000000000
--- a/data/hfopenllm_v2/BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/2acf0d12-7e0c-46dc-a079-ebc48a8818d3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BoltMonkey_NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated",
-    "id": "BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated",
-    "developer": "BoltMonkey",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7999
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5152
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1193
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4019
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/8ce42090-006e-4e08-8d3f-5b1eb0b8da0b.json b/data/hfopenllm_v2/BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/8ce42090-006e-4e08-8d3f-5b1eb0b8da0b.json
deleted file mode 100644
index 0ea3c09c9..000000000
--- a/data/hfopenllm_v2/BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/8ce42090-006e-4e08-8d3f-5b1eb0b8da0b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BoltMonkey_NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated",
-    "id": "BoltMonkey/NeuralDaredevil-SuperNova-Lite-7B-DARETIES-abliterated",
-    "developer": "BoltMonkey",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.459
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5185
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0937
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4083
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3631
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BoltMonkey/SuperNeuralDreadDevil-8b/703df6c3-dae4-437f-9379-f8c264797adc.json b/data/hfopenllm_v2/BoltMonkey/SuperNeuralDreadDevil-8b/703df6c3-dae4-437f-9379-f8c264797adc.json
deleted file mode 100644
index 00d308164..000000000
--- a/data/hfopenllm_v2/BoltMonkey/SuperNeuralDreadDevil-8b/703df6c3-dae4-437f-9379-f8c264797adc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BoltMonkey_SuperNeuralDreadDevil-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SuperNeuralDreadDevil-8b",
-    "id": "BoltMonkey/SuperNeuralDreadDevil-8b",
-    "developer": "BoltMonkey",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.771
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5286
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0929
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3977
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3679
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BrainWave-ML/llama3.2-3B-maths-orpo/1e349ad3-d29b-4a4b-97e7-b82055e41b07.json b/data/hfopenllm_v2/BrainWave-ML/llama3.2-3B-maths-orpo/1e349ad3-d29b-4a4b-97e7-b82055e41b07.json
deleted file mode 100644
index 4d4697ad4..000000000
--- a/data/hfopenllm_v2/BrainWave-ML/llama3.2-3B-maths-orpo/1e349ad3-d29b-4a4b-97e7-b82055e41b07.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BrainWave-ML_llama3.2-3B-maths-orpo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama3.2-3B-maths-orpo",
-    "id": "BrainWave-ML/llama3.2-3B-maths-orpo",
-    "developer": "BrainWave-ML",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2049
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2912
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3575
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1168
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BramVanroy/GEITje-7B-ultra/8f677a76-932c-4c35-9708-4b723226aa19.json b/data/hfopenllm_v2/BramVanroy/GEITje-7B-ultra/8f677a76-932c-4c35-9708-4b723226aa19.json
deleted file mode 100644
index 876af8a9c..000000000
--- a/data/hfopenllm_v2/BramVanroy/GEITje-7B-ultra/8f677a76-932c-4c35-9708-4b723226aa19.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BramVanroy_GEITje-7B-ultra/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GEITje-7B-ultra",
-    "id": "BramVanroy/GEITje-7B-ultra",
-    "developer": "BramVanroy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3723
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3776
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.329
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2011
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BramVanroy/fietje-2-chat/ebfe625f-ff1f-45f9-826c-9351ea4134e1.json b/data/hfopenllm_v2/BramVanroy/fietje-2-chat/ebfe625f-ff1f-45f9-826c-9351ea4134e1.json
deleted file mode 100644
index 3c12f59b5..000000000
--- a/data/hfopenllm_v2/BramVanroy/fietje-2-chat/ebfe625f-ff1f-45f9-826c-9351ea4134e1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BramVanroy_fietje-2-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "fietje-2-chat",
-    "id": "BramVanroy/fietje-2-chat",
-    "developer": "BramVanroy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.775
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2917
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.415
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0189
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2399
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3528
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2055
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BramVanroy/fietje-2-instruct/66e6a757-ac22-47f3-82ce-81af45e1d3cf.json b/data/hfopenllm_v2/BramVanroy/fietje-2-instruct/66e6a757-ac22-47f3-82ce-81af45e1d3cf.json
deleted file mode 100644
index 667f588bf..000000000
--- a/data/hfopenllm_v2/BramVanroy/fietje-2-instruct/66e6a757-ac22-47f3-82ce-81af45e1d3cf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BramVanroy_fietje-2-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "fietje-2-instruct",
-    "id": "BramVanroy/fietje-2-instruct",
-    "developer": "BramVanroy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.775
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.279
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4136
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0227
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2332
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3369
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2104
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/BramVanroy/fietje-2/1cd840c7-d432-495c-a3df-af1fa6264259.json b/data/hfopenllm_v2/BramVanroy/fietje-2/1cd840c7-d432-495c-a3df-af1fa6264259.json
deleted file mode 100644
index 9d714f7c1..000000000
--- a/data/hfopenllm_v2/BramVanroy/fietje-2/1cd840c7-d432-495c-a3df-af1fa6264259.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/BramVanroy_fietje-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "fietje-2",
-    "id": "BramVanroy/fietje-2",
-    "developer": "BramVanroy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.78
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2098
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4036
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3696
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1986
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CYFRAGOVPL/Llama-PLLuM-8B-base/066f520f-9a64-4564-abfc-6435732c3585.json b/data/hfopenllm_v2/CYFRAGOVPL/Llama-PLLuM-8B-base/066f520f-9a64-4564-abfc-6435732c3585.json
deleted file mode 100644
index 72e310100..000000000
--- a/data/hfopenllm_v2/CYFRAGOVPL/Llama-PLLuM-8B-base/066f520f-9a64-4564-abfc-6435732c3585.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CYFRAGOVPL_Llama-PLLuM-8B-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-PLLuM-8B-base",
-    "id": "CYFRAGOVPL/Llama-PLLuM-8B-base",
-    "developer": "CYFRAGOVPL",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2899
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.432
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0363
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.397
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2757
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CYFRAGOVPL/Llama-PLLuM-8B-chat/aced5181-040a-48c0-bc5f-78d0de3afae8.json b/data/hfopenllm_v2/CYFRAGOVPL/Llama-PLLuM-8B-chat/aced5181-040a-48c0-bc5f-78d0de3afae8.json
deleted file mode 100644
index 8c1a7d263..000000000
--- a/data/hfopenllm_v2/CYFRAGOVPL/Llama-PLLuM-8B-chat/aced5181-040a-48c0-bc5f-78d0de3afae8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CYFRAGOVPL_Llama-PLLuM-8B-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-PLLuM-8B-chat",
-    "id": "CYFRAGOVPL/Llama-PLLuM-8B-chat",
-    "developer": "CYFRAGOVPL",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4077
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.034
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2719
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-base/a4889a38-84d2-4ae1-b8a9-297b4400602d.json b/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-base/a4889a38-84d2-4ae1-b8a9-297b4400602d.json
deleted file mode 100644
index ec8ff80a8..000000000
--- a/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-base/a4889a38-84d2-4ae1-b8a9-297b4400602d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CYFRAGOVPL_PLLuM-12B-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PLLuM-12B-base",
-    "id": "CYFRAGOVPL/PLLuM-12B-base",
-    "developer": "CYFRAGOVPL",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2821
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4391
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4142
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.274
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-chat/d540505a-c67b-4b72-a53a-c03aa6f8d3e7.json b/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-chat/d540505a-c67b-4b72-a53a-c03aa6f8d3e7.json
deleted file mode 100644
index b607ed69c..000000000
--- a/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-chat/d540505a-c67b-4b72-a53a-c03aa6f8d3e7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CYFRAGOVPL_PLLuM-12B-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PLLuM-12B-chat",
-    "id": "CYFRAGOVPL/PLLuM-12B-chat",
-    "developer": "CYFRAGOVPL",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3214
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4446
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4115
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2872
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-nc-base/9859afee-02ca-4c48-acc8-acfd20c37e4e.json b/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-nc-base/9859afee-02ca-4c48-acc8-acfd20c37e4e.json
deleted file mode 100644
index d200117b7..000000000
--- a/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-nc-base/9859afee-02ca-4c48-acc8-acfd20c37e4e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CYFRAGOVPL_PLLuM-12B-nc-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PLLuM-12B-nc-base",
-    "id": "CYFRAGOVPL/PLLuM-12B-nc-base",
-    "developer": "CYFRAGOVPL",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2405
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4277
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3645
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-nc-chat/e222d12b-c796-4890-a584-cd689bae7ea6.json b/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-nc-chat/e222d12b-c796-4890-a584-cd689bae7ea6.json
deleted file mode 100644
index 2eaeab787..000000000
--- a/data/hfopenllm_v2/CYFRAGOVPL/PLLuM-12B-nc-chat/e222d12b-c796-4890-a584-cd689bae7ea6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CYFRAGOVPL_PLLuM-12B-nc-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PLLuM-12B-nc-chat",
-    "id": "CYFRAGOVPL/PLLuM-12B-nc-chat",
-    "developer": "CYFRAGOVPL",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2834
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4576
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4354
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2597
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct-2412/c16850f8-0b80-4455-8f38-8ec453cd1d41.json b/data/hfopenllm_v2/CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct-2412/c16850f8-0b80-4455-8f38-8ec453cd1d41.json
deleted file mode 100644
index 49e80d94c..000000000
--- a/data/hfopenllm_v2/CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct-2412/c16850f8-0b80-4455-8f38-8ec453cd1d41.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CarrotAI_Llama-3.2-Rabbit-Ko-3B-Instruct-2412/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-Rabbit-Ko-3B-Instruct-2412",
-    "id": "CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct-2412",
-    "developer": "CarrotAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4782
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4358
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.176
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3872
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3134
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct/0d400b0f-cc82-4c86-b600-93a31b133f9d.json b/data/hfopenllm_v2/CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct/0d400b0f-cc82-4c86-b600-93a31b133f9d.json
deleted file mode 100644
index 727b90c2f..000000000
--- a/data/hfopenllm_v2/CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct/0d400b0f-cc82-4c86-b600-93a31b133f9d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CarrotAI_Llama-3.2-Rabbit-Ko-3B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-Rabbit-Ko-3B-Instruct",
-    "id": "CarrotAI/Llama-3.2-Rabbit-Ko-3B-Instruct",
-    "developer": "CarrotAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7199
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4427
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2054
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3649
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2822
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Casual-Autopsy/L3-Umbral-Mind-RP-v2.0-8B/90f6f8f1-02fc-425a-8499-e9b43ae8ac59.json b/data/hfopenllm_v2/Casual-Autopsy/L3-Umbral-Mind-RP-v2.0-8B/90f6f8f1-02fc-425a-8499-e9b43ae8ac59.json
deleted file mode 100644
index ea628405b..000000000
--- a/data/hfopenllm_v2/Casual-Autopsy/L3-Umbral-Mind-RP-v2.0-8B/90f6f8f1-02fc-425a-8499-e9b43ae8ac59.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Casual-Autopsy_L3-Umbral-Mind-RP-v2.0-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-Umbral-Mind-RP-v2.0-8B",
-    "id": "Casual-Autopsy/L3-Umbral-Mind-RP-v2.0-8B",
-    "developer": "Casual-Autopsy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7123
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5262
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1095
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3687
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3723
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CausalLM/14B/6704d6bc-6d38-4c59-87a4-81d3eacde3b1.json b/data/hfopenllm_v2/CausalLM/14B/6704d6bc-6d38-4c59-87a4-81d3eacde3b1.json
deleted file mode 100644
index aaa13fb97..000000000
--- a/data/hfopenllm_v2/CausalLM/14B/6704d6bc-6d38-4c59-87a4-81d3eacde3b1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CausalLM_14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "14B",
-    "id": "CausalLM/14B",
-    "developer": "CausalLM",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2788
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.47
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0755
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4155
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CausalLM/34b-beta/e8ad6ce4-7efc-499e-a2c9-9e0df898fbb9.json b/data/hfopenllm_v2/CausalLM/34b-beta/e8ad6ce4-7efc-499e-a2c9-9e0df898fbb9.json
deleted file mode 100644
index 19e23bb71..000000000
--- a/data/hfopenllm_v2/CausalLM/34b-beta/e8ad6ce4-7efc-499e-a2c9-9e0df898fbb9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CausalLM_34b-beta/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "34b-beta",
-    "id": "CausalLM/34b-beta",
-    "developer": "CausalLM",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3043
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5591
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0483
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3465
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3749
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5325
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CausalLM/preview-1-hf/5e9c1273-536d-4280-8fff-9931f46dc968.json b/data/hfopenllm_v2/CausalLM/preview-1-hf/5e9c1273-536d-4280-8fff-9931f46dc968.json
deleted file mode 100644
index 571a2c125..000000000
--- a/data/hfopenllm_v2/CausalLM/preview-1-hf/5e9c1273-536d-4280-8fff-9931f46dc968.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CausalLM_preview-1-hf/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "preview-1-hf",
-    "id": "CausalLM/preview-1-hf",
-    "developer": "CausalLM",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GlmForCausalLM",
-      "params_billions": 9.543
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5559
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3615
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3422
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3597
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Changgil/K2S3-14b-v0.2/460ca160-ac34-4091-ba2d-986b53532b55.json b/data/hfopenllm_v2/Changgil/K2S3-14b-v0.2/460ca160-ac34-4091-ba2d-986b53532b55.json
deleted file mode 100644
index c087ab2dc..000000000
--- a/data/hfopenllm_v2/Changgil/K2S3-14b-v0.2/460ca160-ac34-4091-ba2d-986b53532b55.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Changgil_K2S3-14b-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "K2S3-14b-v0.2",
-    "id": "Changgil/K2S3-14b-v0.2",
-    "developer": "Changgil",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 14.352
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3243
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4613
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3923
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2644
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Changgil/K2S3-v0.1/ef9d2fab-07a2-44e2-aae2-ede5a2ff31d9.json b/data/hfopenllm_v2/Changgil/K2S3-v0.1/ef9d2fab-07a2-44e2-aae2-ede5a2ff31d9.json
deleted file mode 100644
index efefe2dc0..000000000
--- a/data/hfopenllm_v2/Changgil/K2S3-v0.1/ef9d2fab-07a2-44e2-aae2-ede5a2ff31d9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Changgil_K2S3-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "K2S3-v0.1",
-    "id": "Changgil/K2S3-v0.1",
-    "developer": "Changgil",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 14.352
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3277
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4655
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0461
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4014
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2562
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ClaudioItaly/Albacus/a29a69d3-d64e-4463-aa52-0a9d6d012c98.json b/data/hfopenllm_v2/ClaudioItaly/Albacus/a29a69d3-d64e-4463-aa52-0a9d6d012c98.json
deleted file mode 100644
index 8b6f83423..000000000
--- a/data/hfopenllm_v2/ClaudioItaly/Albacus/a29a69d3-d64e-4463-aa52-0a9d6d012c98.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ClaudioItaly_Albacus/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Albacus",
-    "id": "ClaudioItaly/Albacus",
-    "developer": "ClaudioItaly",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 8.987
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4667
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5113
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.071
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4135
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3165
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ClaudioItaly/Book-Gut12B/4539c16e-1ac6-47f4-88eb-a09842497330.json b/data/hfopenllm_v2/ClaudioItaly/Book-Gut12B/4539c16e-1ac6-47f4-88eb-a09842497330.json
deleted file mode 100644
index c0c13e7aa..000000000
--- a/data/hfopenllm_v2/ClaudioItaly/Book-Gut12B/4539c16e-1ac6-47f4-88eb-a09842497330.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ClaudioItaly_Book-Gut12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Book-Gut12B",
-    "id": "ClaudioItaly/Book-Gut12B",
-    "developer": "ClaudioItaly",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3998
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5417
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4635
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.367
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ClaudioItaly/Evolutionstory-7B-v2.2/2ff33c55-1236-4c57-8809-2d3076e43cc7.json b/data/hfopenllm_v2/ClaudioItaly/Evolutionstory-7B-v2.2/2ff33c55-1236-4c57-8809-2d3076e43cc7.json
deleted file mode 100644
index da6fa6915..000000000
--- a/data/hfopenllm_v2/ClaudioItaly/Evolutionstory-7B-v2.2/2ff33c55-1236-4c57-8809-2d3076e43cc7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ClaudioItaly_Evolutionstory-7B-v2.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Evolutionstory-7B-v2.2",
-    "id": "ClaudioItaly/Evolutionstory-7B-v2.2",
-    "developer": "ClaudioItaly",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4814
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5108
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.071
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4135
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3159
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ClaudioItaly/intelligence-cod-rag-7b-v3/281ba822-49a2-4746-bc04-8de046439508.json b/data/hfopenllm_v2/ClaudioItaly/intelligence-cod-rag-7b-v3/281ba822-49a2-4746-bc04-8de046439508.json
deleted file mode 100644
index 5635777d0..000000000
--- a/data/hfopenllm_v2/ClaudioItaly/intelligence-cod-rag-7b-v3/281ba822-49a2-4746-bc04-8de046439508.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ClaudioItaly_intelligence-cod-rag-7b-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "intelligence-cod-rag-7b-v3",
-    "id": "ClaudioItaly/intelligence-cod-rag-7b-v3",
-    "developer": "ClaudioItaly",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6898
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5366
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3807
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4153
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4195
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CohereForAI/aya-23-35B/0606d916-95ea-4318-af0c-3942329071c6.json b/data/hfopenllm_v2/CohereForAI/aya-23-35B/0606d916-95ea-4318-af0c-3942329071c6.json
deleted file mode 100644
index 3eb7bd011..000000000
--- a/data/hfopenllm_v2/CohereForAI/aya-23-35B/0606d916-95ea-4318-af0c-3942329071c6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CohereForAI_aya-23-35B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "aya-23-35B",
-    "id": "CohereForAI/aya-23-35B",
-    "developer": "CohereForAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "CohereForCausalLM",
-      "params_billions": 34.981
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6462
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.54
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0347
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.431
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CohereForAI/aya-23-8B/005159f0-da68-480d-972c-c160d145a682.json b/data/hfopenllm_v2/CohereForAI/aya-23-8B/005159f0-da68-480d-972c-c160d145a682.json
deleted file mode 100644
index ff20417ff..000000000
--- a/data/hfopenllm_v2/CohereForAI/aya-23-8B/005159f0-da68-480d-972c-c160d145a682.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CohereForAI_aya-23-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "aya-23-8B",
-    "id": "CohereForAI/aya-23-8B",
-    "developer": "CohereForAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "CohereForCausalLM",
-      "params_billions": 8.028
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4699
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4296
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3941
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2278
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CohereForAI/aya-expanse-32b/2f6abb5d-52b3-44b0-b960-115793485fb1.json b/data/hfopenllm_v2/CohereForAI/aya-expanse-32b/2f6abb5d-52b3-44b0-b960-115793485fb1.json
deleted file mode 100644
index 398bb4dca..000000000
--- a/data/hfopenllm_v2/CohereForAI/aya-expanse-32b/2f6abb5d-52b3-44b0-b960-115793485fb1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CohereForAI_aya-expanse-32b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "aya-expanse-32b",
-    "id": "CohereForAI/aya-expanse-32b",
-    "developer": "CohereForAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "CohereForCausalLM",
-      "params_billions": 32.296
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7302
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5649
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1533
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3873
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.413
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CohereForAI/aya-expanse-8b/6ffacad9-1a4d-472e-bbbf-0d64d068dd0d.json b/data/hfopenllm_v2/CohereForAI/aya-expanse-8b/6ffacad9-1a4d-472e-bbbf-0d64d068dd0d.json
deleted file mode 100644
index 724fe03bd..000000000
--- a/data/hfopenllm_v2/CohereForAI/aya-expanse-8b/6ffacad9-1a4d-472e-bbbf-0d64d068dd0d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CohereForAI_aya-expanse-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "aya-expanse-8b",
-    "id": "CohereForAI/aya-expanse-8b",
-    "developer": "CohereForAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "CohereForCausalLM",
-      "params_billions": 8.028
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6359
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4977
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0861
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3729
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3004
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CohereForAI/c4ai-command-r-plus-08-2024/26eadaf8-bfb8-4aad-a8a4-90699b6f0fcd.json b/data/hfopenllm_v2/CohereForAI/c4ai-command-r-plus-08-2024/26eadaf8-bfb8-4aad-a8a4-90699b6f0fcd.json
deleted file mode 100644
index 294a861d5..000000000
--- a/data/hfopenllm_v2/CohereForAI/c4ai-command-r-plus-08-2024/26eadaf8-bfb8-4aad-a8a4-90699b6f0fcd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CohereForAI_c4ai-command-r-plus-08-2024/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "c4ai-command-r-plus-08-2024",
-    "id": "CohereForAI/c4ai-command-r-plus-08-2024",
-    "developer": "CohereForAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "CohereForCausalLM",
-      "params_billions": 103.811
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.754
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5996
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1239
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3507
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4829
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4421
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CohereForAI/c4ai-command-r-plus/d4536913-5708-45e4-a024-45ae37fdae13.json b/data/hfopenllm_v2/CohereForAI/c4ai-command-r-plus/d4536913-5708-45e4-a024-45ae37fdae13.json
deleted file mode 100644
index e568ee283..000000000
--- a/data/hfopenllm_v2/CohereForAI/c4ai-command-r-plus/d4536913-5708-45e4-a024-45ae37fdae13.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CohereForAI_c4ai-command-r-plus/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "c4ai-command-r-plus",
-    "id": "CohereForAI/c4ai-command-r-plus",
-    "developer": "CohereForAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "CohereForCausalLM",
-      "params_billions": 103.811
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7664
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5815
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0801
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4807
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3992
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CohereForAI/c4ai-command-r-v01/848860aa-7de3-4fae-afca-ac11224b96c5.json b/data/hfopenllm_v2/CohereForAI/c4ai-command-r-v01/848860aa-7de3-4fae-afca-ac11224b96c5.json
deleted file mode 100644
index 5b2083324..000000000
--- a/data/hfopenllm_v2/CohereForAI/c4ai-command-r-v01/848860aa-7de3-4fae-afca-ac11224b96c5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CohereForAI_c4ai-command-r-v01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "c4ai-command-r-v01",
-    "id": "CohereForAI/c4ai-command-r-v01",
-    "developer": "CohereForAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "CohereForCausalLM",
-      "params_billions": 34.981
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6748
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5406
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0347
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4517
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3369
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CohereForAI/c4ai-command-r7b-12-2024/0241a8e3-d6e5-4ba5-afb9-862bde2ba851.json b/data/hfopenllm_v2/CohereForAI/c4ai-command-r7b-12-2024/0241a8e3-d6e5-4ba5-afb9-862bde2ba851.json
deleted file mode 100644
index 0cc392d32..000000000
--- a/data/hfopenllm_v2/CohereForAI/c4ai-command-r7b-12-2024/0241a8e3-d6e5-4ba5-afb9-862bde2ba851.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CohereForAI_c4ai-command-r7b-12-2024/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "c4ai-command-r7b-12-2024",
-    "id": "CohereForAI/c4ai-command-r7b-12-2024",
-    "developer": "CohereForAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Cohere2ForCausalLM",
-      "params_billions": 8.028
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7713
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5503
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2991
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4125
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3572
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-dpo-v1.0/20b69120-d476-4e34-b3c6-8cef11d6ee78.json b/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-dpo-v1.0/20b69120-d476-4e34-b3c6-8cef11d6ee78.json
deleted file mode 100644
index 047b3a3d1..000000000
--- a/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-dpo-v1.0/20b69120-d476-4e34-b3c6-8cef11d6ee78.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Columbia-NLP_LION-Gemma-2b-dpo-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LION-Gemma-2b-dpo-v1.0",
-    "id": "Columbia-NLP/LION-Gemma-2b-dpo-v1.0",
-    "developer": "Columbia-NLP",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 2.506
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3102
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3881
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4081
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1665
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-dpo-v1.0/696bbbfc-49dd-444e-a90b-76821845a726.json b/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-dpo-v1.0/696bbbfc-49dd-444e-a90b-76821845a726.json
deleted file mode 100644
index 9deccf82b..000000000
--- a/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-dpo-v1.0/696bbbfc-49dd-444e-a90b-76821845a726.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Columbia-NLP_LION-Gemma-2b-dpo-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LION-Gemma-2b-dpo-v1.0",
-    "id": "Columbia-NLP/LION-Gemma-2b-dpo-v1.0",
-    "developer": "Columbia-NLP",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 2.506
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3278
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.392
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.412
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1666
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-odpo-v1.0/e6d974d3-467e-4fe7-bd84-79fc7c72cde2.json b/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-odpo-v1.0/e6d974d3-467e-4fe7-bd84-79fc7c72cde2.json
deleted file mode 100644
index 7b19f887e..000000000
--- a/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-odpo-v1.0/e6d974d3-467e-4fe7-bd84-79fc7c72cde2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Columbia-NLP_LION-Gemma-2b-odpo-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LION-Gemma-2b-odpo-v1.0",
-    "id": "Columbia-NLP/LION-Gemma-2b-odpo-v1.0",
-    "developer": "Columbia-NLP",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 2.506
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3066
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3896
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0695
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2424
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4279
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1692
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-sft-v1.0/b26ba2b7-1365-4b1c-a1be-35d588e02d36.json b/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-sft-v1.0/b26ba2b7-1365-4b1c-a1be-35d588e02d36.json
deleted file mode 100644
index 2d900597c..000000000
--- a/data/hfopenllm_v2/Columbia-NLP/LION-Gemma-2b-sft-v1.0/b26ba2b7-1365-4b1c-a1be-35d588e02d36.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Columbia-NLP_LION-Gemma-2b-sft-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LION-Gemma-2b-sft-v1.0",
-    "id": "Columbia-NLP/LION-Gemma-2b-sft-v1.0",
-    "developer": "Columbia-NLP",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 2.506
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3692
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3879
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4027
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1782
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-dpo-v1.0/64bd755d-ba4b-4559-ad8e-f56c697b1ae6.json b/data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-dpo-v1.0/64bd755d-ba4b-4559-ad8e-f56c697b1ae6.json
deleted file mode 100644
index f22d9e63b..000000000
--- a/data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-dpo-v1.0/64bd755d-ba4b-4559-ad8e-f56c697b1ae6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Columbia-NLP_LION-LLaMA-3-8b-dpo-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LION-LLaMA-3-8b-dpo-v1.0",
-    "id": "Columbia-NLP/LION-LLaMA-3-8b-dpo-v1.0",
-    "developer": "Columbia-NLP",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4957
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5028
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1171
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4097
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3219
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-odpo-v1.0/c4e572cb-1d12-4baf-a4d8-a55422692207.json b/data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-odpo-v1.0/c4e572cb-1d12-4baf-a4d8-a55422692207.json
deleted file mode 100644
index 6e2795510..000000000
--- a/data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-odpo-v1.0/c4e572cb-1d12-4baf-a4d8-a55422692207.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Columbia-NLP_LION-LLaMA-3-8b-odpo-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LION-LLaMA-3-8b-odpo-v1.0",
-    "id": "Columbia-NLP/LION-LLaMA-3-8b-odpo-v1.0",
-    "developer": "Columbia-NLP",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3968
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5024
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4057
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3152
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-sft-v1.0/c6123e10-b1f9-49dc-888b-083881e6ef09.json b/data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-sft-v1.0/c6123e10-b1f9-49dc-888b-083881e6ef09.json
deleted file mode 100644
index 1377d56ef..000000000
--- a/data/hfopenllm_v2/Columbia-NLP/LION-LLaMA-3-8b-sft-v1.0/c6123e10-b1f9-49dc-888b-083881e6ef09.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Columbia-NLP_LION-LLaMA-3-8b-sft-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LION-LLaMA-3-8b-sft-v1.0",
-    "id": "Columbia-NLP/LION-LLaMA-3-8b-sft-v1.0",
-    "developer": "Columbia-NLP",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5088
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4503
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3237
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CombinHorizon/Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES/e1647f10-fec5-463d-b8e5-6b2b880bd687.json b/data/hfopenllm_v2/CombinHorizon/Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES/e1647f10-fec5-463d-b8e5-6b2b880bd687.json
deleted file mode 100644
index 739ac6a43..000000000
--- a/data/hfopenllm_v2/CombinHorizon/Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES/e1647f10-fec5-463d-b8e5-6b2b880bd687.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CombinHorizon_Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES",
-    "id": "CombinHorizon/Josiefied-abliteratedV4-Qwen2.5-14B-Inst-BaseMerge-TIES",
-    "developer": "CombinHorizon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.824
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.637
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5317
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.426
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4979
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CombinHorizon/Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES/6d5fa235-8d69-456e-9f23-0f702760baf4.json b/data/hfopenllm_v2/CombinHorizon/Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES/6d5fa235-8d69-456e-9f23-0f702760baf4.json
deleted file mode 100644
index e1668a135..000000000
--- a/data/hfopenllm_v2/CombinHorizon/Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES/6d5fa235-8d69-456e-9f23-0f702760baf4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CombinHorizon_Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES",
-    "id": "CombinHorizon/Rombos-Qwen2.5-7B-Inst-BaseMerge-TIES",
-    "developer": "CombinHorizon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7564
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5402
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4932
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4033
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4342
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CombinHorizon/YiSM-blossom5.1-34B-SLERP/e8709a6a-a2b8-4b09-9342-d1aeae89de1f.json b/data/hfopenllm_v2/CombinHorizon/YiSM-blossom5.1-34B-SLERP/e8709a6a-a2b8-4b09-9342-d1aeae89de1f.json
deleted file mode 100644
index 0da86caa5..000000000
--- a/data/hfopenllm_v2/CombinHorizon/YiSM-blossom5.1-34B-SLERP/e8709a6a-a2b8-4b09-9342-d1aeae89de1f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CombinHorizon_YiSM-blossom5.1-34B-SLERP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "YiSM-blossom5.1-34B-SLERP",
-    "id": "CombinHorizon/YiSM-blossom5.1-34B-SLERP",
-    "developer": "CombinHorizon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5033
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6208
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2153
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3557
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4413
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4741
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CombinHorizon/huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES/603e95c9-7e7f-4892-93f7-92f92b256865.json b/data/hfopenllm_v2/CombinHorizon/huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES/603e95c9-7e7f-4892-93f7-92f92b256865.json
deleted file mode 100644
index f4ca1a794..000000000
--- a/data/hfopenllm_v2/CombinHorizon/huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES/603e95c9-7e7f-4892-93f7-92f92b256865.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CombinHorizon_huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES",
-    "id": "CombinHorizon/huihui-ai-abliterated-Qwen2.5-32B-Inst-BaseMerge-TIES",
-    "developer": "CombinHorizon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8206
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6929
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5944
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3389
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4207
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5721
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CombinHorizon/huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES/3e2fd38a-186e-49aa-915c-7eb3cde50562.json b/data/hfopenllm_v2/CombinHorizon/huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES/3e2fd38a-186e-49aa-915c-7eb3cde50562.json
deleted file mode 100644
index 859925816..000000000
--- a/data/hfopenllm_v2/CombinHorizon/huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES/3e2fd38a-186e-49aa-915c-7eb3cde50562.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CombinHorizon_huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES",
-    "id": "CombinHorizon/huihui-ai-abliteratedV2-Qwen2.5-14B-Inst-BaseMerge-TIES",
-    "developer": "CombinHorizon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8176
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6336
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.426
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.491
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CombinHorizon/zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES/16d55e66-9015-4d72-81e4-3f14c42b0368.json b/data/hfopenllm_v2/CombinHorizon/zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES/16d55e66-9015-4d72-81e4-3f14c42b0368.json
deleted file mode 100644
index f7f8dee1c..000000000
--- a/data/hfopenllm_v2/CombinHorizon/zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES/16d55e66-9015-4d72-81e4-3f14c42b0368.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CombinHorizon_zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES",
-    "id": "CombinHorizon/zetasepic-abliteratedV2-Qwen2.5-32B-Inst-BaseMerge-TIES",
-    "developer": "CombinHorizon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8328
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6955
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5853
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3674
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4314
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5685
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ContactDoctor/Bio-Medical-3B-CoT-012025/696644b9-bd40-4047-bb85-0cb19510a96c.json b/data/hfopenllm_v2/ContactDoctor/Bio-Medical-3B-CoT-012025/696644b9-bd40-4047-bb85-0cb19510a96c.json
deleted file mode 100644
index ea7e83334..000000000
--- a/data/hfopenllm_v2/ContactDoctor/Bio-Medical-3B-CoT-012025/696644b9-bd40-4047-bb85-0cb19510a96c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ContactDoctor_Bio-Medical-3B-CoT-012025/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Bio-Medical-3B-CoT-012025",
-    "id": "ContactDoctor/Bio-Medical-3B-CoT-012025",
-    "developer": "ContactDoctor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.085
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3604
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4383
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2213
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3368
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2934
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ContactDoctor/Bio-Medical-Llama-3-8B/cbae8c39-0aec-4859-98bc-3b2d065833ad.json b/data/hfopenllm_v2/ContactDoctor/Bio-Medical-Llama-3-8B/cbae8c39-0aec-4859-98bc-3b2d065833ad.json
deleted file mode 100644
index 0ff3b9020..000000000
--- a/data/hfopenllm_v2/ContactDoctor/Bio-Medical-Llama-3-8B/cbae8c39-0aec-4859-98bc-3b2d065833ad.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ContactDoctor_Bio-Medical-Llama-3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Bio-Medical-Llama-3-8B",
-    "id": "ContactDoctor/Bio-Medical-Llama-3-8B",
-    "developer": "ContactDoctor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4422
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4863
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0672
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3339
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3514
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3648
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme-merge2/15fb3cc7-1ba5-4ba5-ba02-8e8a9d2029d0.json b/data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme-merge2/15fb3cc7-1ba5-4ba5-ba02-8e8a9d2029d0.json
deleted file mode 100644
index 23736286b..000000000
--- a/data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme-merge2/15fb3cc7-1ba5-4ba5-ba02-8e8a9d2029d0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CoolSpring_Qwen2-0.5B-Abyme-merge2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-0.5B-Abyme-merge2",
-    "id": "CoolSpring/Qwen2-0.5B-Abyme-merge2",
-    "developer": "CoolSpring",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2022
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2994
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0332
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3687
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1489
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme-merge3/357f6051-b880-48bb-8e68-e4b0a7a0cbcc.json b/data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme-merge3/357f6051-b880-48bb-8e68-e4b0a7a0cbcc.json
deleted file mode 100644
index a155e6c18..000000000
--- a/data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme-merge3/357f6051-b880-48bb-8e68-e4b0a7a0cbcc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CoolSpring_Qwen2-0.5B-Abyme-merge3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-0.5B-Abyme-merge3",
-    "id": "CoolSpring/Qwen2-0.5B-Abyme-merge3",
-    "developer": "CoolSpring",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2386
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0317
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3501
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.15
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme/a50a542b-668e-47b1-a37e-805a58eea3d1.json b/data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme/a50a542b-668e-47b1-a37e-805a58eea3d1.json
deleted file mode 100644
index 43ce6bc2b..000000000
--- a/data/hfopenllm_v2/CoolSpring/Qwen2-0.5B-Abyme/a50a542b-668e-47b1-a37e-805a58eea3d1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CoolSpring_Qwen2-0.5B-Abyme/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-0.5B-Abyme",
-    "id": "CoolSpring/Qwen2-0.5B-Abyme",
-    "developer": "CoolSpring",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1915
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2862
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3542
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1333
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Corianas/Neural-Mistral-7B/00f7bd51-0b31-446d-be8c-1e0dc0d82e54.json b/data/hfopenllm_v2/Corianas/Neural-Mistral-7B/00f7bd51-0b31-446d-be8c-1e0dc0d82e54.json
deleted file mode 100644
index 002cf8d4c..000000000
--- a/data/hfopenllm_v2/Corianas/Neural-Mistral-7B/00f7bd51-0b31-446d-be8c-1e0dc0d82e54.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Corianas_Neural-Mistral-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Neural-Mistral-7B",
-    "id": "Corianas/Neural-Mistral-7B",
-    "developer": "Corianas",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5489
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4428
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0189
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3873
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2738
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Corianas/Quokka_2.7b/26782941-b918-44c5-a7f6-5f770e47c3d6.json b/data/hfopenllm_v2/Corianas/Quokka_2.7b/26782941-b918-44c5-a7f6-5f770e47c3d6.json
deleted file mode 100644
index 04534ef20..000000000
--- a/data/hfopenllm_v2/Corianas/Quokka_2.7b/26782941-b918-44c5-a7f6-5f770e47c3d6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Corianas_Quokka_2.7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Quokka_2.7b",
-    "id": "Corianas/Quokka_2.7b",
-    "developer": "Corianas",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPT2LMHeadModel",
-      "params_billions": 2.786
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1749
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3055
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3908
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1145
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Corianas/llama-3-reactor/5547ddaf-8fbb-4259-8b88-e946fc3d2404.json b/data/hfopenllm_v2/Corianas/llama-3-reactor/5547ddaf-8fbb-4259-8b88-e946fc3d2404.json
deleted file mode 100644
index 0f76b0c3e..000000000
--- a/data/hfopenllm_v2/Corianas/llama-3-reactor/5547ddaf-8fbb-4259-8b88-e946fc3d2404.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Corianas_llama-3-reactor/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-reactor",
-    "id": "Corianas/llama-3-reactor",
-    "developer": "Corianas",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": -1.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.23
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4457
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0468
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3977
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2801
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CortexLM/btlm-7b-base-v0.2/bee5ea59-b97a-4783-b763-b6bd432d4558.json b/data/hfopenllm_v2/CortexLM/btlm-7b-base-v0.2/bee5ea59-b97a-4783-b763-b6bd432d4558.json
deleted file mode 100644
index 9f5b604cb..000000000
--- a/data/hfopenllm_v2/CortexLM/btlm-7b-base-v0.2/bee5ea59-b97a-4783-b763-b6bd432d4558.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CortexLM_btlm-7b-base-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "btlm-7b-base-v0.2",
-    "id": "CortexLM/btlm-7b-base-v0.2",
-    "developer": "CortexLM",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.885
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1483
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4006
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3846
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.235
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Cran-May/SCE-2-24B/8150333f-8e79-4230-af8b-7ddb1d5eeb21.json b/data/hfopenllm_v2/Cran-May/SCE-2-24B/8150333f-8e79-4230-af8b-7ddb1d5eeb21.json
deleted file mode 100644
index d8282b048..000000000
--- a/data/hfopenllm_v2/Cran-May/SCE-2-24B/8150333f-8e79-4230-af8b-7ddb1d5eeb21.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Cran-May_SCE-2-24B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SCE-2-24B",
-    "id": "Cran-May/SCE-2-24B",
-    "developer": "Cran-May",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5866
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6265
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1896
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3372
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4528
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4612
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Cran-May/SCE-3-24B/be8510a9-ecd4-4ac7-9930-3200cacb7b50.json b/data/hfopenllm_v2/Cran-May/SCE-3-24B/be8510a9-ecd4-4ac7-9930-3200cacb7b50.json
deleted file mode 100644
index 17424927b..000000000
--- a/data/hfopenllm_v2/Cran-May/SCE-3-24B/be8510a9-ecd4-4ac7-9930-3200cacb7b50.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Cran-May_SCE-3-24B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SCE-3-24B",
-    "id": "Cran-May/SCE-3-24B",
-    "developer": "Cran-May",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5465
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5973
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1881
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3465
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4435
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4647
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Cran-May/T.E-8.1/887e4574-f876-4e75-afb8-e543bcb30020.json b/data/hfopenllm_v2/Cran-May/T.E-8.1/887e4574-f876-4e75-afb8-e543bcb30020.json
deleted file mode 100644
index 18db9bb98..000000000
--- a/data/hfopenllm_v2/Cran-May/T.E-8.1/887e4574-f876-4e75-afb8-e543bcb30020.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Cran-May_T.E-8.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "T.E-8.1",
-    "id": "Cran-May/T.E-8.1",
-    "developer": "Cran-May",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7077
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5582
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4456
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4505
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4432
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Cran-May/merge_model_20250308_2/fd21d8bd-28cf-4b91-8075-c38a61f5f32a.json b/data/hfopenllm_v2/Cran-May/merge_model_20250308_2/fd21d8bd-28cf-4b91-8075-c38a61f5f32a.json
deleted file mode 100644
index e21b806dc..000000000
--- a/data/hfopenllm_v2/Cran-May/merge_model_20250308_2/fd21d8bd-28cf-4b91-8075-c38a61f5f32a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Cran-May_merge_model_20250308_2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "merge_model_20250308_2",
-    "id": "Cran-May/merge_model_20250308_2",
-    "developer": "Cran-May",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5932
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6585
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4381
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3909
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4794
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.542
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Cran-May/merge_model_20250308_3/c0f05e38-6592-478a-9c46-26567f24ff85.json b/data/hfopenllm_v2/Cran-May/merge_model_20250308_3/c0f05e38-6592-478a-9c46-26567f24ff85.json
deleted file mode 100644
index 2aca4048b..000000000
--- a/data/hfopenllm_v2/Cran-May/merge_model_20250308_3/c0f05e38-6592-478a-9c46-26567f24ff85.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Cran-May_merge_model_20250308_3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "merge_model_20250308_3",
-    "id": "Cran-May/merge_model_20250308_3",
-    "developer": "Cran-May",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6018
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6271
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2545
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.432
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4962
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Cran-May/merge_model_20250308_4/06cc2913-8e05-44bf-a128-9a7c4aeff536.json b/data/hfopenllm_v2/Cran-May/merge_model_20250308_4/06cc2913-8e05-44bf-a128-9a7c4aeff536.json
deleted file mode 100644
index 9d92eeeaa..000000000
--- a/data/hfopenllm_v2/Cran-May/merge_model_20250308_4/06cc2913-8e05-44bf-a128-9a7c4aeff536.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Cran-May_merge_model_20250308_4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "merge_model_20250308_4",
-    "id": "Cran-May/merge_model_20250308_4",
-    "developer": "Cran-May",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.454
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6664
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3977
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4688
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5367
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Cran-May/tempmotacilla-cinerea-0308/86368d5b-0509-4b52-b988-58bcf7e1043e.json b/data/hfopenllm_v2/Cran-May/tempmotacilla-cinerea-0308/86368d5b-0509-4b52-b988-58bcf7e1043e.json
deleted file mode 100644
index 4adc26fd6..000000000
--- a/data/hfopenllm_v2/Cran-May/tempmotacilla-cinerea-0308/86368d5b-0509-4b52-b988-58bcf7e1043e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Cran-May_tempmotacilla-cinerea-0308/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tempmotacilla-cinerea-0308",
-    "id": "Cran-May/tempmotacilla-cinerea-0308",
-    "developer": "Cran-May",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8085
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6551
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5551
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3624
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4208
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.525
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CreitinGameplays/Llama-3.1-8B-R1-v0.1/77b89fe6-464b-4017-a77f-8750e2668a82.json b/data/hfopenllm_v2/CreitinGameplays/Llama-3.1-8B-R1-v0.1/77b89fe6-464b-4017-a77f-8750e2668a82.json
deleted file mode 100644
index 213246109..000000000
--- a/data/hfopenllm_v2/CreitinGameplays/Llama-3.1-8B-R1-v0.1/77b89fe6-464b-4017-a77f-8750e2668a82.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CreitinGameplays_Llama-3.1-8B-R1-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-R1-v0.1",
-    "id": "CreitinGameplays/Llama-3.1-8B-R1-v0.1",
-    "developer": "CreitinGameplays",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3235
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3057
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1813
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3622
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1252
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Broca/d2e47d86-23dd-4c95-a7fb-99518615d09f.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Broca/d2e47d86-23dd-4c95-a7fb-99518615d09f.json
deleted file mode 100644
index b78a75058..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Broca/d2e47d86-23dd-4c95-a7fb-99518615d09f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Broca/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Broca",
-    "id": "CultriX/Qwen2.5-14B-Broca",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5604
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6527
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4767
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5364
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-BrocaV9/0a09891e-ac97-4c3a-8364-7106a851f1a8.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-BrocaV9/0a09891e-ac97-4c3a-8364-7106a851f1a8.json
deleted file mode 100644
index 31c50cf81..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-BrocaV9/0a09891e-ac97-4c3a-8364-7106a851f1a8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-BrocaV9/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-BrocaV9",
-    "id": "CultriX/Qwen2.5-14B-BrocaV9",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6763
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6391
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3814
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3641
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.469
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5331
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav3/eb41fe62-ac46-4630-bb2d-6b907f271737.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav3/eb41fe62-ac46-4630-bb2d-6b907f271737.json
deleted file mode 100644
index e98b2f9be..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav3/eb41fe62-ac46-4630-bb2d-6b907f271737.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Brocav3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Brocav3",
-    "id": "CultriX/Qwen2.5-14B-Brocav3",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6952
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6452
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3875
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3591
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4756
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5317
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav6/d540a6c8-e9ec-4413-b9d2-dee68533c377.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav6/d540a6c8-e9ec-4413-b9d2-dee68533c377.json
deleted file mode 100644
index 9a923a074..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav6/d540a6c8-e9ec-4413-b9d2-dee68533c377.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Brocav6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Brocav6",
-    "id": "CultriX/Qwen2.5-14B-Brocav6",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6995
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6389
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3875
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3674
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4742
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5319
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav7/5b1f413a-05c4-43be-bdbc-9de5728e8d0a.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav7/5b1f413a-05c4-43be-bdbc-9de5728e8d0a.json
deleted file mode 100644
index 3819754c9..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Brocav7/5b1f413a-05c4-43be-bdbc-9de5728e8d0a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Brocav7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Brocav7",
-    "id": "CultriX/Qwen2.5-14B-Brocav7",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6724
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6444
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3844
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3674
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4796
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5258
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Emerged/6701738c-27e4-4bbd-b614-fbc297c3164f.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Emerged/6701738c-27e4-4bbd-b614-fbc297c3164f.json
deleted file mode 100644
index 84b4c7784..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Emerged/6701738c-27e4-4bbd-b614-fbc297c3164f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Emerged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Emerged",
-    "id": "CultriX/Qwen2.5-14B-Emerged",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.626
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3248
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3574
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4691
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5186
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Emergedv3/7f4563b4-0b25-49e7-ac1c-afaa28b0eda2.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Emergedv3/7f4563b4-0b25-49e7-ac1c-afaa28b0eda2.json
deleted file mode 100644
index 0ba7935ad..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Emergedv3/7f4563b4-0b25-49e7-ac1c-afaa28b0eda2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Emergedv3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Emergedv3",
-    "id": "CultriX/Qwen2.5-14B-Emergedv3",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6388
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6191
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4358
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3607
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4728
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5174
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-FinalMerge/32b6e4af-69ba-49b7-9367-dfafe3e390e8.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-FinalMerge/32b6e4af-69ba-49b7-9367-dfafe3e390e8.json
deleted file mode 100644
index 481bd01ec..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-FinalMerge/32b6e4af-69ba-49b7-9367-dfafe3e390e8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-FinalMerge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-FinalMerge",
-    "id": "CultriX/Qwen2.5-14B-FinalMerge",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4891
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5715
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3814
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3549
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4379
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4574
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyper/e16deaf7-da55-40ba-ac18-860fa3f14d34.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyper/e16deaf7-da55-40ba-ac18-860fa3f14d34.json
deleted file mode 100644
index e702043d7..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyper/e16deaf7-da55-40ba-ac18-860fa3f14d34.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Hyper/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Hyper",
-    "id": "CultriX/Qwen2.5-14B-Hyper",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5391
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6507
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3437
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3918
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4898
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5374
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-HyperMarck-dl/8a7a5886-0618-4615-9cdf-46f5d19a29fe.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-HyperMarck-dl/8a7a5886-0618-4615-9cdf-46f5d19a29fe.json
deleted file mode 100644
index 33af3ceb9..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-HyperMarck-dl/8a7a5886-0618-4615-9cdf-46f5d19a29fe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-HyperMarck-dl/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-HyperMarck-dl",
-    "id": "CultriX/Qwen2.5-14B-HyperMarck-dl",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.665
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6096
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3674
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4416
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5091
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv3/66d18e5b-9ebc-4ab6-94fb-6d5c23c58672.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv3/66d18e5b-9ebc-4ab6-94fb-6d5c23c58672.json
deleted file mode 100644
index ad906fa69..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv3/66d18e5b-9ebc-4ab6-94fb-6d5c23c58672.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Hyperionv3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Hyperionv3",
-    "id": "CultriX/Qwen2.5-14B-Hyperionv3",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6836
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6522
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3701
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3708
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.473
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.534
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv4/a36aaaf6-2478-4b98-ad0c-2b06ddb8c308.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv4/a36aaaf6-2478-4b98-ad0c-2b06ddb8c308.json
deleted file mode 100644
index 2baa36524..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv4/a36aaaf6-2478-4b98-ad0c-2b06ddb8c308.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Hyperionv4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Hyperionv4",
-    "id": "CultriX/Qwen2.5-14B-Hyperionv4",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5416
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6472
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3474
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3977
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4832
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5364
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv5/4a6237a7-019c-4310-971e-84b08d1b5067.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv5/4a6237a7-019c-4310-971e-84b08d1b5067.json
deleted file mode 100644
index bdfbba563..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Hyperionv5/4a6237a7-019c-4310-971e-84b08d1b5067.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Hyperionv5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Hyperionv5",
-    "id": "CultriX/Qwen2.5-14B-Hyperionv5",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6729
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6443
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3822
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4795
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5302
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-MegaMerge-pt2/996e781e-5939-41ac-b347-95c99037c34a.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-MegaMerge-pt2/996e781e-5939-41ac-b347-95c99037c34a.json
deleted file mode 100644
index 0815bb180..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-MegaMerge-pt2/996e781e-5939-41ac-b347-95c99037c34a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-MegaMerge-pt2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-MegaMerge-pt2",
-    "id": "CultriX/Qwen2.5-14B-MegaMerge-pt2",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5683
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6578
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3995
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4729
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5421
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-MergeStock/e880fa0e-ae49-4398-91bd-eadf8695425f.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-MergeStock/e880fa0e-ae49-4398-91bd-eadf8695425f.json
deleted file mode 100644
index 64f818a07..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-MergeStock/e880fa0e-ae49-4398-91bd-eadf8695425f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-MergeStock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-MergeStock",
-    "id": "CultriX/Qwen2.5-14B-MergeStock",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5685
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6579
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4147
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4676
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5396
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-ReasoningMerge/da04ff51-fbeb-41a8-ae5e-8ddf5925b792.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-ReasoningMerge/da04ff51-fbeb-41a8-ae5e-8ddf5925b792.json
deleted file mode 100644
index a2f6c4850..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-ReasoningMerge/da04ff51-fbeb-41a8-ae5e-8ddf5925b792.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-ReasoningMerge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-ReasoningMerge",
-    "id": "CultriX/Qwen2.5-14B-ReasoningMerge",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4605
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6578
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4077
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5166
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5345
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Ultimav2/6d709396-1ae1-4e5c-a03c-13c1e9425202.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Ultimav2/6d709396-1ae1-4e5c-a03c-13c1e9425202.json
deleted file mode 100644
index 0693a1c39..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Ultimav2/6d709396-1ae1-4e5c-a03c-13c1e9425202.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Ultimav2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Ultimav2",
-    "id": "CultriX/Qwen2.5-14B-Ultimav2",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.55
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6555
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3844
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3851
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4966
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5417
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Unity/5b616df9-e15a-4f84-98b4-c2cb532c1b95.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Unity/5b616df9-e15a-4f84-98b4-c2cb532c1b95.json
deleted file mode 100644
index 5dd3c6ef1..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Unity/5b616df9-e15a-4f84-98b4-c2cb532c1b95.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Unity/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Unity",
-    "id": "CultriX/Qwen2.5-14B-Unity",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6739
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.602
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4313
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4679
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5076
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke-SFT/0f6552d9-3cbe-447e-909b-068e5ceed4c9.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke-SFT/0f6552d9-3cbe-447e-909b-068e5ceed4c9.json
deleted file mode 100644
index 05fdd29f4..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke-SFT/0f6552d9-3cbe-447e-909b-068e5ceed4c9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Wernicke-SFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Wernicke-SFT",
-    "id": "CultriX/Qwen2.5-14B-Wernicke-SFT",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4937
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6461
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3595
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.354
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.507
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke-SLERP/2861aae0-d2ec-48f5-bd20-9e7bcaf8dabd.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke-SLERP/2861aae0-d2ec-48f5-bd20-9e7bcaf8dabd.json
deleted file mode 100644
index 19347fafa..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke-SLERP/2861aae0-d2ec-48f5-bd20-9e7bcaf8dabd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Wernicke-SLERP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Wernicke-SLERP",
-    "id": "CultriX/Qwen2.5-14B-Wernicke-SLERP",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.491
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5589
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6441
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4486
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.414
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5094
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke/51a64f37-256c-4fe7-b28c-6117520f04ec.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke/51a64f37-256c-4fe7-b28c-6117520f04ec.json
deleted file mode 100644
index 62b9abb48..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernicke/51a64f37-256c-4fe7-b28c-6117520f04ec.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Wernicke/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Wernicke",
-    "id": "CultriX/Qwen2.5-14B-Wernicke",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5235
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6568
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3814
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4689
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5424
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernickev3/03ce9c1d-38e8-4a6c-b293-57428a9d7c0e.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernickev3/03ce9c1d-38e8-4a6c-b293-57428a9d7c0e.json
deleted file mode 100644
index ece044f89..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-Wernickev3/03ce9c1d-38e8-4a6c-b293-57428a9d7c0e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-Wernickev3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Wernickev3",
-    "id": "CultriX/Qwen2.5-14B-Wernickev3",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7048
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6184
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3542
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3624
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4717
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5151
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-partialmergept1/3b0f5dea-db9b-4657-9807-6b3e56d38823.json b/data/hfopenllm_v2/CultriX/Qwen2.5-14B-partialmergept1/3b0f5dea-db9b-4657-9807-6b3e56d38823.json
deleted file mode 100644
index a8c01a5f1..000000000
--- a/data/hfopenllm_v2/CultriX/Qwen2.5-14B-partialmergept1/3b0f5dea-db9b-4657-9807-6b3e56d38823.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwen2.5-14B-partialmergept1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-partialmergept1",
-    "id": "CultriX/Qwen2.5-14B-partialmergept1",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6337
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6151
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4539
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3616
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4757
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5208
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwenfinity-2.5-14B/2d19e9ff-e331-4171-ae90-47e44f3f8885.json b/data/hfopenllm_v2/CultriX/Qwenfinity-2.5-14B/2d19e9ff-e331-4171-ae90-47e44f3f8885.json
deleted file mode 100644
index 4b70d0648..000000000
--- a/data/hfopenllm_v2/CultriX/Qwenfinity-2.5-14B/2d19e9ff-e331-4171-ae90-47e44f3f8885.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwenfinity-2.5-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenfinity-2.5-14B",
-    "id": "CultriX/Qwenfinity-2.5-14B",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4814
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5655
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4101
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.349
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4506
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4498
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/Qwestion-14B/6bfb8b24-1abd-405b-b01d-7d7111705dbb.json b/data/hfopenllm_v2/CultriX/Qwestion-14B/6bfb8b24-1abd-405b-b01d-7d7111705dbb.json
deleted file mode 100644
index 27242621d..000000000
--- a/data/hfopenllm_v2/CultriX/Qwestion-14B/6bfb8b24-1abd-405b-b01d-7d7111705dbb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_Qwestion-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwestion-14B",
-    "id": "CultriX/Qwestion-14B",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6318
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.645
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3724
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3683
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4636
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5422
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/SeQwence-14B-EvolMerge/c83e6b6c-c8be-4d97-9c65-2d883f88f37f.json b/data/hfopenllm_v2/CultriX/SeQwence-14B-EvolMerge/c83e6b6c-c8be-4d97-9c65-2d883f88f37f.json
deleted file mode 100644
index 2ff8194ea..000000000
--- a/data/hfopenllm_v2/CultriX/SeQwence-14B-EvolMerge/c83e6b6c-c8be-4d97-9c65-2d883f88f37f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_SeQwence-14B-EvolMerge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SeQwence-14B-EvolMerge",
-    "id": "CultriX/SeQwence-14B-EvolMerge",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5382
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6572
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3671
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3809
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4821
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5419
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/SeQwence-14B-EvolMergev1/72569796-1b11-48cc-ada7-e8c09522dd54.json b/data/hfopenllm_v2/CultriX/SeQwence-14B-EvolMergev1/72569796-1b11-48cc-ada7-e8c09522dd54.json
deleted file mode 100644
index eb95ee2a0..000000000
--- a/data/hfopenllm_v2/CultriX/SeQwence-14B-EvolMergev1/72569796-1b11-48cc-ada7-e8c09522dd54.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_SeQwence-14B-EvolMergev1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SeQwence-14B-EvolMergev1",
-    "id": "CultriX/SeQwence-14B-EvolMergev1",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5555
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6546
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4215
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3767
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4623
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5393
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/SeQwence-14B-v5/58403e30-bd2b-4f4c-ad41-daa890c77d40.json b/data/hfopenllm_v2/CultriX/SeQwence-14B-v5/58403e30-bd2b-4f4c-ad41-daa890c77d40.json
deleted file mode 100644
index 3cbe2fbd3..000000000
--- a/data/hfopenllm_v2/CultriX/SeQwence-14B-v5/58403e30-bd2b-4f4c-ad41-daa890c77d40.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_SeQwence-14B-v5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SeQwence-14B-v5",
-    "id": "CultriX/SeQwence-14B-v5",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.592
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6517
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3308
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4714
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5415
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/SeQwence-14B/eb8e1f1d-c6b3-407c-b172-d240553d2f89.json b/data/hfopenllm_v2/CultriX/SeQwence-14B/eb8e1f1d-c6b3-407c-b172-d240553d2f89.json
deleted file mode 100644
index c20b9bd39..000000000
--- a/data/hfopenllm_v2/CultriX/SeQwence-14B/eb8e1f1d-c6b3-407c-b172-d240553d2f89.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_SeQwence-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SeQwence-14B",
-    "id": "CultriX/SeQwence-14B",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5352
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6506
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3535
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3607
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4666
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5419
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/SeQwence-14Bv1/356d75a0-6520-46c1-afa9-7dbb2596a5c1.json b/data/hfopenllm_v2/CultriX/SeQwence-14Bv1/356d75a0-6520-46c1-afa9-7dbb2596a5c1.json
deleted file mode 100644
index 4bc3ce623..000000000
--- a/data/hfopenllm_v2/CultriX/SeQwence-14Bv1/356d75a0-6520-46c1-afa9-7dbb2596a5c1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_SeQwence-14Bv1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SeQwence-14Bv1",
-    "id": "CultriX/SeQwence-14Bv1",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6678
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6345
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.361
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3616
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4704
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.532
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/SeQwence-14Bv2/78681e0c-5fe2-4920-af7b-99345cea3efe.json b/data/hfopenllm_v2/CultriX/SeQwence-14Bv2/78681e0c-5fe2-4920-af7b-99345cea3efe.json
deleted file mode 100644
index a0e28a509..000000000
--- a/data/hfopenllm_v2/CultriX/SeQwence-14Bv2/78681e0c-5fe2-4920-af7b-99345cea3efe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_SeQwence-14Bv2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SeQwence-14Bv2",
-    "id": "CultriX/SeQwence-14Bv2",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5786
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6305
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4758
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3607
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4601
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5334
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/CultriX/SeQwence-14Bv3/ba0ee5b4-070a-461d-a3d2-cd4036387cc9.json b/data/hfopenllm_v2/CultriX/SeQwence-14Bv3/ba0ee5b4-070a-461d-a3d2-cd4036387cc9.json
deleted file mode 100644
index ff855fb3e..000000000
--- a/data/hfopenllm_v2/CultriX/SeQwence-14Bv3/ba0ee5b4-070a-461d-a3d2-cd4036387cc9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/CultriX_SeQwence-14Bv3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SeQwence-14Bv3",
-    "id": "CultriX/SeQwence-14Bv3",
-    "developer": "CultriX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5719
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6302
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4766
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3649
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4624
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5335
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DRXD1000/Atlas-7B/17d0d377-bca4-411c-be11-6c5cfce07798.json b/data/hfopenllm_v2/DRXD1000/Atlas-7B/17d0d377-bca4-411c-be11-6c5cfce07798.json
deleted file mode 100644
index 3ae174b80..000000000
--- a/data/hfopenllm_v2/DRXD1000/Atlas-7B/17d0d377-bca4-411c-be11-6c5cfce07798.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DRXD1000_Atlas-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Atlas-7B",
-    "id": "DRXD1000/Atlas-7B",
-    "developer": "DRXD1000",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.768
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3704
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3302
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0189
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1401
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DRXD1000/Phoenix-7B/d01a56a1-1eb9-4ccf-8c09-348b6ba5480b.json b/data/hfopenllm_v2/DRXD1000/Phoenix-7B/d01a56a1-1eb9-4ccf-8c09-348b6ba5480b.json
deleted file mode 100644
index 9ea88c590..000000000
--- a/data/hfopenllm_v2/DRXD1000/Phoenix-7B/d01a56a1-1eb9-4ccf-8c09-348b6ba5480b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DRXD1000_Phoenix-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phoenix-7B",
-    "id": "DRXD1000/Phoenix-7B",
-    "developer": "DRXD1000",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.321
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3932
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3849
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2343
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DUAL-GPO/zephyr-7b-ipo-0k-15k-i1/389821ff-d8e2-4d1d-8fb2-57a689867ac5.json b/data/hfopenllm_v2/DUAL-GPO/zephyr-7b-ipo-0k-15k-i1/389821ff-d8e2-4d1d-8fb2-57a689867ac5.json
deleted file mode 100644
index 1cd2505fb..000000000
--- a/data/hfopenllm_v2/DUAL-GPO/zephyr-7b-ipo-0k-15k-i1/389821ff-d8e2-4d1d-8fb2-57a689867ac5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DUAL-GPO_zephyr-7b-ipo-0k-15k-i1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "zephyr-7b-ipo-0k-15k-i1",
-    "id": "DUAL-GPO/zephyr-7b-ipo-0k-15k-i1",
-    "developer": "DUAL-GPO",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 14.483
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2756
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4473
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4173
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.313
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DZgas/GIGABATEMAN-7B/7913f782-29b0-48bd-bc62-37da9a5ac7d9.json b/data/hfopenllm_v2/DZgas/GIGABATEMAN-7B/7913f782-29b0-48bd-bc62-37da9a5ac7d9.json
deleted file mode 100644
index 150aa45c9..000000000
--- a/data/hfopenllm_v2/DZgas/GIGABATEMAN-7B/7913f782-29b0-48bd-bc62-37da9a5ac7d9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DZgas_GIGABATEMAN-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GIGABATEMAN-7B",
-    "id": "DZgas/GIGABATEMAN-7B",
-    "developer": "DZgas",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4607
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5032
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0551
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4328
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3177
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/AetherDrake-SFT/b0930974-999e-4372-9d21-b9790e0bad4c.json b/data/hfopenllm_v2/Daemontatox/AetherDrake-SFT/b0930974-999e-4372-9d21-b9790e0bad4c.json
deleted file mode 100644
index e4aedebfc..000000000
--- a/data/hfopenllm_v2/Daemontatox/AetherDrake-SFT/b0930974-999e-4372-9d21-b9790e0bad4c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_AetherDrake-SFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AetherDrake-SFT",
-    "id": "Daemontatox/AetherDrake-SFT",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4813
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4872
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1511
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4088
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3499
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/AetherSett/8265f577-f504-4a56-9cf0-42c34766559a.json b/data/hfopenllm_v2/Daemontatox/AetherSett/8265f577-f504-4a56-9cf0-42c34766559a.json
deleted file mode 100644
index 66d790f66..000000000
--- a/data/hfopenllm_v2/Daemontatox/AetherSett/8265f577-f504-4a56-9cf0-42c34766559a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_AetherSett/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AetherSett",
-    "id": "Daemontatox/AetherSett",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.537
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5452
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3973
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4603
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4279
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/AetherTOT/82044cd2-1a46-406e-bc68-397ce41b29ea.json b/data/hfopenllm_v2/Daemontatox/AetherTOT/82044cd2-1a46-406e-bc68-397ce41b29ea.json
deleted file mode 100644
index 859888bd9..000000000
--- a/data/hfopenllm_v2/Daemontatox/AetherTOT/82044cd2-1a46-406e-bc68-397ce41b29ea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_AetherTOT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AetherTOT",
-    "id": "Daemontatox/AetherTOT",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MllamaForConditionalGeneration",
-      "params_billions": 10.67
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4383
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5034
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1443
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4052
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3778
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/AetherTOT/de09e323-8cf1-4aa9-9537-e8ad30a8c297.json b/data/hfopenllm_v2/Daemontatox/AetherTOT/de09e323-8cf1-4aa9-9537-e8ad30a8c297.json
deleted file mode 100644
index e4d9830af..000000000
--- a/data/hfopenllm_v2/Daemontatox/AetherTOT/de09e323-8cf1-4aa9-9537-e8ad30a8c297.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_AetherTOT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AetherTOT",
-    "id": "Daemontatox/AetherTOT",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MllamaForConditionalGeneration",
-      "params_billions": 10.67
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4398
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5066
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1488
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4079
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3804
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/AetherUncensored/bfe543b4-ec38-488e-ae04-125cd358b61f.json b/data/hfopenllm_v2/Daemontatox/AetherUncensored/bfe543b4-ec38-488e-ae04-125cd358b61f.json
deleted file mode 100644
index a8cb84de3..000000000
--- a/data/hfopenllm_v2/Daemontatox/AetherUncensored/bfe543b4-ec38-488e-ae04-125cd358b61f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_AetherUncensored/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AetherUncensored",
-    "id": "Daemontatox/AetherUncensored",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4042
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4463
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3747
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/Cogito-MIS/be36d8ae-b81c-4b4e-aa2f-5999c7582237.json b/data/hfopenllm_v2/Daemontatox/Cogito-MIS/be36d8ae-b81c-4b4e-aa2f-5999c7582237.json
deleted file mode 100644
index 8a0c7af29..000000000
--- a/data/hfopenllm_v2/Daemontatox/Cogito-MIS/be36d8ae-b81c-4b4e-aa2f-5999c7582237.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_Cogito-MIS/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cogito-MIS",
-    "id": "Daemontatox/Cogito-MIS",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1815
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.506
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0861
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3768
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1435
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/CogitoDistil/342b435f-89e9-48ad-ab0f-2c1f52f4571a.json b/data/hfopenllm_v2/Daemontatox/CogitoDistil/342b435f-89e9-48ad-ab0f-2c1f52f4571a.json
deleted file mode 100644
index c37151cbd..000000000
--- a/data/hfopenllm_v2/Daemontatox/CogitoDistil/342b435f-89e9-48ad-ab0f-2c1f52f4571a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_CogitoDistil/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CogitoDistil",
-    "id": "Daemontatox/CogitoDistil",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2776
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3677
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3927
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3755
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2625
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/CogitoZ/b0c8737d-d838-4da1-909b-b218e22119dc.json b/data/hfopenllm_v2/Daemontatox/CogitoZ/b0c8737d-d838-4da1-909b-b218e22119dc.json
deleted file mode 100644
index f78c0c39c..000000000
--- a/data/hfopenllm_v2/Daemontatox/CogitoZ/b0c8737d-d838-4da1-909b-b218e22119dc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_CogitoZ/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CogitoZ",
-    "id": "Daemontatox/CogitoZ",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3967
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6734
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5242
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3951
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4793
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5593
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/CogitoZ14/4cd40f28-842f-44d5-9eb2-86238077fc55.json b/data/hfopenllm_v2/Daemontatox/CogitoZ14/4cd40f28-842f-44d5-9eb2-86238077fc55.json
deleted file mode 100644
index e16a58952..000000000
--- a/data/hfopenllm_v2/Daemontatox/CogitoZ14/4cd40f28-842f-44d5-9eb2-86238077fc55.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_CogitoZ14/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CogitoZ14",
-    "id": "Daemontatox/CogitoZ14",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6637
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6298
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4222
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4059
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3999
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/DocumentCogito/0758051c-2d75-402e-af0e-769096cbb17c.json b/data/hfopenllm_v2/Daemontatox/DocumentCogito/0758051c-2d75-402e-af0e-769096cbb17c.json
deleted file mode 100644
index b59d83df4..000000000
--- a/data/hfopenllm_v2/Daemontatox/DocumentCogito/0758051c-2d75-402e-af0e-769096cbb17c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_DocumentCogito/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DocumentCogito",
-    "id": "Daemontatox/DocumentCogito",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MllamaForConditionalGeneration",
-      "params_billions": 10.67
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.777
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5187
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2198
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3911
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/DocumentCogito/c93f610b-fb97-4ad1-b8af-fc41c6d8da33.json b/data/hfopenllm_v2/Daemontatox/DocumentCogito/c93f610b-fb97-4ad1-b8af-fc41c6d8da33.json
deleted file mode 100644
index 4744e7468..000000000
--- a/data/hfopenllm_v2/Daemontatox/DocumentCogito/c93f610b-fb97-4ad1-b8af-fc41c6d8da33.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_DocumentCogito/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DocumentCogito",
-    "id": "Daemontatox/DocumentCogito",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MllamaForConditionalGeneration",
-      "params_billions": 10.67
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5064
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5112
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1631
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3973
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3802
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/Llama3.3-70B-CogniLink/b8467118-d895-41fa-81c7-89892e1844d5.json b/data/hfopenllm_v2/Daemontatox/Llama3.3-70B-CogniLink/b8467118-d895-41fa-81c7-89892e1844d5.json
deleted file mode 100644
index 7fa8b968c..000000000
--- a/data/hfopenllm_v2/Daemontatox/Llama3.3-70B-CogniLink/b8467118-d895-41fa-81c7-89892e1844d5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_Llama3.3-70B-CogniLink/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.3-70B-CogniLink",
-    "id": "Daemontatox/Llama3.3-70B-CogniLink",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6931
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6668
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4139
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4455
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4877
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5173
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/Llama_cot/30d867bb-63c6-48d1-8d43-6c24f4cf44ba.json b/data/hfopenllm_v2/Daemontatox/Llama_cot/30d867bb-63c6-48d1-8d43-6c24f4cf44ba.json
deleted file mode 100644
index e654aab90..000000000
--- a/data/hfopenllm_v2/Daemontatox/Llama_cot/30d867bb-63c6-48d1-8d43-6c24f4cf44ba.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_Llama_cot/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_cot",
-    "id": "Daemontatox/Llama_cot",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MllamaForConditionalGeneration",
-      "params_billions": 10.67
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7549
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4838
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2024
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3872
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3518
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/MawaredT1/89b92cda-c5b6-45ed-a534-361c9d34794a.json b/data/hfopenllm_v2/Daemontatox/MawaredT1/89b92cda-c5b6-45ed-a534-361c9d34794a.json
deleted file mode 100644
index 9ac085f5f..000000000
--- a/data/hfopenllm_v2/Daemontatox/MawaredT1/89b92cda-c5b6-45ed-a534-361c9d34794a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_MawaredT1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MawaredT1",
-    "id": "Daemontatox/MawaredT1",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5215
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3021
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3347
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4702
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4718
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/Mini_QwQ/48cdf76a-886d-41ec-8580-00ed4232b601.json b/data/hfopenllm_v2/Daemontatox/Mini_QwQ/48cdf76a-886d-41ec-8580-00ed4232b601.json
deleted file mode 100644
index e1b1e912d..000000000
--- a/data/hfopenllm_v2/Daemontatox/Mini_QwQ/48cdf76a-886d-41ec-8580-00ed4232b601.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_Mini_QwQ/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mini_QwQ",
-    "id": "Daemontatox/Mini_QwQ",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4497
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5549
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4192
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4682
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4373
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/NemoR/116272d4-d25d-49cb-80cb-ff26a0fb3cf4.json b/data/hfopenllm_v2/Daemontatox/NemoR/116272d4-d25d-49cb-80cb-ff26a0fb3cf4.json
deleted file mode 100644
index 450d53bc4..000000000
--- a/data/hfopenllm_v2/Daemontatox/NemoR/116272d4-d25d-49cb-80cb-ff26a0fb3cf4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_NemoR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NemoR",
-    "id": "Daemontatox/NemoR",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 6.124
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2287
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5194
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0831
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3908
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.329
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/PathFinderAI2.0/bb103828-70fe-4767-9302-6750d839129e.json b/data/hfopenllm_v2/Daemontatox/PathFinderAI2.0/bb103828-70fe-4767-9302-6750d839129e.json
deleted file mode 100644
index 32487f192..000000000
--- a/data/hfopenllm_v2/Daemontatox/PathFinderAI2.0/bb103828-70fe-4767-9302-6750d839129e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_PathFinderAI2.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PathFinderAI2.0",
-    "id": "Daemontatox/PathFinderAI2.0",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4541
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6658
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5076
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4216
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5547
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/PathFinderAi3.0/7b58ab54-239b-4e49-93f1-c3940df61474.json b/data/hfopenllm_v2/Daemontatox/PathFinderAi3.0/7b58ab54-239b-4e49-93f1-c3940df61474.json
deleted file mode 100644
index 1fd11dd1d..000000000
--- a/data/hfopenllm_v2/Daemontatox/PathFinderAi3.0/7b58ab54-239b-4e49-93f1-c3940df61474.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_PathFinderAi3.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PathFinderAi3.0",
-    "id": "Daemontatox/PathFinderAi3.0",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4271
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6884
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4086
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4807
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5757
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/PathfinderAI/559067a2-816c-4091-893e-b1c7860171ec.json b/data/hfopenllm_v2/Daemontatox/PathfinderAI/559067a2-816c-4091-893e-b1c7860171ec.json
deleted file mode 100644
index a2410c22f..000000000
--- a/data/hfopenllm_v2/Daemontatox/PathfinderAI/559067a2-816c-4091-893e-b1c7860171ec.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_PathfinderAI/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PathfinderAI",
-    "id": "Daemontatox/PathfinderAI",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4855
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6627
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4841
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4256
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5542
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/PathfinderAI/ec502619-880b-4b7c-acfe-c43cf6514e3f.json b/data/hfopenllm_v2/Daemontatox/PathfinderAI/ec502619-880b-4b7c-acfe-c43cf6514e3f.json
deleted file mode 100644
index 02616b562..000000000
--- a/data/hfopenllm_v2/Daemontatox/PathfinderAI/ec502619-880b-4b7c-acfe-c43cf6514e3f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_PathfinderAI/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PathfinderAI",
-    "id": "Daemontatox/PathfinderAI",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3745
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6668
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4758
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3943
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4858
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5593
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/Phi-4-COT/6941a5dd-2a70-4846-a5f6-b16ef2d56a03.json b/data/hfopenllm_v2/Daemontatox/Phi-4-COT/6941a5dd-2a70-4846-a5f6-b16ef2d56a03.json
deleted file mode 100644
index d40331320..000000000
--- a/data/hfopenllm_v2/Daemontatox/Phi-4-COT/6941a5dd-2a70-4846-a5f6-b16ef2d56a03.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_Phi-4-COT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-COT",
-    "id": "Daemontatox/Phi-4-COT",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1793
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6173
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2243
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.453
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5005
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/PixelParse_AI/636e2f93-3242-491c-9df5-003aa1dacecf.json b/data/hfopenllm_v2/Daemontatox/PixelParse_AI/636e2f93-3242-491c-9df5-003aa1dacecf.json
deleted file mode 100644
index c6518eaf3..000000000
--- a/data/hfopenllm_v2/Daemontatox/PixelParse_AI/636e2f93-3242-491c-9df5-003aa1dacecf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_PixelParse_AI/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PixelParse_AI",
-    "id": "Daemontatox/PixelParse_AI",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MllamaForConditionalGeneration",
-      "params_billions": 10.67
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4383
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5034
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1473
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4052
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3778
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/RA2.0/1f4efa23-816d-49be-8659-feb003f4b3ef.json b/data/hfopenllm_v2/Daemontatox/RA2.0/1f4efa23-816d-49be-8659-feb003f4b3ef.json
deleted file mode 100644
index cdbe9004f..000000000
--- a/data/hfopenllm_v2/Daemontatox/RA2.0/1f4efa23-816d-49be-8659-feb003f4b3ef.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_RA2.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RA2.0",
-    "id": "Daemontatox/RA2.0",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3784
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4889
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3837
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4091
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2616
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/RA_Reasoner/d05be1e4-bcac-4b4a-bbde-8b17a5a71243.json b/data/hfopenllm_v2/Daemontatox/RA_Reasoner/d05be1e4-bcac-4b4a-bbde-8b17a5a71243.json
deleted file mode 100644
index 01700c90a..000000000
--- a/data/hfopenllm_v2/Daemontatox/RA_Reasoner/d05be1e4-bcac-4b4a-bbde-8b17a5a71243.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_RA_Reasoner/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RA_Reasoner",
-    "id": "Daemontatox/RA_Reasoner",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5592
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6054
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2122
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3964
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.43
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/RA_Reasoner2.0/9ab53055-86f5-4a88-976f-015dd9c9e832.json b/data/hfopenllm_v2/Daemontatox/RA_Reasoner2.0/9ab53055-86f5-4a88-976f-015dd9c9e832.json
deleted file mode 100644
index beab11efc..000000000
--- a/data/hfopenllm_v2/Daemontatox/RA_Reasoner2.0/9ab53055-86f5-4a88-976f-015dd9c9e832.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_RA_Reasoner2.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RA_Reasoner2.0",
-    "id": "Daemontatox/RA_Reasoner2.0",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5366
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6062
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2311
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3884
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4353
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/ReasonTest/ba34083a-9b13-46d9-8f36-aa3ddd586711.json b/data/hfopenllm_v2/Daemontatox/ReasonTest/ba34083a-9b13-46d9-8f36-aa3ddd586711.json
deleted file mode 100644
index 60c753137..000000000
--- a/data/hfopenllm_v2/Daemontatox/ReasonTest/ba34083a-9b13-46d9-8f36-aa3ddd586711.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_ReasonTest/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReasonTest",
-    "id": "Daemontatox/ReasonTest",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.808
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.408
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5435
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2137
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4315
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4272
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/Research_PathfinderAI/6a39d734-ad73-4c4a-9583-3563e336d4b3.json b/data/hfopenllm_v2/Daemontatox/Research_PathfinderAI/6a39d734-ad73-4c4a-9583-3563e336d4b3.json
deleted file mode 100644
index fbb52784d..000000000
--- a/data/hfopenllm_v2/Daemontatox/Research_PathfinderAI/6a39d734-ad73-4c4a-9583-3563e336d4b3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_Research_PathfinderAI/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Research_PathfinderAI",
-    "id": "Daemontatox/Research_PathfinderAI",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3457
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2872
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1699
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2408
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3394
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.113
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/SphinX/2af71e88-4931-4359-b92a-c64fa33df802.json b/data/hfopenllm_v2/Daemontatox/SphinX/2af71e88-4931-4359-b92a-c64fa33df802.json
deleted file mode 100644
index 4195d2bc9..000000000
--- a/data/hfopenllm_v2/Daemontatox/SphinX/2af71e88-4931-4359-b92a-c64fa33df802.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_SphinX/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SphinX",
-    "id": "Daemontatox/SphinX",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5725
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5441
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3082
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4405
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4366
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/Sphinx2.0/bf9336a7-a7c4-420a-9dd0-68d8e0c815c4.json b/data/hfopenllm_v2/Daemontatox/Sphinx2.0/bf9336a7-a7c4-420a-9dd0-68d8e0c815c4.json
deleted file mode 100644
index ab374a609..000000000
--- a/data/hfopenllm_v2/Daemontatox/Sphinx2.0/bf9336a7-a7c4-420a-9dd0-68d8e0c815c4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_Sphinx2.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sphinx2.0",
-    "id": "Daemontatox/Sphinx2.0",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7123
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6473
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4018
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.426
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5184
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/TinySphinx/2de872b2-10c7-44dd-91c3-f20205207da6.json b/data/hfopenllm_v2/Daemontatox/TinySphinx/2de872b2-10c7-44dd-91c3-f20205207da6.json
deleted file mode 100644
index 8e4831568..000000000
--- a/data/hfopenllm_v2/Daemontatox/TinySphinx/2de872b2-10c7-44dd-91c3-f20205207da6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_TinySphinx/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TinySphinx",
-    "id": "Daemontatox/TinySphinx",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.331
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3328
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1698
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/TinySphinx2.0/5cabed09-d8ea-46c2-bb78-012dac954d6b.json b/data/hfopenllm_v2/Daemontatox/TinySphinx2.0/5cabed09-d8ea-46c2-bb78-012dac954d6b.json
deleted file mode 100644
index 7a83a9405..000000000
--- a/data/hfopenllm_v2/Daemontatox/TinySphinx2.0/5cabed09-d8ea-46c2-bb78-012dac954d6b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_TinySphinx2.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TinySphinx2.0",
-    "id": "Daemontatox/TinySphinx2.0",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2535
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3168
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0325
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1731
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/Zirel-7B-Math/8236db6a-ff8a-4237-af5a-03bb258f8e59.json b/data/hfopenllm_v2/Daemontatox/Zirel-7B-Math/8236db6a-ff8a-4237-af5a-03bb258f8e59.json
deleted file mode 100644
index 00c47b014..000000000
--- a/data/hfopenllm_v2/Daemontatox/Zirel-7B-Math/8236db6a-ff8a-4237-af5a-03bb258f8e59.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_Zirel-7B-Math/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Zirel-7B-Math",
-    "id": "Daemontatox/Zirel-7B-Math",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6639
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5448
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1979
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4789
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4237
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/Zirel_1.5/1a7b078e-bc1f-400f-a0cd-f7b535548f23.json b/data/hfopenllm_v2/Daemontatox/Zirel_1.5/1a7b078e-bc1f-400f-a0cd-f7b535548f23.json
deleted file mode 100644
index 160e661a4..000000000
--- a/data/hfopenllm_v2/Daemontatox/Zirel_1.5/1a7b078e-bc1f-400f-a0cd-f7b535548f23.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_Zirel_1.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Zirel_1.5",
-    "id": "Daemontatox/Zirel_1.5",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4168
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3985
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1133
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3658
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2143
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/mini-Cogito-R1/fdaf561c-567c-416d-a74a-ac3c07c5be5b.json b/data/hfopenllm_v2/Daemontatox/mini-Cogito-R1/fdaf561c-567c-416d-a74a-ac3c07c5be5b.json
deleted file mode 100644
index a9a68a873..000000000
--- a/data/hfopenllm_v2/Daemontatox/mini-Cogito-R1/fdaf561c-567c-416d-a74a-ac3c07c5be5b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_mini-Cogito-R1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mini-Cogito-R1",
-    "id": "Daemontatox/mini-Cogito-R1",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2298
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2749
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3447
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1482
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Daemontatox/mini_Pathfinder/58900b3b-303b-49c8-b807-7b8d06601568.json b/data/hfopenllm_v2/Daemontatox/mini_Pathfinder/58900b3b-303b-49c8-b807-7b8d06601568.json
deleted file mode 100644
index 08611e986..000000000
--- a/data/hfopenllm_v2/Daemontatox/mini_Pathfinder/58900b3b-303b-49c8-b807-7b8d06601568.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Daemontatox_mini_Pathfinder/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mini_Pathfinder",
-    "id": "Daemontatox/mini_Pathfinder",
-    "developer": "Daemontatox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2962
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3956
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4751
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3781
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2809
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Dampfinchen/Llama-3.1-8B-Ultra-Instruct/7ac5a45a-7b41-4f63-8556-8737638a00ea.json b/data/hfopenllm_v2/Dampfinchen/Llama-3.1-8B-Ultra-Instruct/7ac5a45a-7b41-4f63-8556-8737638a00ea.json
deleted file mode 100644
index d6ac101ed..000000000
--- a/data/hfopenllm_v2/Dampfinchen/Llama-3.1-8B-Ultra-Instruct/7ac5a45a-7b41-4f63-8556-8737638a00ea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Dampfinchen_Llama-3.1-8B-Ultra-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-Ultra-Instruct",
-    "id": "Dampfinchen/Llama-3.1-8B-Ultra-Instruct",
-    "developer": "Dampfinchen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8081
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5258
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2205
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4003
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3826
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Danielbrdz/Barcenas-10b/3cb55475-30c8-43c8-8d7d-394450fdc117.json b/data/hfopenllm_v2/Danielbrdz/Barcenas-10b/3cb55475-30c8-43c8-8d7d-394450fdc117.json
deleted file mode 100644
index 104efe008..000000000
--- a/data/hfopenllm_v2/Danielbrdz/Barcenas-10b/3cb55475-30c8-43c8-8d7d-394450fdc117.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Danielbrdz_Barcenas-10b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Barcenas-10b",
-    "id": "Danielbrdz/Barcenas-10b",
-    "developer": "Danielbrdz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6608
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6121
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2153
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3414
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4135
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4361
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Danielbrdz/Barcenas-14b-Phi-3-medium-ORPO/f5e140ff-0c0e-4769-8116-63cf50255773.json b/data/hfopenllm_v2/Danielbrdz/Barcenas-14b-Phi-3-medium-ORPO/f5e140ff-0c0e-4769-8116-63cf50255773.json
deleted file mode 100644
index 4dc6150c8..000000000
--- a/data/hfopenllm_v2/Danielbrdz/Barcenas-14b-Phi-3-medium-ORPO/f5e140ff-0c0e-4769-8116-63cf50255773.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Danielbrdz_Barcenas-14b-Phi-3-medium-ORPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Barcenas-14b-Phi-3-medium-ORPO",
-    "id": "Danielbrdz/Barcenas-14b-Phi-3-medium-ORPO",
-    "developer": "Danielbrdz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 13.96
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4799
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6536
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2024
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4808
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4723
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Danielbrdz/Barcenas-14b-phi-4-v2/df85ec6e-1325-40ce-8087-d960a1d767dd.json b/data/hfopenllm_v2/Danielbrdz/Barcenas-14b-phi-4-v2/df85ec6e-1325-40ce-8087-d960a1d767dd.json
deleted file mode 100644
index 604f4b71a..000000000
--- a/data/hfopenllm_v2/Danielbrdz/Barcenas-14b-phi-4-v2/df85ec6e-1325-40ce-8087-d960a1d767dd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Danielbrdz_Barcenas-14b-phi-4-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Barcenas-14b-phi-4-v2",
-    "id": "Danielbrdz/Barcenas-14b-phi-4-v2",
-    "developer": "Danielbrdz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2775
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6573
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3218
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3784
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4399
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5244
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Danielbrdz/Barcenas-14b-phi-4/a7bd3fff-f01e-46ca-af85-5b4ac6ae7320.json b/data/hfopenllm_v2/Danielbrdz/Barcenas-14b-phi-4/a7bd3fff-f01e-46ca-af85-5b4ac6ae7320.json
deleted file mode 100644
index e976af74e..000000000
--- a/data/hfopenllm_v2/Danielbrdz/Barcenas-14b-phi-4/a7bd3fff-f01e-46ca-af85-5b4ac6ae7320.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Danielbrdz_Barcenas-14b-phi-4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Barcenas-14b-phi-4",
-    "id": "Danielbrdz/Barcenas-14b-phi-4",
-    "developer": "Danielbrdz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0498
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6769
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2583
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3834
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5097
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5175
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Danielbrdz/Barcenas-3b-GRPO/11842dd9-0572-41ef-aaa0-8d19f3420efc.json b/data/hfopenllm_v2/Danielbrdz/Barcenas-3b-GRPO/11842dd9-0572-41ef-aaa0-8d19f3420efc.json
deleted file mode 100644
index 85744ceae..000000000
--- a/data/hfopenllm_v2/Danielbrdz/Barcenas-3b-GRPO/11842dd9-0572-41ef-aaa0-8d19f3420efc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Danielbrdz_Barcenas-3b-GRPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Barcenas-3b-GRPO",
-    "id": "Danielbrdz/Barcenas-3b-GRPO",
-    "developer": "Danielbrdz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5444
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4414
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1375
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3576
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Danielbrdz/Barcenas-Llama3-8b-ORPO/01abccec-1cea-4060-89be-289987d0a2ce.json b/data/hfopenllm_v2/Danielbrdz/Barcenas-Llama3-8b-ORPO/01abccec-1cea-4060-89be-289987d0a2ce.json
deleted file mode 100644
index a208f17df..000000000
--- a/data/hfopenllm_v2/Danielbrdz/Barcenas-Llama3-8b-ORPO/01abccec-1cea-4060-89be-289987d0a2ce.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Danielbrdz_Barcenas-Llama3-8b-ORPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Barcenas-Llama3-8b-ORPO",
-    "id": "Danielbrdz/Barcenas-Llama3-8b-ORPO",
-    "developer": "Danielbrdz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7372
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4987
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0657
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.419
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.383
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Danielbrdz/Barcenas-R1-Qwen-1.5b/dce8226c-57bd-4255-b813-8a70494f0a1a.json b/data/hfopenllm_v2/Danielbrdz/Barcenas-R1-Qwen-1.5b/dce8226c-57bd-4255-b813-8a70494f0a1a.json
deleted file mode 100644
index 03948309d..000000000
--- a/data/hfopenllm_v2/Danielbrdz/Barcenas-R1-Qwen-1.5b/dce8226c-57bd-4255-b813-8a70494f0a1a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Danielbrdz_Barcenas-R1-Qwen-1.5b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Barcenas-R1-Qwen-1.5b",
-    "id": "Danielbrdz/Barcenas-R1-Qwen-1.5b",
-    "developer": "Danielbrdz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2428
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3587
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3497
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3541
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1909
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Dans-DiscountModels/12b-mn-dans-reasoning-test-2/7f80e69c-eec6-49ac-a088-6248ee25f736.json b/data/hfopenllm_v2/Dans-DiscountModels/12b-mn-dans-reasoning-test-2/7f80e69c-eec6-49ac-a088-6248ee25f736.json
deleted file mode 100644
index f352222ea..000000000
--- a/data/hfopenllm_v2/Dans-DiscountModels/12b-mn-dans-reasoning-test-2/7f80e69c-eec6-49ac-a088-6248ee25f736.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Dans-DiscountModels_12b-mn-dans-reasoning-test-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "12b-mn-dans-reasoning-test-2",
-    "id": "Dans-DiscountModels/12b-mn-dans-reasoning-test-2",
-    "developer": "Dans-DiscountModels",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3711
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4807
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0634
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3702
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2507
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Dans-DiscountModels/12b-mn-dans-reasoning-test-3/e0267a2c-dfc5-456e-864d-b5b0ad1fa508.json b/data/hfopenllm_v2/Dans-DiscountModels/12b-mn-dans-reasoning-test-3/e0267a2c-dfc5-456e-864d-b5b0ad1fa508.json
deleted file mode 100644
index 570947dbd..000000000
--- a/data/hfopenllm_v2/Dans-DiscountModels/12b-mn-dans-reasoning-test-3/e0267a2c-dfc5-456e-864d-b5b0ad1fa508.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Dans-DiscountModels_12b-mn-dans-reasoning-test-3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "12b-mn-dans-reasoning-test-3",
-    "id": "Dans-DiscountModels/12b-mn-dans-reasoning-test-3",
-    "developer": "Dans-DiscountModels",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5053
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4839
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0778
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4168
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2516
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-CoreCurriculum-12b-ChatML/e6ad37be-28f4-43b4-9df1-b7b47d31232e.json b/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-CoreCurriculum-12b-ChatML/e6ad37be-28f4-43b4-9df1-b7b47d31232e.json
deleted file mode 100644
index fc2a36d87..000000000
--- a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-CoreCurriculum-12b-ChatML/e6ad37be-28f4-43b4-9df1-b7b47d31232e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Dans-DiscountModels_Dans-Instruct-CoreCurriculum-12b-ChatML/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dans-Instruct-CoreCurriculum-12b-ChatML",
-    "id": "Dans-DiscountModels/Dans-Instruct-CoreCurriculum-12b-ChatML",
-    "developer": "Dans-DiscountModels",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2111
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4792
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3606
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2805
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.0/5514368a-1f7d-4cd0-b7f7-d116b753f975.json b/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.0/5514368a-1f7d-4cd0-b7f7-d116b753f975.json
deleted file mode 100644
index 83903ba4f..000000000
--- a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.0/5514368a-1f7d-4cd0-b7f7-d116b753f975.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML-V0.1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dans-Instruct-Mix-8b-ChatML-V0.1.0",
-    "id": "Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.0",
-    "developer": "Dans-DiscountModels",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0668
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4775
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0672
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3786
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3284
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.1/c0e29cf8-897f-4e07-abb4-71c801d34301.json b/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.1/c0e29cf8-897f-4e07-abb4-71c801d34301.json
deleted file mode 100644
index 4a453f833..000000000
--- a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.1/c0e29cf8-897f-4e07-abb4-71c801d34301.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML-V0.1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dans-Instruct-Mix-8b-ChatML-V0.1.1",
-    "id": "Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.1.1",
-    "developer": "Dans-DiscountModels",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0911
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4749
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0597
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3825
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3279
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.2.0/68310379-65b2-482d-892b-f76547bce2b0.json b/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.2.0/68310379-65b2-482d-892b-f76547bce2b0.json
deleted file mode 100644
index a2a7418c6..000000000
--- a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.2.0/68310379-65b2-482d-892b-f76547bce2b0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML-V0.2.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dans-Instruct-Mix-8b-ChatML-V0.2.0",
-    "id": "Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML-V0.2.0",
-    "developer": "Dans-DiscountModels",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5064
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4624
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0733
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3644
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML/a034c4ec-d4cd-439b-8dbd-e67685ea7616.json b/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML/a034c4ec-d4cd-439b-8dbd-e67685ea7616.json
deleted file mode 100644
index d03dd14df..000000000
--- a/data/hfopenllm_v2/Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML/a034c4ec-d4cd-439b-8dbd-e67685ea7616.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Dans-DiscountModels_Dans-Instruct-Mix-8b-ChatML/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dans-Instruct-Mix-8b-ChatML",
-    "id": "Dans-DiscountModels/Dans-Instruct-Mix-8b-ChatML",
-    "developer": "Dans-DiscountModels",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0825
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4738
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0551
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3918
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3288
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Dans-DiscountModels/Mistral-7b-v0.3-Test-E0.7/e4b761d3-bb84-4433-b9fb-4c92ecae6279.json b/data/hfopenllm_v2/Dans-DiscountModels/Mistral-7b-v0.3-Test-E0.7/e4b761d3-bb84-4433-b9fb-4c92ecae6279.json
deleted file mode 100644
index 17f92f2fb..000000000
--- a/data/hfopenllm_v2/Dans-DiscountModels/Mistral-7b-v0.3-Test-E0.7/e4b761d3-bb84-4433-b9fb-4c92ecae6279.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Dans-DiscountModels_Mistral-7b-v0.3-Test-E0.7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7b-v0.3-Test-E0.7",
-    "id": "Dans-DiscountModels/Mistral-7b-v0.3-Test-E0.7",
-    "developer": "Dans-DiscountModels",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5124
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.475
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.034
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4005
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2744
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Dans-DiscountModels/mistral-7b-test-merged/38d78d30-be6d-476c-a3aa-d9a40f570a56.json b/data/hfopenllm_v2/Dans-DiscountModels/mistral-7b-test-merged/38d78d30-be6d-476c-a3aa-d9a40f570a56.json
deleted file mode 100644
index 530b81d04..000000000
--- a/data/hfopenllm_v2/Dans-DiscountModels/mistral-7b-test-merged/38d78d30-be6d-476c-a3aa-d9a40f570a56.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Dans-DiscountModels_mistral-7b-test-merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-7b-test-merged",
-    "id": "Dans-DiscountModels/mistral-7b-test-merged",
-    "developer": "Dans-DiscountModels",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6678
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4898
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0446
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3754
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Darkknight535/OpenCrystal-12B-L3/36e60f6c-60f7-4b17-88fe-82810e195fc7.json b/data/hfopenllm_v2/Darkknight535/OpenCrystal-12B-L3/36e60f6c-60f7-4b17-88fe-82810e195fc7.json
deleted file mode 100644
index 62f99b5b5..000000000
--- a/data/hfopenllm_v2/Darkknight535/OpenCrystal-12B-L3/36e60f6c-60f7-4b17-88fe-82810e195fc7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Darkknight535_OpenCrystal-12B-L3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenCrystal-12B-L3",
-    "id": "Darkknight535/OpenCrystal-12B-L3",
-    "developer": "Darkknight535",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 11.52
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4071
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5223
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0899
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3657
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.364
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm/a6c647e8-ed24-4150-8563-dd9b20e21498.json b/data/hfopenllm_v2/DavidAU/DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm/a6c647e8-ed24-4150-8563-dd9b20e21498.json
deleted file mode 100644
index 61ffee2d2..000000000
--- a/data/hfopenllm_v2/DavidAU/DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm/a6c647e8-ed24-4150-8563-dd9b20e21498.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm",
-    "id": "DavidAU/DeepHermes-3-Llama-3-8B-Preview-16.5B-Brainstorm",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 16.537
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3136
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4762
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1057
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3928
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3209
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B/b5a366ac-d736-4447-a2f1-98d0b84ba3bd.json b/data/hfopenllm_v2/DavidAU/DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B/b5a366ac-d736-4447-a2f1-98d0b84ba3bd.json
deleted file mode 100644
index 408f2c43a..000000000
--- a/data/hfopenllm_v2/DavidAU/DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B/b5a366ac-d736-4447-a2f1-98d0b84ba3bd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B",
-    "id": "DavidAU/DeepSeek-BlackRoot-R1-Distill-Llama-3.1-8B",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3685
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4887
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0657
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.432
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2976
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B/5d098dc6-8124-4d26-86ec-d54e6e09c3a6.json b/data/hfopenllm_v2/DavidAU/DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B/5d098dc6-8124-4d26-86ec-d54e6e09c3a6.json
deleted file mode 100644
index ba72ae399..000000000
--- a/data/hfopenllm_v2/DavidAU/DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B/5d098dc6-8124-4d26-86ec-d54e6e09c3a6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B",
-    "id": "DavidAU/DeepSeek-Grand-Horror-SMB-R1-Distill-Llama-3.1-16B",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 15.664
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2507
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4488
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4164
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2709
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B/1137cbc4-d80b-4e21-bfeb-feab41dc80b2.json b/data/hfopenllm_v2/DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B/1137cbc4-d80b-4e21-bfeb-feab41dc80b2.json
deleted file mode 100644
index 55cc9cff5..000000000
--- a/data/hfopenllm_v2/DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B/1137cbc4-d80b-4e21-bfeb-feab41dc80b2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B",
-    "id": "DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Deep-Thinker-Uncensored-24B",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 24.942
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3883
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4886
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0816
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4375
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3024
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B/097bbfbc-0ccd-4fd4-9e0c-9c192cba9e8b.json b/data/hfopenllm_v2/DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B/097bbfbc-0ccd-4fd4-9e0c-9c192cba9e8b.json
deleted file mode 100644
index e4618c8df..000000000
--- a/data/hfopenllm_v2/DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B/097bbfbc-0ccd-4fd4-9e0c-9c192cba9e8b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B",
-    "id": "DavidAU/DeepSeek-MOE-4X8B-R1-Distill-Llama-3.1-Mad-Scientist-24B",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 24.942
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3436
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4769
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0755
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3372
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4231
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm/db8c6169-bfc1-48bb-be53-fa93c673f051.json b/data/hfopenllm_v2/DavidAU/DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm/db8c6169-bfc1-48bb-be53-fa93c673f051.json
deleted file mode 100644
index a94c29f09..000000000
--- a/data/hfopenllm_v2/DavidAU/DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm/db8c6169-bfc1-48bb-be53-fa93c673f051.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm",
-    "id": "DavidAU/DeepSeek-R1-Distill-Qwen-25.5B-Brainstorm",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 25.506
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3416
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5807
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3859
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5155
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4624
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B/41437fc9-6d48-4317-a8de-ab4e63b2cf46.json b/data/hfopenllm_v2/DavidAU/DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B/41437fc9-6d48-4317-a8de-ab4e63b2cf46.json
deleted file mode 100644
index 0f3796fae..000000000
--- a/data/hfopenllm_v2/DavidAU/DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B/41437fc9-6d48-4317-a8de-ab4e63b2cf46.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B",
-    "id": "DavidAU/DeepSeek-V2-Grand-Horror-SMB-R1-Distill-Llama-3.1-Uncensored-16.5B",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 16.537
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2853
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4462
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0174
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4179
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2778
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B/e075f4fe-95e0-48f4-94c4-f6ebd3f4edaa.json b/data/hfopenllm_v2/DavidAU/DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B/e075f4fe-95e0-48f4-94c4-f6ebd3f4edaa.json
deleted file mode 100644
index 774de1d5e..000000000
--- a/data/hfopenllm_v2/DavidAU/DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B/e075f4fe-95e0-48f4-94c4-f6ebd3f4edaa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B",
-    "id": "DavidAU/DeepThought-MOE-8X3B-R1-Llama-3.2-Reasoning-18B",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 18.405
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3793
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4232
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.356
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.272
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-9B/3349d66c-e12b-49c1-a406-e0e77b697458.json b/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-9B/3349d66c-e12b-49c1-a406-e0e77b697458.json
deleted file mode 100644
index 1978b8349..000000000
--- a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-9B/3349d66c-e12b-49c1-a406-e0e77b697458.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_Gemma-The-Writer-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-The-Writer-9B",
-    "id": "DavidAU/Gemma-The-Writer-9B",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.174
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5905
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0876
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3456
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4099
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3979
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-DEADLINE-10B/7aa0ff6b-11a9-4554-a27f-e477a0ff77c7.json b/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-DEADLINE-10B/7aa0ff6b-11a9-4554-a27f-e477a0ff77c7.json
deleted file mode 100644
index 5af39ed94..000000000
--- a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-DEADLINE-10B/7aa0ff6b-11a9-4554-a27f-e477a0ff77c7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_Gemma-The-Writer-DEADLINE-10B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-The-Writer-DEADLINE-10B",
-    "id": "DavidAU/Gemma-The-Writer-DEADLINE-10B",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.952
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2332
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5896
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0989
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3423
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4189
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3946
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-J.GutenBerg-10B/ac749485-df6d-485e-8fa7-63bdfd744167.json b/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-J.GutenBerg-10B/ac749485-df6d-485e-8fa7-63bdfd744167.json
deleted file mode 100644
index f0e5db406..000000000
--- a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-J.GutenBerg-10B/ac749485-df6d-485e-8fa7-63bdfd744167.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_Gemma-The-Writer-J.GutenBerg-10B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-The-Writer-J.GutenBerg-10B",
-    "id": "DavidAU/Gemma-The-Writer-J.GutenBerg-10B",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.034
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2858
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5909
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0921
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4176
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3947
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-Mighty-Sword-9B/54363a4b-312b-4035-a1c3-b5321311cec4.json b/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-Mighty-Sword-9B/54363a4b-312b-4035-a1c3-b5321311cec4.json
deleted file mode 100644
index fdb1da8cf..000000000
--- a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-Mighty-Sword-9B/54363a4b-312b-4035-a1c3-b5321311cec4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_Gemma-The-Writer-Mighty-Sword-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-The-Writer-Mighty-Sword-9B",
-    "id": "DavidAU/Gemma-The-Writer-Mighty-Sword-9B",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7528
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5912
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1911
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3482
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4112
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3968
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-N-Restless-Quill-10B-Uncensored/aa9e2b9e-cd25-4492-9801-eba7d40b4365.json b/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-N-Restless-Quill-10B-Uncensored/aa9e2b9e-cd25-4492-9801-eba7d40b4365.json
deleted file mode 100644
index 949b96b5e..000000000
--- a/data/hfopenllm_v2/DavidAU/Gemma-The-Writer-N-Restless-Quill-10B-Uncensored/aa9e2b9e-cd25-4492-9801-eba7d40b4365.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_Gemma-The-Writer-N-Restless-Quill-10B-Uncensored/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-The-Writer-N-Restless-Quill-10B-Uncensored",
-    "id": "DavidAU/Gemma-The-Writer-N-Restless-Quill-10B-Uncensored",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.034
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7071
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5922
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2296
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3414
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4163
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3966
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/L3-DARKEST-PLANET-16.5B/c6b484b8-f6f3-4516-aff5-c2f6438c9047.json b/data/hfopenllm_v2/DavidAU/L3-DARKEST-PLANET-16.5B/c6b484b8-f6f3-4516-aff5-c2f6438c9047.json
deleted file mode 100644
index adcdba506..000000000
--- a/data/hfopenllm_v2/DavidAU/L3-DARKEST-PLANET-16.5B/c6b484b8-f6f3-4516-aff5-c2f6438c9047.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_L3-DARKEST-PLANET-16.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-DARKEST-PLANET-16.5B",
-    "id": "DavidAU/L3-DARKEST-PLANET-16.5B",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 16.537
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6231
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.523
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0899
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3754
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.363
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/L3-Dark-Planet-8B/c6c760c9-a345-4e25-b333-b403bf6db389.json b/data/hfopenllm_v2/DavidAU/L3-Dark-Planet-8B/c6c760c9-a345-4e25-b333-b403bf6db389.json
deleted file mode 100644
index 2857bf738..000000000
--- a/data/hfopenllm_v2/DavidAU/L3-Dark-Planet-8B/c6c760c9-a345-4e25-b333-b403bf6db389.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_L3-Dark-Planet-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-Dark-Planet-8B",
-    "id": "DavidAU/L3-Dark-Planet-8B",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4134
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5084
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0823
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3616
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3737
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/L3-Jamet-12.2B-MK.V-Blackroot-Instruct/65b2aa58-2c04-48f2-9ea3-c8fd97cb9dde.json b/data/hfopenllm_v2/DavidAU/L3-Jamet-12.2B-MK.V-Blackroot-Instruct/65b2aa58-2c04-48f2-9ea3-c8fd97cb9dde.json
deleted file mode 100644
index d2d3fcadd..000000000
--- a/data/hfopenllm_v2/DavidAU/L3-Jamet-12.2B-MK.V-Blackroot-Instruct/65b2aa58-2c04-48f2-9ea3-c8fd97cb9dde.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_L3-Jamet-12.2B-MK.V-Blackroot-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-Jamet-12.2B-MK.V-Blackroot-Instruct",
-    "id": "DavidAU/L3-Jamet-12.2B-MK.V-Blackroot-Instruct",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 12.174
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3962
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4766
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.402
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3291
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/L3-Lumimaid-12.2B-v0.1-OAS-Instruct/92903344-0dde-4f5a-a7d2-749a1ffe9cd3.json b/data/hfopenllm_v2/DavidAU/L3-Lumimaid-12.2B-v0.1-OAS-Instruct/92903344-0dde-4f5a-a7d2-749a1ffe9cd3.json
deleted file mode 100644
index 0b5fe1f21..000000000
--- a/data/hfopenllm_v2/DavidAU/L3-Lumimaid-12.2B-v0.1-OAS-Instruct/92903344-0dde-4f5a-a7d2-749a1ffe9cd3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_L3-Lumimaid-12.2B-v0.1-OAS-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-Lumimaid-12.2B-v0.1-OAS-Instruct",
-    "id": "DavidAU/L3-Lumimaid-12.2B-v0.1-OAS-Instruct",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 12.174
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3924
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4693
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0461
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4194
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3142
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/L3-SMB-Instruct-12.2B-F32/59ddd478-c1cd-4bd8-80c3-fdebe762414a.json b/data/hfopenllm_v2/DavidAU/L3-SMB-Instruct-12.2B-F32/59ddd478-c1cd-4bd8-80c3-fdebe762414a.json
deleted file mode 100644
index 39a911ded..000000000
--- a/data/hfopenllm_v2/DavidAU/L3-SMB-Instruct-12.2B-F32/59ddd478-c1cd-4bd8-80c3-fdebe762414a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_L3-SMB-Instruct-12.2B-F32/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-SMB-Instruct-12.2B-F32",
-    "id": "DavidAU/L3-SMB-Instruct-12.2B-F32",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 12.174
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4303
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4786
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0468
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4087
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3312
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/L3-Stheno-Maid-Blackroot-Grand-HORROR-16B/02f63fc6-9376-4fb5-b067-63493238cc27.json b/data/hfopenllm_v2/DavidAU/L3-Stheno-Maid-Blackroot-Grand-HORROR-16B/02f63fc6-9376-4fb5-b067-63493238cc27.json
deleted file mode 100644
index abc99f61c..000000000
--- a/data/hfopenllm_v2/DavidAU/L3-Stheno-Maid-Blackroot-Grand-HORROR-16B/02f63fc6-9376-4fb5-b067-63493238cc27.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_L3-Stheno-Maid-Blackroot-Grand-HORROR-16B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-Stheno-Maid-Blackroot-Grand-HORROR-16B",
-    "id": "DavidAU/L3-Stheno-Maid-Blackroot-Grand-HORROR-16B",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 16.537
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3439
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4736
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4031
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.357
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/L3-Stheno-v3.2-12.2B-Instruct/dd7597fd-27f5-4e77-a44f-b01d0db82719.json b/data/hfopenllm_v2/DavidAU/L3-Stheno-v3.2-12.2B-Instruct/dd7597fd-27f5-4e77-a44f-b01d0db82719.json
deleted file mode 100644
index e2f7047b8..000000000
--- a/data/hfopenllm_v2/DavidAU/L3-Stheno-v3.2-12.2B-Instruct/dd7597fd-27f5-4e77-a44f-b01d0db82719.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_L3-Stheno-v3.2-12.2B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-Stheno-v3.2-12.2B-Instruct",
-    "id": "DavidAU/L3-Stheno-v3.2-12.2B-Instruct",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 12.174
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4028
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4846
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4103
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3345
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/L3.1-Dark-Planet-SpinFire-Uncensored-8B/20cd0d60-eb0d-41bd-b37f-910a03dd7f82.json b/data/hfopenllm_v2/DavidAU/L3.1-Dark-Planet-SpinFire-Uncensored-8B/20cd0d60-eb0d-41bd-b37f-910a03dd7f82.json
deleted file mode 100644
index c256c420f..000000000
--- a/data/hfopenllm_v2/DavidAU/L3.1-Dark-Planet-SpinFire-Uncensored-8B/20cd0d60-eb0d-41bd-b37f-910a03dd7f82.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_L3.1-Dark-Planet-SpinFire-Uncensored-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.1-Dark-Planet-SpinFire-Uncensored-8B",
-    "id": "DavidAU/L3.1-Dark-Planet-SpinFire-Uncensored-8B",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7043
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5261
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0929
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3541
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.367
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B/c4e9d045-3769-4828-a2ca-7fa508873089.json b/data/hfopenllm_v2/DavidAU/L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B/c4e9d045-3769-4828-a2ca-7fa508873089.json
deleted file mode 100644
index 0420b014e..000000000
--- a/data/hfopenllm_v2/DavidAU/L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B/c4e9d045-3769-4828-a2ca-7fa508873089.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B",
-    "id": "DavidAU/L3.1-MOE-2X8B-Deepseek-DeepHermes-e32-uncensored-abliterated-13.7B",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 13.668
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3345
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4421
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2606
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3749
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2892
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B/0a0501ec-4ecd-47c1-914b-d473f795cef2.json b/data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B/0a0501ec-4ecd-47c1-914b-d473f795cef2.json
deleted file mode 100644
index 0a0b5cf10..000000000
--- a/data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B/0a0501ec-4ecd-47c1-914b-d473f795cef2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B",
-    "id": "DavidAU/Qwen2.5-MOE-2X1.5B-DeepSeek-Uncensored-Censored-4B",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2MoeForCausalLM",
-      "params_billions": 4.089
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1783
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3033
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3715
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1142
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B/beca755f-203f-4bc8-b5cf-f9a9e3f8bd8f.json b/data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B/beca755f-203f-4bc8-b5cf-f9a9e3f8bd8f.json
deleted file mode 100644
index 3d03444da..000000000
--- a/data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B/beca755f-203f-4bc8-b5cf-f9a9e3f8bd8f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B",
-    "id": "DavidAU/Qwen2.5-MOE-2X7B-DeepSeek-Abliterated-Censored-19B",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2MoeForCausalLM",
-      "params_billions": 19.022
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2835
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3592
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2417
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3847
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1636
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32/79e1e1c6-cbe0-43a9-a593-8e2119baaf77.json b/data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32/79e1e1c6-cbe0-43a9-a593-8e2119baaf77.json
deleted file mode 100644
index 60721bfc8..000000000
--- a/data/hfopenllm_v2/DavidAU/Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32/79e1e1c6-cbe0-43a9-a593-8e2119baaf77.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavidAU_Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32",
-    "id": "DavidAU/Qwen2.5-MOE-6x1.5B-DeepSeek-Reasoning-e32",
-    "developer": "DavidAU",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2MoeForCausalLM",
-      "params_billions": 8.714
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2107
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3286
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2475
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3404
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1122
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Davidsv/SUONG-1/def80b44-3d9a-46ba-bf5f-ffc81e50af2e.json b/data/hfopenllm_v2/Davidsv/SUONG-1/def80b44-3d9a-46ba-bf5f-ffc81e50af2e.json
deleted file mode 100644
index 392a9563b..000000000
--- a/data/hfopenllm_v2/Davidsv/SUONG-1/def80b44-3d9a-46ba-bf5f-ffc81e50af2e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Davidsv_SUONG-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SUONG-1",
-    "id": "Davidsv/SUONG-1",
-    "developer": "Davidsv",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 2.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2497
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2817
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2441
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3578
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1085
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter0/5e1aa809-ef20-445e-a05b-eccd585d5991.json b/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter0/5e1aa809-ef20-445e-a05b-eccd585d5991.json
deleted file mode 100644
index a70497e8f..000000000
--- a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter0/5e1aa809-ef20-445e-a05b-eccd585d5991.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavieLion_Llama-3.2-1B-SPIN-iter0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-1B-SPIN-iter0",
-    "id": "DavieLion/Llama-3.2-1B-SPIN-iter0",
-    "developer": "DavieLion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1507
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.293
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter0/7c2be651-ca56-4285-afc7-1bfe1c8ce11e.json b/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter0/7c2be651-ca56-4285-afc7-1bfe1c8ce11e.json
deleted file mode 100644
index ff2f7e65f..000000000
--- a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter0/7c2be651-ca56-4285-afc7-1bfe1c8ce11e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavieLion_Llama-3.2-1B-SPIN-iter0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-1B-SPIN-iter0",
-    "id": "DavieLion/Llama-3.2-1B-SPIN-iter0",
-    "developer": "DavieLion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1549
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2937
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1128
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter1/cfe4ea72-ddb9-49b5-9599-99f215e112e5.json b/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter1/cfe4ea72-ddb9-49b5-9599-99f215e112e5.json
deleted file mode 100644
index 6cf1f5440..000000000
--- a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter1/cfe4ea72-ddb9-49b5-9599-99f215e112e5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavieLion_Llama-3.2-1B-SPIN-iter1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-1B-SPIN-iter1",
-    "id": "DavieLion/Llama-3.2-1B-SPIN-iter1",
-    "developer": "DavieLion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1575
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.294
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0023
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3646
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1118
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter2/81d63d8e-88dd-4b16-b9b8-d07604878f8f.json b/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter2/81d63d8e-88dd-4b16-b9b8-d07604878f8f.json
deleted file mode 100644
index b5a6f68d3..000000000
--- a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter2/81d63d8e-88dd-4b16-b9b8-d07604878f8f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavieLion_Llama-3.2-1B-SPIN-iter2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-1B-SPIN-iter2",
-    "id": "DavieLion/Llama-3.2-1B-SPIN-iter2",
-    "developer": "DavieLion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1376
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.298
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0053
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3553
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1129
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter3/81f8208b-f7e7-4685-bb84-321d9e097470.json b/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter3/81f8208b-f7e7-4685-bb84-321d9e097470.json
deleted file mode 100644
index fe2d84160..000000000
--- a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter3/81f8208b-f7e7-4685-bb84-321d9e097470.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavieLion_Llama-3.2-1B-SPIN-iter3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-1B-SPIN-iter3",
-    "id": "DavieLion/Llama-3.2-1B-SPIN-iter3",
-    "developer": "DavieLion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1324
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2972
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3527
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1129
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter3/a0c9a434-9b8c-47c5-b511-9daac7901686.json b/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter3/a0c9a434-9b8c-47c5-b511-9daac7901686.json
deleted file mode 100644
index e22bf8212..000000000
--- a/data/hfopenllm_v2/DavieLion/Llama-3.2-1B-SPIN-iter3/a0c9a434-9b8c-47c5-b511-9daac7901686.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavieLion_Llama-3.2-1B-SPIN-iter3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-1B-SPIN-iter3",
-    "id": "DavieLion/Llama-3.2-1B-SPIN-iter3",
-    "developer": "DavieLion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1336
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2975
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1128
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DavieLion/Lllma-3.2-1B/28b60eae-1b38-4404-8db1-3fb2997583f4.json b/data/hfopenllm_v2/DavieLion/Lllma-3.2-1B/28b60eae-1b38-4404-8db1-3fb2997583f4.json
deleted file mode 100644
index 9df57bb74..000000000
--- a/data/hfopenllm_v2/DavieLion/Lllma-3.2-1B/28b60eae-1b38-4404-8db1-3fb2997583f4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DavieLion_Lllma-3.2-1B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lllma-3.2-1B",
-    "id": "DavieLion/Lllma-3.2-1B",
-    "developer": "DavieLion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1601
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2965
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2441
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3578
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1126
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DebateLabKIT/Llama-3.1-Argunaut-1-8B-SFT/746862a2-a90c-4612-91d0-f989b9eed1a5.json b/data/hfopenllm_v2/DebateLabKIT/Llama-3.1-Argunaut-1-8B-SFT/746862a2-a90c-4612-91d0-f989b9eed1a5.json
deleted file mode 100644
index 64711247e..000000000
--- a/data/hfopenllm_v2/DebateLabKIT/Llama-3.1-Argunaut-1-8B-SFT/746862a2-a90c-4612-91d0-f989b9eed1a5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DebateLabKIT_Llama-3.1-Argunaut-1-8B-SFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Argunaut-1-8B-SFT",
-    "id": "DebateLabKIT/Llama-3.1-Argunaut-1-8B-SFT",
-    "developer": "DebateLabKIT",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5519
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4824
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4503
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3472
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Deci/DeciLM-7B-instruct/715ee057-9c9a-4e04-991c-7040b1eef65b.json b/data/hfopenllm_v2/Deci/DeciLM-7B-instruct/715ee057-9c9a-4e04-991c-7040b1eef65b.json
deleted file mode 100644
index 19a0bd337..000000000
--- a/data/hfopenllm_v2/Deci/DeciLM-7B-instruct/715ee057-9c9a-4e04-991c-7040b1eef65b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Deci_DeciLM-7B-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeciLM-7B-instruct",
-    "id": "Deci/DeciLM-7B-instruct",
-    "developer": "Deci",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "DeciLMForCausalLM",
-      "params_billions": 7.044
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.488
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.459
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3884
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2608
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Deci/DeciLM-7B/4dc1d103-3458-4b8c-9e63-b98effd69667.json b/data/hfopenllm_v2/Deci/DeciLM-7B/4dc1d103-3458-4b8c-9e63-b98effd69667.json
deleted file mode 100644
index 7a095aa90..000000000
--- a/data/hfopenllm_v2/Deci/DeciLM-7B/4dc1d103-3458-4b8c-9e63-b98effd69667.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Deci_DeciLM-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeciLM-7B",
-    "id": "Deci/DeciLM-7B",
-    "developer": "Deci",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "DeciLMForCausalLM",
-      "params_billions": 7.044
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2813
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4423
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4359
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2692
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.1-8B-Inst/070ff2a5-9a5d-48cf-8517-1ad9b6642d59.json b/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.1-8B-Inst/070ff2a5-9a5d-48cf-8517-1ad9b6642d59.json
deleted file mode 100644
index 693d2501c..000000000
--- a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.1-8B-Inst/070ff2a5-9a5d-48cf-8517-1ad9b6642d59.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepAutoAI_Explore_Llama-3.1-8B-Inst/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Explore_Llama-3.1-8B-Inst",
-    "id": "DeepAutoAI/Explore_Llama-3.1-8B-Inst",
-    "developer": "DeepAutoAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7795
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5117
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2009
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.391
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst/8406a5b8-a87d-489b-b75b-00e9f675f09f.json b/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst/8406a5b8-a87d-489b-b75b-00e9f675f09f.json
deleted file mode 100644
index 2d0dd9463..000000000
--- a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst/8406a5b8-a87d-489b-b75b-00e9f675f09f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepAutoAI_Explore_Llama-3.2-1B-Inst/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Explore_Llama-3.2-1B-Inst",
-    "id": "DeepAutoAI/Explore_Llama-3.2-1B-Inst",
-    "developer": "DeepAutoAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5649
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3505
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0748
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3183
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1809
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v0/11e8f9b6-32ab-4b83-a601-e5644c0b2c39.json b/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v0/11e8f9b6-32ab-4b83-a601-e5644c0b2c39.json
deleted file mode 100644
index 6773c3179..000000000
--- a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v0/11e8f9b6-32ab-4b83-a601-e5644c0b2c39.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepAutoAI_Explore_Llama-3.2-1B-Inst_v0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Explore_Llama-3.2-1B-Inst_v0",
-    "id": "DeepAutoAI/Explore_Llama-3.2-1B-Inst_v0",
-    "developer": "DeepAutoAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5597
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3365
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0597
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3103
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1804
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1.1/6b542f5a-ea62-45ce-8e98-436a4d058877.json b/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1.1/6b542f5a-ea62-45ce-8e98-436a4d058877.json
deleted file mode 100644
index a3c9bf2aa..000000000
--- a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1.1/6b542f5a-ea62-45ce-8e98-436a4d058877.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepAutoAI_Explore_Llama-3.2-1B-Inst_v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Explore_Llama-3.2-1B-Inst_v1.1",
-    "id": "DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1.1",
-    "developer": "DeepAutoAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5844
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3513
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0718
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3117
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1818
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1/9b280640-bfee-4730-acc3-386a54b2434c.json b/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1/9b280640-bfee-4730-acc3-386a54b2434c.json
deleted file mode 100644
index 0e0834213..000000000
--- a/data/hfopenllm_v2/DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1/9b280640-bfee-4730-acc3-386a54b2434c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepAutoAI_Explore_Llama-3.2-1B-Inst_v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Explore_Llama-3.2-1B-Inst_v1",
-    "id": "DeepAutoAI/Explore_Llama-3.2-1B-Inst_v1",
-    "developer": "DeepAutoAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4999
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3141
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.031
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.245
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3781
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1269
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepAutoAI/causal_gpt2/eff5171b-6119-4013-8aa8-8a4f0215b045.json b/data/hfopenllm_v2/DeepAutoAI/causal_gpt2/eff5171b-6119-4013-8aa8-8a4f0215b045.json
deleted file mode 100644
index f4355b69c..000000000
--- a/data/hfopenllm_v2/DeepAutoAI/causal_gpt2/eff5171b-6119-4013-8aa8-8a4f0215b045.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepAutoAI_causal_gpt2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "causal_gpt2",
-    "id": "DeepAutoAI/causal_gpt2",
-    "developer": "DeepAutoAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPT2LMHeadModel",
-      "params_billions": 0.124
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1813
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3026
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0053
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.427
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1131
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepAutoAI/d2nwg_Llama-3.1-8B-Instruct-v0.0/471c5fed-f155-4521-9d9c-b5370ca91bec.json b/data/hfopenllm_v2/DeepAutoAI/d2nwg_Llama-3.1-8B-Instruct-v0.0/471c5fed-f155-4521-9d9c-b5370ca91bec.json
deleted file mode 100644
index db64d4ff1..000000000
--- a/data/hfopenllm_v2/DeepAutoAI/d2nwg_Llama-3.1-8B-Instruct-v0.0/471c5fed-f155-4521-9d9c-b5370ca91bec.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepAutoAI_d2nwg_Llama-3.1-8B-Instruct-v0.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "d2nwg_Llama-3.1-8B-Instruct-v0.0",
-    "id": "DeepAutoAI/d2nwg_Llama-3.1-8B-Instruct-v0.0",
-    "developer": "DeepAutoAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7893
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.508
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1805
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4135
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3877
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepAutoAI/d2nwg_causal_gpt2/690be099-3ace-484f-b01f-2fe6b324d12a.json b/data/hfopenllm_v2/DeepAutoAI/d2nwg_causal_gpt2/690be099-3ace-484f-b01f-2fe6b324d12a.json
deleted file mode 100644
index e1f3baeef..000000000
--- a/data/hfopenllm_v2/DeepAutoAI/d2nwg_causal_gpt2/690be099-3ace-484f-b01f-2fe6b324d12a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepAutoAI_d2nwg_causal_gpt2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "d2nwg_causal_gpt2",
-    "id": "DeepAutoAI/d2nwg_causal_gpt2",
-    "developer": "DeepAutoAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPT2LMHeadModel",
-      "params_billions": 0.124
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1916
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3027
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4297
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1151
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepAutoAI/d2nwg_causal_gpt2_v1/71fbd15f-5eec-40d9-84e8-07323f3ffac6.json b/data/hfopenllm_v2/DeepAutoAI/d2nwg_causal_gpt2_v1/71fbd15f-5eec-40d9-84e8-07323f3ffac6.json
deleted file mode 100644
index 97cbbb48b..000000000
--- a/data/hfopenllm_v2/DeepAutoAI/d2nwg_causal_gpt2_v1/71fbd15f-5eec-40d9-84e8-07323f3ffac6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepAutoAI_d2nwg_causal_gpt2_v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "d2nwg_causal_gpt2_v1",
-    "id": "DeepAutoAI/d2nwg_causal_gpt2_v1",
-    "developer": "DeepAutoAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPT2LMHeadModel",
-      "params_billions": 0.124
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1989
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2992
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0038
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4337
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1135
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Inst/eb93dd3e-3d13-4234-bb66-f6177648aa2b.json b/data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Inst/eb93dd3e-3d13-4234-bb66-f6177648aa2b.json
deleted file mode 100644
index 02684506f..000000000
--- a/data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Inst/eb93dd3e-3d13-4234-bb66-f6177648aa2b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepAutoAI_ldm_soup_Llama-3.1-8B-Inst/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ldm_soup_Llama-3.1-8B-Inst",
-    "id": "DeepAutoAI/ldm_soup_Llama-3.1-8B-Inst",
-    "developer": "DeepAutoAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8033
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5121
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1888
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4161
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3886
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.0/f7ec1ed7-cc30-4879-8ab1-4909011553d5.json b/data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.0/f7ec1ed7-cc30-4879-8ab1-4909011553d5.json
deleted file mode 100644
index 78c4abb58..000000000
--- a/data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.0/f7ec1ed7-cc30-4879-8ab1-4909011553d5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepAutoAI_ldm_soup_Llama-3.1-8B-Instruct-v0.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ldm_soup_Llama-3.1-8B-Instruct-v0.0",
-    "id": "DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.0",
-    "developer": "DeepAutoAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7889
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5125
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1918
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4121
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3895
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.1/3e100704-dbd3-4d05-b325-5bb4bc90e51c.json b/data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.1/3e100704-dbd3-4d05-b325-5bb4bc90e51c.json
deleted file mode 100644
index dbebcb08a..000000000
--- a/data/hfopenllm_v2/DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.1/3e100704-dbd3-4d05-b325-5bb4bc90e51c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepAutoAI_ldm_soup_Llama-3.1-8B-Instruct-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ldm_soup_Llama-3.1-8B-Instruct-v0.1",
-    "id": "DeepAutoAI/ldm_soup_Llama-3.1-8B-Instruct-v0.1",
-    "developer": "DeepAutoAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7889
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5125
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1918
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4121
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3895
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepMount00/Lexora-Lite-3B/12f003ef-1098-4d3f-aed7-7343034157bc.json b/data/hfopenllm_v2/DeepMount00/Lexora-Lite-3B/12f003ef-1098-4d3f-aed7-7343034157bc.json
deleted file mode 100644
index d3e936e6c..000000000
--- a/data/hfopenllm_v2/DeepMount00/Lexora-Lite-3B/12f003ef-1098-4d3f-aed7-7343034157bc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepMount00_Lexora-Lite-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lexora-Lite-3B",
-    "id": "DeepMount00/Lexora-Lite-3B",
-    "developer": "DeepMount00",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5776
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4873
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2304
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3966
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3602
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepMount00/Lexora-Lite-3B_v2/9de2e564-3a30-4f1c-80da-6432a245a64f.json b/data/hfopenllm_v2/DeepMount00/Lexora-Lite-3B_v2/9de2e564-3a30-4f1c-80da-6432a245a64f.json
deleted file mode 100644
index 6c7c5b9aa..000000000
--- a/data/hfopenllm_v2/DeepMount00/Lexora-Lite-3B_v2/9de2e564-3a30-4f1c-80da-6432a245a64f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepMount00_Lexora-Lite-3B_v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lexora-Lite-3B_v2",
-    "id": "DeepMount00/Lexora-Lite-3B_v2",
-    "developer": "DeepMount00",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4943
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4812
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2281
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3822
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3544
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepMount00/Lexora-Medium-7B/dd5aaa3f-b24b-4a5b-852b-b80f4a6bf366.json b/data/hfopenllm_v2/DeepMount00/Lexora-Medium-7B/dd5aaa3f-b24b-4a5b-852b-b80f4a6bf366.json
deleted file mode 100644
index 65cb6dbf1..000000000
--- a/data/hfopenllm_v2/DeepMount00/Lexora-Medium-7B/dd5aaa3f-b24b-4a5b-852b-b80f4a6bf366.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepMount00_Lexora-Medium-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lexora-Medium-7B",
-    "id": "DeepMount00/Lexora-Medium-7B",
-    "developer": "DeepMount00",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4103
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5145
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2221
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4439
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4325
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepMount00/Llama-3-8b-Ita/8d8b9fd2-43f6-4edc-8340-44d20824a7e7.json b/data/hfopenllm_v2/DeepMount00/Llama-3-8b-Ita/8d8b9fd2-43f6-4edc-8340-44d20824a7e7.json
deleted file mode 100644
index c5cf4ed7b..000000000
--- a/data/hfopenllm_v2/DeepMount00/Llama-3-8b-Ita/8d8b9fd2-43f6-4edc-8340-44d20824a7e7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepMount00_Llama-3-8b-Ita/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8b-Ita",
-    "id": "DeepMount00/Llama-3-8b-Ita",
-    "developer": "DeepMount00",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.753
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4936
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4268
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3852
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepMount00/Llama-3.1-8b-ITA/7fe45c20-a2c0-4acf-9425-651a1ec3b0d0.json b/data/hfopenllm_v2/DeepMount00/Llama-3.1-8b-ITA/7fe45c20-a2c0-4acf-9425-651a1ec3b0d0.json
deleted file mode 100644
index 3e1809697..000000000
--- a/data/hfopenllm_v2/DeepMount00/Llama-3.1-8b-ITA/7fe45c20-a2c0-4acf-9425-651a1ec3b0d0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepMount00_Llama-3.1-8b-Ita/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8b-Ita",
-    "id": "DeepMount00/Llama-3.1-8b-Ita",
-    "developer": "DeepMount00",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Unknown",
-      "params_billions": 0.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5365
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.517
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1707
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4487
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.396
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepMount00/Llama-3.1-8b-ITA/baf93ef6-56f3-4809-93f6-32dcf4730388.json b/data/hfopenllm_v2/DeepMount00/Llama-3.1-8b-ITA/baf93ef6-56f3-4809-93f6-32dcf4730388.json
deleted file mode 100644
index 3d7b72221..000000000
--- a/data/hfopenllm_v2/DeepMount00/Llama-3.1-8b-ITA/baf93ef6-56f3-4809-93f6-32dcf4730388.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepMount00_Llama-3.1-8b-ITA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8b-ITA",
-    "id": "DeepMount00/Llama-3.1-8b-ITA",
-    "developer": "DeepMount00",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7917
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5109
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1088
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4136
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3876
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepMount00/Llama-3.1-Distilled/f6df14bd-207c-4fea-b789-c9f9aef749b3.json b/data/hfopenllm_v2/DeepMount00/Llama-3.1-Distilled/f6df14bd-207c-4fea-b789-c9f9aef749b3.json
deleted file mode 100644
index 5d76d499b..000000000
--- a/data/hfopenllm_v2/DeepMount00/Llama-3.1-Distilled/f6df14bd-207c-4fea-b789-c9f9aef749b3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepMount00_Llama-3.1-Distilled/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Distilled",
-    "id": "DeepMount00/Llama-3.1-Distilled",
-    "developer": "DeepMount00",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7844
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5101
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2032
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4058
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3782
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita/97766a7f-cf5b-46ae-b51e-5c5702ae000b.json b/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita/97766a7f-cf5b-46ae-b51e-5c5702ae000b.json
deleted file mode 100644
index dfa22cee3..000000000
--- a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita/97766a7f-cf5b-46ae-b51e-5c5702ae000b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepMount00_Qwen2-1.5B-Ita/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-1.5B-Ita",
-    "id": "DeepMount00/Qwen2-1.5B-Ita",
-    "developer": "DeepMount00",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5173
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3981
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3504
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2772
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v2/d5cd2a1b-3def-4b33-a8fe-4b02e090db27.json b/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v2/d5cd2a1b-3def-4b33-a8fe-4b02e090db27.json
deleted file mode 100644
index 28635f94d..000000000
--- a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v2/d5cd2a1b-3def-4b33-a8fe-4b02e090db27.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepMount00_Qwen2-1.5B-Ita_v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-1.5B-Ita_v2",
-    "id": "DeepMount00/Qwen2-1.5B-Ita_v2",
-    "developer": "DeepMount00",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3954
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0967
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3702
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3032
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v3/275d4bf0-566c-4b50-86b9-38c7f45df143.json b/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v3/275d4bf0-566c-4b50-86b9-38c7f45df143.json
deleted file mode 100644
index 4d8e80e4f..000000000
--- a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v3/275d4bf0-566c-4b50-86b9-38c7f45df143.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepMount00_Qwen2-1.5B-Ita_v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-1.5B-Ita_v3",
-    "id": "DeepMount00/Qwen2-1.5B-Ita_v3",
-    "developer": "DeepMount00",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.489
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3948
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1042
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3742
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3018
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v5/aa504db9-81f3-424f-b7d9-683ebe31f5d8.json b/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v5/aa504db9-81f3-424f-b7d9-683ebe31f5d8.json
deleted file mode 100644
index d9344d807..000000000
--- a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v5/aa504db9-81f3-424f-b7d9-683ebe31f5d8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepMount00_Qwen2-1.5B-Ita_v5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-1.5B-Ita_v5",
-    "id": "DeepMount00/Qwen2-1.5B-Ita_v5",
-    "developer": "DeepMount00",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4987
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4032
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1178
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3422
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2943
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v6/2cc209b7-ef10-435d-a840-b904ab741491.json b/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v6/2cc209b7-ef10-435d-a840-b904ab741491.json
deleted file mode 100644
index a299046f1..000000000
--- a/data/hfopenllm_v2/DeepMount00/Qwen2-1.5B-Ita_v6/2cc209b7-ef10-435d-a840-b904ab741491.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepMount00_Qwen2-1.5B-Ita_v6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-1.5B-Ita_v6",
-    "id": "DeepMount00/Qwen2-1.5B-Ita_v6",
-    "developer": "DeepMount00",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.497
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2999
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4249
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0846
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3755
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2872
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepMount00/Qwen2.5-7B-Instruct-MathCoder/9b9390ac-fd65-4a58-9834-5352aa340cdc.json b/data/hfopenllm_v2/DeepMount00/Qwen2.5-7B-Instruct-MathCoder/9b9390ac-fd65-4a58-9834-5352aa340cdc.json
deleted file mode 100644
index b0e72b4e2..000000000
--- a/data/hfopenllm_v2/DeepMount00/Qwen2.5-7B-Instruct-MathCoder/9b9390ac-fd65-4a58-9834-5352aa340cdc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepMount00_Qwen2.5-7B-Instruct-MathCoder/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-Instruct-MathCoder",
-    "id": "DeepMount00/Qwen2.5-7B-Instruct-MathCoder",
-    "developer": "DeepMount00",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.153
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2998
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3806
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1118
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DeepMount00/mergekit-ties-okvgjfz/4efe5cd4-6b8a-4951-a63a-4c7dc390bbec.json b/data/hfopenllm_v2/DeepMount00/mergekit-ties-okvgjfz/4efe5cd4-6b8a-4951-a63a-4c7dc390bbec.json
deleted file mode 100644
index 9daeaca32..000000000
--- a/data/hfopenllm_v2/DeepMount00/mergekit-ties-okvgjfz/4efe5cd4-6b8a-4951-a63a-4c7dc390bbec.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DeepMount00_mergekit-ties-okvgjfz/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mergekit-ties-okvgjfz",
-    "id": "DeepMount00/mergekit-ties-okvgjfz",
-    "developer": "DeepMount00",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.153
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2998
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3806
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1118
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Delta-Vector/Baldur-8B/4bc5a0db-1c88-4c61-9343-1d340305ecc5.json b/data/hfopenllm_v2/Delta-Vector/Baldur-8B/4bc5a0db-1c88-4c61-9343-1d340305ecc5.json
deleted file mode 100644
index 4eaef8e92..000000000
--- a/data/hfopenllm_v2/Delta-Vector/Baldur-8B/4bc5a0db-1c88-4c61-9343-1d340305ecc5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Delta-Vector_Baldur-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Baldur-8B",
-    "id": "Delta-Vector/Baldur-8B",
-    "developer": "Delta-Vector",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4782
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5306
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1435
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4372
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3654
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Delta-Vector/Control-8B-V1.1/74527f51-dcec-4b82-8ba8-075c933404f5.json b/data/hfopenllm_v2/Delta-Vector/Control-8B-V1.1/74527f51-dcec-4b82-8ba8-075c933404f5.json
deleted file mode 100644
index 5f8e589b6..000000000
--- a/data/hfopenllm_v2/Delta-Vector/Control-8B-V1.1/74527f51-dcec-4b82-8ba8-075c933404f5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Delta-Vector_Control-8B-V1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Control-8B-V1.1",
-    "id": "Delta-Vector/Control-8B-V1.1",
-    "developer": "Delta-Vector",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5697
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4993
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1276
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4237
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3745
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Delta-Vector/Control-8B/ac31bc90-3854-4d38-925d-ef8dc7e75d24.json b/data/hfopenllm_v2/Delta-Vector/Control-8B/ac31bc90-3854-4d38-925d-ef8dc7e75d24.json
deleted file mode 100644
index e9f8fccf8..000000000
--- a/data/hfopenllm_v2/Delta-Vector/Control-8B/ac31bc90-3854-4d38-925d-ef8dc7e75d24.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Delta-Vector_Control-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Control-8B",
-    "id": "Delta-Vector/Control-8B",
-    "developer": "Delta-Vector",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.549
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5041
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.139
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4355
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3732
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Delta-Vector/Darkens-8B/88583cff-1adc-4b1b-8e68-07f0074d0ae2.json b/data/hfopenllm_v2/Delta-Vector/Darkens-8B/88583cff-1adc-4b1b-8e68-07f0074d0ae2.json
deleted file mode 100644
index da2477729..000000000
--- a/data/hfopenllm_v2/Delta-Vector/Darkens-8B/88583cff-1adc-4b1b-8e68-07f0074d0ae2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Delta-Vector_Darkens-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Darkens-8B",
-    "id": "Delta-Vector/Darkens-8B",
-    "developer": "Delta-Vector",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 8.414
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2548
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5251
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0589
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4106
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3736
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Delta-Vector/Henbane-7b-attempt2/fadbac9e-7224-41d1-abfa-7039cbcba9f6.json b/data/hfopenllm_v2/Delta-Vector/Henbane-7b-attempt2/fadbac9e-7224-41d1-abfa-7039cbcba9f6.json
deleted file mode 100644
index 18bdc7508..000000000
--- a/data/hfopenllm_v2/Delta-Vector/Henbane-7b-attempt2/fadbac9e-7224-41d1-abfa-7039cbcba9f6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Delta-Vector_Henbane-7b-attempt2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Henbane-7b-attempt2",
-    "id": "Delta-Vector/Henbane-7b-attempt2",
-    "developer": "Delta-Vector",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4157
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5061
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2273
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3973
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4028
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Delta-Vector/Odin-9B/1fb90540-0fa0-44ca-ad67-1e3503f6b729.json b/data/hfopenllm_v2/Delta-Vector/Odin-9B/1fb90540-0fa0-44ca-ad67-1e3503f6b729.json
deleted file mode 100644
index e6b6d563d..000000000
--- a/data/hfopenllm_v2/Delta-Vector/Odin-9B/1fb90540-0fa0-44ca-ad67-1e3503f6b729.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Delta-Vector_Odin-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Odin-9B",
-    "id": "Delta-Vector/Odin-9B",
-    "developer": "Delta-Vector",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3692
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.544
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3414
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4648
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4047
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Delta-Vector/Tor-8B/047784e2-c1ee-40d9-a60d-e43504825801.json b/data/hfopenllm_v2/Delta-Vector/Tor-8B/047784e2-c1ee-40d9-a60d-e43504825801.json
deleted file mode 100644
index 226245e5e..000000000
--- a/data/hfopenllm_v2/Delta-Vector/Tor-8B/047784e2-c1ee-40d9-a60d-e43504825801.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Delta-Vector_Tor-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tor-8B",
-    "id": "Delta-Vector/Tor-8B",
-    "developer": "Delta-Vector",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 8.414
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2382
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5209
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0589
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4092
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.373
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DevQuasar/DevQuasar-R1-Uncensored-Llama-8B/ee60453d-2d51-46f7-8a18-c651d590f0e7.json b/data/hfopenllm_v2/DevQuasar/DevQuasar-R1-Uncensored-Llama-8B/ee60453d-2d51-46f7-8a18-c651d590f0e7.json
deleted file mode 100644
index 0c6a93423..000000000
--- a/data/hfopenllm_v2/DevQuasar/DevQuasar-R1-Uncensored-Llama-8B/ee60453d-2d51-46f7-8a18-c651d590f0e7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DevQuasar_DevQuasar-R1-Uncensored-Llama-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DevQuasar-R1-Uncensored-Llama-8B",
-    "id": "DevQuasar/DevQuasar-R1-Uncensored-Llama-8B",
-    "developer": "DevQuasar",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3849
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5118
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3308
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4436
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3615
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Dongwei/DeepSeek-R1-Distill-Qwen-7B-GRPO/b0ac4b11-f7b4-4753-baae-310a92f08259.json b/data/hfopenllm_v2/Dongwei/DeepSeek-R1-Distill-Qwen-7B-GRPO/b0ac4b11-f7b4-4753-baae-310a92f08259.json
deleted file mode 100644
index c68c27f32..000000000
--- a/data/hfopenllm_v2/Dongwei/DeepSeek-R1-Distill-Qwen-7B-GRPO/b0ac4b11-f7b4-4753-baae-310a92f08259.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Dongwei_DeepSeek-R1-Distill-Qwen-7B-GRPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-7B-GRPO",
-    "id": "Dongwei/DeepSeek-R1-Distill-Qwen-7B-GRPO",
-    "developer": "Dongwei",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4038
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3443
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1956
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3663
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2322
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/L3-8B-R1-WolfCore-V1.5-test/324db8b3-38c7-4a2c-82e8-7bebfa38e760.json b/data/hfopenllm_v2/DoppelReflEx/L3-8B-R1-WolfCore-V1.5-test/324db8b3-38c7-4a2c-82e8-7bebfa38e760.json
deleted file mode 100644
index 735b5e369..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/L3-8B-R1-WolfCore-V1.5-test/324db8b3-38c7-4a2c-82e8-7bebfa38e760.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_L3-8B-R1-WolfCore-V1.5-test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-8B-R1-WolfCore-V1.5-test",
-    "id": "DoppelReflEx/L3-8B-R1-WolfCore-V1.5-test",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3955
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5315
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1231
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3841
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3728
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/L3-8B-R1-WolfCore/54dd9033-61b9-4f26-9cde-e04c7136524b.json b/data/hfopenllm_v2/DoppelReflEx/L3-8B-R1-WolfCore/54dd9033-61b9-4f26-9cde-e04c7136524b.json
deleted file mode 100644
index 806bf87b7..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/L3-8B-R1-WolfCore/54dd9033-61b9-4f26-9cde-e04c7136524b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_L3-8B-R1-WolfCore/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-8B-R1-WolfCore",
-    "id": "DoppelReflEx/L3-8B-R1-WolfCore",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3775
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5318
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1631
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4277
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3717
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/L3-8B-WolfCore/d0973d6c-373c-41cd-9e62-52470c044dac.json b/data/hfopenllm_v2/DoppelReflEx/L3-8B-WolfCore/d0973d6c-373c-41cd-9e62-52470c044dac.json
deleted file mode 100644
index 1553f5b9c..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/L3-8B-WolfCore/d0973d6c-373c-41cd-9e62-52470c044dac.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_L3-8B-WolfCore/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-8B-WolfCore",
-    "id": "DoppelReflEx/L3-8B-WolfCore",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4022
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5182
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0982
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3973
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3705
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame-test/da15da67-b316-4c2e-86a5-c1f88eece9cb.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame-test/da15da67-b316-4c2e-86a5-c1f88eece9cb.json
deleted file mode 100644
index 49166bb43..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame-test/da15da67-b316-4c2e-86a5-c1f88eece9cb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-FoxFrame-test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-FoxFrame-test",
-    "id": "DoppelReflEx/MN-12B-FoxFrame-test",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4222
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5456
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1397
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4254
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3503
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame2-test/b0c34174-bfd0-4556-a3bf-92ec0ddf5ec4.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame2-test/b0c34174-bfd0-4556-a3bf-92ec0ddf5ec4.json
deleted file mode 100644
index b9e5d4976..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame2-test/b0c34174-bfd0-4556-a3bf-92ec0ddf5ec4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-FoxFrame2-test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-FoxFrame2-test",
-    "id": "DoppelReflEx/MN-12B-FoxFrame2-test",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4319
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5485
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1405
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4252
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3569
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame3-test/bce7b15d-1670-46db-bdff-24fb38bc3fd9.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame3-test/bce7b15d-1670-46db-bdff-24fb38bc3fd9.json
deleted file mode 100644
index 9a33bd885..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-FoxFrame3-test/bce7b15d-1670-46db-bdff-24fb38bc3fd9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-FoxFrame3-test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-FoxFrame3-test",
-    "id": "DoppelReflEx/MN-12B-FoxFrame3-test",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4323
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5395
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1322
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4598
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3529
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Kakigori/15e5e02f-27b9-4063-b601-42c2b17180f9.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Kakigori/15e5e02f-27b9-4063-b601-42c2b17180f9.json
deleted file mode 100644
index 14634831b..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Kakigori/15e5e02f-27b9-4063-b601-42c2b17180f9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Kakigori/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-Kakigori",
-    "id": "DoppelReflEx/MN-12B-Kakigori",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3593
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5416
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1193
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4052
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3581
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-2/51b0c546-0dde-4668-a8b8-3b9753a31aa0.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-2/51b0c546-0dde-4668-a8b8-3b9753a31aa0.json
deleted file mode 100644
index 91323e70b..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-2/51b0c546-0dde-4668-a8b8-3b9753a31aa0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-LilithFrame-Experiment-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-LilithFrame-Experiment-2",
-    "id": "DoppelReflEx/MN-12B-LilithFrame-Experiment-2",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4299
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4983
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1073
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3804
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3276
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-3/45842b1c-cf68-44a7-928f-2da454cdd13f.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-3/45842b1c-cf68-44a7-928f-2da454cdd13f.json
deleted file mode 100644
index e8a0610db..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-3/45842b1c-cf68-44a7-928f-2da454cdd13f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-LilithFrame-Experiment-3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-LilithFrame-Experiment-3",
-    "id": "DoppelReflEx/MN-12B-LilithFrame-Experiment-3",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4128
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5468
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1344
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4039
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3604
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-4/c15cdefd-dbe3-432e-aab0-3c43540cd320.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-4/c15cdefd-dbe3-432e-aab0-3c43540cd320.json
deleted file mode 100644
index 61d3a73e2..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame-Experiment-4/c15cdefd-dbe3-432e-aab0-3c43540cd320.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-LilithFrame-Experiment-4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-LilithFrame-Experiment-4",
-    "id": "DoppelReflEx/MN-12B-LilithFrame-Experiment-4",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3981
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5534
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1224
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4371
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3649
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame/1f489afa-a01d-40f3-836a-9e386c502d1d.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame/1f489afa-a01d-40f3-836a-9e386c502d1d.json
deleted file mode 100644
index 87c8dd012..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame/1f489afa-a01d-40f3-836a-9e386c502d1d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-LilithFrame/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-LilithFrame",
-    "id": "DoppelReflEx/MN-12B-LilithFrame",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.451
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4944
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1156
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3896
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3256
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame/94bcc87e-eb06-4321-9b72-2f99168cf92a.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame/94bcc87e-eb06-4321-9b72-2f99168cf92a.json
deleted file mode 100644
index 8a84d871c..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-LilithFrame/94bcc87e-eb06-4321-9b72-2f99168cf92a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-LilithFrame/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-LilithFrame",
-    "id": "DoppelReflEx/MN-12B-LilithFrame",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.436
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4956
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0589
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3843
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3237
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-GreenSnake/c0bc9811-4d7c-412f-a12b-3e6eab2e5a6f.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-GreenSnake/c0bc9811-4d7c-412f-a12b-3e6eab2e5a6f.json
deleted file mode 100644
index c66fd0b26..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-GreenSnake/c0bc9811-4d7c-412f-a12b-3e6eab2e5a6f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-GreenSnake/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-Mimicore-GreenSnake",
-    "id": "DoppelReflEx/MN-12B-Mimicore-GreenSnake",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.478
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5481
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.139
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4306
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3651
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Nocturne/b5a8b278-69e9-41ba-89ee-8fd6b2d90a1c.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Nocturne/b5a8b278-69e9-41ba-89ee-8fd6b2d90a1c.json
deleted file mode 100644
index 8af1a665e..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Nocturne/b5a8b278-69e9-41ba-89ee-8fd6b2d90a1c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-Nocturne/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-Mimicore-Nocturne",
-    "id": "DoppelReflEx/MN-12B-Mimicore-Nocturne",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3957
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5703
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1057
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4569
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3634
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v2-Experiment/a3ad7f0f-64bd-42a1-bc7d-d7d4cbbd80fd.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v2-Experiment/a3ad7f0f-64bd-42a1-bc7d-d7d4cbbd80fd.json
deleted file mode 100644
index eb3e1d5ef..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v2-Experiment/a3ad7f0f-64bd-42a1-bc7d-d7d4cbbd80fd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-Orochi-v2-Experiment/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-Mimicore-Orochi-v2-Experiment",
-    "id": "DoppelReflEx/MN-12B-Mimicore-Orochi-v2-Experiment",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2842
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5323
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4574
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3423
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v3-Experiment/f07c3a4a-2a8e-45c4-a726-be95726df2db.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v3-Experiment/f07c3a4a-2a8e-45c4-a726-be95726df2db.json
deleted file mode 100644
index 368728f64..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v3-Experiment/f07c3a4a-2a8e-45c4-a726-be95726df2db.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-Orochi-v3-Experiment/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-Mimicore-Orochi-v3-Experiment",
-    "id": "DoppelReflEx/MN-12B-Mimicore-Orochi-v3-Experiment",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4102
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5438
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4438
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3396
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v4-Experiment/f36d56b8-cd77-4d69-a51d-39025bcfcdfd.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v4-Experiment/f36d56b8-cd77-4d69-a51d-39025bcfcdfd.json
deleted file mode 100644
index e36c363a4..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi-v4-Experiment/f36d56b8-cd77-4d69-a51d-39025bcfcdfd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-Orochi-v4-Experiment/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-Mimicore-Orochi-v4-Experiment",
-    "id": "DoppelReflEx/MN-12B-Mimicore-Orochi-v4-Experiment",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4321
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5463
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1208
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4449
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.352
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi/65acabdc-ea5f-426c-820b-2b79f2b20b44.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi/65acabdc-ea5f-426c-820b-2b79f2b20b44.json
deleted file mode 100644
index 443a4c901..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-Orochi/65acabdc-ea5f-426c-820b-2b79f2b20b44.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-Orochi/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-Mimicore-Orochi",
-    "id": "DoppelReflEx/MN-12B-Mimicore-Orochi",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.462
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5498
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4546
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3447
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-1/96b00cfa-1383-4b36-a043-17eb39678ffc.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-1/96b00cfa-1383-4b36-a043-17eb39678ffc.json
deleted file mode 100644
index 995274f5e..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-1/96b00cfa-1383-4b36-a043-17eb39678ffc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-Mimicore-WhiteSnake-v2-Experiment-1",
-    "id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-1",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3909
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4866
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0785
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.379
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3114
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-2/3b8a796e-6bde-4506-8335-bd3cc72482e1.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-2/3b8a796e-6bde-4506-8335-bd3cc72482e1.json
deleted file mode 100644
index 037fde141..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-2/3b8a796e-6bde-4506-8335-bd3cc72482e1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-Mimicore-WhiteSnake-v2-Experiment-2",
-    "id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-2",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3124
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5126
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3975
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-3/a93e99e2-ca13-4cdc-9904-7ae5cc82c623.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-3/a93e99e2-ca13-4cdc-9904-7ae5cc82c623.json
deleted file mode 100644
index acc3b57f9..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-3/a93e99e2-ca13-4cdc-9904-7ae5cc82c623.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-Mimicore-WhiteSnake-v2-Experiment-3",
-    "id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-3",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4302
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4812
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0899
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3684
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3198
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-4/65d9e237-2757-459e-94e7-e382213e4eeb.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-4/65d9e237-2757-459e-94e7-e382213e4eeb.json
deleted file mode 100644
index bd2328696..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-4/65d9e237-2757-459e-94e7-e382213e4eeb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-WhiteSnake-v2-Experiment-4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-Mimicore-WhiteSnake-v2-Experiment-4",
-    "id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake-v2-Experiment-4",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4241
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5185
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4002
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3342
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake/c3f44524-4c75-4cd0-9f5d-79c8b08f6f77.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake/c3f44524-4c75-4cd0-9f5d-79c8b08f6f77.json
deleted file mode 100644
index 2aafdb196..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Mimicore-WhiteSnake/c3f44524-4c75-4cd0-9f5d-79c8b08f6f77.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Mimicore-WhiteSnake/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-Mimicore-WhiteSnake",
-    "id": "DoppelReflEx/MN-12B-Mimicore-WhiteSnake",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4438
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5605
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1314
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4569
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3658
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Unleashed-Twilight/2e7d3674-d0b0-4b87-8bd8-8202114b7665.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-Unleashed-Twilight/2e7d3674-d0b0-4b87-8bd8-8202114b7665.json
deleted file mode 100644
index c22d0071e..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-Unleashed-Twilight/2e7d3674-d0b0-4b87-8bd8-8202114b7665.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-Unleashed-Twilight/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-Unleashed-Twilight",
-    "id": "DoppelReflEx/MN-12B-Unleashed-Twilight",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3505
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5521
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0959
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4384
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3678
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MN-12B-WolFrame/30d21295-beb1-4179-8c6f-7bac79b29474.json b/data/hfopenllm_v2/DoppelReflEx/MN-12B-WolFrame/30d21295-beb1-4179-8c6f-7bac79b29474.json
deleted file mode 100644
index 4c6187607..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MN-12B-WolFrame/30d21295-beb1-4179-8c6f-7bac79b29474.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MN-12B-WolFrame/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-WolFrame",
-    "id": "DoppelReflEx/MN-12B-WolFrame",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4397
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5117
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1314
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4015
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3393
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-test/e2fc95de-b9d9-4043-b55c-aa2819d4f52f.json b/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-test/e2fc95de-b9d9-4043-b55c-aa2819d4f52f.json
deleted file mode 100644
index 3848f175e..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-test/e2fc95de-b9d9-4043-b55c-aa2819d4f52f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MiniusLight-24B-test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MiniusLight-24B-test",
-    "id": "DoppelReflEx/MiniusLight-24B-test",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0394
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6334
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0257
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3683
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4093
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5182
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1b-test/7fbd7f97-baf9-4acd-ba0c-90ffbf0c47a5.json b/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1b-test/7fbd7f97-baf9-4acd-ba0c-90ffbf0c47a5.json
deleted file mode 100644
index fa3042ed0..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1b-test/7fbd7f97-baf9-4acd-ba0c-90ffbf0c47a5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MiniusLight-24B-v1b-test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MiniusLight-24B-v1b-test",
-    "id": "DoppelReflEx/MiniusLight-24B-v1b-test",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3791
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6617
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2394
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4557
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5365
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1c-test/336effcd-d8fc-4477-846f-70fc40bdc111.json b/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1c-test/336effcd-d8fc-4477-846f-70fc40bdc111.json
deleted file mode 100644
index c424357f5..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1c-test/336effcd-d8fc-4477-846f-70fc40bdc111.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MiniusLight-24B-v1c-test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MiniusLight-24B-v1c-test",
-    "id": "DoppelReflEx/MiniusLight-24B-v1c-test",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3786
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6753
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2968
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3951
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4634
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5487
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1d-test/28f87820-d587-498e-b713-7c0af0cdc324.json b/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1d-test/28f87820-d587-498e-b713-7c0af0cdc324.json
deleted file mode 100644
index f4fb232bb..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B-v1d-test/28f87820-d587-498e-b713-7c0af0cdc324.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MiniusLight-24B-v1d-test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MiniusLight-24B-v1d-test",
-    "id": "DoppelReflEx/MiniusLight-24B-v1d-test",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4032
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6712
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2946
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3951
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4621
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5489
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B/f1b671ab-ebb3-43ec-86fa-832982d04cc1.json b/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B/f1b671ab-ebb3-43ec-86fa-832982d04cc1.json
deleted file mode 100644
index 5cc2e4360..000000000
--- a/data/hfopenllm_v2/DoppelReflEx/MiniusLight-24B/f1b671ab-ebb3-43ec-86fa-832982d04cc1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DoppelReflEx_MiniusLight-24B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MiniusLight-24B",
-    "id": "DoppelReflEx/MiniusLight-24B",
-    "developer": "DoppelReflEx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2577
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6256
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1261
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3582
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4319
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5091
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Again-8B-Model_Stock/327cde83-d107-4455-bc03-7e03026c52e6.json b/data/hfopenllm_v2/DreadPoor/Again-8B-Model_Stock/327cde83-d107-4455-bc03-7e03026c52e6.json
deleted file mode 100644
index e2c3518ef..000000000
--- a/data/hfopenllm_v2/DreadPoor/Again-8B-Model_Stock/327cde83-d107-4455-bc03-7e03026c52e6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Again-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Again-8B-Model_Stock",
-    "id": "DreadPoor/Again-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6724
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.531
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3987
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3518
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Alita99-8B-LINEAR/7497b8fb-9a7d-46dc-868e-1a2bbcdc7860.json b/data/hfopenllm_v2/DreadPoor/Alita99-8B-LINEAR/7497b8fb-9a7d-46dc-868e-1a2bbcdc7860.json
deleted file mode 100644
index de70affaf..000000000
--- a/data/hfopenllm_v2/DreadPoor/Alita99-8B-LINEAR/7497b8fb-9a7d-46dc-868e-1a2bbcdc7860.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Alita99-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Alita99-8B-LINEAR",
-    "id": "DreadPoor/Alita99-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.719
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5442
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1647
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4266
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3809
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/AnotherTest/92c8afbe-7735-40c8-af0e-29da687c2070.json b/data/hfopenllm_v2/DreadPoor/AnotherTest/92c8afbe-7735-40c8-af0e-29da687c2070.json
deleted file mode 100644
index c4e25da29..000000000
--- a/data/hfopenllm_v2/DreadPoor/AnotherTest/92c8afbe-7735-40c8-af0e-29da687c2070.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_AnotherTest/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AnotherTest",
-    "id": "DreadPoor/AnotherTest",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4701
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4683
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0619
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4213
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2875
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Aspire-8B-model_stock/bca052ac-6556-49d8-94e3-f4bda560a5d3.json b/data/hfopenllm_v2/DreadPoor/Aspire-8B-model_stock/bca052ac-6556-49d8-94e3-f4bda560a5d3.json
deleted file mode 100644
index 6e0e74802..000000000
--- a/data/hfopenllm_v2/DreadPoor/Aspire-8B-model_stock/bca052ac-6556-49d8-94e3-f4bda560a5d3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Aspire-8B-model_stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Aspire-8B-model_stock",
-    "id": "DreadPoor/Aspire-8B-model_stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7141
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5278
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1495
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4212
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3763
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Aspire_1.3-8B_model-stock/5f74fe6e-8575-4cea-959b-e6ba03c7e273.json b/data/hfopenllm_v2/DreadPoor/Aspire_1.3-8B_model-stock/5f74fe6e-8575-4cea-959b-e6ba03c7e273.json
deleted file mode 100644
index 91197dc03..000000000
--- a/data/hfopenllm_v2/DreadPoor/Aspire_1.3-8B_model-stock/5f74fe6e-8575-4cea-959b-e6ba03c7e273.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Aspire_1.3-8B_model-stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Aspire_1.3-8B_model-stock",
-    "id": "DreadPoor/Aspire_1.3-8B_model-stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7062
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5302
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1692
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4105
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Aspire_V2-8B-Model_Stock/b0f696f5-ed70-4293-999d-a9121192c137.json b/data/hfopenllm_v2/DreadPoor/Aspire_V2-8B-Model_Stock/b0f696f5-ed70-4293-999d-a9121192c137.json
deleted file mode 100644
index 206f0a344..000000000
--- a/data/hfopenllm_v2/DreadPoor/Aspire_V2-8B-Model_Stock/b0f696f5-ed70-4293-999d-a9121192c137.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Aspire_V2-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Aspire_V2-8B-Model_Stock",
-    "id": "DreadPoor/Aspire_V2-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7371
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.533
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.176
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3894
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3697
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Aspire_V2.1-8B-Model_Stock/18751a6f-062c-4915-bbe0-ae222cf9ae0b.json b/data/hfopenllm_v2/DreadPoor/Aspire_V2.1-8B-Model_Stock/18751a6f-062c-4915-bbe0-ae222cf9ae0b.json
deleted file mode 100644
index 2e1a6a761..000000000
--- a/data/hfopenllm_v2/DreadPoor/Aspire_V2.1-8B-Model_Stock/18751a6f-062c-4915-bbe0-ae222cf9ae0b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Aspire_V2.1-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Aspire_V2.1-8B-Model_Stock",
-    "id": "DreadPoor/Aspire_V2.1-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7238
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5236
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1767
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4136
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3801
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Aspire_V2_ALT-8B-Model_Stock/398ebe04-638f-4a11-b99d-6778ff3ff97b.json b/data/hfopenllm_v2/DreadPoor/Aspire_V2_ALT-8B-Model_Stock/398ebe04-638f-4a11-b99d-6778ff3ff97b.json
deleted file mode 100644
index 98689999d..000000000
--- a/data/hfopenllm_v2/DreadPoor/Aspire_V2_ALT-8B-Model_Stock/398ebe04-638f-4a11-b99d-6778ff3ff97b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Aspire_V2_ALT-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Aspire_V2_ALT-8B-Model_Stock",
-    "id": "DreadPoor/Aspire_V2_ALT-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7381
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5266
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.173
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3975
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3727
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Aspire_V2_ALT_ROW-8B-Model_Stock/b4f197f2-3456-4221-b222-10dfbbb50f56.json b/data/hfopenllm_v2/DreadPoor/Aspire_V2_ALT_ROW-8B-Model_Stock/b4f197f2-3456-4221-b222-10dfbbb50f56.json
deleted file mode 100644
index 2a86794fd..000000000
--- a/data/hfopenllm_v2/DreadPoor/Aspire_V2_ALT_ROW-8B-Model_Stock/b4f197f2-3456-4221-b222-10dfbbb50f56.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Aspire_V2_ALT_ROW-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Aspire_V2_ALT_ROW-8B-Model_Stock",
-    "id": "DreadPoor/Aspire_V2_ALT_ROW-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7381
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5266
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.173
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3975
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3727
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Aspire_V3-8B-Model_Stock/0a2fa86a-f9b3-4a49-b215-4cd3ee9b4c22.json b/data/hfopenllm_v2/DreadPoor/Aspire_V3-8B-Model_Stock/0a2fa86a-f9b3-4a49-b215-4cd3ee9b4c22.json
deleted file mode 100644
index f208a5c89..000000000
--- a/data/hfopenllm_v2/DreadPoor/Aspire_V3-8B-Model_Stock/0a2fa86a-f9b3-4a49-b215-4cd3ee9b4c22.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Aspire_V3-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Aspire_V3-8B-Model_Stock",
-    "id": "DreadPoor/Aspire_V3-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5119
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5268
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1858
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4015
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3642
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Aspire_V4-8B-Model_Stock/1561ec50-1cb9-47ce-9db1-09efe9c3fc61.json b/data/hfopenllm_v2/DreadPoor/Aspire_V4-8B-Model_Stock/1561ec50-1cb9-47ce-9db1-09efe9c3fc61.json
deleted file mode 100644
index b90acd330..000000000
--- a/data/hfopenllm_v2/DreadPoor/Aspire_V4-8B-Model_Stock/1561ec50-1cb9-47ce-9db1-09efe9c3fc61.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Aspire_V4-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Aspire_V4-8B-Model_Stock",
-    "id": "DreadPoor/Aspire_V4-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7694
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5314
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1926
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3708
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Aspire_V4_ALT-8B-Model_Stock/496525ff-394a-4b7b-9d93-f5b38d2a1ee3.json b/data/hfopenllm_v2/DreadPoor/Aspire_V4_ALT-8B-Model_Stock/496525ff-394a-4b7b-9d93-f5b38d2a1ee3.json
deleted file mode 100644
index d418f3c08..000000000
--- a/data/hfopenllm_v2/DreadPoor/Aspire_V4_ALT-8B-Model_Stock/496525ff-394a-4b7b-9d93-f5b38d2a1ee3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Aspire_V4_ALT-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Aspire_V4_ALT-8B-Model_Stock",
-    "id": "DreadPoor/Aspire_V4_ALT-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7366
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5268
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1813
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.392
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3682
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Asymmetric_Linearity-8B-Model_Stock/37071760-d24c-43cc-9965-d8c7873c0ee8.json b/data/hfopenllm_v2/DreadPoor/Asymmetric_Linearity-8B-Model_Stock/37071760-d24c-43cc-9965-d8c7873c0ee8.json
deleted file mode 100644
index 77c551bb3..000000000
--- a/data/hfopenllm_v2/DreadPoor/Asymmetric_Linearity-8B-Model_Stock/37071760-d24c-43cc-9965-d8c7873c0ee8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Asymmetric_Linearity-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Asymmetric_Linearity-8B-Model_Stock",
-    "id": "DreadPoor/Asymmetric_Linearity-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7174
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5465
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1647
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3844
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LINEAR/91a71a49-5dd4-43b1-9e1c-fd9492236712.json b/data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LINEAR/91a71a49-5dd4-43b1-9e1c-fd9492236712.json
deleted file mode 100644
index ea3d38b72..000000000
--- a/data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LINEAR/91a71a49-5dd4-43b1-9e1c-fd9492236712.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Aurora_faustus-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Aurora_faustus-8B-LINEAR",
-    "id": "DreadPoor/Aurora_faustus-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7281
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5516
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1707
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4146
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LORABLATED/d1d48abb-6dcf-4905-958f-c3a3e75feac6.json b/data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LORABLATED/d1d48abb-6dcf-4905-958f-c3a3e75feac6.json
deleted file mode 100644
index 1abc59610..000000000
--- a/data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LORABLATED/d1d48abb-6dcf-4905-958f-c3a3e75feac6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Aurora_faustus-8B-LORABLATED/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Aurora_faustus-8B-LORABLATED",
-    "id": "DreadPoor/Aurora_faustus-8B-LORABLATED",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7527
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5392
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1488
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4239
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3673
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LORABLATED_ALT/68282f29-f56f-420b-bd1e-9cc54783c1a5.json b/data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LORABLATED_ALT/68282f29-f56f-420b-bd1e-9cc54783c1a5.json
deleted file mode 100644
index 12017ff39..000000000
--- a/data/hfopenllm_v2/DreadPoor/Aurora_faustus-8B-LORABLATED_ALT/68282f29-f56f-420b-bd1e-9cc54783c1a5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Aurora_faustus-8B-LORABLATED_ALT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Aurora_faustus-8B-LORABLATED_ALT",
-    "id": "DreadPoor/Aurora_faustus-8B-LORABLATED_ALT",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7378
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5388
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1586
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4225
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3694
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Autumn_Dawn-8B-LINEAR/cd1c84dc-6c6e-4789-add7-0e3ca783b0ea.json b/data/hfopenllm_v2/DreadPoor/Autumn_Dawn-8B-LINEAR/cd1c84dc-6c6e-4789-add7-0e3ca783b0ea.json
deleted file mode 100644
index ab92e181e..000000000
--- a/data/hfopenllm_v2/DreadPoor/Autumn_Dawn-8B-LINEAR/cd1c84dc-6c6e-4789-add7-0e3ca783b0ea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Autumn_Dawn-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Autumn_Dawn-8B-LINEAR",
-    "id": "DreadPoor/Autumn_Dawn-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7293
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5459
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1858
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4186
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3968
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/BaeZel-8B-LINEAR/22a9d3b8-ac45-4433-8926-5d28681af922.json b/data/hfopenllm_v2/DreadPoor/BaeZel-8B-LINEAR/22a9d3b8-ac45-4433-8926-5d28681af922.json
deleted file mode 100644
index 52019043a..000000000
--- a/data/hfopenllm_v2/DreadPoor/BaeZel-8B-LINEAR/22a9d3b8-ac45-4433-8926-5d28681af922.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_BaeZel-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BaeZel-8B-LINEAR",
-    "id": "DreadPoor/BaeZel-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7378
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5464
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1813
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4227
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3861
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/BaeZel-8B-Model_Stock/57c4b9eb-dffd-4623-a2d5-b2374d3c9109.json b/data/hfopenllm_v2/DreadPoor/BaeZel-8B-Model_Stock/57c4b9eb-dffd-4623-a2d5-b2374d3c9109.json
deleted file mode 100644
index 0d62d9248..000000000
--- a/data/hfopenllm_v2/DreadPoor/BaeZel-8B-Model_Stock/57c4b9eb-dffd-4623-a2d5-b2374d3c9109.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_BaeZel-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BaeZel-8B-Model_Stock",
-    "id": "DreadPoor/BaeZel-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7713
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5408
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1639
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.388
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/BaeZel_V2-8B-Model_Stock/24adbd8c-df3a-4b58-94e6-61a3dfa6828e.json b/data/hfopenllm_v2/DreadPoor/BaeZel_V2-8B-Model_Stock/24adbd8c-df3a-4b58-94e6-61a3dfa6828e.json
deleted file mode 100644
index 92150ff2e..000000000
--- a/data/hfopenllm_v2/DreadPoor/BaeZel_V2-8B-Model_Stock/24adbd8c-df3a-4b58-94e6-61a3dfa6828e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_BaeZel_V2-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BaeZel_V2-8B-Model_Stock",
-    "id": "DreadPoor/BaeZel_V2-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7677
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5374
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1798
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4186
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3947
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/BaeZel_V2_ALT-8B-Model_Stock/6ed62f64-c2be-4bca-b17d-bd0184a3d498.json b/data/hfopenllm_v2/DreadPoor/BaeZel_V2_ALT-8B-Model_Stock/6ed62f64-c2be-4bca-b17d-bd0184a3d498.json
deleted file mode 100644
index 4fbb404d5..000000000
--- a/data/hfopenllm_v2/DreadPoor/BaeZel_V2_ALT-8B-Model_Stock/6ed62f64-c2be-4bca-b17d-bd0184a3d498.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_BaeZel_V2_ALT-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BaeZel_V2_ALT-8B-Model_Stock",
-    "id": "DreadPoor/BaeZel_V2_ALT-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7677
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5374
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1798
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4186
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3947
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/BaeZel_V3-8B-Model_Stock/db9e4d03-03a8-4a10-8739-16bbcfbb06d4.json b/data/hfopenllm_v2/DreadPoor/BaeZel_V3-8B-Model_Stock/db9e4d03-03a8-4a10-8739-16bbcfbb06d4.json
deleted file mode 100644
index 6a2b2bd8f..000000000
--- a/data/hfopenllm_v2/DreadPoor/BaeZel_V3-8B-Model_Stock/db9e4d03-03a8-4a10-8739-16bbcfbb06d4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_BaeZel_V3-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BaeZel_V3-8B-Model_Stock",
-    "id": "DreadPoor/BaeZel_V3-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7832
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5392
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1896
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4174
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3888
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Blunt_Edge-8B-SLERP/7b0fc4fe-51c8-4f01-b07b-5bca05b40859.json b/data/hfopenllm_v2/DreadPoor/Blunt_Edge-8B-SLERP/7b0fc4fe-51c8-4f01-b07b-5bca05b40859.json
deleted file mode 100644
index c3a3e4356..000000000
--- a/data/hfopenllm_v2/DreadPoor/Blunt_Edge-8B-SLERP/7b0fc4fe-51c8-4f01-b07b-5bca05b40859.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Blunt_Edge-8B-SLERP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Blunt_Edge-8B-SLERP",
-    "id": "DreadPoor/Blunt_Edge-8B-SLERP",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7497
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5389
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1858
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4174
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3767
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/BulkUp/6f286418-d8e3-4c11-8941-cfe5a18b1037.json b/data/hfopenllm_v2/DreadPoor/BulkUp/6f286418-d8e3-4c11-8941-cfe5a18b1037.json
deleted file mode 100644
index dfd176ab1..000000000
--- a/data/hfopenllm_v2/DreadPoor/BulkUp/6f286418-d8e3-4c11-8941-cfe5a18b1037.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_BulkUp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BulkUp",
-    "id": "DreadPoor/BulkUp",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1778
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.287
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2475
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3447
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.111
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Cadence-8B-LINEAR/b0a83b1f-3af2-45e8-9d88-d7302a529112.json b/data/hfopenllm_v2/DreadPoor/Cadence-8B-LINEAR/b0a83b1f-3af2-45e8-9d88-d7302a529112.json
deleted file mode 100644
index 6b7ee9836..000000000
--- a/data/hfopenllm_v2/DreadPoor/Cadence-8B-LINEAR/b0a83b1f-3af2-45e8-9d88-d7302a529112.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Cadence-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cadence-8B-LINEAR",
-    "id": "DreadPoor/Cadence-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7682
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5433
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1677
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4173
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3803
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Caelid-8B-Model_Stock/0462fce1-51b4-48d8-8278-a90048ffd637.json b/data/hfopenllm_v2/DreadPoor/Caelid-8B-Model_Stock/0462fce1-51b4-48d8-8278-a90048ffd637.json
deleted file mode 100644
index 2d3dba8f9..000000000
--- a/data/hfopenllm_v2/DreadPoor/Caelid-8B-Model_Stock/0462fce1-51b4-48d8-8278-a90048ffd637.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Caelid-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Caelid-8B-Model_Stock",
-    "id": "DreadPoor/Caelid-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7247
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.546
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1511
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4001
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3816
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Casuar-9B-Model_Stock/e02f597c-c368-4223-ac90-c99d82c90634.json b/data/hfopenllm_v2/DreadPoor/Casuar-9B-Model_Stock/e02f597c-c368-4223-ac90-c99d82c90634.json
deleted file mode 100644
index c97dcca00..000000000
--- a/data/hfopenllm_v2/DreadPoor/Casuar-9B-Model_Stock/e02f597c-c368-4223-ac90-c99d82c90634.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Casuar-9B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Casuar-9B-Model_Stock",
-    "id": "DreadPoor/Casuar-9B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7765
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6107
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.213
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3448
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4165
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4156
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Condensed_Milk-8B-Model_Stock/32e63ffc-c64e-4562-ba99-14873f5bac2e.json b/data/hfopenllm_v2/DreadPoor/Condensed_Milk-8B-Model_Stock/32e63ffc-c64e-4562-ba99-14873f5bac2e.json
deleted file mode 100644
index 3cd2321bc..000000000
--- a/data/hfopenllm_v2/DreadPoor/Condensed_Milk-8B-Model_Stock/32e63ffc-c64e-4562-ba99-14873f5bac2e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Condensed_Milk-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Condensed_Milk-8B-Model_Stock",
-    "id": "DreadPoor/Condensed_Milk-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7536
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5435
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1745
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.416
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3876
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/CoolerCoder-8B-LINEAR/6af4faad-05c2-488b-9685-e11ae4e1cbf0.json b/data/hfopenllm_v2/DreadPoor/CoolerCoder-8B-LINEAR/6af4faad-05c2-488b-9685-e11ae4e1cbf0.json
deleted file mode 100644
index c49036cd8..000000000
--- a/data/hfopenllm_v2/DreadPoor/CoolerCoder-8B-LINEAR/6af4faad-05c2-488b-9685-e11ae4e1cbf0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_CoolerCoder-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CoolerCoder-8B-LINEAR",
-    "id": "DreadPoor/CoolerCoder-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4519
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4762
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0793
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3964
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3159
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Damasteel-8B-LINEAR/8aa7701b-7019-44a0-851f-cfc9108fdfbd.json b/data/hfopenllm_v2/DreadPoor/Damasteel-8B-LINEAR/8aa7701b-7019-44a0-851f-cfc9108fdfbd.json
deleted file mode 100644
index 7160aa1de..000000000
--- a/data/hfopenllm_v2/DreadPoor/Damasteel-8B-LINEAR/8aa7701b-7019-44a0-851f-cfc9108fdfbd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Damasteel-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Damasteel-8B-LINEAR",
-    "id": "DreadPoor/Damasteel-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7384
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5388
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1669
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4212
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3779
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Dearly_Beloved-8B-TIES/a2f95fad-5ab5-47d0-b9aa-33358c673caf.json b/data/hfopenllm_v2/DreadPoor/Dearly_Beloved-8B-TIES/a2f95fad-5ab5-47d0-b9aa-33358c673caf.json
deleted file mode 100644
index a39f8ede9..000000000
--- a/data/hfopenllm_v2/DreadPoor/Dearly_Beloved-8B-TIES/a2f95fad-5ab5-47d0-b9aa-33358c673caf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Dearly_Beloved-8B-TIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dearly_Beloved-8B-TIES",
-    "id": "DreadPoor/Dearly_Beloved-8B-TIES",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8267
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.405
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2115
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4175
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Decayed-8B-LINEAR/aef73a77-9df7-4d4f-89ef-50905d326198.json b/data/hfopenllm_v2/DreadPoor/Decayed-8B-LINEAR/aef73a77-9df7-4d4f-89ef-50905d326198.json
deleted file mode 100644
index 28e908428..000000000
--- a/data/hfopenllm_v2/DreadPoor/Decayed-8B-LINEAR/aef73a77-9df7-4d4f-89ef-50905d326198.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Decayed-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Decayed-8B-LINEAR",
-    "id": "DreadPoor/Decayed-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7676
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5417
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1715
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4186
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3763
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Derivative-8B-Model_Stock/e9ffdfb6-6f91-4bac-89d2-40b1eb43f3ee.json b/data/hfopenllm_v2/DreadPoor/Derivative-8B-Model_Stock/e9ffdfb6-6f91-4bac-89d2-40b1eb43f3ee.json
deleted file mode 100644
index 5b83a9506..000000000
--- a/data/hfopenllm_v2/DreadPoor/Derivative-8B-Model_Stock/e9ffdfb6-6f91-4bac-89d2-40b1eb43f3ee.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Derivative-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Derivative-8B-Model_Stock",
-    "id": "DreadPoor/Derivative-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7667
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5395
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.179
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.42
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3811
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Derivative_V2-8B-Model_Stock/8ff39438-907c-465f-ac7a-5a25cfd8d824.json b/data/hfopenllm_v2/DreadPoor/Derivative_V2-8B-Model_Stock/8ff39438-907c-465f-ac7a-5a25cfd8d824.json
deleted file mode 100644
index 332287ece..000000000
--- a/data/hfopenllm_v2/DreadPoor/Derivative_V2-8B-Model_Stock/8ff39438-907c-465f-ac7a-5a25cfd8d824.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Derivative_V2-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Derivative_V2-8B-Model_Stock",
-    "id": "DreadPoor/Derivative_V2-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7537
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5393
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1798
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4123
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3856
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Derivative_V2_ALT-8B-Model_Stock/83d831c5-a74f-4699-9961-664a7a51b7b8.json b/data/hfopenllm_v2/DreadPoor/Derivative_V2_ALT-8B-Model_Stock/83d831c5-a74f-4699-9961-664a7a51b7b8.json
deleted file mode 100644
index 2b2d015d1..000000000
--- a/data/hfopenllm_v2/DreadPoor/Derivative_V2_ALT-8B-Model_Stock/83d831c5-a74f-4699-9961-664a7a51b7b8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Derivative_V2_ALT-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Derivative_V2_ALT-8B-Model_Stock",
-    "id": "DreadPoor/Derivative_V2_ALT-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.772
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5365
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1881
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4135
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3882
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Derivative_V3-8B-Model_Stock/83fb88ec-f640-4c1e-b71c-53a123fc4c2e.json b/data/hfopenllm_v2/DreadPoor/Derivative_V3-8B-Model_Stock/83fb88ec-f640-4c1e-b71c-53a123fc4c2e.json
deleted file mode 100644
index cfc2e036f..000000000
--- a/data/hfopenllm_v2/DreadPoor/Derivative_V3-8B-Model_Stock/83fb88ec-f640-4c1e-b71c-53a123fc4c2e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Derivative_V3-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Derivative_V3-8B-Model_Stock",
-    "id": "DreadPoor/Derivative_V3-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6964
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5243
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1465
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.415
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3502
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Elusive_Dragon_Heart-8B-LINEAR/3811cc34-45cb-4932-b862-39bf042331e0.json b/data/hfopenllm_v2/DreadPoor/Elusive_Dragon_Heart-8B-LINEAR/3811cc34-45cb-4932-b862-39bf042331e0.json
deleted file mode 100644
index cf8031e7d..000000000
--- a/data/hfopenllm_v2/DreadPoor/Elusive_Dragon_Heart-8B-LINEAR/3811cc34-45cb-4932-b862-39bf042331e0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Elusive_Dragon_Heart-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Elusive_Dragon_Heart-8B-LINEAR",
-    "id": "DreadPoor/Elusive_Dragon_Heart-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7131
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5456
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4146
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3814
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Emu_Eggs-9B-Model_Stock/5b2a16a1-7a2a-40b7-add6-b99378b6af00.json b/data/hfopenllm_v2/DreadPoor/Emu_Eggs-9B-Model_Stock/5b2a16a1-7a2a-40b7-add6-b99378b6af00.json
deleted file mode 100644
index 34eb53856..000000000
--- a/data/hfopenllm_v2/DreadPoor/Emu_Eggs-9B-Model_Stock/5b2a16a1-7a2a-40b7-add6-b99378b6af00.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Emu_Eggs-9B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Emu_Eggs-9B-Model_Stock",
-    "id": "DreadPoor/Emu_Eggs-9B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7607
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6052
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.21
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4071
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4227
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Eunoia_Vespera-8B-LINEAR/1dc2a5bb-40b6-401e-8f1c-6110cb4c0f0d.json b/data/hfopenllm_v2/DreadPoor/Eunoia_Vespera-8B-LINEAR/1dc2a5bb-40b6-401e-8f1c-6110cb4c0f0d.json
deleted file mode 100644
index be6c2967a..000000000
--- a/data/hfopenllm_v2/DreadPoor/Eunoia_Vespera-8B-LINEAR/1dc2a5bb-40b6-401e-8f1c-6110cb4c0f0d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Eunoia_Vespera-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Eunoia_Vespera-8B-LINEAR",
-    "id": "DreadPoor/Eunoia_Vespera-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7235
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5399
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1541
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4185
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3839
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Fu_sion_HA-8B-SLERP/742e0a1c-7496-4076-bdbf-ada0a8e528c2.json b/data/hfopenllm_v2/DreadPoor/Fu_sion_HA-8B-SLERP/742e0a1c-7496-4076-bdbf-ada0a8e528c2.json
deleted file mode 100644
index cc65f1f79..000000000
--- a/data/hfopenllm_v2/DreadPoor/Fu_sion_HA-8B-SLERP/742e0a1c-7496-4076-bdbf-ada0a8e528c2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Fu_sion_HA-8B-SLERP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fu_sion_HA-8B-SLERP",
-    "id": "DreadPoor/Fu_sion_HA-8B-SLERP",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7609
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5373
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1752
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.416
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3825
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/HOT_STINKING_GARBAGE/f0664035-3256-444c-b848-ef603e0d46b5.json b/data/hfopenllm_v2/DreadPoor/HOT_STINKING_GARBAGE/f0664035-3256-444c-b848-ef603e0d46b5.json
deleted file mode 100644
index 0bd391e53..000000000
--- a/data/hfopenllm_v2/DreadPoor/HOT_STINKING_GARBAGE/f0664035-3256-444c-b848-ef603e0d46b5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_HOT_STINKING_GARBAGE/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HOT_STINKING_GARBAGE",
-    "id": "DreadPoor/HOT_STINKING_GARBAGE",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5754
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4884
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0672
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.425
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3017
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/H_the_eighth-8B-LINEAR/9159aaa6-8663-491f-901a-74da4c343d20.json b/data/hfopenllm_v2/DreadPoor/H_the_eighth-8B-LINEAR/9159aaa6-8663-491f-901a-74da4c343d20.json
deleted file mode 100644
index 506db7e4d..000000000
--- a/data/hfopenllm_v2/DreadPoor/H_the_eighth-8B-LINEAR/9159aaa6-8663-491f-901a-74da4c343d20.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_H_the_eighth-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "H_the_eighth-8B-LINEAR",
-    "id": "DreadPoor/H_the_eighth-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7469
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5384
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1775
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4173
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3824
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Happy_New_Year-8B-Model_Stock/5179b145-9fdb-4ab5-8cca-87966ecf6519.json b/data/hfopenllm_v2/DreadPoor/Happy_New_Year-8B-Model_Stock/5179b145-9fdb-4ab5-8cca-87966ecf6519.json
deleted file mode 100644
index c37605ceb..000000000
--- a/data/hfopenllm_v2/DreadPoor/Happy_New_Year-8B-Model_Stock/5179b145-9fdb-4ab5-8cca-87966ecf6519.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Happy_New_Year-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Happy_New_Year-8B-Model_Stock",
-    "id": "DreadPoor/Happy_New_Year-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7616
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5368
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1594
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4186
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3879
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Heart_Stolen-8B-Model_Stock/da872193-1d25-4e8e-bc22-9138a9d121ba.json b/data/hfopenllm_v2/DreadPoor/Heart_Stolen-8B-Model_Stock/da872193-1d25-4e8e-bc22-9138a9d121ba.json
deleted file mode 100644
index 5afe7e574..000000000
--- a/data/hfopenllm_v2/DreadPoor/Heart_Stolen-8B-Model_Stock/da872193-1d25-4e8e-bc22-9138a9d121ba.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Heart_Stolen-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Heart_Stolen-8B-Model_Stock",
-    "id": "DreadPoor/Heart_Stolen-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7245
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5395
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1722
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4162
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3794
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Heart_Stolen-ALT-8B-Model_Stock/967fdd26-1f8a-40d6-8f7d-ca731c7ef2e3.json b/data/hfopenllm_v2/DreadPoor/Heart_Stolen-ALT-8B-Model_Stock/967fdd26-1f8a-40d6-8f7d-ca731c7ef2e3.json
deleted file mode 100644
index 1724bbb8b..000000000
--- a/data/hfopenllm_v2/DreadPoor/Heart_Stolen-ALT-8B-Model_Stock/967fdd26-1f8a-40d6-8f7d-ca731c7ef2e3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Heart_Stolen-ALT-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Heart_Stolen-ALT-8B-Model_Stock",
-    "id": "DreadPoor/Heart_Stolen-ALT-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7184
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5263
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1563
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4055
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3772
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Here_We_Go_Again-8B-SLERP/dd615b4c-189e-4361-bcf4-879fd59b28a2.json b/data/hfopenllm_v2/DreadPoor/Here_We_Go_Again-8B-SLERP/dd615b4c-189e-4361-bcf4-879fd59b28a2.json
deleted file mode 100644
index 040701af7..000000000
--- a/data/hfopenllm_v2/DreadPoor/Here_We_Go_Again-8B-SLERP/dd615b4c-189e-4361-bcf4-879fd59b28a2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Here_We_Go_Again-8B-SLERP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Here_We_Go_Again-8B-SLERP",
-    "id": "DreadPoor/Here_We_Go_Again-8B-SLERP",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7442
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.546
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.173
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4187
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3873
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Howdy-8B-LINEAR/0aeee3e8-00ce-4f95-bbd9-307d93a194a4.json b/data/hfopenllm_v2/DreadPoor/Howdy-8B-LINEAR/0aeee3e8-00ce-4f95-bbd9-307d93a194a4.json
deleted file mode 100644
index 6da88e2f6..000000000
--- a/data/hfopenllm_v2/DreadPoor/Howdy-8B-LINEAR/0aeee3e8-00ce-4f95-bbd9-307d93a194a4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Howdy-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Howdy-8B-LINEAR",
-    "id": "DreadPoor/Howdy-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7378
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5384
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1775
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4121
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3807
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Incidental-8B-Model_Stock/8c583b51-4349-48af-98d9-8eaaf43d60b6.json b/data/hfopenllm_v2/DreadPoor/Incidental-8B-Model_Stock/8c583b51-4349-48af-98d9-8eaaf43d60b6.json
deleted file mode 100644
index f986ac603..000000000
--- a/data/hfopenllm_v2/DreadPoor/Incidental-8B-Model_Stock/8c583b51-4349-48af-98d9-8eaaf43d60b6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Incidental-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Incidental-8B-Model_Stock",
-    "id": "DreadPoor/Incidental-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7482
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5452
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1616
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.424
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3873
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Irina-8B-model_stock/34aab556-5e97-4ea2-9ada-d17dc3624be2.json b/data/hfopenllm_v2/DreadPoor/Irina-8B-model_stock/34aab556-5e97-4ea2-9ada-d17dc3624be2.json
deleted file mode 100644
index b23dc6544..000000000
--- a/data/hfopenllm_v2/DreadPoor/Irina-8B-model_stock/34aab556-5e97-4ea2-9ada-d17dc3624be2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Irina-8B-model_stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Irina-8B-model_stock",
-    "id": "DreadPoor/Irina-8B-model_stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6799
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5237
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4003
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3574
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Kindling-8B-Model_Stock/fbd9d5e3-15f7-45ce-92fb-368b3bfcc526.json b/data/hfopenllm_v2/DreadPoor/Kindling-8B-Model_Stock/fbd9d5e3-15f7-45ce-92fb-368b3bfcc526.json
deleted file mode 100644
index 26f843f70..000000000
--- a/data/hfopenllm_v2/DreadPoor/Kindling-8B-Model_Stock/fbd9d5e3-15f7-45ce-92fb-368b3bfcc526.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Kindling-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kindling-8B-Model_Stock",
-    "id": "DreadPoor/Kindling-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7308
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5492
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1752
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4068
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.383
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/L3.1-BaeZel-8B-Della/b177e329-ce6b-4bc6-aeac-1c01306e6b1f.json b/data/hfopenllm_v2/DreadPoor/L3.1-BaeZel-8B-Della/b177e329-ce6b-4bc6-aeac-1c01306e6b1f.json
deleted file mode 100644
index 5d86d24a4..000000000
--- a/data/hfopenllm_v2/DreadPoor/L3.1-BaeZel-8B-Della/b177e329-ce6b-4bc6-aeac-1c01306e6b1f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_L3.1-BaeZel-8B-Della/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.1-BaeZel-8B-Della",
-    "id": "DreadPoor/L3.1-BaeZel-8B-Della",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.518
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5448
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1745
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.42
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3902
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Laughing_Stock-8B-Model_Stock/7f371c11-e8f0-4233-b359-aac39c0a1110.json b/data/hfopenllm_v2/DreadPoor/Laughing_Stock-8B-Model_Stock/7f371c11-e8f0-4233-b359-aac39c0a1110.json
deleted file mode 100644
index aec03b176..000000000
--- a/data/hfopenllm_v2/DreadPoor/Laughing_Stock-8B-Model_Stock/7f371c11-e8f0-4233-b359-aac39c0a1110.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Laughing_Stock-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Laughing_Stock-8B-Model_Stock",
-    "id": "DreadPoor/Laughing_Stock-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.719
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5449
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1579
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4146
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3764
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Lava_Lamp-8B-SLERP/9f758d4e-d121-4688-8ece-8dc67a499811.json b/data/hfopenllm_v2/DreadPoor/Lava_Lamp-8B-SLERP/9f758d4e-d121-4688-8ece-8dc67a499811.json
deleted file mode 100644
index e0d95bfba..000000000
--- a/data/hfopenllm_v2/DreadPoor/Lava_Lamp-8B-SLERP/9f758d4e-d121-4688-8ece-8dc67a499811.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Lava_Lamp-8B-SLERP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lava_Lamp-8B-SLERP",
-    "id": "DreadPoor/Lava_Lamp-8B-SLERP",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7381
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5368
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1737
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4187
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/LemonP-8B-Model_Stock/903b8c71-d54d-4ce4-9845-71eb8ca8733a.json b/data/hfopenllm_v2/DreadPoor/LemonP-8B-Model_Stock/903b8c71-d54d-4ce4-9845-71eb8ca8733a.json
deleted file mode 100644
index bd0f0baf1..000000000
--- a/data/hfopenllm_v2/DreadPoor/LemonP-8B-Model_Stock/903b8c71-d54d-4ce4-9845-71eb8ca8733a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_LemonP-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LemonP-8B-Model_Stock",
-    "id": "DreadPoor/LemonP-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7676
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5439
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1767
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4081
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4004
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Lydia_of_Whiterun-8B-LINEAR/9bdc17bf-7b81-49c8-81f5-c6dfa31b449b.json b/data/hfopenllm_v2/DreadPoor/Lydia_of_Whiterun-8B-LINEAR/9bdc17bf-7b81-49c8-81f5-c6dfa31b449b.json
deleted file mode 100644
index d1ecbf772..000000000
--- a/data/hfopenllm_v2/DreadPoor/Lydia_of_Whiterun-8B-LINEAR/9bdc17bf-7b81-49c8-81f5-c6dfa31b449b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Lydia_of_Whiterun-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lydia_of_Whiterun-8B-LINEAR",
-    "id": "DreadPoor/Lydia_of_Whiterun-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7603
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.538
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1767
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4251
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3801
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Matryoshka-8B-LINEAR/28109e00-87c1-4809-a4fc-dddebba52621.json b/data/hfopenllm_v2/DreadPoor/Matryoshka-8B-LINEAR/28109e00-87c1-4809-a4fc-dddebba52621.json
deleted file mode 100644
index 81c216cdf..000000000
--- a/data/hfopenllm_v2/DreadPoor/Matryoshka-8B-LINEAR/28109e00-87c1-4809-a4fc-dddebba52621.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Matryoshka-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Matryoshka-8B-LINEAR",
-    "id": "DreadPoor/Matryoshka-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7263
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5444
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1752
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4252
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3866
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Mercury_In_Retrograde-8b-Model-Stock/6a21381b-426d-4a5d-ad6d-2aeb57ed14c5.json b/data/hfopenllm_v2/DreadPoor/Mercury_In_Retrograde-8b-Model-Stock/6a21381b-426d-4a5d-ad6d-2aeb57ed14c5.json
deleted file mode 100644
index 84fcc3ee6..000000000
--- a/data/hfopenllm_v2/DreadPoor/Mercury_In_Retrograde-8b-Model-Stock/6a21381b-426d-4a5d-ad6d-2aeb57ed14c5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Mercury_In_Retrograde-8b-Model-Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mercury_In_Retrograde-8b-Model-Stock",
-    "id": "DreadPoor/Mercury_In_Retrograde-8b-Model-Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7296
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5391
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1647
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3829
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Minthy-8B-Model_Stock/03a8091c-473e-4fbe-af70-35f791a23a0f.json b/data/hfopenllm_v2/DreadPoor/Minthy-8B-Model_Stock/03a8091c-473e-4fbe-af70-35f791a23a0f.json
deleted file mode 100644
index 8bace15e3..000000000
--- a/data/hfopenllm_v2/DreadPoor/Minthy-8B-Model_Stock/03a8091c-473e-4fbe-af70-35f791a23a0f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Minthy-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Minthy-8B-Model_Stock",
-    "id": "DreadPoor/Minthy-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7658
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5353
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1918
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4094
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3993
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Minthy_ALT-8B-Model_Stock/ed75e9ed-841b-4783-a201-bc72651afd0a.json b/data/hfopenllm_v2/DreadPoor/Minthy_ALT-8B-Model_Stock/ed75e9ed-841b-4783-a201-bc72651afd0a.json
deleted file mode 100644
index bb6a90a55..000000000
--- a/data/hfopenllm_v2/DreadPoor/Minthy_ALT-8B-Model_Stock/ed75e9ed-841b-4783-a201-bc72651afd0a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Minthy_ALT-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Minthy_ALT-8B-Model_Stock",
-    "id": "DreadPoor/Minthy_ALT-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6992
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5375
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.176
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4225
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3674
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Minthy_V2-8B-Model_Stock/38cd418c-9770-49d2-8b30-ac47e445cee3.json b/data/hfopenllm_v2/DreadPoor/Minthy_V2-8B-Model_Stock/38cd418c-9770-49d2-8b30-ac47e445cee3.json
deleted file mode 100644
index f602119ec..000000000
--- a/data/hfopenllm_v2/DreadPoor/Minthy_V2-8B-Model_Stock/38cd418c-9770-49d2-8b30-ac47e445cee3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Minthy_V2-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Minthy_V2-8B-Model_Stock",
-    "id": "DreadPoor/Minthy_V2-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7126
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5491
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1594
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3737
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Minus_Penus-8B-Model_Stock/d49b6a48-ae81-467d-87c5-b17f9ca306f8.json b/data/hfopenllm_v2/DreadPoor/Minus_Penus-8B-Model_Stock/d49b6a48-ae81-467d-87c5-b17f9ca306f8.json
deleted file mode 100644
index 214eb3bc5..000000000
--- a/data/hfopenllm_v2/DreadPoor/Minus_Penus-8B-Model_Stock/d49b6a48-ae81-467d-87c5-b17f9ca306f8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Minus_Penus-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Minus_Penus-8B-Model_Stock",
-    "id": "DreadPoor/Minus_Penus-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7311
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5344
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2002
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4019
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3752
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Morphing-8B-Model_Stock/39b7e250-9f71-4833-941e-85692a48b6e6.json b/data/hfopenllm_v2/DreadPoor/Morphing-8B-Model_Stock/39b7e250-9f71-4833-941e-85692a48b6e6.json
deleted file mode 100644
index 0295f0f91..000000000
--- a/data/hfopenllm_v2/DreadPoor/Morphing-8B-Model_Stock/39b7e250-9f71-4833-941e-85692a48b6e6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Morphing-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Morphing-8B-Model_Stock",
-    "id": "DreadPoor/Morphing-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7445
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5397
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1888
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4069
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3852
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Not_Even_My_Final_Form-8B-Model_Stock/c0d102a2-ff8c-45ac-a825-31472b98b871.json b/data/hfopenllm_v2/DreadPoor/Not_Even_My_Final_Form-8B-Model_Stock/c0d102a2-ff8c-45ac-a825-31472b98b871.json
deleted file mode 100644
index c07ee195d..000000000
--- a/data/hfopenllm_v2/DreadPoor/Not_Even_My_Final_Form-8B-Model_Stock/c0d102a2-ff8c-45ac-a825-31472b98b871.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Not_Even_My_Final_Form-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Not_Even_My_Final_Form-8B-Model_Stock",
-    "id": "DreadPoor/Not_Even_My_Final_Form-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7722
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5351
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.176
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4147
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.384
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Nother_One-8B-Model_Stock/7c5674a8-6a1c-483e-be9c-b0a6d00d3ac4.json b/data/hfopenllm_v2/DreadPoor/Nother_One-8B-Model_Stock/7c5674a8-6a1c-483e-be9c-b0a6d00d3ac4.json
deleted file mode 100644
index 3a365d7fc..000000000
--- a/data/hfopenllm_v2/DreadPoor/Nother_One-8B-Model_Stock/7c5674a8-6a1c-483e-be9c-b0a6d00d3ac4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Nother_One-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nother_One-8B-Model_Stock",
-    "id": "DreadPoor/Nother_One-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6863
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5205
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1518
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.387
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3595
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Noxis-8B-LINEAR/d34b899e-b067-4c9c-9fa2-439f8b2d589d.json b/data/hfopenllm_v2/DreadPoor/Noxis-8B-LINEAR/d34b899e-b067-4c9c-9fa2-439f8b2d589d.json
deleted file mode 100644
index 73cd484d6..000000000
--- a/data/hfopenllm_v2/DreadPoor/Noxis-8B-LINEAR/d34b899e-b067-4c9c-9fa2-439f8b2d589d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Noxis-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Noxis-8B-LINEAR",
-    "id": "DreadPoor/Noxis-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6913
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5421
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1979
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4231
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.366
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Nullsworn-12B-LINEAR/8c7b2332-510b-42d3-bcbb-e177c35d27d5.json b/data/hfopenllm_v2/DreadPoor/Nullsworn-12B-LINEAR/8c7b2332-510b-42d3-bcbb-e177c35d27d5.json
deleted file mode 100644
index 4c95be2fd..000000000
--- a/data/hfopenllm_v2/DreadPoor/Nullsworn-12B-LINEAR/8c7b2332-510b-42d3-bcbb-e177c35d27d5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Nullsworn-12B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nullsworn-12B-LINEAR",
-    "id": "DreadPoor/Nullsworn-12B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4436
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5483
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.435
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3645
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Nwah-8B-Model_Stock/685f107f-e431-4dba-a117-8d6f1dd2c296.json b/data/hfopenllm_v2/DreadPoor/Nwah-8B-Model_Stock/685f107f-e431-4dba-a117-8d6f1dd2c296.json
deleted file mode 100644
index ed00a7392..000000000
--- a/data/hfopenllm_v2/DreadPoor/Nwah-8B-Model_Stock/685f107f-e431-4dba-a117-8d6f1dd2c296.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Nwah-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nwah-8B-Model_Stock",
-    "id": "DreadPoor/Nwah-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7716
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5384
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1798
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4039
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3807
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/ONeil-model_stock-8B/e1570804-85b6-4518-a099-5f21ab27d12c.json b/data/hfopenllm_v2/DreadPoor/ONeil-model_stock-8B/e1570804-85b6-4518-a099-5f21ab27d12c.json
deleted file mode 100644
index 43888a5d9..000000000
--- a/data/hfopenllm_v2/DreadPoor/ONeil-model_stock-8B/e1570804-85b6-4518-a099-5f21ab27d12c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_ONeil-model_stock-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ONeil-model_stock-8B",
-    "id": "DreadPoor/ONeil-model_stock-8B",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6786
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5548
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1012
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4173
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3599
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Oh_Boy-8B-LINEAR/a779ebec-76ab-4a1e-aa4f-d1a6adfe2d5c.json b/data/hfopenllm_v2/DreadPoor/Oh_Boy-8B-LINEAR/a779ebec-76ab-4a1e-aa4f-d1a6adfe2d5c.json
deleted file mode 100644
index a4b064bc6..000000000
--- a/data/hfopenllm_v2/DreadPoor/Oh_Boy-8B-LINEAR/a779ebec-76ab-4a1e-aa4f-d1a6adfe2d5c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Oh_Boy-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Oh_Boy-8B-LINEAR",
-    "id": "DreadPoor/Oh_Boy-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7503
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5375
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1782
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4108
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3849
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/OrangeJ-8B-Model_Stock/1ed7f6ed-d04d-4cfc-a36a-1ef0f72d4814.json b/data/hfopenllm_v2/DreadPoor/OrangeJ-8B-Model_Stock/1ed7f6ed-d04d-4cfc-a36a-1ef0f72d4814.json
deleted file mode 100644
index 009d6a3f3..000000000
--- a/data/hfopenllm_v2/DreadPoor/OrangeJ-8B-Model_Stock/1ed7f6ed-d04d-4cfc-a36a-1ef0f72d4814.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_OrangeJ-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OrangeJ-8B-Model_Stock",
-    "id": "DreadPoor/OrangeJ-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7841
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5413
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.176
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4028
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3969
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Promissum_Mane-8B-LINEAR-lorablated/c901a9ee-069a-4e3e-ac52-3017d67d8800.json b/data/hfopenllm_v2/DreadPoor/Promissum_Mane-8B-LINEAR-lorablated/c901a9ee-069a-4e3e-ac52-3017d67d8800.json
deleted file mode 100644
index fbaecf746..000000000
--- a/data/hfopenllm_v2/DreadPoor/Promissum_Mane-8B-LINEAR-lorablated/c901a9ee-069a-4e3e-ac52-3017d67d8800.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Promissum_Mane-8B-LINEAR-lorablated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Promissum_Mane-8B-LINEAR-lorablated",
-    "id": "DreadPoor/Promissum_Mane-8B-LINEAR-lorablated",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7156
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5435
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1533
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4198
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3739
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Promissum_Mane-8B-LINEAR/08317b59-ff74-43c8-bea5-2a266c38816e.json b/data/hfopenllm_v2/DreadPoor/Promissum_Mane-8B-LINEAR/08317b59-ff74-43c8-bea5-2a266c38816e.json
deleted file mode 100644
index c14c68ca6..000000000
--- a/data/hfopenllm_v2/DreadPoor/Promissum_Mane-8B-LINEAR/08317b59-ff74-43c8-bea5-2a266c38816e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Promissum_Mane-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Promissum_Mane-8B-LINEAR",
-    "id": "DreadPoor/Promissum_Mane-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.715
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5458
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1556
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.42
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3851
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/RPMash-8B-Model_Stock/4106d4d3-344a-4c1f-b9ce-a3140d435013.json b/data/hfopenllm_v2/DreadPoor/RPMash-8B-Model_Stock/4106d4d3-344a-4c1f-b9ce-a3140d435013.json
deleted file mode 100644
index 4c1772a72..000000000
--- a/data/hfopenllm_v2/DreadPoor/RPMash-8B-Model_Stock/4106d4d3-344a-4c1f-b9ce-a3140d435013.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_RPMash-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RPMash-8B-Model_Stock",
-    "id": "DreadPoor/RPMash-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4564
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5169
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4054
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3604
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/RPMash_V3-8B-Model_Stock/2b308fad-8494-4056-8b84-82733cd2710a.json b/data/hfopenllm_v2/DreadPoor/RPMash_V3-8B-Model_Stock/2b308fad-8494-4056-8b84-82733cd2710a.json
deleted file mode 100644
index 1c26373d3..000000000
--- a/data/hfopenllm_v2/DreadPoor/RPMash_V3-8B-Model_Stock/2b308fad-8494-4056-8b84-82733cd2710a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_RPMash_V3-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RPMash_V3-8B-Model_Stock",
-    "id": "DreadPoor/RPMash_V3-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7049
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5217
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1042
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3778
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3614
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Rusted_Gold-8B-LINEAR/93c867d0-4f10-440c-838c-91d1633fe584.json b/data/hfopenllm_v2/DreadPoor/Rusted_Gold-8B-LINEAR/93c867d0-4f10-440c-838c-91d1633fe584.json
deleted file mode 100644
index f8b506d6b..000000000
--- a/data/hfopenllm_v2/DreadPoor/Rusted_Gold-8B-LINEAR/93c867d0-4f10-440c-838c-91d1633fe584.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Rusted_Gold-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rusted_Gold-8B-LINEAR",
-    "id": "DreadPoor/Rusted_Gold-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7296
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5387
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1934
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4178
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.378
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Rusted_Platinum-8B-LINEAR/1a4a69c5-4acc-4ad9-adb2-bd9cf0fa2875.json b/data/hfopenllm_v2/DreadPoor/Rusted_Platinum-8B-LINEAR/1a4a69c5-4acc-4ad9-adb2-bd9cf0fa2875.json
deleted file mode 100644
index 928a50b08..000000000
--- a/data/hfopenllm_v2/DreadPoor/Rusted_Platinum-8B-LINEAR/1a4a69c5-4acc-4ad9-adb2-bd9cf0fa2875.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Rusted_Platinum-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rusted_Platinum-8B-LINEAR",
-    "id": "DreadPoor/Rusted_Platinum-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.718
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5428
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1722
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3967
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.373
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Rusted_Platinum-8B-Model_Stock/151226ba-9744-45bc-b923-30df57f7aa3e.json b/data/hfopenllm_v2/DreadPoor/Rusted_Platinum-8B-Model_Stock/151226ba-9744-45bc-b923-30df57f7aa3e.json
deleted file mode 100644
index f71f0f094..000000000
--- a/data/hfopenllm_v2/DreadPoor/Rusted_Platinum-8B-Model_Stock/151226ba-9744-45bc-b923-30df57f7aa3e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Rusted_Platinum-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rusted_Platinum-8B-Model_Stock",
-    "id": "DreadPoor/Rusted_Platinum-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4408
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5243
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3741
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3546
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Sellen-8B-model_stock/98363657-0793-4eb3-94de-28961afc92ea.json b/data/hfopenllm_v2/DreadPoor/Sellen-8B-model_stock/98363657-0793-4eb3-94de-28961afc92ea.json
deleted file mode 100644
index 1c2ffb709..000000000
--- a/data/hfopenllm_v2/DreadPoor/Sellen-8B-model_stock/98363657-0793-4eb3-94de-28961afc92ea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Sellen-8B-model_stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sellen-8B-model_stock",
-    "id": "DreadPoor/Sellen-8B-model_stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7113
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5232
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1337
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.396
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.357
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Something-8B-Model_Stock/a32b4ded-6bff-441e-afbd-736e6d8cce5c.json b/data/hfopenllm_v2/DreadPoor/Something-8B-Model_Stock/a32b4ded-6bff-441e-afbd-736e6d8cce5c.json
deleted file mode 100644
index 7c20e3cb1..000000000
--- a/data/hfopenllm_v2/DreadPoor/Something-8B-Model_Stock/a32b4ded-6bff-441e-afbd-736e6d8cce5c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Something-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Something-8B-Model_Stock",
-    "id": "DreadPoor/Something-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5043
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5395
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1798
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4187
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3885
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Spring_Dusk-8B-SCE/326bcf4a-02e9-4218-8bf2-55a94a79435e.json b/data/hfopenllm_v2/DreadPoor/Spring_Dusk-8B-SCE/326bcf4a-02e9-4218-8bf2-55a94a79435e.json
deleted file mode 100644
index decc0df8c..000000000
--- a/data/hfopenllm_v2/DreadPoor/Spring_Dusk-8B-SCE/326bcf4a-02e9-4218-8bf2-55a94a79435e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Spring_Dusk-8B-SCE/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Spring_Dusk-8B-SCE",
-    "id": "DreadPoor/Spring_Dusk-8B-SCE",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6515
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5635
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0763
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3436
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Summer_Dawn-8B-SCE/145facc2-ab11-4c68-b841-762e0ad9bd5a.json b/data/hfopenllm_v2/DreadPoor/Summer_Dawn-8B-SCE/145facc2-ab11-4c68-b841-762e0ad9bd5a.json
deleted file mode 100644
index 4227020d6..000000000
--- a/data/hfopenllm_v2/DreadPoor/Summer_Dawn-8B-SCE/145facc2-ab11-4c68-b841-762e0ad9bd5a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Summer_Dawn-8B-SCE/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Summer_Dawn-8B-SCE",
-    "id": "DreadPoor/Summer_Dawn-8B-SCE",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6642
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5391
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1722
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.412
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3753
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Summer_Dusk-8B-TIES/d3e6aae6-9284-4309-8d8c-02c9e797a58b.json b/data/hfopenllm_v2/DreadPoor/Summer_Dusk-8B-TIES/d3e6aae6-9284-4309-8d8c-02c9e797a58b.json
deleted file mode 100644
index 563ad7232..000000000
--- a/data/hfopenllm_v2/DreadPoor/Summer_Dusk-8B-TIES/d3e6aae6-9284-4309-8d8c-02c9e797a58b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Summer_Dusk-8B-TIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Summer_Dusk-8B-TIES",
-    "id": "DreadPoor/Summer_Dusk-8B-TIES",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4922
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.536
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1805
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4267
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3856
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Summer_Rain-8B-SCE/6ee8537c-90e8-4455-83ca-c8c375a5ead7.json b/data/hfopenllm_v2/DreadPoor/Summer_Rain-8B-SCE/6ee8537c-90e8-4455-83ca-c8c375a5ead7.json
deleted file mode 100644
index 9e3634f9a..000000000
--- a/data/hfopenllm_v2/DreadPoor/Summer_Rain-8B-SCE/6ee8537c-90e8-4455-83ca-c8c375a5ead7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Summer_Rain-8B-SCE/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Summer_Rain-8B-SCE",
-    "id": "DreadPoor/Summer_Rain-8B-SCE",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5459
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5846
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0702
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4477
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3551
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Summer_Rain-8B-TIES/6efbfb38-57e5-46c7-b765-f7d0356afb97.json b/data/hfopenllm_v2/DreadPoor/Summer_Rain-8B-TIES/6efbfb38-57e5-46c7-b765-f7d0356afb97.json
deleted file mode 100644
index 2e8e86b23..000000000
--- a/data/hfopenllm_v2/DreadPoor/Summer_Rain-8B-TIES/6efbfb38-57e5-46c7-b765-f7d0356afb97.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Summer_Rain-8B-TIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Summer_Rain-8B-TIES",
-    "id": "DreadPoor/Summer_Rain-8B-TIES",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5444
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5846
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0702
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4477
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3551
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Sun-8B-Model_Stock/f4d418d9-1089-452d-9c7f-4cc4712e6ac7.json b/data/hfopenllm_v2/DreadPoor/Sun-8B-Model_Stock/f4d418d9-1089-452d-9c7f-4cc4712e6ac7.json
deleted file mode 100644
index 0c4fbb59f..000000000
--- a/data/hfopenllm_v2/DreadPoor/Sun-8B-Model_Stock/f4d418d9-1089-452d-9c7f-4cc4712e6ac7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Sun-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sun-8B-Model_Stock",
-    "id": "DreadPoor/Sun-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7758
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5264
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.21
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4098
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3835
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Sweetened_Condensed_Milk-8B-Model_Stock/1c9b325b-92b3-499a-a3ea-026269c63c88.json b/data/hfopenllm_v2/DreadPoor/Sweetened_Condensed_Milk-8B-Model_Stock/1c9b325b-92b3-499a-a3ea-026269c63c88.json
deleted file mode 100644
index 28f4c0025..000000000
--- a/data/hfopenllm_v2/DreadPoor/Sweetened_Condensed_Milk-8B-Model_Stock/1c9b325b-92b3-499a-a3ea-026269c63c88.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Sweetened_Condensed_Milk-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sweetened_Condensed_Milk-8B-Model_Stock",
-    "id": "DreadPoor/Sweetened_Condensed_Milk-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7417
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5406
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1873
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4107
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3848
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/TEST02-Ignore/c546ccde-cef3-4de2-a49f-24517d76dde5.json b/data/hfopenllm_v2/DreadPoor/TEST02-Ignore/c546ccde-cef3-4de2-a49f-24517d76dde5.json
deleted file mode 100644
index 875b97c44..000000000
--- a/data/hfopenllm_v2/DreadPoor/TEST02-Ignore/c546ccde-cef3-4de2-a49f-24517d76dde5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_TEST02-Ignore/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TEST02-Ignore",
-    "id": "DreadPoor/TEST02-Ignore",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6119
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5602
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0869
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3468
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/TEST03-ignore/e85d3ccf-f48d-4e5c-b893-771a107773d4.json b/data/hfopenllm_v2/DreadPoor/TEST03-ignore/e85d3ccf-f48d-4e5c-b893-771a107773d4.json
deleted file mode 100644
index f463020e8..000000000
--- a/data/hfopenllm_v2/DreadPoor/TEST03-ignore/e85d3ccf-f48d-4e5c-b893-771a107773d4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_TEST03-ignore/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TEST03-ignore",
-    "id": "DreadPoor/TEST03-ignore",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6967
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5383
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1654
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4186
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3789
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/TEST06-ignore/b8d22ade-874e-4ff3-9fcd-dbe14220d48b.json b/data/hfopenllm_v2/DreadPoor/TEST06-ignore/b8d22ade-874e-4ff3-9fcd-dbe14220d48b.json
deleted file mode 100644
index b39933b07..000000000
--- a/data/hfopenllm_v2/DreadPoor/TEST06-ignore/b8d22ade-874e-4ff3-9fcd-dbe14220d48b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_TEST06-ignore/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TEST06-ignore",
-    "id": "DreadPoor/TEST06-ignore",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7323
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5509
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1178
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4225
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3615
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/TEST07-ignore/97e8e7e2-74a4-42a5-a0b1-250e47d3c3e6.json b/data/hfopenllm_v2/DreadPoor/TEST07-ignore/97e8e7e2-74a4-42a5-a0b1-250e47d3c3e6.json
deleted file mode 100644
index 42a699d62..000000000
--- a/data/hfopenllm_v2/DreadPoor/TEST07-ignore/97e8e7e2-74a4-42a5-a0b1-250e47d3c3e6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_TEST07-ignore/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TEST07-ignore",
-    "id": "DreadPoor/TEST07-ignore",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.74
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5561
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1662
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4094
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.388
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/TEST08-ignore/b2d56bb6-a726-4e47-8bc6-c016a51aac5c.json b/data/hfopenllm_v2/DreadPoor/TEST08-ignore/b2d56bb6-a726-4e47-8bc6-c016a51aac5c.json
deleted file mode 100644
index 9477b8a75..000000000
--- a/data/hfopenllm_v2/DreadPoor/TEST08-ignore/b2d56bb6-a726-4e47-8bc6-c016a51aac5c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_TEST08-ignore/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TEST08-ignore",
-    "id": "DreadPoor/TEST08-ignore",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7467
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5454
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.182
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4081
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3853
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Trinas_Nectar-8B-model_stock/3366f6d8-41bc-4c2c-a72c-bc0fd7dc8dd2.json b/data/hfopenllm_v2/DreadPoor/Trinas_Nectar-8B-model_stock/3366f6d8-41bc-4c2c-a72c-bc0fd7dc8dd2.json
deleted file mode 100644
index bf826442f..000000000
--- a/data/hfopenllm_v2/DreadPoor/Trinas_Nectar-8B-model_stock/3366f6d8-41bc-4c2c-a72c-bc0fd7dc8dd2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Trinas_Nectar-8B-model_stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Trinas_Nectar-8B-model_stock",
-    "id": "DreadPoor/Trinas_Nectar-8B-model_stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7259
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5256
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1526
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4068
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3618
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/UNTESTED-VENN_1.2-8B-Model_Stock/7ba52efb-3890-4691-8740-9f051f1f645e.json b/data/hfopenllm_v2/DreadPoor/UNTESTED-VENN_1.2-8B-Model_Stock/7ba52efb-3890-4691-8740-9f051f1f645e.json
deleted file mode 100644
index 44c3145e9..000000000
--- a/data/hfopenllm_v2/DreadPoor/UNTESTED-VENN_1.2-8B-Model_Stock/7ba52efb-3890-4691-8740-9f051f1f645e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_UNTESTED-VENN_1.2-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "UNTESTED-VENN_1.2-8B-Model_Stock",
-    "id": "DreadPoor/UNTESTED-VENN_1.2-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4718
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5475
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1541
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4449
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3787
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/VENN_1.2-8B-Model_Stock/7b192b49-057e-418a-b47d-44b0ec82a6b6.json b/data/hfopenllm_v2/DreadPoor/VENN_1.2-8B-Model_Stock/7b192b49-057e-418a-b47d-44b0ec82a6b6.json
deleted file mode 100644
index 77da6fb17..000000000
--- a/data/hfopenllm_v2/DreadPoor/VENN_1.2-8B-Model_Stock/7b192b49-057e-418a-b47d-44b0ec82a6b6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_VENN_1.2-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "VENN_1.2-8B-Model_Stock",
-    "id": "DreadPoor/VENN_1.2-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7226
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5459
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1707
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.42
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3721
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/WIP-Acacia-8B-Model_Stock/f2120d53-bef6-44d6-84a6-a6f8e3537188.json b/data/hfopenllm_v2/DreadPoor/WIP-Acacia-8B-Model_Stock/f2120d53-bef6-44d6-84a6-a6f8e3537188.json
deleted file mode 100644
index aadd0095c..000000000
--- a/data/hfopenllm_v2/DreadPoor/WIP-Acacia-8B-Model_Stock/f2120d53-bef6-44d6-84a6-a6f8e3537188.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_WIP-Acacia-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "WIP-Acacia-8B-Model_Stock",
-    "id": "DreadPoor/WIP-Acacia-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6246
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5195
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1669
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4226
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3737
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/WIP_Damascus-8B-TIES/f5408aa9-85c8-46e5-b225-0480b2e18e97.json b/data/hfopenllm_v2/DreadPoor/WIP_Damascus-8B-TIES/f5408aa9-85c8-46e5-b225-0480b2e18e97.json
deleted file mode 100644
index e05a5b321..000000000
--- a/data/hfopenllm_v2/DreadPoor/WIP_Damascus-8B-TIES/f5408aa9-85c8-46e5-b225-0480b2e18e97.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_WIP_Damascus-8B-TIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "WIP_Damascus-8B-TIES",
-    "id": "DreadPoor/WIP_Damascus-8B-TIES",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4776
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5411
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1654
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4119
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3761
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Wannabe-8B-Model_Stock/c1918f55-286c-4b29-ac53-2ee8f9d36d9e.json b/data/hfopenllm_v2/DreadPoor/Wannabe-8B-Model_Stock/c1918f55-286c-4b29-ac53-2ee8f9d36d9e.json
deleted file mode 100644
index f551e73cc..000000000
--- a/data/hfopenllm_v2/DreadPoor/Wannabe-8B-Model_Stock/c1918f55-286c-4b29-ac53-2ee8f9d36d9e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Wannabe-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Wannabe-8B-Model_Stock",
-    "id": "DreadPoor/Wannabe-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7205
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.539
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1775
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4135
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3831
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/What_A_Thrill-8B-Model_Stock/52659d37-67f8-45b8-88e4-11917dc90488.json b/data/hfopenllm_v2/DreadPoor/What_A_Thrill-8B-Model_Stock/52659d37-67f8-45b8-88e4-11917dc90488.json
deleted file mode 100644
index 00ab3ed08..000000000
--- a/data/hfopenllm_v2/DreadPoor/What_A_Thrill-8B-Model_Stock/52659d37-67f8-45b8-88e4-11917dc90488.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_What_A_Thrill-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "What_A_Thrill-8B-Model_Stock",
-    "id": "DreadPoor/What_A_Thrill-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7064
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5311
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.182
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.408
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3615
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Winter-8B-SCE/556ae77c-effe-44ab-ac4a-1ad7cbd7c363.json b/data/hfopenllm_v2/DreadPoor/Winter-8B-SCE/556ae77c-effe-44ab-ac4a-1ad7cbd7c363.json
deleted file mode 100644
index b9a6a545d..000000000
--- a/data/hfopenllm_v2/DreadPoor/Winter-8B-SCE/556ae77c-effe-44ab-ac4a-1ad7cbd7c363.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Winter-8B-SCE/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Winter-8B-SCE",
-    "id": "DreadPoor/Winter-8B-SCE",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7536
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5262
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1918
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4071
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3839
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Winter_Dawn-8B-TIES/048fc971-3baf-4740-a132-2f9476d01b7a.json b/data/hfopenllm_v2/DreadPoor/Winter_Dawn-8B-TIES/048fc971-3baf-4740-a132-2f9476d01b7a.json
deleted file mode 100644
index f48ac4b48..000000000
--- a/data/hfopenllm_v2/DreadPoor/Winter_Dawn-8B-TIES/048fc971-3baf-4740-a132-2f9476d01b7a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Winter_Dawn-8B-TIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Winter_Dawn-8B-TIES",
-    "id": "DreadPoor/Winter_Dawn-8B-TIES",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5496
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5309
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1858
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4279
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.391
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Winter_Dusk-8B-TIES/abd28d25-01e0-474d-be35-08d816d281f5.json b/data/hfopenllm_v2/DreadPoor/Winter_Dusk-8B-TIES/abd28d25-01e0-474d-be35-08d816d281f5.json
deleted file mode 100644
index 1470af900..000000000
--- a/data/hfopenllm_v2/DreadPoor/Winter_Dusk-8B-TIES/abd28d25-01e0-474d-be35-08d816d281f5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Winter_Dusk-8B-TIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Winter_Dusk-8B-TIES",
-    "id": "DreadPoor/Winter_Dusk-8B-TIES",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7153
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4952
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0718
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3688
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3478
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Winter_Night-8B-Model_Stock/17f49724-6553-4baa-b354-45ffd0f2c844.json b/data/hfopenllm_v2/DreadPoor/Winter_Night-8B-Model_Stock/17f49724-6553-4baa-b354-45ffd0f2c844.json
deleted file mode 100644
index 2969a6c18..000000000
--- a/data/hfopenllm_v2/DreadPoor/Winter_Night-8B-Model_Stock/17f49724-6553-4baa-b354-45ffd0f2c844.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Winter_Night-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Winter_Night-8B-Model_Stock",
-    "id": "DreadPoor/Winter_Night-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.704
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5185
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1458
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3914
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3666
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Yafune-8B-Model_Stock/3e60d982-d7d5-432b-962e-b7734cc90534.json b/data/hfopenllm_v2/DreadPoor/Yafune-8B-Model_Stock/3e60d982-d7d5-432b-962e-b7734cc90534.json
deleted file mode 100644
index 25b9ce545..000000000
--- a/data/hfopenllm_v2/DreadPoor/Yafune-8B-Model_Stock/3e60d982-d7d5-432b-962e-b7734cc90534.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Yafune-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yafune-8B-Model_Stock",
-    "id": "DreadPoor/Yafune-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7533
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5467
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1662
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4173
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3851
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Yearn_V3-8B-Model_Stock/79a0fdf3-b432-4598-be62-f9eb57fa5a43.json b/data/hfopenllm_v2/DreadPoor/Yearn_V3-8B-Model_Stock/79a0fdf3-b432-4598-be62-f9eb57fa5a43.json
deleted file mode 100644
index 74619217a..000000000
--- a/data/hfopenllm_v2/DreadPoor/Yearn_V3-8B-Model_Stock/79a0fdf3-b432-4598-be62-f9eb57fa5a43.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Yearn_V3-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yearn_V3-8B-Model_Stock",
-    "id": "DreadPoor/Yearn_V3-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.729
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5322
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1896
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3909
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3802
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/ZEUS-8B-V17-Abliterated_ALT/662566e0-2af3-40d6-90de-9b361bcae355.json b/data/hfopenllm_v2/DreadPoor/ZEUS-8B-V17-Abliterated_ALT/662566e0-2af3-40d6-90de-9b361bcae355.json
deleted file mode 100644
index 02d4db078..000000000
--- a/data/hfopenllm_v2/DreadPoor/ZEUS-8B-V17-Abliterated_ALT/662566e0-2af3-40d6-90de-9b361bcae355.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_ZEUS-8B-V17-Abliterated_ALT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V17-Abliterated_ALT",
-    "id": "DreadPoor/ZEUS-8B-V17-Abliterated_ALT",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5511
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5231
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1903
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4149
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.389
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Zelus-8B-Model_Stock/d81c0035-a0b1-426c-9080-8ccbf745642b.json b/data/hfopenllm_v2/DreadPoor/Zelus-8B-Model_Stock/d81c0035-a0b1-426c-9080-8ccbf745642b.json
deleted file mode 100644
index 03e62bfa1..000000000
--- a/data/hfopenllm_v2/DreadPoor/Zelus-8B-Model_Stock/d81c0035-a0b1-426c-9080-8ccbf745642b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Zelus-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Zelus-8B-Model_Stock",
-    "id": "DreadPoor/Zelus-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7788
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5307
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1647
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4214
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3841
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/Zelus_V2-8B-Model_Stock/100bc243-158c-4e5c-918b-1439bf26fee8.json b/data/hfopenllm_v2/DreadPoor/Zelus_V2-8B-Model_Stock/100bc243-158c-4e5c-918b-1439bf26fee8.json
deleted file mode 100644
index 059e73847..000000000
--- a/data/hfopenllm_v2/DreadPoor/Zelus_V2-8B-Model_Stock/100bc243-158c-4e5c-918b-1439bf26fee8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_Zelus_V2-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Zelus_V2-8B-Model_Stock",
-    "id": "DreadPoor/Zelus_V2-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7898
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5345
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2054
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3961
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3833
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/felix_dies-mistral-7B-model_stock/45e32080-1464-40e0-a232-310fdda967eb.json b/data/hfopenllm_v2/DreadPoor/felix_dies-mistral-7B-model_stock/45e32080-1464-40e0-a232-310fdda967eb.json
deleted file mode 100644
index c0877447b..000000000
--- a/data/hfopenllm_v2/DreadPoor/felix_dies-mistral-7B-model_stock/45e32080-1464-40e0-a232-310fdda967eb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_felix_dies-mistral-7B-model_stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "felix_dies-mistral-7B-model_stock",
-    "id": "DreadPoor/felix_dies-mistral-7B-model_stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3008
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4901
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4518
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3109
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/hakuchido-8B-MODEL_STOCK/e89b279f-d548-4aa8-b5e5-0bffdd98b840.json b/data/hfopenllm_v2/DreadPoor/hakuchido-8B-MODEL_STOCK/e89b279f-d548-4aa8-b5e5-0bffdd98b840.json
deleted file mode 100644
index 81722a6a0..000000000
--- a/data/hfopenllm_v2/DreadPoor/hakuchido-8B-MODEL_STOCK/e89b279f-d548-4aa8-b5e5-0bffdd98b840.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_hakuchido-8B-MODEL_STOCK/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "hakuchido-8B-MODEL_STOCK",
-    "id": "DreadPoor/hakuchido-8B-MODEL_STOCK",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7375
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5398
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1949
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4175
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3782
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/ichor-8B-Model_Stock/777a53f9-891c-4f9e-99a8-bb1988f61f19.json b/data/hfopenllm_v2/DreadPoor/ichor-8B-Model_Stock/777a53f9-891c-4f9e-99a8-bb1988f61f19.json
deleted file mode 100644
index c35d0a556..000000000
--- a/data/hfopenllm_v2/DreadPoor/ichor-8B-Model_Stock/777a53f9-891c-4f9e-99a8-bb1988f61f19.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_ichor-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ichor-8B-Model_Stock",
-    "id": "DreadPoor/ichor-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5386
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5084
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1088
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4212
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3151
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/ichor_1.1-8B-Model_Stock/f15846b1-8eaa-411b-88f7-25064161af4e.json b/data/hfopenllm_v2/DreadPoor/ichor_1.1-8B-Model_Stock/f15846b1-8eaa-411b-88f7-25064161af4e.json
deleted file mode 100644
index 07bce712f..000000000
--- a/data/hfopenllm_v2/DreadPoor/ichor_1.1-8B-Model_Stock/f15846b1-8eaa-411b-88f7-25064161af4e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_ichor_1.1-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ichor_1.1-8B-Model_Stock",
-    "id": "DreadPoor/ichor_1.1-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8096
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5281
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1775
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4068
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3856
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/inexpertus-8B-Model_Stock/e803fc85-fb98-4db8-aab0-a63100dcd5fc.json b/data/hfopenllm_v2/DreadPoor/inexpertus-8B-Model_Stock/e803fc85-fb98-4db8-aab0-a63100dcd5fc.json
deleted file mode 100644
index dd5b49bf4..000000000
--- a/data/hfopenllm_v2/DreadPoor/inexpertus-8B-Model_Stock/e803fc85-fb98-4db8-aab0-a63100dcd5fc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_inexpertus-8B-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "inexpertus-8B-Model_Stock",
-    "id": "DreadPoor/inexpertus-8B-Model_Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7795
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.528
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1707
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4118
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3791
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/inexpertus_1.1-8B-LINEAR/50620749-5ecf-41eb-a131-611675560e07.json b/data/hfopenllm_v2/DreadPoor/inexpertus_1.1-8B-LINEAR/50620749-5ecf-41eb-a131-611675560e07.json
deleted file mode 100644
index 0f7b2d86a..000000000
--- a/data/hfopenllm_v2/DreadPoor/inexpertus_1.1-8B-LINEAR/50620749-5ecf-41eb-a131-611675560e07.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_inexpertus_1.1-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "inexpertus_1.1-8B-LINEAR",
-    "id": "DreadPoor/inexpertus_1.1-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7527
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5525
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.173
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4173
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3827
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/inexpertus_1.2-8B-LINEAR/2d40a551-6440-4d71-87e4-639d486c1c5e.json b/data/hfopenllm_v2/DreadPoor/inexpertus_1.2-8B-LINEAR/2d40a551-6440-4d71-87e4-639d486c1c5e.json
deleted file mode 100644
index 9628c12f8..000000000
--- a/data/hfopenllm_v2/DreadPoor/inexpertus_1.2-8B-LINEAR/2d40a551-6440-4d71-87e4-639d486c1c5e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_inexpertus_1.2-8B-LINEAR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "inexpertus_1.2-8B-LINEAR",
-    "id": "DreadPoor/inexpertus_1.2-8B-LINEAR",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7348
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5523
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1586
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4133
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3788
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/mergekit-nuslerp-nqzkedi/22235942-2e3e-4ef4-b7a0-5800f507571a.json b/data/hfopenllm_v2/DreadPoor/mergekit-nuslerp-nqzkedi/22235942-2e3e-4ef4-b7a0-5800f507571a.json
deleted file mode 100644
index 0f3089a37..000000000
--- a/data/hfopenllm_v2/DreadPoor/mergekit-nuslerp-nqzkedi/22235942-2e3e-4ef4-b7a0-5800f507571a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_mergekit-nuslerp-nqzkedi/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mergekit-nuslerp-nqzkedi",
-    "id": "DreadPoor/mergekit-nuslerp-nqzkedi",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7765
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5362
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1881
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4225
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3919
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/remember_to_breathe-8b-Model-Stock/ac06867d-3a34-42f6-9e2e-226cf86748f6.json b/data/hfopenllm_v2/DreadPoor/remember_to_breathe-8b-Model-Stock/ac06867d-3a34-42f6-9e2e-226cf86748f6.json
deleted file mode 100644
index 08420f429..000000000
--- a/data/hfopenllm_v2/DreadPoor/remember_to_breathe-8b-Model-Stock/ac06867d-3a34-42f6-9e2e-226cf86748f6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_remember_to_breathe-8b-Model-Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "remember_to_breathe-8b-Model-Stock",
-    "id": "DreadPoor/remember_to_breathe-8b-Model-Stock",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7104
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5412
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1488
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4145
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3761
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/test/394f1fc8-dc2c-4ff9-9ad0-7b3a8a8ddeb3.json b/data/hfopenllm_v2/DreadPoor/test/394f1fc8-dc2c-4ff9-9ad0-7b3a8a8ddeb3.json
deleted file mode 100644
index 4d6f24d90..000000000
--- a/data/hfopenllm_v2/DreadPoor/test/394f1fc8-dc2c-4ff9-9ad0-7b3a8a8ddeb3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "test",
-    "id": "DreadPoor/test",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4937
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5372
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1934
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4351
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3647
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/test_ALT/03e52d4f-78d7-453c-9685-844dd1636904.json b/data/hfopenllm_v2/DreadPoor/test_ALT/03e52d4f-78d7-453c-9685-844dd1636904.json
deleted file mode 100644
index 14344c072..000000000
--- a/data/hfopenllm_v2/DreadPoor/test_ALT/03e52d4f-78d7-453c-9685-844dd1636904.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_test_ALT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "test_ALT",
-    "id": "DreadPoor/test_ALT",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4997
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.537
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1707
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4363
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3492
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/DreadPoor/tests_pending-do_not_use_yet/3ce136d5-be81-4b8c-a7dc-4e1346935d35.json b/data/hfopenllm_v2/DreadPoor/tests_pending-do_not_use_yet/3ce136d5-be81-4b8c-a7dc-4e1346935d35.json
deleted file mode 100644
index e83a16a2c..000000000
--- a/data/hfopenllm_v2/DreadPoor/tests_pending-do_not_use_yet/3ce136d5-be81-4b8c-a7dc-4e1346935d35.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/DreadPoor_tests_pending-do_not_use_yet/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tests_pending-do_not_use_yet",
-    "id": "DreadPoor/tests_pending-do_not_use_yet",
-    "developer": "DreadPoor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7691
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5408
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1979
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4005
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3827
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ECE-ILAB-PRYMMAL/ILAB-Merging-3B-V2/fb35accf-0c5d-4f72-8d73-ba366a41a76d.json b/data/hfopenllm_v2/ECE-ILAB-PRYMMAL/ILAB-Merging-3B-V2/fb35accf-0c5d-4f72-8d73-ba366a41a76d.json
deleted file mode 100644
index 2a680a23b..000000000
--- a/data/hfopenllm_v2/ECE-ILAB-PRYMMAL/ILAB-Merging-3B-V2/fb35accf-0c5d-4f72-8d73-ba366a41a76d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ECE-ILAB-PRYMMAL_ILAB-Merging-3B-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ILAB-Merging-3B-V2",
-    "id": "ECE-ILAB-PRYMMAL/ILAB-Merging-3B-V2",
-    "developer": "ECE-ILAB-PRYMMAL",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4029
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5402
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1518
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4332
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3861
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EVA-UNIT-01/EVA-Qwen2.5-14B-v0.2/75e5ca5d-cce1-4463-b398-553399ce6833.json b/data/hfopenllm_v2/EVA-UNIT-01/EVA-Qwen2.5-14B-v0.2/75e5ca5d-cce1-4463-b398-553399ce6833.json
deleted file mode 100644
index 12b4347c9..000000000
--- a/data/hfopenllm_v2/EVA-UNIT-01/EVA-Qwen2.5-14B-v0.2/75e5ca5d-cce1-4463-b398-553399ce6833.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EVA-UNIT-01_EVA-Qwen2.5-14B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "EVA-Qwen2.5-14B-v0.2",
-    "id": "EVA-UNIT-01/EVA-Qwen2.5-14B-v0.2",
-    "developer": "EVA-UNIT-01",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4038
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.609
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3943
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4794
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5135
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EVA-UNIT-01/EVA-Qwen2.5-72B-v0.2/c426bae7-b98d-4343-b419-ac8206196a95.json b/data/hfopenllm_v2/EVA-UNIT-01/EVA-Qwen2.5-72B-v0.2/c426bae7-b98d-4343-b419-ac8206196a95.json
deleted file mode 100644
index 557addc60..000000000
--- a/data/hfopenllm_v2/EVA-UNIT-01/EVA-Qwen2.5-72B-v0.2/c426bae7-b98d-4343-b419-ac8206196a95.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EVA-UNIT-01_EVA-Qwen2.5-72B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "EVA-Qwen2.5-72B-v0.2",
-    "id": "EVA-UNIT-01/EVA-Qwen2.5-72B-v0.2",
-    "developer": "EVA-UNIT-01",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6879
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7088
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4313
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4086
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.472
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5813
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Edgerunners/meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16/b17de9f2-6f94-49f6-b908-fa983e8f8f9b.json b/data/hfopenllm_v2/Edgerunners/meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16/b17de9f2-6f94-49f6-b908-fa983e8f8f9b.json
deleted file mode 100644
index 548eedb0d..000000000
--- a/data/hfopenllm_v2/Edgerunners/meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16/b17de9f2-6f94-49f6-b908-fa983e8f8f9b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Edgerunners_meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16",
-    "id": "Edgerunners/meta-llama-3-8b-instruct-hf-ortho-baukit-34fail-3000total-bf16",
-    "developer": "Edgerunners",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7147
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.498
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0906
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3636
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EleutherAI/gpt-j-6b/58ba7ca1-8cca-4668-836b-824491d9cf01.json b/data/hfopenllm_v2/EleutherAI/gpt-j-6b/58ba7ca1-8cca-4668-836b-824491d9cf01.json
deleted file mode 100644
index 3f4b4bd5e..000000000
--- a/data/hfopenllm_v2/EleutherAI/gpt-j-6b/58ba7ca1-8cca-4668-836b-824491d9cf01.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EleutherAI_gpt-j-6b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt-j-6b",
-    "id": "EleutherAI/gpt-j-6b",
-    "developer": "EleutherAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPTJForCausalLM",
-      "params_billions": 6.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2522
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3191
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2458
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3658
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1241
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EleutherAI/gpt-neo-1.3B/23da100a-13b9-42a7-ba79-234be551d0e4.json b/data/hfopenllm_v2/EleutherAI/gpt-neo-1.3B/23da100a-13b9-42a7-ba79-234be551d0e4.json
deleted file mode 100644
index 14e41e155..000000000
--- a/data/hfopenllm_v2/EleutherAI/gpt-neo-1.3B/23da100a-13b9-42a7-ba79-234be551d0e4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EleutherAI_gpt-neo-1.3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt-neo-1.3B",
-    "id": "EleutherAI/gpt-neo-1.3B",
-    "developer": "EleutherAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPTNeoForCausalLM",
-      "params_billions": 1.366
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2079
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3039
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1164
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EleutherAI/gpt-neo-125m/2d0c12b9-cff8-4366-a3ce-7772e4c098c9.json b/data/hfopenllm_v2/EleutherAI/gpt-neo-125m/2d0c12b9-cff8-4366-a3ce-7772e4c098c9.json
deleted file mode 100644
index 58ad6f3da..000000000
--- a/data/hfopenllm_v2/EleutherAI/gpt-neo-125m/2d0c12b9-cff8-4366-a3ce-7772e4c098c9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EleutherAI_gpt-neo-125m/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt-neo-125m",
-    "id": "EleutherAI/gpt-neo-125m",
-    "developer": "EleutherAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPTNeoForCausalLM",
-      "params_billions": 0.15
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1905
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3115
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3593
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1026
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EleutherAI/gpt-neo-2.7B/4b87eea2-169c-411e-9d15-caf6b7826590.json b/data/hfopenllm_v2/EleutherAI/gpt-neo-2.7B/4b87eea2-169c-411e-9d15-caf6b7826590.json
deleted file mode 100644
index 409e0b82a..000000000
--- a/data/hfopenllm_v2/EleutherAI/gpt-neo-2.7B/4b87eea2-169c-411e-9d15-caf6b7826590.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EleutherAI_gpt-neo-2.7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt-neo-2.7B",
-    "id": "EleutherAI/gpt-neo-2.7B",
-    "developer": "EleutherAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPTNeoForCausalLM",
-      "params_billions": 2.718
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.259
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.314
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3554
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1163
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EleutherAI/gpt-neox-20b/62a3cce2-4ff5-4dc9-beab-a06001fd82d9.json b/data/hfopenllm_v2/EleutherAI/gpt-neox-20b/62a3cce2-4ff5-4dc9-beab-a06001fd82d9.json
deleted file mode 100644
index 8d26f484b..000000000
--- a/data/hfopenllm_v2/EleutherAI/gpt-neox-20b/62a3cce2-4ff5-4dc9-beab-a06001fd82d9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EleutherAI_gpt-neox-20b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt-neox-20b",
-    "id": "EleutherAI/gpt-neox-20b",
-    "developer": "EleutherAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 20.739
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2587
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3165
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2433
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3647
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1155
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EleutherAI/pythia-1.4b/0e5961e1-af27-4eee-8b9b-c82ee4ab61b1.json b/data/hfopenllm_v2/EleutherAI/pythia-1.4b/0e5961e1-af27-4eee-8b9b-c82ee4ab61b1.json
deleted file mode 100644
index 0e25cd321..000000000
--- a/data/hfopenllm_v2/EleutherAI/pythia-1.4b/0e5961e1-af27-4eee-8b9b-c82ee4ab61b1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EleutherAI_pythia-1.4b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pythia-1.4b",
-    "id": "EleutherAI/pythia-1.4b",
-    "developer": "EleutherAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 1.515
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2371
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.315
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3538
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1123
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EleutherAI/pythia-12b/b62352d4-e3b0-4b4d-8d68-e2d973d820c1.json b/data/hfopenllm_v2/EleutherAI/pythia-12b/b62352d4-e3b0-4b4d-8d68-e2d973d820c1.json
deleted file mode 100644
index c70b94beb..000000000
--- a/data/hfopenllm_v2/EleutherAI/pythia-12b/b62352d4-e3b0-4b4d-8d68-e2d973d820c1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EleutherAI_pythia-12b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pythia-12b",
-    "id": "EleutherAI/pythia-12b",
-    "developer": "EleutherAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 12.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2471
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2466
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3647
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1109
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EleutherAI/pythia-160m/7fadc486-767e-45ef-979d-74ecb858cb99.json b/data/hfopenllm_v2/EleutherAI/pythia-160m/7fadc486-767e-45ef-979d-74ecb858cb99.json
deleted file mode 100644
index fc912f961..000000000
--- a/data/hfopenllm_v2/EleutherAI/pythia-160m/7fadc486-767e-45ef-979d-74ecb858cb99.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EleutherAI_pythia-160m/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pythia-160m",
-    "id": "EleutherAI/pythia-160m",
-    "developer": "EleutherAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 0.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1816
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4179
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.112
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EleutherAI/pythia-1b/d0628e6f-a6f3-42eb-b9fc-e880ae8c0688.json b/data/hfopenllm_v2/EleutherAI/pythia-1b/d0628e6f-a6f3-42eb-b9fc-e880ae8c0688.json
deleted file mode 100644
index daea284b8..000000000
--- a/data/hfopenllm_v2/EleutherAI/pythia-1b/d0628e6f-a6f3-42eb-b9fc-e880ae8c0688.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EleutherAI_pythia-1b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pythia-1b",
-    "id": "EleutherAI/pythia-1b",
-    "developer": "EleutherAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 1.079
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2208
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3004
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3552
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1136
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EleutherAI/pythia-2.8b/0999a066-1151-4445-b130-00d8fe4a516e.json b/data/hfopenllm_v2/EleutherAI/pythia-2.8b/0999a066-1151-4445-b130-00d8fe4a516e.json
deleted file mode 100644
index 7e836076c..000000000
--- a/data/hfopenllm_v2/EleutherAI/pythia-2.8b/0999a066-1151-4445-b130-00d8fe4a516e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EleutherAI_pythia-2.8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pythia-2.8b",
-    "id": "EleutherAI/pythia-2.8b",
-    "developer": "EleutherAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 2.909
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2173
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3224
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3486
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1137
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EleutherAI/pythia-410m/1efc09d8-6a5c-4d48-b76e-2e04ef97b676.json b/data/hfopenllm_v2/EleutherAI/pythia-410m/1efc09d8-6a5c-4d48-b76e-2e04ef97b676.json
deleted file mode 100644
index 070f7c573..000000000
--- a/data/hfopenllm_v2/EleutherAI/pythia-410m/1efc09d8-6a5c-4d48-b76e-2e04ef97b676.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EleutherAI_pythia-410m/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pythia-410m",
-    "id": "EleutherAI/pythia-410m",
-    "developer": "EleutherAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 0.506
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2195
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3028
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3578
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1128
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EleutherAI/pythia-6.9b/1a59412f-fe78-4ecf-8951-8f2996dd374f.json b/data/hfopenllm_v2/EleutherAI/pythia-6.9b/1a59412f-fe78-4ecf-8951-8f2996dd374f.json
deleted file mode 100644
index 6f0eb37e2..000000000
--- a/data/hfopenllm_v2/EleutherAI/pythia-6.9b/1a59412f-fe78-4ecf-8951-8f2996dd374f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EleutherAI_pythia-6.9b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pythia-6.9b",
-    "id": "EleutherAI/pythia-6.9b",
-    "developer": "EleutherAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 6.9
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2281
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3232
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3591
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1147
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-French-Llama-3-8B-v0.4/b5403311-2069-488d-af98-27da14496c15.json b/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-French-Llama-3-8B-v0.4/b5403311-2069-488d-af98-27da14496c15.json
deleted file mode 100644
index 43c0cbc2f..000000000
--- a/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-French-Llama-3-8B-v0.4/b5403311-2069-488d-af98-27da14496c15.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Enno-Ai_EnnoAi-Pro-French-Llama-3-8B-v0.4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "EnnoAi-Pro-French-Llama-3-8B-v0.4",
-    "id": "Enno-Ai/EnnoAi-Pro-French-Llama-3-8B-v0.4",
-    "developer": "Enno-Ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.031
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4189
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4075
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0363
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.417
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2635
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3-8B-v0.3/6c10c176-b2b6-4216-91c0-1444944612f7.json b/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3-8B-v0.3/6c10c176-b2b6-4216-91c0-1444944612f7.json
deleted file mode 100644
index 3c0a9c0a6..000000000
--- a/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3-8B-v0.3/6c10c176-b2b6-4216-91c0-1444944612f7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Enno-Ai_EnnoAi-Pro-Llama-3-8B-v0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "EnnoAi-Pro-Llama-3-8B-v0.3",
-    "id": "Enno-Ai/EnnoAi-Pro-Llama-3-8B-v0.3",
-    "developer": "Enno-Ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5083
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4101
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0483
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4236
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.299
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3-8B/80ebd92e-d9b6-46ce-b77e-973c3f3f6051.json b/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3-8B/80ebd92e-d9b6-46ce-b77e-973c3f3f6051.json
deleted file mode 100644
index ae3f57ad1..000000000
--- a/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3-8B/80ebd92e-d9b6-46ce-b77e-973c3f3f6051.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Enno-Ai_EnnoAi-Pro-Llama-3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "EnnoAi-Pro-Llama-3-8B",
-    "id": "Enno-Ai/EnnoAi-Pro-Llama-3-8B",
-    "developer": "Enno-Ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.031
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3195
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4152
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4071
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2151
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3.1-8B-v0.9/0418e36f-17ea-46a2-bfeb-91cc0ff719bf.json b/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3.1-8B-v0.9/0418e36f-17ea-46a2-bfeb-91cc0ff719bf.json
deleted file mode 100644
index 145d529e4..000000000
--- a/data/hfopenllm_v2/Enno-Ai/EnnoAi-Pro-Llama-3.1-8B-v0.9/0418e36f-17ea-46a2-bfeb-91cc0ff719bf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Enno-Ai_EnnoAi-Pro-Llama-3.1-8B-v0.9/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "EnnoAi-Pro-Llama-3.1-8B-v0.9",
-    "id": "Enno-Ai/EnnoAi-Pro-Llama-3.1-8B-v0.9",
-    "developer": "Enno-Ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4689
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.416
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0378
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3832
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2596
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EnnoAi/EnnoAi-7B-French-Instruct-202502/4f5ba3fc-694a-45b1-ae9d-2c7d33e41519.json b/data/hfopenllm_v2/EnnoAi/EnnoAi-7B-French-Instruct-202502/4f5ba3fc-694a-45b1-ae9d-2c7d33e41519.json
deleted file mode 100644
index 28ac26d3c..000000000
--- a/data/hfopenllm_v2/EnnoAi/EnnoAi-7B-French-Instruct-202502/4f5ba3fc-694a-45b1-ae9d-2c7d33e41519.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EnnoAi_EnnoAi-7B-French-Instruct-202502/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "EnnoAi-7B-French-Instruct-202502",
-    "id": "EnnoAi/EnnoAi-7B-French-Instruct-202502",
-    "developer": "EnnoAi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5564
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5575
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3724
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4013
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EnnoAi/EnnoAi-Pro-Llama-3.1-8B-v1.0/8b0d1556-bbd5-49e3-b881-32224bc1aa9a.json b/data/hfopenllm_v2/EnnoAi/EnnoAi-Pro-Llama-3.1-8B-v1.0/8b0d1556-bbd5-49e3-b881-32224bc1aa9a.json
deleted file mode 100644
index 610b4bd7f..000000000
--- a/data/hfopenllm_v2/EnnoAi/EnnoAi-Pro-Llama-3.1-8B-v1.0/8b0d1556-bbd5-49e3-b881-32224bc1aa9a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EnnoAi_EnnoAi-Pro-Llama-3.1-8B-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "EnnoAi-Pro-Llama-3.1-8B-v1.0",
-    "id": "EnnoAi/EnnoAi-Pro-Llama-3.1-8B-v1.0",
-    "developer": "EnnoAi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4704
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.416
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0378
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3832
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2596
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Epiculous/Azure_Dusk-v0.2/524e634f-280c-4f3a-9f1f-bdda19fad740.json b/data/hfopenllm_v2/Epiculous/Azure_Dusk-v0.2/524e634f-280c-4f3a-9f1f-bdda19fad740.json
deleted file mode 100644
index db3809f7b..000000000
--- a/data/hfopenllm_v2/Epiculous/Azure_Dusk-v0.2/524e634f-280c-4f3a-9f1f-bdda19fad740.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Epiculous_Azure_Dusk-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Azure_Dusk-v0.2",
-    "id": "Epiculous/Azure_Dusk-v0.2",
-    "developer": "Epiculous",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3467
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.412
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3835
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3034
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Epiculous/Crimson_Dawn-v0.2/cb82e92b-f207-4fbd-9bfe-43184769cdbd.json b/data/hfopenllm_v2/Epiculous/Crimson_Dawn-v0.2/cb82e92b-f207-4fbd-9bfe-43184769cdbd.json
deleted file mode 100644
index 0f0cfc70f..000000000
--- a/data/hfopenllm_v2/Epiculous/Crimson_Dawn-v0.2/cb82e92b-f207-4fbd-9bfe-43184769cdbd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Epiculous_Crimson_Dawn-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Crimson_Dawn-v0.2",
-    "id": "Epiculous/Crimson_Dawn-v0.2",
-    "developer": "Epiculous",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3103
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4482
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4152
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2721
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Epiculous/NovaSpark/0b674103-4e55-41f4-accb-b7be73671801.json b/data/hfopenllm_v2/Epiculous/NovaSpark/0b674103-4e55-41f4-accb-b7be73671801.json
deleted file mode 100644
index 74391a8d3..000000000
--- a/data/hfopenllm_v2/Epiculous/NovaSpark/0b674103-4e55-41f4-accb-b7be73671801.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Epiculous_NovaSpark/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NovaSpark",
-    "id": "Epiculous/NovaSpark",
-    "developer": "Epiculous",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6408
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5064
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1518
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3882
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3649
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Epiculous/Violet_Twilight-v0.2/fa0290e0-723f-4502-90b6-c77007fffc1f.json b/data/hfopenllm_v2/Epiculous/Violet_Twilight-v0.2/fa0290e0-723f-4502-90b6-c77007fffc1f.json
deleted file mode 100644
index c08d3f9d1..000000000
--- a/data/hfopenllm_v2/Epiculous/Violet_Twilight-v0.2/fa0290e0-723f-4502-90b6-c77007fffc1f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Epiculous_Violet_Twilight-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Violet_Twilight-v0.2",
-    "id": "Epiculous/Violet_Twilight-v0.2",
-    "developer": "Epiculous",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4532
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4615
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4299
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3111
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Alpaca-Llama3.1-8B/c3827ecd-d02a-4464-a098-110f4fb54516.json b/data/hfopenllm_v2/EpistemeAI/Alpaca-Llama3.1-8B/c3827ecd-d02a-4464-a098-110f4fb54516.json
deleted file mode 100644
index 5e88a9d80..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Alpaca-Llama3.1-8B/c3827ecd-d02a-4464-a098-110f4fb54516.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Alpaca-Llama3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Alpaca-Llama3.1-8B",
-    "id": "EpistemeAI/Alpaca-Llama3.1-8B",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1599
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4755
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3403
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3246
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Athena-gemma-2-2b-it-Philos/af9700fe-20c0-4b7c-9f3a-c4d78fab7911.json b/data/hfopenllm_v2/EpistemeAI/Athena-gemma-2-2b-it-Philos/af9700fe-20c0-4b7c-9f3a-c4d78fab7911.json
deleted file mode 100644
index 109e4335c..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Athena-gemma-2-2b-it-Philos/af9700fe-20c0-4b7c-9f3a-c4d78fab7911.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Athena-gemma-2-2b-it-Philos/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Athena-gemma-2-2b-it-Philos",
-    "id": "EpistemeAI/Athena-gemma-2-2b-it-Philos",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4621
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3795
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.037
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4314
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2248
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Athena-gemma-2-2b-it/959a4e4d-211c-4e45-94f1-f8f877e0b36f.json b/data/hfopenllm_v2/EpistemeAI/Athena-gemma-2-2b-it/959a4e4d-211c-4e45-94f1-f8f877e0b36f.json
deleted file mode 100644
index 6c1214fb7..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Athena-gemma-2-2b-it/959a4e4d-211c-4e45-94f1-f8f877e0b36f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Athena-gemma-2-2b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Athena-gemma-2-2b-it",
-    "id": "EpistemeAI/Athena-gemma-2-2b-it",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3134
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4264
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0491
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4351
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2422
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Athene-codegemma-2-7b-it-alpaca-v1.3/96a8b3c0-d6bc-41fe-8967-0d798669aa8e.json b/data/hfopenllm_v2/EpistemeAI/Athene-codegemma-2-7b-it-alpaca-v1.3/96a8b3c0-d6bc-41fe-8967-0d798669aa8e.json
deleted file mode 100644
index 18bd5723e..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Athene-codegemma-2-7b-it-alpaca-v1.3/96a8b3c0-d6bc-41fe-8967-0d798669aa8e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Athene-codegemma-2-7b-it-alpaca-v1.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Athene-codegemma-2-7b-it-alpaca-v1.3",
-    "id": "EpistemeAI/Athene-codegemma-2-7b-it-alpaca-v1.3",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.403
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4332
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0619
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4503
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2587
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/DeepPhi-3.5-mini-instruct/ed5d2ca8-d551-493d-8877-348204ef91cc.json b/data/hfopenllm_v2/EpistemeAI/DeepPhi-3.5-mini-instruct/ed5d2ca8-d551-493d-8877-348204ef91cc.json
deleted file mode 100644
index 378a472d1..000000000
--- a/data/hfopenllm_v2/EpistemeAI/DeepPhi-3.5-mini-instruct/ed5d2ca8-d551-493d-8877-348204ef91cc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_DeepPhi-3.5-mini-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepPhi-3.5-mini-instruct",
-    "id": "EpistemeAI/DeepPhi-3.5-mini-instruct",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1326
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2882
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2332
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3656
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1103
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/DeepThinkers-Phi4/04e20a14-8346-4801-8515-189861c857cb.json b/data/hfopenllm_v2/EpistemeAI/DeepThinkers-Phi4/04e20a14-8346-4801-8515-189861c857cb.json
deleted file mode 100644
index f41ea35a4..000000000
--- a/data/hfopenllm_v2/EpistemeAI/DeepThinkers-Phi4/04e20a14-8346-4801-8515-189861c857cb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_DeepThinkers-Phi4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepThinkers-Phi4",
-    "id": "EpistemeAI/DeepThinkers-Phi4",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.694
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.679
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4585
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3981
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5258
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/FineLlama3.1-8B-Instruct/eec2da56-ba0a-418f-afe1-8a46882b9839.json b/data/hfopenllm_v2/EpistemeAI/FineLlama3.1-8B-Instruct/eec2da56-ba0a-418f-afe1-8a46882b9839.json
deleted file mode 100644
index 7a0548790..000000000
--- a/data/hfopenllm_v2/EpistemeAI/FineLlama3.1-8B-Instruct/eec2da56-ba0a-418f-afe1-8a46882b9839.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_FineLlama3.1-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FineLlama3.1-8B-Instruct",
-    "id": "EpistemeAI/FineLlama3.1-8B-Instruct",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "4bit",
-      "architecture": "?",
-      "params_billions": 14.483
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.08
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4557
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0347
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3482
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3113
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-12B-v1.13a-philosophers/321cf68b-9220-4ada-89da-061341a20a9d.json b/data/hfopenllm_v2/EpistemeAI/Fireball-12B-v1.13a-philosophers/321cf68b-9220-4ada-89da-061341a20a9d.json
deleted file mode 100644
index 2dc46794a..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Fireball-12B-v1.13a-philosophers/321cf68b-9220-4ada-89da-061341a20a9d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-12B-v1.13a-philosophers/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-12B-v1.13a-philosophers",
-    "id": "EpistemeAI/Fireball-12B-v1.13a-philosophers",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0876
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5103
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0461
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4081
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3367
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-12B/86fda025-2345-4a40-9094-223b96b21f13.json b/data/hfopenllm_v2/EpistemeAI/Fireball-12B/86fda025-2345-4a40-9094-223b96b21f13.json
deleted file mode 100644
index 0d7c7fd33..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Fireball-12B/86fda025-2345-4a40-9094-223b96b21f13.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-12B",
-    "id": "EpistemeAI/Fireball-12B",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1834
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5111
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4236
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3344
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200/3c734233-9868-4ba6-83c0-2b63f2ce8980.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200/3c734233-9868-4ba6-83c0-2b63f2ce8980.json
deleted file mode 100644
index 185221b63..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200/3c734233-9868-4ba6-83c0-2b63f2ce8980.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200",
-    "id": "EpistemeAI/Fireball-Alpaca-Llama-3.1-8B-Philos-DPO-200",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4577
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4838
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1231
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3945
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3583
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta/7f5eca48-0ab9-4ef2-85c2-a7f1fe713afe.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta/7f5eca48-0ab9-4ef2-85c2-a7f1fe713afe.json
deleted file mode 100644
index a5c0fc181..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta/7f5eca48-0ab9-4ef2-85c2-a7f1fe713afe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta",
-    "id": "EpistemeAI/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math-KTO-beta",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7274
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4865
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1526
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3619
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3543
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2/f5e0e809-08b8-43dd-a44d-875f365610c3.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2/f5e0e809-08b8-43dd-a44d-875f365610c3.json
deleted file mode 100644
index 5554b76ed..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2/f5e0e809-08b8-43dd-a44d-875f365610c3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2",
-    "id": "EpistemeAI/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R2",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4673
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4932
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1239
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4624
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3352
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto/8d267135-a7e6-4ec5-ae09-66478804bb66.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto/8d267135-a7e6-4ec5-ae09-66478804bb66.json
deleted file mode 100644
index e11cd048a..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto/8d267135-a7e6-4ec5-ae09-66478804bb66.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto",
-    "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-0.001-128K-auto",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4432
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4824
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1329
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4066
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3516
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/4940ed0e-2c1e-4408-9806-49ceed30a69e.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/4940ed0e-2c1e-4408-9806-49ceed30a69e.json
deleted file mode 100644
index afd2c84cc..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/4940ed0e-2c1e-4408-9806-49ceed30a69e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto",
-    "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7305
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4649
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1397
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3209
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.348
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/5f6f7b7c-ef6a-4468-aae5-d7dfc25c5659.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/5f6f7b7c-ef6a-4468-aae5-d7dfc25c5659.json
deleted file mode 100644
index df9550996..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/5f6f7b7c-ef6a-4468-aae5-d7dfc25c5659.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto",
-    "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7207
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.461
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1314
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3432
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3354
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds/5244ee3c-7d65-434a-acfe-cdb277ff5264.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds/5244ee3c-7d65-434a-acfe-cdb277ff5264.json
deleted file mode 100644
index 1b365ab87..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds/5244ee3c-7d65-434a-acfe-cdb277ff5264.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds",
-    "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6691
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4668
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1337
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3418
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3389
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code/eba4644f-d455-4a23-a16f-8ecb038ffe7f.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code/eba4644f-d455-4a23-a16f-8ecb038ffe7f.json
deleted file mode 100644
index 647383065..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code/eba4644f-d455-4a23-a16f-8ecb038ffe7f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code",
-    "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5975
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4904
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1337
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.401
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3423
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K/fb270319-7010-4946-b60c-409aebe41aaa.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K/fb270319-7010-4946-b60c-409aebe41aaa.json
deleted file mode 100644
index 19697de71..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K/fb270319-7010-4946-b60c-409aebe41aaa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K",
-    "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4457
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4897
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1208
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3762
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3543
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT/d57bd77a-11cc-497c-b0bb-31c1ffa63dc2.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT/d57bd77a-11cc-497c-b0bb-31c1ffa63dc2.json
deleted file mode 100644
index b052f7972..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT/d57bd77a-11cc-497c-b0bb-31c1ffa63dc2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT",
-    "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-COT",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4578
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4761
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1382
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3881
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3471
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto/0220984e-fe8c-4e72-bc3e-92b949ffe769.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto/0220984e-fe8c-4e72-bc3e-92b949ffe769.json
deleted file mode 100644
index f8d75adfa..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto/0220984e-fe8c-4e72-bc3e-92b949ffe769.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto",
-    "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.004-128K-code-ds-auto",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7205
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4818
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1435
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2483
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.33
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3548
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Math/16482634-ec03-463a-9deb-2230ee955800.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Math/16482634-ec03-463a-9deb-2230ee955800.json
deleted file mode 100644
index af0d2a1e2..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Math/16482634-ec03-463a-9deb-2230ee955800.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.1-8B-Instruct-Math/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Math",
-    "id": "EpistemeAI/Fireball-Meta-Llama-3.1-8B-Instruct-Math",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4623
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4983
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3641
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO/4c1db32d-96fc-4a66-b083-530a3e75ad6d.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO/4c1db32d-96fc-4a66-b083-530a3e75ad6d.json
deleted file mode 100644
index a7e78720e..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO/4c1db32d-96fc-4a66-b083-530a3e75ad6d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO",
-    "id": "EpistemeAI/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4611
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4801
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1254
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3998
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3521
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-Mistral-Nemo-Base-2407-v1-DPO2/c0c5c846-395a-47ac-9e8e-e598939f317d.json b/data/hfopenllm_v2/EpistemeAI/Fireball-Mistral-Nemo-Base-2407-v1-DPO2/c0c5c846-395a-47ac-9e8e-e598939f317d.json
deleted file mode 100644
index d0ad3364c..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Fireball-Mistral-Nemo-Base-2407-v1-DPO2/c0c5c846-395a-47ac-9e8e-e598939f317d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-Mistral-Nemo-Base-2407-v1-DPO2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Mistral-Nemo-Base-2407-v1-DPO2",
-    "id": "EpistemeAI/Fireball-Mistral-Nemo-Base-2407-v1-DPO2",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1861
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4968
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0363
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.404
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3353
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-R1-Llama-3.1-8B-Medical-COT/6b3f6b59-a8eb-48c2-acbc-92e8f34b2dd6.json b/data/hfopenllm_v2/EpistemeAI/Fireball-R1-Llama-3.1-8B-Medical-COT/6b3f6b59-a8eb-48c2-acbc-92e8f34b2dd6.json
deleted file mode 100644
index 4e38880fc..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Fireball-R1-Llama-3.1-8B-Medical-COT/6b3f6b59-a8eb-48c2-acbc-92e8f34b2dd6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-R1-Llama-3.1-8B-Medical-COT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-R1-Llama-3.1-8B-Medical-COT",
-    "id": "EpistemeAI/Fireball-R1-Llama-3.1-8B-Medical-COT",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3216
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.327
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3114
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1402
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-R1-Llama-3.1-8B/d017e3bf-2abe-4b84-810e-e0eaf973adc3.json b/data/hfopenllm_v2/EpistemeAI/Fireball-R1-Llama-3.1-8B/d017e3bf-2abe-4b84-810e-e0eaf973adc3.json
deleted file mode 100644
index 694f4c5d2..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Fireball-R1-Llama-3.1-8B/d017e3bf-2abe-4b84-810e-e0eaf973adc3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-R1-Llama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-R1-Llama-3.1-8B",
-    "id": "EpistemeAI/Fireball-R1-Llama-3.1-8B",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4427
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3643
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2483
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3288
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1115
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Fireball-R1.1-Llama-3.1-8B/62a3ecb8-f6d1-429c-807f-5545b2a5897f.json b/data/hfopenllm_v2/EpistemeAI/Fireball-R1.1-Llama-3.1-8B/62a3ecb8-f6d1-429c-807f-5545b2a5897f.json
deleted file mode 100644
index 81219a0cf..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Fireball-R1.1-Llama-3.1-8B/62a3ecb8-f6d1-429c-807f-5545b2a5897f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Fireball-R1.1-Llama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-R1.1-Llama-3.1-8B",
-    "id": "EpistemeAI/Fireball-R1.1-Llama-3.1-8B",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3676
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3326
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1382
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3419
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1115
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Llama-3.2-3B-Agent007-Coder/748557ce-1a49-4b3a-9c38-9007dc04aafb.json b/data/hfopenllm_v2/EpistemeAI/Llama-3.2-3B-Agent007-Coder/748557ce-1a49-4b3a-9c38-9007dc04aafb.json
deleted file mode 100644
index c5b57ce3d..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Llama-3.2-3B-Agent007-Coder/748557ce-1a49-4b3a-9c38-9007dc04aafb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Llama-3.2-3B-Agent007-Coder/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-Agent007-Coder",
-    "id": "EpistemeAI/Llama-3.2-3B-Agent007-Coder",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.54
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4304
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.111
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3668
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Mistral-Nemo-Instruct-12B-Philosophy-Math/95d43d01-a75e-4af4-a2cc-b60f832071d3.json b/data/hfopenllm_v2/EpistemeAI/Mistral-Nemo-Instruct-12B-Philosophy-Math/95d43d01-a75e-4af4-a2cc-b60f832071d3.json
deleted file mode 100644
index 8cad61e00..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Mistral-Nemo-Instruct-12B-Philosophy-Math/95d43d01-a75e-4af4-a2cc-b60f832071d3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Mistral-Nemo-Instruct-12B-Philosophy-Math/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Nemo-Instruct-12B-Philosophy-Math",
-    "id": "EpistemeAI/Mistral-Nemo-Instruct-12B-Philosophy-Math",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0695
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5365
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0959
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4292
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3296
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/OpenReasoner-Llama-3.2-3B-rs1.0/4dc7c889-7839-4047-b48c-33be5b688e72.json b/data/hfopenllm_v2/EpistemeAI/OpenReasoner-Llama-3.2-3B-rs1.0/4dc7c889-7839-4047-b48c-33be5b688e72.json
deleted file mode 100644
index 62e0df3f5..000000000
--- a/data/hfopenllm_v2/EpistemeAI/OpenReasoner-Llama-3.2-3B-rs1.0/4dc7c889-7839-4047-b48c-33be5b688e72.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_OpenReasoner-Llama-3.2-3B-rs1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenReasoner-Llama-3.2-3B-rs1.0",
-    "id": "EpistemeAI/OpenReasoner-Llama-3.2-3B-rs1.0",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7274
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4519
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1344
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3461
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3134
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy/751851c8-9a7f-4135-a106-eab4efbd0734.json b/data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy/751851c8-9a7f-4135-a106-eab4efbd0734.json
deleted file mode 100644
index afaf74d49..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy/751851c8-9a7f-4135-a106-eab4efbd0734.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy",
-    "id": "EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Empathy",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7101
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4628
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1397
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3195
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3311
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic/2930e30c-9f2e-4248-ae3b-ed7ffbd12f8c.json b/data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic/2930e30c-9f2e-4248-ae3b-ed7ffbd12f8c.json
deleted file mode 100644
index 9a4c900d1..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic/2930e30c-9f2e-4248-ae3b-ed7ffbd12f8c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic",
-    "id": "EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-Logic",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7122
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4566
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1246
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3235
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.335
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent/c1acc460-aeb8-4a99-8ca5-376ab60fb74a.json b/data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent/c1acc460-aeb8-4a99-8ca5-376ab60fb74a.json
deleted file mode 100644
index f3d4c197b..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent/c1acc460-aeb8-4a99-8ca5-376ab60fb74a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent",
-    "id": "EpistemeAI/Polypsyche-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-ds-auto-divergent",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6915
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4525
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1292
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3578
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.329
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO/33b8b64f-7da5-45aa-bf80-7145ef704229.json b/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO/33b8b64f-7da5-45aa-bf80-7145ef704229.json
deleted file mode 100644
index 40ecf34e1..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO/33b8b64f-7da5-45aa-bf80-7145ef704229.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO",
-    "id": "EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT-V2-ORPO",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4553
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4804
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1292
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3931
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3598
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT/2662d257-49e2-430d-b44f-b0b347c61271.json b/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT/2662d257-49e2-430d-b44f-b0b347c61271.json
deleted file mode 100644
index c9e3844b1..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT/2662d257-49e2-430d-b44f-b0b347c61271.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Reasoning-Llama-3.1-CoT-RE1-NMT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Reasoning-Llama-3.1-CoT-RE1-NMT",
-    "id": "EpistemeAI/Reasoning-Llama-3.1-CoT-RE1-NMT",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4829
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4736
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1299
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3182
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3343
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.2/870b639b-ee7a-4b13-872b-52657539c836.json b/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.2/870b639b-ee7a-4b13-872b-52657539c836.json
deleted file mode 100644
index 2e9d69e3c..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.2/870b639b-ee7a-4b13-872b-52657539c836.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Reasoning-Llama-3.2-1B-Instruct-v1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Reasoning-Llama-3.2-1B-Instruct-v1.2",
-    "id": "EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.2",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4087
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3324
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3222
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1179
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.3/6ff20678-a335-4fa8-8126-9f96ce247f34.json b/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.3/6ff20678-a335-4fa8-8126-9f96ce247f34.json
deleted file mode 100644
index a23d5fb0b..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.3/6ff20678-a335-4fa8-8126-9f96ce247f34.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Reasoning-Llama-3.2-1B-Instruct-v1.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Reasoning-Llama-3.2-1B-Instruct-v1.3",
-    "id": "EpistemeAI/Reasoning-Llama-3.2-1B-Instruct-v1.3",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3273
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.326
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1173
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO/19c4ea89-896a-4577-a386-c2470eaf743f.json b/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO/19c4ea89-896a-4577-a386-c2470eaf743f.json
deleted file mode 100644
index 3b21e4f59..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO/19c4ea89-896a-4577-a386-c2470eaf743f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO",
-    "id": "EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1-ORPO",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.729
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4518
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1533
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3487
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.31
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1/22eb2479-16ff-4a56-b9e4-e8835da7ca0e.json b/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1/22eb2479-16ff-4a56-b9e4-e8835da7ca0e.json
deleted file mode 100644
index ceb6ab2db..000000000
--- a/data/hfopenllm_v2/EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1/22eb2479-16ff-4a56-b9e4-e8835da7ca0e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_Reasoning-Llama-3.2-3B-Math-Instruct-RE1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Reasoning-Llama-3.2-3B-Math-Instruct-RE1",
-    "id": "EpistemeAI/Reasoning-Llama-3.2-3B-Math-Instruct-RE1",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.512
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4381
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3435
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2789
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math/aca3f1fd-9c46-47f6-81c6-dc56a702c1de.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math/aca3f1fd-9c46-47f6-81c6-dc56a702c1de.json
deleted file mode 100644
index 3c0a3ed67..000000000
--- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math/aca3f1fd-9c46-47f6-81c6-dc56a702c1de.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math",
-    "id": "EpistemeAI/ReasoningCore-1.0-3B-Instruct-r01-Reflect-Math",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5903
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4364
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2823
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-0/071ca686-5950-4af4-80f2-969b1008e370.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-0/071ca686-5950-4af4-80f2-969b1008e370.json
deleted file mode 100644
index da61c8f6c..000000000
--- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-0/071ca686-5950-4af4-80f2-969b1008e370.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-3B-0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReasoningCore-3B-0",
-    "id": "EpistemeAI/ReasoningCore-3B-0",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7341
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4446
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1586
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3554
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3172
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-Instruct-r01-Reflect/78977c34-33f8-4037-86e0-dfce1d01c3f8.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-Instruct-r01-Reflect/78977c34-33f8-4037-86e0-dfce1d01c3f8.json
deleted file mode 100644
index 99bcceb57..000000000
--- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-Instruct-r01-Reflect/78977c34-33f8-4037-86e0-dfce1d01c3f8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-3B-Instruct-r01-Reflect/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReasoningCore-3B-Instruct-r01-Reflect",
-    "id": "EpistemeAI/ReasoningCore-3B-Instruct-r01-Reflect",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7335
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.445
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1541
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3527
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3144
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-R01/480e4294-c8d9-4088-9b8c-7a239d57f683.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-R01/480e4294-c8d9-4088-9b8c-7a239d57f683.json
deleted file mode 100644
index 78ed2cc8e..000000000
--- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-R01/480e4294-c8d9-4088-9b8c-7a239d57f683.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-3B-R01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReasoningCore-3B-R01",
-    "id": "EpistemeAI/ReasoningCore-3B-R01",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2976
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4373
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1299
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3195
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2591
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2/be9b21e8-90ce-451a-bcaf-2ebc7c72bc34.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2/be9b21e8-90ce-451a-bcaf-2ebc7c72bc34.json
deleted file mode 100644
index a387e5176..000000000
--- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2/be9b21e8-90ce-451a-bcaf-2ebc7c72bc34.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-3B-RE1-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReasoningCore-3B-RE1-V2",
-    "id": "EpistemeAI/ReasoningCore-3B-RE1-V2",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7393
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4462
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1563
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3541
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3181
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2A/b0054dd8-e62c-4d0c-9b18-090851c3a7e2.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2A/b0054dd8-e62c-4d0c-9b18-090851c3a7e2.json
deleted file mode 100644
index 6f4873443..000000000
--- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2A/b0054dd8-e62c-4d0c-9b18-090851c3a7e2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-3B-RE1-V2A/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReasoningCore-3B-RE1-V2A",
-    "id": "EpistemeAI/ReasoningCore-3B-RE1-V2A",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5733
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.419
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0929
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3352
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2736
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2B/985e479b-658a-4548-9b5e-c9c04b8838c1.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2B/985e479b-658a-4548-9b5e-c9c04b8838c1.json
deleted file mode 100644
index 21879ed27..000000000
--- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2B/985e479b-658a-4548-9b5e-c9c04b8838c1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-3B-RE1-V2B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReasoningCore-3B-RE1-V2B",
-    "id": "EpistemeAI/ReasoningCore-3B-RE1-V2B",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5051
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4168
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1073
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3448
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2673
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2C/d0ef8af4-156d-456d-9e33-b2cdb3f8c04e.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2C/d0ef8af4-156d-456d-9e33-b2cdb3f8c04e.json
deleted file mode 100644
index abce8ee39..000000000
--- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-RE1-V2C/d0ef8af4-156d-456d-9e33-b2cdb3f8c04e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-3B-RE1-V2C/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReasoningCore-3B-RE1-V2C",
-    "id": "EpistemeAI/ReasoningCore-3B-RE1-V2C",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5057
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4177
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0974
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3422
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2691
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-T1-V1/5050c787-2f95-4a17-a4b0-c094860627b5.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-T1-V1/5050c787-2f95-4a17-a4b0-c094860627b5.json
deleted file mode 100644
index e1a337bdd..000000000
--- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-T1-V1/5050c787-2f95-4a17-a4b0-c094860627b5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-3B-T1-V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReasoningCore-3B-T1-V1",
-    "id": "EpistemeAI/ReasoningCore-3B-T1-V1",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7208
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4517
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1458
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.354
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.312
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-T1_1/bb5c8274-4324-47f2-94c5-d0c831ce0de7.json b/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-T1_1/bb5c8274-4324-47f2-94c5-d0c831ce0de7.json
deleted file mode 100644
index a3f8f7216..000000000
--- a/data/hfopenllm_v2/EpistemeAI/ReasoningCore-3B-T1_1/bb5c8274-4324-47f2-94c5-d0c831ce0de7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI_ReasoningCore-3B-T1_1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReasoningCore-3B-T1_1",
-    "id": "EpistemeAI/ReasoningCore-3B-T1_1",
-    "developer": "EpistemeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7275
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4524
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1541
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3554
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3117
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI2/Athene-codegemma-2-7b-it-alpaca-v1.2/8113a26a-5941-4f3d-872a-bdde5456ad97.json b/data/hfopenllm_v2/EpistemeAI2/Athene-codegemma-2-7b-it-alpaca-v1.2/8113a26a-5941-4f3d-872a-bdde5456ad97.json
deleted file mode 100644
index 18f691dc4..000000000
--- a/data/hfopenllm_v2/EpistemeAI2/Athene-codegemma-2-7b-it-alpaca-v1.2/8113a26a-5941-4f3d-872a-bdde5456ad97.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI2_Athene-codegemma-2-7b-it-alpaca-v1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Athene-codegemma-2-7b-it-alpaca-v1.2",
-    "id": "EpistemeAI2/Athene-codegemma-2-7b-it-alpaca-v1.2",
-    "developer": "EpistemeAI2",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4351
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4175
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0423
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.417
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2297
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-12B-v1.2/5b60047b-2e85-4a47-a31f-4c07f4bd2c30.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-12B-v1.2/5b60047b-2e85-4a47-a31f-4c07f4bd2c30.json
deleted file mode 100644
index 8a14100ea..000000000
--- a/data/hfopenllm_v2/EpistemeAI2/Fireball-12B-v1.2/5b60047b-2e85-4a47-a31f-4c07f4bd2c30.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-12B-v1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-12B-v1.2",
-    "id": "EpistemeAI2/Fireball-12B-v1.2",
-    "developer": "EpistemeAI2",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1355
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5019
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0415
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4173
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3337
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1-8B-Philos/88d79858-3a35-43eb-8da6-95b80b5deef6.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1-8B-Philos/88d79858-3a35-43eb-8da6-95b80b5deef6.json
deleted file mode 100644
index 1fef1a4b8..000000000
--- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1-8B-Philos/88d79858-3a35-43eb-8da6-95b80b5deef6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Alpaca-Llama3.1-8B-Philos/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Alpaca-Llama3.1-8B-Philos",
-    "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1-8B-Philos",
-    "developer": "EpistemeAI2",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4986
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4978
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1186
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4277
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.01-8B-Philos/63266a49-01ea-40f1-83ef-778f391aff2b.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.01-8B-Philos/63266a49-01ea-40f1-83ef-778f391aff2b.json
deleted file mode 100644
index 81024bca2..000000000
--- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.01-8B-Philos/63266a49-01ea-40f1-83ef-778f391aff2b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Alpaca-Llama3.1.01-8B-Philos/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Alpaca-Llama3.1.01-8B-Philos",
-    "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.01-8B-Philos",
-    "developer": "EpistemeAI2",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4212
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4956
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4371
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3383
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.03-8B-Philos/f0da069a-833f-489a-a923-c79542a3a9a6.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.03-8B-Philos/f0da069a-833f-489a-a923-c79542a3a9a6.json
deleted file mode 100644
index b979e6ad4..000000000
--- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.03-8B-Philos/f0da069a-833f-489a-a923-c79542a3a9a6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Alpaca-Llama3.1.03-8B-Philos/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Alpaca-Llama3.1.03-8B-Philos",
-    "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.03-8B-Philos",
-    "developer": "EpistemeAI2",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3881
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4951
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1284
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.428
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3355
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.04-8B-Philos/205b9da8-d561-41ec-946e-1d2f9a43e437.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.04-8B-Philos/205b9da8-d561-41ec-946e-1d2f9a43e437.json
deleted file mode 100644
index d4d82fcc6..000000000
--- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.04-8B-Philos/205b9da8-d561-41ec-946e-1d2f9a43e437.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Alpaca-Llama3.1.04-8B-Philos/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Alpaca-Llama3.1.04-8B-Philos",
-    "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.04-8B-Philos",
-    "developer": "EpistemeAI2",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4084
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.493
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4372
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3403
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo/2ea4da56-4b95-4222-a4e2-f57c73e0ee4e.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo/2ea4da56-4b95-4222-a4e2-f57c73e0ee4e.json
deleted file mode 100644
index e5118311f..000000000
--- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo/2ea4da56-4b95-4222-a4e2-f57c73e0ee4e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo",
-    "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.06-8B-Philos-dpo",
-    "developer": "EpistemeAI2",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4866
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4881
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1307
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3932
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3615
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math/c086f693-cef1-4212-9c17-669b210f4caa.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math/c086f693-cef1-4212-9c17-669b210f4caa.json
deleted file mode 100644
index a1b7ae3fb..000000000
--- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math/c086f693-cef1-4212-9c17-669b210f4caa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Alpaca-Llama3.1.07-8B-Philos-Math/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Alpaca-Llama3.1.07-8B-Philos-Math",
-    "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.07-8B-Philos-Math",
-    "developer": "EpistemeAI2",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5079
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4847
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4063
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3531
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection/290995f2-9982-4f29-ac74-dc646905206c.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection/290995f2-9982-4f29-ac74-dc646905206c.json
deleted file mode 100644
index 83634a0a2..000000000
--- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection/290995f2-9982-4f29-ac74-dc646905206c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection",
-    "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-C-R1-KTO-Reflection",
-    "developer": "EpistemeAI2",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3952
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4955
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1246
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4048
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3593
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1/c60e65e6-d771-4c53-80d0-c1e09aa39377.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1/c60e65e6-d771-4c53-80d0-c1e09aa39377.json
deleted file mode 100644
index caf84f694..000000000
--- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1/c60e65e6-d771-4c53-80d0-c1e09aa39377.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1",
-    "id": "EpistemeAI2/Fireball-Alpaca-Llama3.1.08-8B-Philos-C-R1",
-    "developer": "EpistemeAI2",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5316
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4828
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1239
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4103
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3523
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Llama-3.1-8B-Philos-Reflection/fcff202d-3b4f-4ba9-b3f6-1122d8abcac1.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Llama-3.1-8B-Philos-Reflection/fcff202d-3b4f-4ba9-b3f6-1122d8abcac1.json
deleted file mode 100644
index cbd4f8177..000000000
--- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Llama-3.1-8B-Philos-Reflection/fcff202d-3b4f-4ba9-b3f6-1122d8abcac1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Llama-3.1-8B-Philos-Reflection/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Llama-3.1-8B-Philos-Reflection",
-    "id": "EpistemeAI2/Fireball-Llama-3.1-8B-Philos-Reflection",
-    "developer": "EpistemeAI2",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3596
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4898
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1284
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3957
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3551
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-MathMistral-Nemo-Base-2407-v2dpo/5f0fa37a-e829-402b-b2ab-c68ffa248b6e.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-MathMistral-Nemo-Base-2407-v2dpo/5f0fa37a-e829-402b-b2ab-c68ffa248b6e.json
deleted file mode 100644
index a3f2cd9b8..000000000
--- a/data/hfopenllm_v2/EpistemeAI2/Fireball-MathMistral-Nemo-Base-2407-v2dpo/5f0fa37a-e829-402b-b2ab-c68ffa248b6e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-MathMistral-Nemo-Base-2407-v2dpo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-MathMistral-Nemo-Base-2407-v2dpo",
-    "id": "EpistemeAI2/Fireball-MathMistral-Nemo-Base-2407-v2dpo",
-    "developer": "EpistemeAI2",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 11.58
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3097
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4328
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.037
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.403
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1148
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math/a0b4a345-3530-4da2-8403-87259bbd1405.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math/a0b4a345-3530-4da2-8403-87259bbd1405.json
deleted file mode 100644
index fe44332dd..000000000
--- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math/a0b4a345-3530-4da2-8403-87259bbd1405.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math",
-    "id": "EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.003-128K-code-math",
-    "developer": "EpistemeAI2",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5515
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4808
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1352
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3693
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.342
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT/3548f0ea-f3ab-4a0e-9c77-5ae62014ed44.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT/3548f0ea-f3ab-4a0e-9c77-5ae62014ed44.json
deleted file mode 100644
index 0ddafa271..000000000
--- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT/3548f0ea-f3ab-4a0e-9c77-5ae62014ed44.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT",
-    "id": "EpistemeAI2/Fireball-Meta-Llama-3.1-8B-Instruct-Agent-0.005-128K-code-COT",
-    "developer": "EpistemeAI2",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4633
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4791
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1171
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3774
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/EpistemeAI2/Fireball-Phi-3-medium-4k-inst-Philos/707270e3-334b-4eba-84c0-2795ae53d79a.json b/data/hfopenllm_v2/EpistemeAI2/Fireball-Phi-3-medium-4k-inst-Philos/707270e3-334b-4eba-84c0-2795ae53d79a.json
deleted file mode 100644
index 6b5e01456..000000000
--- a/data/hfopenllm_v2/EpistemeAI2/Fireball-Phi-3-medium-4k-inst-Philos/707270e3-334b-4eba-84c0-2795ae53d79a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/EpistemeAI2_Fireball-Phi-3-medium-4k-inst-Philos/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fireball-Phi-3-medium-4k-inst-Philos",
-    "id": "EpistemeAI2/Fireball-Phi-3-medium-4k-inst-Philos",
-    "developer": "EpistemeAI2",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 13.96
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5313
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6178
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1707
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3322
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4139
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4599
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Eric111/CatunaMayo-DPO/c827bee3-a181-42bc-9387-ca132d59c8ba.json b/data/hfopenllm_v2/Eric111/CatunaMayo-DPO/c827bee3-a181-42bc-9387-ca132d59c8ba.json
deleted file mode 100644
index f742caff8..000000000
--- a/data/hfopenllm_v2/Eric111/CatunaMayo-DPO/c827bee3-a181-42bc-9387-ca132d59c8ba.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Eric111_CatunaMayo-DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CatunaMayo-DPO",
-    "id": "Eric111/CatunaMayo-DPO",
-    "developer": "Eric111",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4215
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5224
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0816
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.445
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.317
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Eric111/CatunaMayo/d3e8949b-f6f8-459f-891b-f4900ff806cd.json b/data/hfopenllm_v2/Eric111/CatunaMayo/d3e8949b-f6f8-459f-891b-f4900ff806cd.json
deleted file mode 100644
index 5cbeab7ab..000000000
--- a/data/hfopenllm_v2/Eric111/CatunaMayo/d3e8949b-f6f8-459f-891b-f4900ff806cd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Eric111_CatunaMayo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CatunaMayo",
-    "id": "Eric111/CatunaMayo",
-    "developer": "Eric111",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4074
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5244
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0846
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.454
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3178
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties-v2/35d5f5e3-74eb-4eea-9f78-b7b8969830a2.json b/data/hfopenllm_v2/Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties-v2/35d5f5e3-74eb-4eea-9f78-b7b8969830a2.json
deleted file mode 100644
index fb25872ce..000000000
--- a/data/hfopenllm_v2/Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties-v2/35d5f5e3-74eb-4eea-9f78-b7b8969830a2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Etherll_Chocolatine-3B-Instruct-DPO-Revised-Ties-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chocolatine-3B-Instruct-DPO-Revised-Ties-v2",
-    "id": "Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties-v2",
-    "developer": "Etherll",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.374
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5411
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1631
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4649
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3978
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties/4cf4479a-622a-4bc2-86f2-aa526216f24c.json b/data/hfopenllm_v2/Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties/4cf4479a-622a-4bc2-86f2-aa526216f24c.json
deleted file mode 100644
index 34efe21cd..000000000
--- a/data/hfopenllm_v2/Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties/4cf4479a-622a-4bc2-86f2-aa526216f24c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Etherll_Chocolatine-3B-Instruct-DPO-Revised-Ties/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chocolatine-3B-Instruct-DPO-Revised-Ties",
-    "id": "Etherll/Chocolatine-3B-Instruct-DPO-Revised-Ties",
-    "developer": "Etherll",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3725
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5411
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1631
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4649
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3978
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b-Ties/6ed27890-3e61-4c7d-8c94-a78c0b34ba32.json b/data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b-Ties/6ed27890-3e61-4c7d-8c94-a78c0b34ba32.json
deleted file mode 100644
index f9d679b28..000000000
--- a/data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b-Ties/6ed27890-3e61-4c7d-8c94-a78c0b34ba32.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Etherll_Herplete-LLM-Llama-3.1-8b-Ties/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Herplete-LLM-Llama-3.1-8b-Ties",
-    "id": "Etherll/Herplete-LLM-Llama-3.1-8b-Ties",
-    "developer": "Etherll",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6164
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5338
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1601
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4017
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3752
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b/87b5e360-7867-4edd-b45e-e7bb92a91b69.json b/data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b/87b5e360-7867-4edd-b45e-e7bb92a91b69.json
deleted file mode 100644
index c264e2240..000000000
--- a/data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b/87b5e360-7867-4edd-b45e-e7bb92a91b69.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Etherll_Herplete-LLM-Llama-3.1-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Herplete-LLM-Llama-3.1-8b",
-    "id": "Etherll/Herplete-LLM-Llama-3.1-8b",
-    "developer": "Etherll",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4672
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5013
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0279
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.386
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3482
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b/d93116b8-28ff-41ea-8273-56f7ae11cf18.json b/data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b/d93116b8-28ff-41ea-8273-56f7ae11cf18.json
deleted file mode 100644
index e86363c53..000000000
--- a/data/hfopenllm_v2/Etherll/Herplete-LLM-Llama-3.1-8b/d93116b8-28ff-41ea-8273-56f7ae11cf18.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Etherll_Herplete-LLM-Llama-3.1-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Herplete-LLM-Llama-3.1-8b",
-    "id": "Etherll/Herplete-LLM-Llama-3.1-8b",
-    "developer": "Etherll",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6106
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5347
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1548
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3991
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3752
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Etherll/Qwen2.5-7B-della-test/ba8c2c17-64f6-4cdb-b3b9-8977ce1bdbe2.json b/data/hfopenllm_v2/Etherll/Qwen2.5-7B-della-test/ba8c2c17-64f6-4cdb-b3b9-8977ce1bdbe2.json
deleted file mode 100644
index dffbcaa3e..000000000
--- a/data/hfopenllm_v2/Etherll/Qwen2.5-7B-della-test/ba8c2c17-64f6-4cdb-b3b9-8977ce1bdbe2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Etherll_Qwen2.5-7B-della-test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-della-test",
-    "id": "Etherll/Qwen2.5-7B-della-test",
-    "developer": "Etherll",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7625
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5447
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4894
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4047
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4361
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Etherll/Qwen2.5-Coder-7B-Instruct-Ties/5e5602cc-b4de-4247-aa6d-940817fc849b.json b/data/hfopenllm_v2/Etherll/Qwen2.5-Coder-7B-Instruct-Ties/5e5602cc-b4de-4247-aa6d-940817fc849b.json
deleted file mode 100644
index 89117f4d7..000000000
--- a/data/hfopenllm_v2/Etherll/Qwen2.5-Coder-7B-Instruct-Ties/5e5602cc-b4de-4247-aa6d-940817fc849b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Etherll_Qwen2.5-Coder-7B-Instruct-Ties/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Coder-7B-Instruct-Ties",
-    "id": "Etherll/Qwen2.5-Coder-7B-Instruct-Ties",
-    "developer": "Etherll",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5005
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4895
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2915
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4373
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3503
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Etherll/Replete-LLM-V3-Llama-3.1-8b/cc5f27f5-36d8-49bb-9c9d-7879598bfe71.json b/data/hfopenllm_v2/Etherll/Replete-LLM-V3-Llama-3.1-8b/cc5f27f5-36d8-49bb-9c9d-7879598bfe71.json
deleted file mode 100644
index 66e068f9c..000000000
--- a/data/hfopenllm_v2/Etherll/Replete-LLM-V3-Llama-3.1-8b/cc5f27f5-36d8-49bb-9c9d-7879598bfe71.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Etherll_Replete-LLM-V3-Llama-3.1-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Replete-LLM-V3-Llama-3.1-8b",
-    "id": "Etherll/Replete-LLM-V3-Llama-3.1-8b",
-    "developer": "Etherll",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5263
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4543
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2273
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3516
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.347
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Etherll/SuperHermes/aec03bd9-808a-4c3f-bbde-40bcac5775fb.json b/data/hfopenllm_v2/Etherll/SuperHermes/aec03bd9-808a-4c3f-bbde-40bcac5775fb.json
deleted file mode 100644
index 688d993d6..000000000
--- a/data/hfopenllm_v2/Etherll/SuperHermes/aec03bd9-808a-4c3f-bbde-40bcac5775fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Etherll_SuperHermes/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SuperHermes",
-    "id": "Etherll/SuperHermes",
-    "developer": "Etherll",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5459
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.529
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1654
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3949
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Eurdem/Defne-llama3.1-8B/b4ae6f0b-8a6b-4c60-8eb2-3e202877bcf5.json b/data/hfopenllm_v2/Eurdem/Defne-llama3.1-8B/b4ae6f0b-8a6b-4c60-8eb2-3e202877bcf5.json
deleted file mode 100644
index 39493bbf0..000000000
--- a/data/hfopenllm_v2/Eurdem/Defne-llama3.1-8B/b4ae6f0b-8a6b-4c60-8eb2-3e202877bcf5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Eurdem_Defne-llama3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Defne-llama3.1-8B",
-    "id": "Eurdem/Defne-llama3.1-8B",
-    "developer": "Eurdem",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5036
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5321
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1601
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4331
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3866
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FINGU-AI/Chocolatine-Fusion-14B/c68deb4d-73a8-40ab-b4e5-1773b7ec4ed8.json b/data/hfopenllm_v2/FINGU-AI/Chocolatine-Fusion-14B/c68deb4d-73a8-40ab-b4e5-1773b7ec4ed8.json
deleted file mode 100644
index 35a5b2bfe..000000000
--- a/data/hfopenllm_v2/FINGU-AI/Chocolatine-Fusion-14B/c68deb4d-73a8-40ab-b4e5-1773b7ec4ed8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FINGU-AI_Chocolatine-Fusion-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chocolatine-Fusion-14B",
-    "id": "FINGU-AI/Chocolatine-Fusion-14B",
-    "developer": "FINGU-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 8.367
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6949
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6413
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3852
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.494
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5262
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FINGU-AI/L3-8B/a93c5674-599b-429c-a322-3c6bc7248f45.json b/data/hfopenllm_v2/FINGU-AI/L3-8B/a93c5674-599b-429c-a322-3c6bc7248f45.json
deleted file mode 100644
index c9e5228a0..000000000
--- a/data/hfopenllm_v2/FINGU-AI/L3-8B/a93c5674-599b-429c-a322-3c6bc7248f45.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FINGU-AI_L3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-8B",
-    "id": "FINGU-AI/L3-8B",
-    "developer": "FINGU-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7517
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4986
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2545
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3828
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3639
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FINGU-AI/Phi-4-RRStock/5e6374a6-56bd-4bd9-b04b-30ec9cf234bc.json b/data/hfopenllm_v2/FINGU-AI/Phi-4-RRStock/5e6374a6-56bd-4bd9-b04b-30ec9cf234bc.json
deleted file mode 100644
index e93edbe24..000000000
--- a/data/hfopenllm_v2/FINGU-AI/Phi-4-RRStock/5e6374a6-56bd-4bd9-b04b-30ec9cf234bc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FINGU-AI_Phi-4-RRStock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-RRStock",
-    "id": "FINGU-AI/Phi-4-RRStock",
-    "developer": "FINGU-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.652
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2855
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6443
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0582
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4479
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4883
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FINGU-AI/Q-Small-3B/c3d2fc86-a5c4-4e92-bcf9-26096ca32ad4.json b/data/hfopenllm_v2/FINGU-AI/Q-Small-3B/c3d2fc86-a5c4-4e92-bcf9-26096ca32ad4.json
deleted file mode 100644
index 61898827c..000000000
--- a/data/hfopenllm_v2/FINGU-AI/Q-Small-3B/c3d2fc86-a5c4-4e92-bcf9-26096ca32ad4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FINGU-AI_Q-Small-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Q-Small-3B",
-    "id": "FINGU-AI/Q-Small-3B",
-    "developer": "FINGU-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4145
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4319
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0831
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4005
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.279
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FINGU-AI/QwQ-Buddy-32B-Alpha/1b49cb06-3ee1-4945-aaed-12c868d9e45e.json b/data/hfopenllm_v2/FINGU-AI/QwQ-Buddy-32B-Alpha/1b49cb06-3ee1-4945-aaed-12c868d9e45e.json
deleted file mode 100644
index a93048eac..000000000
--- a/data/hfopenllm_v2/FINGU-AI/QwQ-Buddy-32B-Alpha/1b49cb06-3ee1-4945-aaed-12c868d9e45e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FINGU-AI_QwQ-Buddy-32B-Alpha/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwQ-Buddy-32B-Alpha",
-    "id": "FINGU-AI/QwQ-Buddy-32B-Alpha",
-    "developer": "FINGU-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 19.662
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3446
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6424
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3852
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.506
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5294
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FINGU-AI/RomboUltima-32B/65853bb5-ff3e-4880-8c32-ce9aabcadd7b.json b/data/hfopenllm_v2/FINGU-AI/RomboUltima-32B/65853bb5-ff3e-4880-8c32-ce9aabcadd7b.json
deleted file mode 100644
index 05c3a1ad9..000000000
--- a/data/hfopenllm_v2/FINGU-AI/RomboUltima-32B/65853bb5-ff3e-4880-8c32-ce9aabcadd7b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FINGU-AI_RomboUltima-32B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RomboUltima-32B",
-    "id": "FINGU-AI/RomboUltima-32B",
-    "developer": "FINGU-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 17.645
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6672
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6938
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4836
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5789
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FINGU-AI/Ultimos-32B/7fecc176-debf-4bf7-b3f3-479d05678a1e.json b/data/hfopenllm_v2/FINGU-AI/Ultimos-32B/7fecc176-debf-4bf7-b3f3-479d05678a1e.json
deleted file mode 100644
index d9153beab..000000000
--- a/data/hfopenllm_v2/FINGU-AI/Ultimos-32B/7fecc176-debf-4bf7-b3f3-479d05678a1e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FINGU-AI_Ultimos-32B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ultimos-32B",
-    "id": "FINGU-AI/Ultimos-32B",
-    "developer": "FINGU-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 9.604
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1592
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2906
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3286
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1111
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FallenMerick/Chewy-Lemon-Cookie-11B/3c965626-a264-40db-93e1-cd7659d0662e.json b/data/hfopenllm_v2/FallenMerick/Chewy-Lemon-Cookie-11B/3c965626-a264-40db-93e1-cd7659d0662e.json
deleted file mode 100644
index 21887b781..000000000
--- a/data/hfopenllm_v2/FallenMerick/Chewy-Lemon-Cookie-11B/3c965626-a264-40db-93e1-cd7659d0662e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FallenMerick_Chewy-Lemon-Cookie-11B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chewy-Lemon-Cookie-11B",
-    "id": "FallenMerick/Chewy-Lemon-Cookie-11B",
-    "developer": "FallenMerick",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4875
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5251
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4546
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3267
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Felladrin/Llama-160M-Chat-v1/50fa6f0c-d689-4380-b619-253209b5badc.json b/data/hfopenllm_v2/Felladrin/Llama-160M-Chat-v1/50fa6f0c-d689-4380-b619-253209b5badc.json
deleted file mode 100644
index bb5ec6c1c..000000000
--- a/data/hfopenllm_v2/Felladrin/Llama-160M-Chat-v1/50fa6f0c-d689-4380-b619-253209b5badc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Felladrin_Llama-160M-Chat-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-160M-Chat-v1",
-    "id": "Felladrin/Llama-160M-Chat-v1",
-    "developer": "Felladrin",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.162
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1575
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3036
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3661
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1136
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Felladrin/Minueza-32M-UltraChat/adb25c88-6113-4307-bbf0-d377f757bc18.json b/data/hfopenllm_v2/Felladrin/Minueza-32M-UltraChat/adb25c88-6113-4307-bbf0-d377f757bc18.json
deleted file mode 100644
index 43e58afa9..000000000
--- a/data/hfopenllm_v2/Felladrin/Minueza-32M-UltraChat/adb25c88-6113-4307-bbf0-d377f757bc18.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Felladrin_Minueza-32M-UltraChat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Minueza-32M-UltraChat",
-    "id": "Felladrin/Minueza-32M-UltraChat",
-    "developer": "Felladrin",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 0.033
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1376
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2941
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3742
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1133
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/b9ac5e03-c878-4e46-a89c-1906f3b91dce.json b/data/hfopenllm_v2/FlofloB/100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/b9ac5e03-c878-4e46-a89c-1906f3b91dce.json
deleted file mode 100644
index 1fd7e8ac3..000000000
--- a/data/hfopenllm_v2/FlofloB/100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/b9ac5e03-c878-4e46-a89c-1906f3b91dce.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit",
-    "id": "FlofloB/100k_fineweb_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.5
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3083
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3323
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3302
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1498
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/d6a6badf-4472-44b5-af9e-4282e4406a8e.json b/data/hfopenllm_v2/FlofloB/10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/d6a6badf-4472-44b5-af9e-4282e4406a8e.json
deleted file mode 100644
index 648617711..000000000
--- a/data/hfopenllm_v2/FlofloB/10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/d6a6badf-4472-44b5-af9e-4282e4406a8e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit",
-    "id": "FlofloB/10k_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 16.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5097
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5215
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0974
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.431
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3769
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/92e62d3a-3091-4538-b6da-ba705e11687a.json b/data/hfopenllm_v2/FlofloB/10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/92e62d3a-3091-4538-b6da-ba705e11687a.json
deleted file mode 100644
index 7f912b4b9..000000000
--- a/data/hfopenllm_v2/FlofloB/10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/92e62d3a-3091-4538-b6da-ba705e11687a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit",
-    "id": "FlofloB/10k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.5
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2815
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3306
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.031
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3302
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1541
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/04f5fdc6-f1cd-4b2d-947a-86fee67b3b62.json b/data/hfopenllm_v2/FlofloB/40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/04f5fdc6-f1cd-4b2d-947a-86fee67b3b62.json
deleted file mode 100644
index cbbb13eef..000000000
--- a/data/hfopenllm_v2/FlofloB/40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/04f5fdc6-f1cd-4b2d-947a-86fee67b3b62.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit",
-    "id": "FlofloB/40k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.5
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3016
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3325
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0332
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3408
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1485
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/5013ccfc-6bc5-4862-898c-1ca781f92572.json b/data/hfopenllm_v2/FlofloB/83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/5013ccfc-6bc5-4862-898c-1ca781f92572.json
deleted file mode 100644
index b055b4a00..000000000
--- a/data/hfopenllm_v2/FlofloB/83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/5013ccfc-6bc5-4862-898c-1ca781f92572.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit",
-    "id": "FlofloB/83k_continued_pretraining_Qwen2.5-0.5B-Instruct_Unsloth_merged_16bit",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.5
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3347
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1555
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb/38fff98c-72b1-453c-a2cf-cf077dd19d10.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb/38fff98c-72b1-453c-a2cf-cf077dd19d10.json
deleted file mode 100644
index fe9ff6a34..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb/38fff98c-72b1-453c-a2cf-cf077dd19d10.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_1000k_fineweb/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_1000k_fineweb",
-    "id": "FlofloB/smollm2-135M_pretrained_1000k_fineweb",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1485
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2918
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3581
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1164
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed/42911928-ef64-474b-828a-02ce3383773e.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed/42911928-ef64-474b-828a-02ce3383773e.json
deleted file mode 100644
index 1a53297cc..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed/42911928-ef64-474b-828a-02ce3383773e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed",
-    "id": "FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_human_removed",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1554
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3066
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1143
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_selected/7989d7d3-c5e9-43c6-80a1-6de51533f9bf.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_selected/7989d7d3-c5e9-43c6-80a1-6de51533f9bf.json
deleted file mode 100644
index 36ed45fb4..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_selected/7989d7d3-c5e9-43c6-80a1-6de51533f9bf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_1000k_fineweb_uncovai_selected/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_1000k_fineweb_uncovai_selected",
-    "id": "FlofloB/smollm2-135M_pretrained_1000k_fineweb_uncovai_selected",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1468
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2932
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4048
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1157
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb/5b9acd52-7eb6-4099-98be-ecd6cae07835.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb/5b9acd52-7eb6-4099-98be-ecd6cae07835.json
deleted file mode 100644
index 0446e5349..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb/5b9acd52-7eb6-4099-98be-ecd6cae07835.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_1200k_fineweb/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_1200k_fineweb",
-    "id": "FlofloB/smollm2-135M_pretrained_1200k_fineweb",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1581
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2941
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3714
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1076
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed/666bef5a-2d62-4743-bff1-07365716ab19.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed/666bef5a-2d62-4743-bff1-07365716ab19.json
deleted file mode 100644
index 197afa5d3..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed/666bef5a-2d62-4743-bff1-07365716ab19.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed",
-    "id": "FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_human_removed",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1578
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.295
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1139
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_selected/85de411c-2308-4824-bd6e-3327eeb6fe3e.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_selected/85de411c-2308-4824-bd6e-3327eeb6fe3e.json
deleted file mode 100644
index bc92a7eb5..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_selected/85de411c-2308-4824-bd6e-3327eeb6fe3e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_1200k_fineweb_uncovai_selected/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_1200k_fineweb_uncovai_selected",
-    "id": "FlofloB/smollm2-135M_pretrained_1200k_fineweb_uncovai_selected",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1585
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.296
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0076
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3567
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1164
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb/df28c4c2-d6a4-4ab0-a1ac-faf00a93de99.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb/df28c4c2-d6a4-4ab0-a1ac-faf00a93de99.json
deleted file mode 100644
index 86c5be30e..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb/df28c4c2-d6a4-4ab0-a1ac-faf00a93de99.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_1400k_fineweb/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_1400k_fineweb",
-    "id": "FlofloB/smollm2-135M_pretrained_1400k_fineweb",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1764
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2922
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0113
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3873
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed/6fb37ad0-b41b-4ad7-91a2-79bbb835d445.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed/6fb37ad0-b41b-4ad7-91a2-79bbb835d445.json
deleted file mode 100644
index 4fd364781..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed/6fb37ad0-b41b-4ad7-91a2-79bbb835d445.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed",
-    "id": "FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_human_removed",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1707
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2992
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3939
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1105
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_selected/c41df02e-5aff-4de6-a1c4-d45b5585e29d.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_selected/c41df02e-5aff-4de6-a1c4-d45b5585e29d.json
deleted file mode 100644
index c93f56f5d..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_selected/c41df02e-5aff-4de6-a1c4-d45b5585e29d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_1400k_fineweb_uncovai_selected/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_1400k_fineweb_uncovai_selected",
-    "id": "FlofloB/smollm2-135M_pretrained_1400k_fineweb_uncovai_selected",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1538
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2917
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3741
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1137
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed/aa587b4a-9c19-4231-ba72-9b66446460f9.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed/aa587b4a-9c19-4231-ba72-9b66446460f9.json
deleted file mode 100644
index c51687eca..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed/aa587b4a-9c19-4231-ba72-9b66446460f9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed",
-    "id": "FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_human_removed",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1475
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0038
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3578
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.112
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_selected/be14e75e-4fb1-41aa-b168-1ec23eb305e0.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_selected/be14e75e-4fb1-41aa-b168-1ec23eb305e0.json
deleted file mode 100644
index 6785da5dc..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_selected/be14e75e-4fb1-41aa-b168-1ec23eb305e0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_200k_fineweb_uncovai_selected/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_200k_fineweb_uncovai_selected",
-    "id": "FlofloB/smollm2-135M_pretrained_200k_fineweb_uncovai_selected",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1345
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2927
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0076
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.366
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1131
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb/73be4a2b-28c9-4208-8107-3734fea25008.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb/73be4a2b-28c9-4208-8107-3734fea25008.json
deleted file mode 100644
index ec2adafe9..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb/73be4a2b-28c9-4208-8107-3734fea25008.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_400k_fineweb/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_400k_fineweb",
-    "id": "FlofloB/smollm2-135M_pretrained_400k_fineweb",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1511
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2972
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3794
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1163
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed/0bf2fa4e-3bcb-46ff-a068-f4c796123c6d.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed/0bf2fa4e-3bcb-46ff-a068-f4c796123c6d.json
deleted file mode 100644
index f405d336a..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed/0bf2fa4e-3bcb-46ff-a068-f4c796123c6d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed",
-    "id": "FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_human_removed",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1556
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3049
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.386
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1138
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_selected/9f8fc05a-8658-4ed3-994a-965e6882d242.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_selected/9f8fc05a-8658-4ed3-994a-965e6882d242.json
deleted file mode 100644
index 2de316a22..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_selected/9f8fc05a-8658-4ed3-994a-965e6882d242.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_400k_fineweb_uncovai_selected/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_400k_fineweb_uncovai_selected",
-    "id": "FlofloB/smollm2-135M_pretrained_400k_fineweb_uncovai_selected",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1584
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2925
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1158
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb/ced11f6e-490d-42e9-8f3e-00e22cfc2910.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb/ced11f6e-490d-42e9-8f3e-00e22cfc2910.json
deleted file mode 100644
index 13f664341..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb/ced11f6e-490d-42e9-8f3e-00e22cfc2910.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_600k_fineweb/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_600k_fineweb",
-    "id": "FlofloB/smollm2-135M_pretrained_600k_fineweb",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1639
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3014
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3809
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1126
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed/70ba788b-fe8c-4667-a859-0fb122de22b9.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed/70ba788b-fe8c-4667-a859-0fb122de22b9.json
deleted file mode 100644
index 3978e6a37..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed/70ba788b-fe8c-4667-a859-0fb122de22b9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed",
-    "id": "FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_human_removed",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1641
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3793
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1147
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_selected/e93f2d5f-7ffc-44b8-b2dc-d07b73de44ab.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_selected/e93f2d5f-7ffc-44b8-b2dc-d07b73de44ab.json
deleted file mode 100644
index 4fb6c3205..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_selected/e93f2d5f-7ffc-44b8-b2dc-d07b73de44ab.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_600k_fineweb_uncovai_selected/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_600k_fineweb_uncovai_selected",
-    "id": "FlofloB/smollm2-135M_pretrained_600k_fineweb_uncovai_selected",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1606
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2983
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0076
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3846
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1162
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb/15cacfe0-bdfb-4b87-a813-bfa70ff71984.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb/15cacfe0-bdfb-4b87-a813-bfa70ff71984.json
deleted file mode 100644
index 3d0b3cb65..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb/15cacfe0-bdfb-4b87-a813-bfa70ff71984.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_800k_fineweb/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_800k_fineweb",
-    "id": "FlofloB/smollm2-135M_pretrained_800k_fineweb",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1641
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2959
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3701
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1152
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed/cff00e2a-41e3-40d2-aab3-4bb3bd7d0d0e.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed/cff00e2a-41e3-40d2-aab3-4bb3bd7d0d0e.json
deleted file mode 100644
index ee1ff5fea..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed/cff00e2a-41e3-40d2-aab3-4bb3bd7d0d0e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed",
-    "id": "FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_human_removed",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1623
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3038
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3993
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1138
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_selected/e1eab0cf-2c6d-44b2-8aaf-a75347741529.json b/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_selected/e1eab0cf-2c6d-44b2-8aaf-a75347741529.json
deleted file mode 100644
index c0e2e5309..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_selected/e1eab0cf-2c6d-44b2-8aaf-a75347741529.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2-135M_pretrained_800k_fineweb_uncovai_selected/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-135M_pretrained_800k_fineweb_uncovai_selected",
-    "id": "FlofloB/smollm2-135M_pretrained_800k_fineweb_uncovai_selected",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1474
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2943
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3766
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.113
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/smollm2_pretrained_200k_fineweb/ed221db8-cf81-4257-8785-db9381eec5b7.json b/data/hfopenllm_v2/FlofloB/smollm2_pretrained_200k_fineweb/ed221db8-cf81-4257-8785-db9381eec5b7.json
deleted file mode 100644
index efa15e0a4..000000000
--- a/data/hfopenllm_v2/FlofloB/smollm2_pretrained_200k_fineweb/ed221db8-cf81-4257-8785-db9381eec5b7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_smollm2_pretrained_200k_fineweb/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2_pretrained_200k_fineweb",
-    "id": "FlofloB/smollm2_pretrained_200k_fineweb",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1527
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0038
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2475
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3699
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1159
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FlofloB/test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/b314468b-401a-4318-b022-c966bf3366aa.json b/data/hfopenllm_v2/FlofloB/test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/b314468b-401a-4318-b022-c966bf3366aa.json
deleted file mode 100644
index 0db2c1292..000000000
--- a/data/hfopenllm_v2/FlofloB/test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/b314468b-401a-4318-b022-c966bf3366aa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FlofloB_test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit",
-    "id": "FlofloB/test_continued_pretraining_Phi-3-mini-4k-instruct_Unsloth_merged_16bit",
-    "developer": "FlofloB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 16.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5215
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5241
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1103
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4244
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3721
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FuJhen/ft-openhermes-25-mistral-7b-irca-dpo-pairs/a0dbb2eb-66c7-48a3-a85c-725b49141edf.json b/data/hfopenllm_v2/FuJhen/ft-openhermes-25-mistral-7b-irca-dpo-pairs/a0dbb2eb-66c7-48a3-a85c-725b49141edf.json
deleted file mode 100644
index 09f22c033..000000000
--- a/data/hfopenllm_v2/FuJhen/ft-openhermes-25-mistral-7b-irca-dpo-pairs/a0dbb2eb-66c7-48a3-a85c-725b49141edf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FuJhen_ft-openhermes-25-mistral-7b-irca-dpo-pairs/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ft-openhermes-25-mistral-7b-irca-dpo-pairs",
-    "id": "FuJhen/ft-openhermes-25-mistral-7b-irca-dpo-pairs",
-    "developer": "FuJhen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 14.483
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.542
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4773
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0483
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4174
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2956
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FuJhen/mistral-instruct-7B-DPO/812a36ec-4928-40a9-9aa8-ee39d7bb02f5.json b/data/hfopenllm_v2/FuJhen/mistral-instruct-7B-DPO/812a36ec-4928-40a9-9aa8-ee39d7bb02f5.json
deleted file mode 100644
index a76f20076..000000000
--- a/data/hfopenllm_v2/FuJhen/mistral-instruct-7B-DPO/812a36ec-4928-40a9-9aa8-ee39d7bb02f5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FuJhen_mistral-instruct-7B-DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-instruct-7B-DPO",
-    "id": "FuJhen/mistral-instruct-7B-DPO",
-    "developer": "FuJhen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 14.496
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4968
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4624
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4016
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3034
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FuJhen/mistral_7b_v0.1_structedData_e2e/77af2424-0a23-49f3-97b0-316d04a33547.json b/data/hfopenllm_v2/FuJhen/mistral_7b_v0.1_structedData_e2e/77af2424-0a23-49f3-97b0-316d04a33547.json
deleted file mode 100644
index fd9de9edd..000000000
--- a/data/hfopenllm_v2/FuJhen/mistral_7b_v0.1_structedData_e2e/77af2424-0a23-49f3-97b0-316d04a33547.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FuJhen_mistral_7b_v0.1_structedData_e2e/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral_7b_v0.1_structedData_e2e",
-    "id": "FuJhen/mistral_7b_v0.1_structedData_e2e",
-    "developer": "FuJhen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1727
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4114
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3723
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2811
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FuJhen/mistral_7b_v0.1_structedData_viggo/6f422676-2d7e-40ed-a5e3-4afc25564cfc.json b/data/hfopenllm_v2/FuJhen/mistral_7b_v0.1_structedData_viggo/6f422676-2d7e-40ed-a5e3-4afc25564cfc.json
deleted file mode 100644
index 2e924bc72..000000000
--- a/data/hfopenllm_v2/FuJhen/mistral_7b_v0.1_structedData_viggo/6f422676-2d7e-40ed-a5e3-4afc25564cfc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FuJhen_mistral_7b_v0.1_structedData_viggo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral_7b_v0.1_structedData_viggo",
-    "id": "FuJhen/mistral_7b_v0.1_structedData_viggo",
-    "developer": "FuJhen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 14.483
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1783
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4524
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2942
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FuseAI/FuseChat-7B-v2.0/43923dd6-838a-4259-a938-7766dfd9c07e.json b/data/hfopenllm_v2/FuseAI/FuseChat-7B-v2.0/43923dd6-838a-4259-a938-7766dfd9c07e.json
deleted file mode 100644
index c01a38f0d..000000000
--- a/data/hfopenllm_v2/FuseAI/FuseChat-7B-v2.0/43923dd6-838a-4259-a938-7766dfd9c07e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FuseAI_FuseChat-7B-v2.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FuseChat-7B-v2.0",
-    "id": "FuseAI/FuseChat-7B-v2.0",
-    "developer": "FuseAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3423
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4954
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4797
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3162
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FuseAI/FuseChat-Llama-3.1-8B-Instruct/dba94a49-02b0-4e92-bd6c-c6bfc9be3cfb.json b/data/hfopenllm_v2/FuseAI/FuseChat-Llama-3.1-8B-Instruct/dba94a49-02b0-4e92-bd6c-c6bfc9be3cfb.json
deleted file mode 100644
index b8ec7de60..000000000
--- a/data/hfopenllm_v2/FuseAI/FuseChat-Llama-3.1-8B-Instruct/dba94a49-02b0-4e92-bd6c-c6bfc9be3cfb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FuseAI_FuseChat-Llama-3.1-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FuseChat-Llama-3.1-8B-Instruct",
-    "id": "FuseAI/FuseChat-Llama-3.1-8B-Instruct",
-    "developer": "FuseAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7205
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.512
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2477
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FuseAI/FuseChat-Llama-3.2-3B-Instruct/16a782dc-0795-4281-aad6-4f664a0940ab.json b/data/hfopenllm_v2/FuseAI/FuseChat-Llama-3.2-3B-Instruct/16a782dc-0795-4281-aad6-4f664a0940ab.json
deleted file mode 100644
index 879cf24a5..000000000
--- a/data/hfopenllm_v2/FuseAI/FuseChat-Llama-3.2-3B-Instruct/16a782dc-0795-4281-aad6-4f664a0940ab.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FuseAI_FuseChat-Llama-3.2-3B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FuseChat-Llama-3.2-3B-Instruct",
-    "id": "FuseAI/FuseChat-Llama-3.2-3B-Instruct",
-    "developer": "FuseAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6849
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4658
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2424
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3914
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3132
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/FuseAI/FuseChat-Qwen-2.5-7B-Instruct/5d24d4ad-9f37-4634-ba23-74fbc74fd298.json b/data/hfopenllm_v2/FuseAI/FuseChat-Qwen-2.5-7B-Instruct/5d24d4ad-9f37-4634-ba23-74fbc74fd298.json
deleted file mode 100644
index 0a5c32d0b..000000000
--- a/data/hfopenllm_v2/FuseAI/FuseChat-Qwen-2.5-7B-Instruct/5d24d4ad-9f37-4634-ba23-74fbc74fd298.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/FuseAI_FuseChat-Qwen-2.5-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FuseChat-Qwen-2.5-7B-Instruct",
-    "id": "FuseAI/FuseChat-Qwen-2.5-7B-Instruct",
-    "developer": "FuseAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5906
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5526
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4562
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3874
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4118
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/GalrionSoftworks/MN-LooseCannon-12B-v1/043cd315-fcb7-4871-ae79-dee3fdefaef0.json b/data/hfopenllm_v2/GalrionSoftworks/MN-LooseCannon-12B-v1/043cd315-fcb7-4871-ae79-dee3fdefaef0.json
deleted file mode 100644
index a45337ab5..000000000
--- a/data/hfopenllm_v2/GalrionSoftworks/MN-LooseCannon-12B-v1/043cd315-fcb7-4871-ae79-dee3fdefaef0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/GalrionSoftworks_MN-LooseCannon-12B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-LooseCannon-12B-v1",
-    "id": "GalrionSoftworks/MN-LooseCannon-12B-v1",
-    "developer": "GalrionSoftworks",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5418
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5128
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0853
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4138
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/GalrionSoftworks/MagnusIntellectus-12B-v1/3c377d7e-14bc-4c82-9ada-7560552abbe4.json b/data/hfopenllm_v2/GalrionSoftworks/MagnusIntellectus-12B-v1/3c377d7e-14bc-4c82-9ada-7560552abbe4.json
deleted file mode 100644
index 19eeb69d1..000000000
--- a/data/hfopenllm_v2/GalrionSoftworks/MagnusIntellectus-12B-v1/3c377d7e-14bc-4c82-9ada-7560552abbe4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/GalrionSoftworks_MagnusIntellectus-12B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MagnusIntellectus-12B-v1",
-    "id": "GalrionSoftworks/MagnusIntellectus-12B-v1",
-    "developer": "GalrionSoftworks",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4421
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5323
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4428
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3421
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaOrca-2-Merged/43bb650b-8bb7-41b4-866a-cb2dad1499d6.json b/data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaOrca-2-Merged/43bb650b-8bb7-41b4-866a-cb2dad1499d6.json
deleted file mode 100644
index ce87f2d44..000000000
--- a/data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaOrca-2-Merged/43bb650b-8bb7-41b4-866a-cb2dad1499d6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/GenVRadmin_AryaBhatta-GemmaOrca-2-Merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AryaBhatta-GemmaOrca-2-Merged",
-    "id": "GenVRadmin/AryaBhatta-GemmaOrca-2-Merged",
-    "developer": "GenVRadmin",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 8.538
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3064
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3887
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0498
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.455
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2384
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaOrca-Merged/bdf8f907-37ca-41ca-9a4e-f4dd446f895f.json b/data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaOrca-Merged/bdf8f907-37ca-41ca-9a4e-f4dd446f895f.json
deleted file mode 100644
index 54f086875..000000000
--- a/data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaOrca-Merged/bdf8f907-37ca-41ca-9a4e-f4dd446f895f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/GenVRadmin_AryaBhatta-GemmaOrca-Merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AryaBhatta-GemmaOrca-Merged",
-    "id": "GenVRadmin/AryaBhatta-GemmaOrca-Merged",
-    "developer": "GenVRadmin",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 8.538
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3064
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4131
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3524
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2228
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaUltra-Merged/14a1872c-7afd-4cd4-ad87-853e4fc0847e.json b/data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaUltra-Merged/14a1872c-7afd-4cd4-ad87-853e4fc0847e.json
deleted file mode 100644
index 88dd8d560..000000000
--- a/data/hfopenllm_v2/GenVRadmin/AryaBhatta-GemmaUltra-Merged/14a1872c-7afd-4cd4-ad87-853e4fc0847e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/GenVRadmin_AryaBhatta-GemmaUltra-Merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AryaBhatta-GemmaUltra-Merged",
-    "id": "GenVRadmin/AryaBhatta-GemmaUltra-Merged",
-    "developer": "GenVRadmin",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 8.538
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3021
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4141
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4279
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2266
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/GenVRadmin/llama38bGenZ_Vikas-Merged/887e4ca9-ed48-4b33-b933-f8534a8d0377.json b/data/hfopenllm_v2/GenVRadmin/llama38bGenZ_Vikas-Merged/887e4ca9-ed48-4b33-b933-f8534a8d0377.json
deleted file mode 100644
index 0c72024f8..000000000
--- a/data/hfopenllm_v2/GenVRadmin/llama38bGenZ_Vikas-Merged/887e4ca9-ed48-4b33-b933-f8534a8d0377.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/GenVRadmin_llama38bGenZ_Vikas-Merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama38bGenZ_Vikas-Merged",
-    "id": "GenVRadmin/llama38bGenZ_Vikas-Merged",
-    "developer": "GenVRadmin",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4536
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4402
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2622
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct/c585488d-4043-482f-b1fa-4a61e96f7f0f.json b/data/hfopenllm_v2/GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct/c585488d-4043-482f-b1fa-4a61e96f7f0f.json
deleted file mode 100644
index 99a5c1561..000000000
--- a/data/hfopenllm_v2/GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct/c585488d-4043-482f-b1fa-4a61e96f7f0f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/GoToCompany_gemma2-9b-cpt-sahabatai-v1-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma2-9b-cpt-sahabatai-v1-instruct",
-    "id": "GoToCompany/gemma2-9b-cpt-sahabatai-v1-instruct",
-    "developer": "GoToCompany",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6551
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5955
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2054
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3347
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4779
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4264
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct/d64541f6-19ef-4f04-a991-93efec6fe24f.json b/data/hfopenllm_v2/GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct/d64541f6-19ef-4f04-a991-93efec6fe24f.json
deleted file mode 100644
index e5f54c812..000000000
--- a/data/hfopenllm_v2/GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct/d64541f6-19ef-4f04-a991-93efec6fe24f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/GoToCompany_llama3-8b-cpt-sahabatai-v1-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama3-8b-cpt-sahabatai-v1-instruct",
-    "id": "GoToCompany/llama3-8b-cpt-sahabatai-v1-instruct",
-    "developer": "GoToCompany",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5238
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4951
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1276
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4488
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3453
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1c13e194-8bee-4456-a249-f71e7e34b0eb.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1c13e194-8bee-4456-a249-f71e7e34b0eb.json
deleted file mode 100644
index db3383a91..000000000
--- a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1c13e194-8bee-4456-a249-f71e7e34b0eb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1",
-    "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1",
-    "developer": "Goekdeniz-Guelmez",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3417
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3292
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0023
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3249
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1638
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1d3db737-20e7-4da1-a311-e60de0b41c93.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1d3db737-20e7-4da1-a311-e60de0b41c93.json
deleted file mode 100644
index a0e2c50db..000000000
--- a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1d3db737-20e7-4da1-a311-e60de0b41c93.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1",
-    "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1",
-    "developer": "Goekdeniz-Guelmez",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3472
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3268
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0891
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1641
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1/7b73d50e-358b-4961-8b58-63765ce5a82a.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1/7b73d50e-358b-4961-8b58-63765ce5a82a.json
deleted file mode 100644
index 6da2363a1..000000000
--- a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1/7b73d50e-358b-4961-8b58-63765ce5a82a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1",
-    "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v1",
-    "developer": "Goekdeniz-Guelmez",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4769
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4186
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2085
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2433
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3675
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2783
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2/81dfd69c-cf01-4114-8157-fd09af6f490c.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2/81dfd69c-cf01-4114-8157-fd09af6f490c.json
deleted file mode 100644
index 5793c4f9c..000000000
--- a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2/81dfd69c-cf01-4114-8157-fd09af6f490c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2",
-    "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v2",
-    "developer": "Goekdeniz-Guelmez",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4216
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4042
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1269
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2399
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3769
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2562
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3/f38240ab-35e4-431e-b4d5-b1b0e1d57c5f.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3/f38240ab-35e4-431e-b4d5-b1b0e1d57c5f.json
deleted file mode 100644
index c4deee9bd..000000000
--- a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3/f38240ab-35e4-431e-b4d5-b1b0e1d57c5f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3",
-    "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-1.5B-Instruct-abliterated-v3",
-    "developer": "Goekdeniz-Guelmez",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4253
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4053
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1307
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2433
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3702
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2556
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-14B-Instruct-abliterated-v4/01863b4f-9550-49c3-ad83-74c0bb535eb9.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-14B-Instruct-abliterated-v4/01863b4f-9550-49c3-ad83-74c0bb535eb9.json
deleted file mode 100644
index 42f6949ab..000000000
--- a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-14B-Instruct-abliterated-v4/01863b4f-9550-49c3-ad83-74c0bb535eb9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_Josiefied-Qwen2.5-14B-Instruct-abliterated-v4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Josiefied-Qwen2.5-14B-Instruct-abliterated-v4",
-    "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-14B-Instruct-abliterated-v4",
-    "developer": "Goekdeniz-Guelmez",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8292
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6356
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5423
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3423
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4287
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5018
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/edd25437-38bc-443c-9da3-bc041270447e.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/edd25437-38bc-443c-9da3-bc041270447e.json
deleted file mode 100644
index 0afe4d3d1..000000000
--- a/data/hfopenllm_v2/Goekdeniz-Guelmez/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/edd25437-38bc-443c-9da3-bc041270447e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Josiefied-Qwen2.5-7B-Instruct-abliterated-v2",
-    "id": "Goekdeniz-Guelmez/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2",
-    "developer": "Goekdeniz-Guelmez",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7814
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.531
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4532
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4354
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.412
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/j.o.s.i.e.v4o-1.5b-dpo-stage1-v1/31836d43-5022-488f-ba9e-379195809069.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/j.o.s.i.e.v4o-1.5b-dpo-stage1-v1/31836d43-5022-488f-ba9e-379195809069.json
deleted file mode 100644
index 85cf3c248..000000000
--- a/data/hfopenllm_v2/Goekdeniz-Guelmez/j.o.s.i.e.v4o-1.5b-dpo-stage1-v1/31836d43-5022-488f-ba9e-379195809069.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_j.o.s.i.e.v4o-1.5b-dpo-stage1-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "j.o.s.i.e.v4o-1.5b-dpo-stage1-v1",
-    "id": "Goekdeniz-Guelmez/j.o.s.i.e.v4o-1.5b-dpo-stage1-v1",
-    "developer": "Goekdeniz-Guelmez",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4188
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4124
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3529
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2555
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-3b-v6.0/2a5a3ed6-7137-49e2-a141-497ceba88757.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-3b-v6.0/2a5a3ed6-7137-49e2-a141-497ceba88757.json
deleted file mode 100644
index 492ac7214..000000000
--- a/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-3b-v6.0/2a5a3ed6-7137-49e2-a141-497ceba88757.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_josie-3b-v6.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "josie-3b-v6.0",
-    "id": "Goekdeniz-Guelmez/josie-3b-v6.0",
-    "developer": "Goekdeniz-Guelmez",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.601
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4496
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2938
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3861
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.322
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0-step2000/0b1c6aa6-b94e-4400-9b0d-c39aa1bcd808.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0-step2000/0b1c6aa6-b94e-4400-9b0d-c39aa1bcd808.json
deleted file mode 100644
index 55f207668..000000000
--- a/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0-step2000/0b1c6aa6-b94e-4400-9b0d-c39aa1bcd808.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_josie-7b-v6.0-step2000/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "josie-7b-v6.0-step2000",
-    "id": "Goekdeniz-Guelmez/josie-7b-v6.0-step2000",
-    "developer": "Goekdeniz-Guelmez",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7598
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5107
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4237
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4539
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4012
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0-step2000/69423132-adc9-4b97-b799-15f37de1d7e5.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0-step2000/69423132-adc9-4b97-b799-15f37de1d7e5.json
deleted file mode 100644
index 8561cb840..000000000
--- a/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0-step2000/69423132-adc9-4b97-b799-15f37de1d7e5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_josie-7b-v6.0-step2000/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "josie-7b-v6.0-step2000",
-    "id": "Goekdeniz-Guelmez/josie-7b-v6.0-step2000",
-    "developer": "Goekdeniz-Guelmez",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7628
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5098
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4579
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4033
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0/54d5bf0f-7c4c-40b1-bca6-5484ef8e2a04.json b/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0/54d5bf0f-7c4c-40b1-bca6-5484ef8e2a04.json
deleted file mode 100644
index 359a9fc74..000000000
--- a/data/hfopenllm_v2/Goekdeniz-Guelmez/josie-7b-v6.0/54d5bf0f-7c4c-40b1-bca6-5484ef8e2a04.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Goekdeniz-Guelmez_josie-7b-v6.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "josie-7b-v6.0",
-    "id": "Goekdeniz-Guelmez/josie-7b-v6.0",
-    "developer": "Goekdeniz-Guelmez",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7412
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5105
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4358
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4154
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3807
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/GreenNode/GreenNode-small-9B-it/cfe8f9c7-e9bf-4a17-afa0-d5b8f46d24e7.json b/data/hfopenllm_v2/GreenNode/GreenNode-small-9B-it/cfe8f9c7-e9bf-4a17-afa0-d5b8f46d24e7.json
deleted file mode 100644
index 847a2aed3..000000000
--- a/data/hfopenllm_v2/GreenNode/GreenNode-small-9B-it/cfe8f9c7-e9bf-4a17-afa0-d5b8f46d24e7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/GreenNode_GreenNode-small-9B-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GreenNode-small-9B-it",
-    "id": "GreenNode/GreenNode-small-9B-it",
-    "developer": "GreenNode",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7436
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5994
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1745
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4204
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3927
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/GritLM/GritLM-7B-KTO/7fbc0323-1c78-46b6-a08a-6e5870c64e53.json b/data/hfopenllm_v2/GritLM/GritLM-7B-KTO/7fbc0323-1c78-46b6-a08a-6e5870c64e53.json
deleted file mode 100644
index 054831436..000000000
--- a/data/hfopenllm_v2/GritLM/GritLM-7B-KTO/7fbc0323-1c78-46b6-a08a-6e5870c64e53.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/GritLM_GritLM-7B-KTO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GritLM-7B-KTO",
-    "id": "GritLM/GritLM-7B-KTO",
-    "developer": "GritLM",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.531
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4853
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0272
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.371
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.268
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/GritLM/GritLM-8x7B-KTO/1c769f0d-b99d-4b82-a529-f5264f7b3349.json b/data/hfopenllm_v2/GritLM/GritLM-8x7B-KTO/1c769f0d-b99d-4b82-a529-f5264f7b3349.json
deleted file mode 100644
index 9c52b15ad..000000000
--- a/data/hfopenllm_v2/GritLM/GritLM-8x7B-KTO/1c769f0d-b99d-4b82-a529-f5264f7b3349.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/GritLM_GritLM-8x7B-KTO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GritLM-8x7B-KTO",
-    "id": "GritLM/GritLM-8x7B-KTO",
-    "developer": "GritLM",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 46.703
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5714
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.582
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1224
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4217
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3648
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Groq/Llama-3-Groq-8B-Tool-Use/a9365685-e299-48e2-931a-c63e123a9e00.json b/data/hfopenllm_v2/Groq/Llama-3-Groq-8B-Tool-Use/a9365685-e299-48e2-931a-c63e123a9e00.json
deleted file mode 100644
index 1407a69b5..000000000
--- a/data/hfopenllm_v2/Groq/Llama-3-Groq-8B-Tool-Use/a9365685-e299-48e2-931a-c63e123a9e00.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Groq_Llama-3-Groq-8B-Tool-Use/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Groq-8B-Tool-Use",
-    "id": "Groq/Llama-3-Groq-8B-Tool-Use",
-    "developer": "Groq",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6098
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4863
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0604
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.366
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3399
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.0-8b-Llama-3/bdf2d61a-daa1-4b1f-9245-43ff263540fb.json b/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.0-8b-Llama-3/bdf2d61a-daa1-4b1f-9245-43ff263540fb.json
deleted file mode 100644
index 7c3f4b4a5..000000000
--- a/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.0-8b-Llama-3/bdf2d61a-daa1-4b1f-9245-43ff263540fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Gryphe_Pantheon-RP-1.0-8b-Llama-3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Pantheon-RP-1.0-8b-Llama-3",
-    "id": "Gryphe/Pantheon-RP-1.0-8b-Llama-3",
-    "developer": "Gryphe",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3933
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4539
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0634
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3832
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3067
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.5-12b-Nemo/f0b4eef9-dab2-48e2-87f8-ad83ec33ec23.json b/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.5-12b-Nemo/f0b4eef9-dab2-48e2-87f8-ad83ec33ec23.json
deleted file mode 100644
index 5c8283525..000000000
--- a/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.5-12b-Nemo/f0b4eef9-dab2-48e2-87f8-ad83ec33ec23.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Gryphe_Pantheon-RP-1.5-12b-Nemo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Pantheon-RP-1.5-12b-Nemo",
-    "id": "Gryphe/Pantheon-RP-1.5-12b-Nemo",
-    "developer": "Gryphe",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4763
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5196
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0491
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.442
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3302
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.6-12b-Nemo-KTO/29e10491-8c34-4b7a-a0bd-77f6ca0dc54c.json b/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.6-12b-Nemo-KTO/29e10491-8c34-4b7a-a0bd-77f6ca0dc54c.json
deleted file mode 100644
index 2a9bc0b72..000000000
--- a/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.6-12b-Nemo-KTO/29e10491-8c34-4b7a-a0bd-77f6ca0dc54c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Gryphe_Pantheon-RP-1.6-12b-Nemo-KTO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Pantheon-RP-1.6-12b-Nemo-KTO",
-    "id": "Gryphe/Pantheon-RP-1.6-12b-Nemo-KTO",
-    "developer": "Gryphe",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4636
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5277
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0529
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4248
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3382
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.6-12b-Nemo/c588d86a-80c4-46d1-93e0-b7fa8491f3b3.json b/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.6-12b-Nemo/c588d86a-80c4-46d1-93e0-b7fa8491f3b3.json
deleted file mode 100644
index 36f59a731..000000000
--- a/data/hfopenllm_v2/Gryphe/Pantheon-RP-1.6-12b-Nemo/c588d86a-80c4-46d1-93e0-b7fa8491f3b3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Gryphe_Pantheon-RP-1.6-12b-Nemo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Pantheon-RP-1.6-12b-Nemo",
-    "id": "Gryphe/Pantheon-RP-1.6-12b-Nemo",
-    "developer": "Gryphe",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4481
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5204
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0461
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4288
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3311
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Gryphe/Pantheon-RP-Pure-1.6.2-22b-Small/0b11eb9a-61c8-4af1-8335-24bef2597e5d.json b/data/hfopenllm_v2/Gryphe/Pantheon-RP-Pure-1.6.2-22b-Small/0b11eb9a-61c8-4af1-8335-24bef2597e5d.json
deleted file mode 100644
index 672a5af34..000000000
--- a/data/hfopenllm_v2/Gryphe/Pantheon-RP-Pure-1.6.2-22b-Small/0b11eb9a-61c8-4af1-8335-24bef2597e5d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Gryphe_Pantheon-RP-Pure-1.6.2-22b-Small/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Pantheon-RP-Pure-1.6.2-22b-Small",
-    "id": "Gryphe/Pantheon-RP-Pure-1.6.2-22b-Small",
-    "developer": "Gryphe",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6931
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5305
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2024
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3765
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3942
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall/7d31e5fd-700a-42a8-bea8-8989e8c52603.json b/data/hfopenllm_v2/GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall/7d31e5fd-700a-42a8-bea8-8989e8c52603.json
deleted file mode 100644
index 51a98d10f..000000000
--- a/data/hfopenllm_v2/GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall/7d31e5fd-700a-42a8-bea8-8989e8c52603.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/GuilhermeNaturaUmana_Nature-Reason-1.2-reallysmall/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nature-Reason-1.2-reallysmall",
-    "id": "GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall",
-    "developer": "GuilhermeNaturaUmana",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4791
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5649
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4439
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4408
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall/f993880a-3c7c-4af9-a3ce-3c27207b9a3c.json b/data/hfopenllm_v2/GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall/f993880a-3c7c-4af9-a3ce-3c27207b9a3c.json
deleted file mode 100644
index 36f7c0fb0..000000000
--- a/data/hfopenllm_v2/GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall/f993880a-3c7c-4af9-a3ce-3c27207b9a3c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/GuilhermeNaturaUmana_Nature-Reason-1.2-reallysmall/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nature-Reason-1.2-reallysmall",
-    "id": "GuilhermeNaturaUmana/Nature-Reason-1.2-reallysmall",
-    "developer": "GuilhermeNaturaUmana",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4985
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5645
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4373
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4429
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge-PEFT/2fae7e4a-8c28-4be8-9391-ca79077e32c2.json b/data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge-PEFT/2fae7e4a-8c28-4be8-9391-ca79077e32c2.json
deleted file mode 100644
index 3d2506966..000000000
--- a/data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge-PEFT/2fae7e4a-8c28-4be8-9391-ca79077e32c2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Gunulhona_Gemma-Ko-Merge-PEFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-Ko-Merge-PEFT",
-    "id": "Gunulhona/Gemma-Ko-Merge-PEFT",
-    "developer": "Gunulhona",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 20.318
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.288
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5154
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.408
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge-PEFT/436e651e-6f04-44ff-ab3d-db8ed0d639bd.json b/data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge-PEFT/436e651e-6f04-44ff-ab3d-db8ed0d639bd.json
deleted file mode 100644
index d5f8fff6e..000000000
--- a/data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge-PEFT/436e651e-6f04-44ff-ab3d-db8ed0d639bd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Gunulhona_Gemma-Ko-Merge-PEFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-Ko-Merge-PEFT",
-    "id": "Gunulhona/Gemma-Ko-Merge-PEFT",
-    "developer": "Gunulhona",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 20.318
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4441
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4863
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3986
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3098
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge/9fbccac2-c840-494e-a24d-a6f0c9a07b88.json b/data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge/9fbccac2-c840-494e-a24d-a6f0c9a07b88.json
deleted file mode 100644
index 6f57f8828..000000000
--- a/data/hfopenllm_v2/Gunulhona/Gemma-Ko-Merge/9fbccac2-c840-494e-a24d-a6f0c9a07b88.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Gunulhona_Gemma-Ko-Merge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-Ko-Merge",
-    "id": "Gunulhona/Gemma-Ko-Merge",
-    "developer": "Gunulhona",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6416
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5813
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1881
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4047
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3879
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HPAI-BSC/Llama3-Aloe-8B-Alpha/a4ee6a33-df51-4a4e-a13d-45488a094fd7.json b/data/hfopenllm_v2/HPAI-BSC/Llama3-Aloe-8B-Alpha/a4ee6a33-df51-4a4e-a13d-45488a094fd7.json
deleted file mode 100644
index 97b6f883b..000000000
--- a/data/hfopenllm_v2/HPAI-BSC/Llama3-Aloe-8B-Alpha/a4ee6a33-df51-4a4e-a13d-45488a094fd7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HPAI-BSC_Llama3-Aloe-8B-Alpha/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3-Aloe-8B-Alpha",
-    "id": "HPAI-BSC/Llama3-Aloe-8B-Alpha",
-    "developer": "HPAI-BSC",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5081
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4831
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3673
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3295
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HPAI-BSC/Llama3.1-Aloe-Beta-8B/a3923f10-e64c-4556-9616-4fe7072eff60.json b/data/hfopenllm_v2/HPAI-BSC/Llama3.1-Aloe-Beta-8B/a3923f10-e64c-4556-9616-4fe7072eff60.json
deleted file mode 100644
index 00bf8e2ef..000000000
--- a/data/hfopenllm_v2/HPAI-BSC/Llama3.1-Aloe-Beta-8B/a3923f10-e64c-4556-9616-4fe7072eff60.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HPAI-BSC_Llama3.1-Aloe-Beta-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-Aloe-Beta-8B",
-    "id": "HPAI-BSC/Llama3.1-Aloe-Beta-8B",
-    "developer": "HPAI-BSC",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7253
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5093
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1828
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3835
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HPAI-BSC/Qwen2.5-Aloe-Beta-7B/ca15d972-9075-42df-884b-5d069f6ff425.json b/data/hfopenllm_v2/HPAI-BSC/Qwen2.5-Aloe-Beta-7B/ca15d972-9075-42df-884b-5d069f6ff425.json
deleted file mode 100644
index 4e7791b20..000000000
--- a/data/hfopenllm_v2/HPAI-BSC/Qwen2.5-Aloe-Beta-7B/ca15d972-9075-42df-884b-5d069f6ff425.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HPAI-BSC_Qwen2.5-Aloe-Beta-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Aloe-Beta-7B",
-    "id": "HPAI-BSC/Qwen2.5-Aloe-Beta-7B",
-    "developer": "HPAI-BSC",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4554
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5049
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3542
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.426
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4354
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HarbingerX/Zeitgeist-3b-V1.2/905909a5-abef-46bf-9392-c97873e229df.json b/data/hfopenllm_v2/HarbingerX/Zeitgeist-3b-V1.2/905909a5-abef-46bf-9392-c97873e229df.json
deleted file mode 100644
index e6b388131..000000000
--- a/data/hfopenllm_v2/HarbingerX/Zeitgeist-3b-V1.2/905909a5-abef-46bf-9392-c97873e229df.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HarbingerX_Zeitgeist-3b-V1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Zeitgeist-3b-V1.2",
-    "id": "HarbingerX/Zeitgeist-3b-V1.2",
-    "developer": "HarbingerX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6754
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4441
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1012
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3579
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3056
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HarbingerX/Zeitgeist-3b-V1/95bd05cf-8f59-409d-a99e-d249bad6c561.json b/data/hfopenllm_v2/HarbingerX/Zeitgeist-3b-V1/95bd05cf-8f59-409d-a99e-d249bad6c561.json
deleted file mode 100644
index e04145e0b..000000000
--- a/data/hfopenllm_v2/HarbingerX/Zeitgeist-3b-V1/95bd05cf-8f59-409d-a99e-d249bad6c561.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HarbingerX_Zeitgeist-3b-V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Zeitgeist-3b-V1",
-    "id": "HarbingerX/Zeitgeist-3b-V1",
-    "developer": "HarbingerX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6712
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4441
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1035
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3579
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3009
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Hastagaras/L3.2-JametMini-3B-MK.III/76b12246-33f6-4992-a0ab-38704dcf6345.json b/data/hfopenllm_v2/Hastagaras/L3.2-JametMini-3B-MK.III/76b12246-33f6-4992-a0ab-38704dcf6345.json
deleted file mode 100644
index 9a45e8545..000000000
--- a/data/hfopenllm_v2/Hastagaras/L3.2-JametMini-3B-MK.III/76b12246-33f6-4992-a0ab-38704dcf6345.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Hastagaras_L3.2-JametMini-3B-MK.III/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.2-JametMini-3B-MK.III",
-    "id": "Hastagaras/L3.2-JametMini-3B-MK.III",
-    "developer": "Hastagaras",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6183
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4539
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1458
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3686
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2983
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Hastagaras/Llama-3.1-Jamet-8B-MK.I/e4415806-0ec0-465a-b28f-9c8741436fb4.json b/data/hfopenllm_v2/Hastagaras/Llama-3.1-Jamet-8B-MK.I/e4415806-0ec0-465a-b28f-9c8741436fb4.json
deleted file mode 100644
index 680f1f7a0..000000000
--- a/data/hfopenllm_v2/Hastagaras/Llama-3.1-Jamet-8B-MK.I/e4415806-0ec0-465a-b28f-9c8741436fb4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Hastagaras_Llama-3.1-Jamet-8B-MK.I/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Jamet-8B-MK.I",
-    "id": "Hastagaras/Llama-3.1-Jamet-8B-MK.I",
-    "developer": "Hastagaras",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7338
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5049
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1269
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3726
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3482
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Hastagaras/Zabuza-8B-Llama-3.1/98e62ab5-d35a-42dd-904b-bed9c50f3745.json b/data/hfopenllm_v2/Hastagaras/Zabuza-8B-Llama-3.1/98e62ab5-d35a-42dd-904b-bed9c50f3745.json
deleted file mode 100644
index a06055349..000000000
--- a/data/hfopenllm_v2/Hastagaras/Zabuza-8B-Llama-3.1/98e62ab5-d35a-42dd-904b-bed9c50f3745.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Hastagaras_Zabuza-8B-Llama-3.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Zabuza-8B-Llama-3.1",
-    "id": "Hastagaras/Zabuza-8B-Llama-3.1",
-    "developer": "Hastagaras",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6265
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4539
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0551
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3568
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2923
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HelpingAI/Cipher-20B/8fb3596e-224e-492b-bdb6-a95a16656eb0.json b/data/hfopenllm_v2/HelpingAI/Cipher-20B/8fb3596e-224e-492b-bdb6-a95a16656eb0.json
deleted file mode 100644
index a8c33014c..000000000
--- a/data/hfopenllm_v2/HelpingAI/Cipher-20B/8fb3596e-224e-492b-bdb6-a95a16656eb0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HelpingAI_Cipher-20B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cipher-20B",
-    "id": "HelpingAI/Cipher-20B",
-    "developer": "HelpingAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 20.551
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5378
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6032
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1994
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4003
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3744
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HelpingAI/Dhanishtha-Large/154203c4-d86e-4c36-806b-c45c5cc568ce.json b/data/hfopenllm_v2/HelpingAI/Dhanishtha-Large/154203c4-d86e-4c36-806b-c45c5cc568ce.json
deleted file mode 100644
index d0d75bfd3..000000000
--- a/data/hfopenllm_v2/HelpingAI/Dhanishtha-Large/154203c4-d86e-4c36-806b-c45c5cc568ce.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HelpingAI_Dhanishtha-Large/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dhanishtha-Large",
-    "id": "HelpingAI/Dhanishtha-Large",
-    "developer": "HelpingAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2457
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4604
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3852
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3845
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2755
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HelpingAI/Priya-10B/e42c01f7-2869-4103-bbfd-81aa5a15c140.json b/data/hfopenllm_v2/HelpingAI/Priya-10B/e42c01f7-2869-4103-bbfd-81aa5a15c140.json
deleted file mode 100644
index 5c591c97d..000000000
--- a/data/hfopenllm_v2/HelpingAI/Priya-10B/e42c01f7-2869-4103-bbfd-81aa5a15c140.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HelpingAI_Priya-10B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Priya-10B",
-    "id": "HelpingAI/Priya-10B",
-    "developer": "HelpingAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.211
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4043
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4441
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0189
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3793
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2493
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HelpingAI/Priya-3B/323d2f94-5e04-4627-9f74-129217f53eea.json b/data/hfopenllm_v2/HelpingAI/Priya-3B/323d2f94-5e04-4627-9f74-129217f53eea.json
deleted file mode 100644
index ab36912a7..000000000
--- a/data/hfopenllm_v2/HelpingAI/Priya-3B/323d2f94-5e04-4627-9f74-129217f53eea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HelpingAI_Priya-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Priya-3B",
-    "id": "HelpingAI/Priya-3B",
-    "developer": "HelpingAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 2.81
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4526
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3961
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3713
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2339
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HeraiHench/DeepSeek-R1-Qwen-Coder-8B/6bcc284b-8973-47d5-b5b1-1abb7a3242ee.json b/data/hfopenllm_v2/HeraiHench/DeepSeek-R1-Qwen-Coder-8B/6bcc284b-8973-47d5-b5b1-1abb7a3242ee.json
deleted file mode 100644
index 83225ff07..000000000
--- a/data/hfopenllm_v2/HeraiHench/DeepSeek-R1-Qwen-Coder-8B/6bcc284b-8973-47d5-b5b1-1abb7a3242ee.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HeraiHench_DeepSeek-R1-Qwen-Coder-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Qwen-Coder-8B",
-    "id": "HeraiHench/DeepSeek-R1-Qwen-Coder-8B",
-    "developer": "HeraiHench",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 8.164
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1869
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2913
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1123
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HeraiHench/Double-Down-Qwen-Math-7B/691cace3-5316-4f5b-8693-67efb24a0a06.json b/data/hfopenllm_v2/HeraiHench/Double-Down-Qwen-Math-7B/691cace3-5316-4f5b-8693-67efb24a0a06.json
deleted file mode 100644
index 6515c0ca8..000000000
--- a/data/hfopenllm_v2/HeraiHench/Double-Down-Qwen-Math-7B/691cace3-5316-4f5b-8693-67efb24a0a06.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HeraiHench_Double-Down-Qwen-Math-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Double-Down-Qwen-Math-7B",
-    "id": "HeraiHench/Double-Down-Qwen-Math-7B",
-    "developer": "HeraiHench",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.167
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2845
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3737
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1112
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HeraiHench/Marge-Qwen-Math-7B/d387b3dc-9e76-44a6-9a9f-132a4fd762b4.json b/data/hfopenllm_v2/HeraiHench/Marge-Qwen-Math-7B/d387b3dc-9e76-44a6-9a9f-132a4fd762b4.json
deleted file mode 100644
index c9043860f..000000000
--- a/data/hfopenllm_v2/HeraiHench/Marge-Qwen-Math-7B/d387b3dc-9e76-44a6-9a9f-132a4fd762b4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HeraiHench_Marge-Qwen-Math-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Marge-Qwen-Math-7B",
-    "id": "HeraiHench/Marge-Qwen-Math-7B",
-    "developer": "HeraiHench",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1262
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3069
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0053
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2391
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3939
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1056
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HeraiHench/Phi-4-slerp-ReasoningRP-14B/f6f515d3-f5e9-4362-be51-bb8fc05527e6.json b/data/hfopenllm_v2/HeraiHench/Phi-4-slerp-ReasoningRP-14B/f6f515d3-f5e9-4362-be51-bb8fc05527e6.json
deleted file mode 100644
index 287a631b7..000000000
--- a/data/hfopenllm_v2/HeraiHench/Phi-4-slerp-ReasoningRP-14B/f6f515d3-f5e9-4362-be51-bb8fc05527e6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HeraiHench_Phi-4-slerp-ReasoningRP-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-slerp-ReasoningRP-14B",
-    "id": "HeraiHench/Phi-4-slerp-ReasoningRP-14B",
-    "developer": "HeraiHench",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 9.207
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1575
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4196
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3116
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.19
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HiroseKoichi/Llama-Salad-4x8B-V3/2e1e215f-b622-439f-a13f-531441e25ae3.json b/data/hfopenllm_v2/HiroseKoichi/Llama-Salad-4x8B-V3/2e1e215f-b622-439f-a13f-531441e25ae3.json
deleted file mode 100644
index 68ae14b6f..000000000
--- a/data/hfopenllm_v2/HiroseKoichi/Llama-Salad-4x8B-V3/2e1e215f-b622-439f-a13f-531441e25ae3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HiroseKoichi_Llama-Salad-4x8B-V3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-Salad-4x8B-V3",
-    "id": "HiroseKoichi/Llama-Salad-4x8B-V3",
-    "developer": "HiroseKoichi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 24.942
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6654
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5245
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0959
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.374
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3518
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HoangHa/Pensez-Llama3.1-8B/d50d66a9-a0c4-4b82-922c-9d012f1b50a1.json b/data/hfopenllm_v2/HoangHa/Pensez-Llama3.1-8B/d50d66a9-a0c4-4b82-922c-9d012f1b50a1.json
deleted file mode 100644
index bdc98fec4..000000000
--- a/data/hfopenllm_v2/HoangHa/Pensez-Llama3.1-8B/d50d66a9-a0c4-4b82-922c-9d012f1b50a1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HoangHa_Pensez-Llama3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Pensez-Llama3.1-8B",
-    "id": "HoangHa/Pensez-Llama3.1-8B",
-    "developer": "HoangHa",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3887
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4669
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3597
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3126
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-alpha/ea7292a8-3f07-47be-b8ae-7d352ed1ecb6.json b/data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-alpha/ea7292a8-3f07-47be-b8ae-7d352ed1ecb6.json
deleted file mode 100644
index 6f52aa9ae..000000000
--- a/data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-alpha/ea7292a8-3f07-47be-b8ae-7d352ed1ecb6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HuggingFaceH4_zephyr-7b-alpha/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "zephyr-7b-alpha",
-    "id": "HuggingFaceH4/zephyr-7b-alpha",
-    "developer": "HuggingFaceH4",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5191
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4583
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.395
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2795
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-beta/4eedd6d4-279f-4660-8d71-708a27bb53e0.json b/data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-beta/4eedd6d4-279f-4660-8d71-708a27bb53e0.json
deleted file mode 100644
index 243694134..000000000
--- a/data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-beta/4eedd6d4-279f-4660-8d71-708a27bb53e0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HuggingFaceH4_zephyr-7b-beta/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "zephyr-7b-beta",
-    "id": "HuggingFaceH4/zephyr-7b-beta",
-    "developer": "HuggingFaceH4",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.495
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4316
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3925
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2781
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-gemma-v0.1/9c0f67d1-f95d-4ca0-a234-2e09ac788f55.json b/data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-gemma-v0.1/9c0f67d1-f95d-4ca0-a234-2e09ac788f55.json
deleted file mode 100644
index 46badaeea..000000000
--- a/data/hfopenllm_v2/HuggingFaceH4/zephyr-7b-gemma-v0.1/9c0f67d1-f95d-4ca0-a234-2e09ac788f55.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HuggingFaceH4_zephyr-7b-gemma-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "zephyr-7b-gemma-v0.1",
-    "id": "HuggingFaceH4/zephyr-7b-gemma-v0.1",
-    "developer": "HuggingFaceH4",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 8.538
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4624
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0816
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.374
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2847
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1/e5c0fbc9-f424-4b04-839a-8335adaf89cc.json b/data/hfopenllm_v2/HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1/e5c0fbc9-f424-4b04-839a-8335adaf89cc.json
deleted file mode 100644
index 24a491daf..000000000
--- a/data/hfopenllm_v2/HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1/e5c0fbc9-f424-4b04-839a-8335adaf89cc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HuggingFaceH4_zephyr-orpo-141b-A35b-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "zephyr-orpo-141b-A35b-v0.1",
-    "id": "HuggingFaceH4/zephyr-orpo-141b-A35b-v0.1",
-    "developer": "HuggingFaceH4",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 140.621
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6511
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.629
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2047
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3784
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4465
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4586
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-1.7B-Instruct/d91107fa-eb8d-4d01-90a2-fc9831f337b2.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM-1.7B-Instruct/d91107fa-eb8d-4d01-90a2-fc9831f337b2.json
deleted file mode 100644
index 0268b6bbe..000000000
--- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-1.7B-Instruct/d91107fa-eb8d-4d01-90a2-fc9831f337b2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM-1.7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM-1.7B-Instruct",
-    "id": "HuggingFaceTB/SmolLM-1.7B-Instruct",
-    "developer": "HuggingFaceTB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.71
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2348
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2885
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0211
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3487
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1166
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-1.7B/926999bf-1ba6-4321-82b2-fcced4336739.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM-1.7B/926999bf-1ba6-4321-82b2-fcced4336739.json
deleted file mode 100644
index e9cd807d2..000000000
--- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-1.7B/926999bf-1ba6-4321-82b2-fcced4336739.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM-1.7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM-1.7B",
-    "id": "HuggingFaceTB/SmolLM-1.7B",
-    "developer": "HuggingFaceTB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.71
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2362
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3181
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2416
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3421
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1148
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-135M-Instruct/57d481bf-0db9-4208-afda-dcd20df13964.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM-135M-Instruct/57d481bf-0db9-4208-afda-dcd20df13964.json
deleted file mode 100644
index 745d7547b..000000000
--- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-135M-Instruct/57d481bf-0db9-4208-afda-dcd20df13964.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM-135M-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM-135M-Instruct",
-    "id": "HuggingFaceTB/SmolLM-135M-Instruct",
-    "developer": "HuggingFaceTB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1214
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3015
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0053
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3635
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1176
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-135M/eb417e47-fe63-4dc5-b3e5-28782f3782da.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM-135M/eb417e47-fe63-4dc5-b3e5-28782f3782da.json
deleted file mode 100644
index b15e9c58d..000000000
--- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-135M/eb417e47-fe63-4dc5-b3e5-28782f3782da.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM-135M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM-135M",
-    "id": "HuggingFaceTB/SmolLM-135M",
-    "developer": "HuggingFaceTB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.13
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2125
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3046
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4366
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1122
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-360M-Instruct/b0f516dd-7185-4906-87a5-3c6f019894d0.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM-360M-Instruct/b0f516dd-7185-4906-87a5-3c6f019894d0.json
deleted file mode 100644
index c5826e27d..000000000
--- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-360M-Instruct/b0f516dd-7185-4906-87a5-3c6f019894d0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM-360M-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM-360M-Instruct",
-    "id": "HuggingFaceTB/SmolLM-360M-Instruct",
-    "developer": "HuggingFaceTB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.362
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1952
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2885
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3472
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1166
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-360M/1e562944-a205-4ef7-aff1-3776595d131c.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM-360M/1e562944-a205-4ef7-aff1-3776595d131c.json
deleted file mode 100644
index 3e49abf4e..000000000
--- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM-360M/1e562944-a205-4ef7-aff1-3776595d131c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM-360M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM-360M",
-    "id": "HuggingFaceTB/SmolLM-360M",
-    "developer": "HuggingFaceTB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.36
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2134
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3065
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0113
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4018
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1124
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-1.7B-Instruct/6ccaf08d-1b0a-4ca9-941e-a71e2dce5cb4.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-1.7B-Instruct/6ccaf08d-1b0a-4ca9-941e-a71e2dce5cb4.json
deleted file mode 100644
index 7fcd1bd5f..000000000
--- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-1.7B-Instruct/6ccaf08d-1b0a-4ca9-941e-a71e2dce5cb4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM2-1.7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM2-1.7B-Instruct",
-    "id": "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-    "developer": "HuggingFaceTB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.711
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5368
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3599
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0582
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3421
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2054
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-1.7B/2064938d-9f05-4740-a4d4-2a2da0eac21d.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-1.7B/2064938d-9f05-4740-a4d4-2a2da0eac21d.json
deleted file mode 100644
index c33283bc1..000000000
--- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-1.7B/2064938d-9f05-4740-a4d4-2a2da0eac21d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM2-1.7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM2-1.7B",
-    "id": "HuggingFaceTB/SmolLM2-1.7B",
-    "developer": "HuggingFaceTB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.71
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.244
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3453
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3485
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2138
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M-Instruct/43240184-8245-43ff-a971-678523918fe0.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M-Instruct/43240184-8245-43ff-a971-678523918fe0.json
deleted file mode 100644
index a41b862d1..000000000
--- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M-Instruct/43240184-8245-43ff-a971-678523918fe0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM2-135M-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM2-135M-Instruct",
-    "id": "HuggingFaceTB/SmolLM2-135M-Instruct",
-    "developer": "HuggingFaceTB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0593
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3135
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2341
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3871
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1092
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M-Instruct/b3b854b6-700c-4297-b335-6acc3c385f84.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M-Instruct/b3b854b6-700c-4297-b335-6acc3c385f84.json
deleted file mode 100644
index 5a55c0fe8..000000000
--- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M-Instruct/b3b854b6-700c-4297-b335-6acc3c385f84.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM2-135M-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM2-135M-Instruct",
-    "id": "HuggingFaceTB/SmolLM2-135M-Instruct",
-    "developer": "HuggingFaceTB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2883
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3124
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.003
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2357
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3662
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1115
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M/a9d79c6a-f99a-4b60-8e37-ee2cdfe75f30.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M/a9d79c6a-f99a-4b60-8e37-ee2cdfe75f30.json
deleted file mode 100644
index 08375897a..000000000
--- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-135M/a9d79c6a-f99a-4b60-8e37-ee2cdfe75f30.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM2-135M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM2-135M",
-    "id": "HuggingFaceTB/SmolLM2-135M",
-    "developer": "HuggingFaceTB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1818
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3044
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2483
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4112
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1095
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M-Instruct/88e1dd78-d3bc-401b-88e9-d963bac181db.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M-Instruct/88e1dd78-d3bc-401b-88e9-d963bac181db.json
deleted file mode 100644
index bfc21511e..000000000
--- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M-Instruct/88e1dd78-d3bc-401b-88e9-d963bac181db.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM2-360M-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM2-360M-Instruct",
-    "id": "HuggingFaceTB/SmolLM2-360M-Instruct",
-    "developer": "HuggingFaceTB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.36
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3144
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3461
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1117
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M-Instruct/a41bd607-f319-4063-a6e4-813f43e40568.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M-Instruct/a41bd607-f319-4063-a6e4-813f43e40568.json
deleted file mode 100644
index 96d89080d..000000000
--- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M-Instruct/a41bd607-f319-4063-a6e4-813f43e40568.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM2-360M-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM2-360M-Instruct",
-    "id": "HuggingFaceTB/SmolLM2-360M-Instruct",
-    "developer": "HuggingFaceTB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.362
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.083
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3053
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3423
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1126
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M/8629aef1-c673-4b17-a9cc-b361a53bdaa7.json b/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M/8629aef1-c673-4b17-a9cc-b361a53bdaa7.json
deleted file mode 100644
index 95ae8687c..000000000
--- a/data/hfopenllm_v2/HuggingFaceTB/SmolLM2-360M/8629aef1-c673-4b17-a9cc-b361a53bdaa7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HuggingFaceTB_SmolLM2-360M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM2-360M",
-    "id": "HuggingFaceTB/SmolLM2-360M",
-    "developer": "HuggingFaceTB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.36
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2115
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3233
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2458
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3954
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1169
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HumanLLMs/Humanish-LLama3-8B-Instruct/532c927a-dc0c-4e65-8ab0-7b9ddd889d89.json b/data/hfopenllm_v2/HumanLLMs/Humanish-LLama3-8B-Instruct/532c927a-dc0c-4e65-8ab0-7b9ddd889d89.json
deleted file mode 100644
index c3eae323f..000000000
--- a/data/hfopenllm_v2/HumanLLMs/Humanish-LLama3-8B-Instruct/532c927a-dc0c-4e65-8ab0-7b9ddd889d89.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HumanLLMs_Humanish-LLama3-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Humanish-LLama3-8B-Instruct",
-    "id": "HumanLLMs/Humanish-LLama3-8B-Instruct",
-    "developer": "HumanLLMs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6498
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4968
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1027
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3582
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3702
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HumanLLMs/Humanish-Mistral-Nemo-Instruct-2407/843f9927-9865-4066-9cc0-f0522d3b914f.json b/data/hfopenllm_v2/HumanLLMs/Humanish-Mistral-Nemo-Instruct-2407/843f9927-9865-4066-9cc0-f0522d3b914f.json
deleted file mode 100644
index b2ce02227..000000000
--- a/data/hfopenllm_v2/HumanLLMs/Humanish-Mistral-Nemo-Instruct-2407/843f9927-9865-4066-9cc0-f0522d3b914f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HumanLLMs_Humanish-Mistral-Nemo-Instruct-2407/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Humanish-Mistral-Nemo-Instruct-2407",
-    "id": "HumanLLMs/Humanish-Mistral-Nemo-Instruct-2407",
-    "developer": "HumanLLMs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5451
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5262
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1367
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3968
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3521
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/HumanLLMs/Humanish-Qwen2.5-7B-Instruct/eeecb2cb-e286-443f-84aa-d825702a4ad8.json b/data/hfopenllm_v2/HumanLLMs/Humanish-Qwen2.5-7B-Instruct/eeecb2cb-e286-443f-84aa-d825702a4ad8.json
deleted file mode 100644
index 899bd0d00..000000000
--- a/data/hfopenllm_v2/HumanLLMs/Humanish-Qwen2.5-7B-Instruct/eeecb2cb-e286-443f-84aa-d825702a4ad8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/HumanLLMs_Humanish-Qwen2.5-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Humanish-Qwen2.5-7B-Instruct",
-    "id": "HumanLLMs/Humanish-Qwen2.5-7B-Instruct",
-    "developer": "HumanLLMs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7284
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5364
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3981
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4398
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/IDEA-CCNL/Ziya-LLaMA-13B-v1/36ab4f5a-b2cf-4d01-8283-9eaf2c90928f.json b/data/hfopenllm_v2/IDEA-CCNL/Ziya-LLaMA-13B-v1/36ab4f5a-b2cf-4d01-8283-9eaf2c90928f.json
deleted file mode 100644
index 27a4a8e2c..000000000
--- a/data/hfopenllm_v2/IDEA-CCNL/Ziya-LLaMA-13B-v1/36ab4f5a-b2cf-4d01-8283-9eaf2c90928f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/IDEA-CCNL_Ziya-LLaMA-13B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ziya-LLaMA-13B-v1",
-    "id": "IDEA-CCNL/Ziya-LLaMA-13B-v1",
-    "developer": "IDEA-CCNL",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1697
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2877
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3751
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1101
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/INSAIT-Institute/BgGPT-Gemma-2-27B-IT-v1.0/c4e810f1-ffb3-4ece-b445-64e339761530.json b/data/hfopenllm_v2/INSAIT-Institute/BgGPT-Gemma-2-27B-IT-v1.0/c4e810f1-ffb3-4ece-b445-64e339761530.json
deleted file mode 100644
index 225b86c3c..000000000
--- a/data/hfopenllm_v2/INSAIT-Institute/BgGPT-Gemma-2-27B-IT-v1.0/c4e810f1-ffb3-4ece-b445-64e339761530.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/INSAIT-Institute_BgGPT-Gemma-2-27B-IT-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BgGPT-Gemma-2-27B-IT-v1.0",
-    "id": "INSAIT-Institute/BgGPT-Gemma-2-27B-IT-v1.0",
-    "developer": "INSAIT-Institute",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2912
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3575
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1167
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/IlyaGusev/gemma-2-2b-it-abliterated/025725b6-0034-48c0-a720-5fc210e5e24b.json b/data/hfopenllm_v2/IlyaGusev/gemma-2-2b-it-abliterated/025725b6-0034-48c0-a720-5fc210e5e24b.json
deleted file mode 100644
index 1daa4500b..000000000
--- a/data/hfopenllm_v2/IlyaGusev/gemma-2-2b-it-abliterated/025725b6-0034-48c0-a720-5fc210e5e24b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/IlyaGusev_gemma-2-2b-it-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-it-abliterated",
-    "id": "IlyaGusev/gemma-2-2b-it-abliterated",
-    "developer": "IlyaGusev",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5331
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4119
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3782
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2538
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/IlyaGusev/gemma-2-9b-it-abliterated/7bdd8928-c336-494e-9c87-de9ecc2749b8.json b/data/hfopenllm_v2/IlyaGusev/gemma-2-9b-it-abliterated/7bdd8928-c336-494e-9c87-de9ecc2749b8.json
deleted file mode 100644
index b601a539f..000000000
--- a/data/hfopenllm_v2/IlyaGusev/gemma-2-9b-it-abliterated/7bdd8928-c336-494e-9c87-de9ecc2749b8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/IlyaGusev_gemma-2-9b-it-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-9b-it-abliterated",
-    "id": "IlyaGusev/gemma-2-9b-it-abliterated",
-    "developer": "IlyaGusev",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7473
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5906
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1775
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3456
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4034
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3915
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Infinirc/Infinirc-Llama3-8B-2G-Release-v1.0/ff7369dc-3ff2-424b-80b0-e06a141b54f3.json b/data/hfopenllm_v2/Infinirc/Infinirc-Llama3-8B-2G-Release-v1.0/ff7369dc-3ff2-424b-80b0-e06a141b54f3.json
deleted file mode 100644
index dde7937ec..000000000
--- a/data/hfopenllm_v2/Infinirc/Infinirc-Llama3-8B-2G-Release-v1.0/ff7369dc-3ff2-424b-80b0-e06a141b54f3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Infinirc_Infinirc-Llama3-8B-2G-Release-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Infinirc-Llama3-8B-2G-Release-v1.0",
-    "id": "Infinirc/Infinirc-Llama3-8B-2G-Release-v1.0",
-    "developer": "Infinirc",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2024
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4351
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4609
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.216
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Intel/neural-chat-7b-v3-1/a6dc7253-75fd-4897-be85-8ac89fc11f8e.json b/data/hfopenllm_v2/Intel/neural-chat-7b-v3-1/a6dc7253-75fd-4897-be85-8ac89fc11f8e.json
deleted file mode 100644
index c13200f68..000000000
--- a/data/hfopenllm_v2/Intel/neural-chat-7b-v3-1/a6dc7253-75fd-4897-be85-8ac89fc11f8e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Intel_neural-chat-7b-v3-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "neural-chat-7b-v3-1",
-    "id": "Intel/neural-chat-7b-v3-1",
-    "developer": "Intel",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4687
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5052
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0355
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4979
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2678
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Intel/neural-chat-7b-v3-2/296ceacc-542a-4000-bf9b-ae59b33a53ce.json b/data/hfopenllm_v2/Intel/neural-chat-7b-v3-2/296ceacc-542a-4000-bf9b-ae59b33a53ce.json
deleted file mode 100644
index 050bc7fae..000000000
--- a/data/hfopenllm_v2/Intel/neural-chat-7b-v3-2/296ceacc-542a-4000-bf9b-ae59b33a53ce.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Intel_neural-chat-7b-v3-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "neural-chat-7b-v3-2",
-    "id": "Intel/neural-chat-7b-v3-2",
-    "developer": "Intel",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4988
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5032
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4895
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2667
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Intel/neural-chat-7b-v3-3/13870577-7579-48b4-9c92-202318ca6ecc.json b/data/hfopenllm_v2/Intel/neural-chat-7b-v3-3/13870577-7579-48b4-9c92-202318ca6ecc.json
deleted file mode 100644
index f546afc25..000000000
--- a/data/hfopenllm_v2/Intel/neural-chat-7b-v3-3/13870577-7579-48b4-9c92-202318ca6ecc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Intel_neural-chat-7b-v3-3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "neural-chat-7b-v3-3",
-    "id": "Intel/neural-chat-7b-v3-3",
-    "developer": "Intel",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4763
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4877
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.486
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2625
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Intel/neural-chat-7b-v3/6ebd2806-2623-4773-93bd-1036ff01cb8c.json b/data/hfopenllm_v2/Intel/neural-chat-7b-v3/6ebd2806-2623-4773-93bd-1036ff01cb8c.json
deleted file mode 100644
index 937a77afe..000000000
--- a/data/hfopenllm_v2/Intel/neural-chat-7b-v3/6ebd2806-2623-4773-93bd-1036ff01cb8c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Intel_neural-chat-7b-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "neural-chat-7b-v3",
-    "id": "Intel/neural-chat-7b-v3",
-    "developer": "Intel",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2778
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5048
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5055
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2699
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/IntervitensInc/internlm2_5-20b-llamafied/99d6a44b-d556-4674-8ade-a5b30cf99255.json b/data/hfopenllm_v2/IntervitensInc/internlm2_5-20b-llamafied/99d6a44b-d556-4674-8ade-a5b30cf99255.json
deleted file mode 100644
index 9f01570fc..000000000
--- a/data/hfopenllm_v2/IntervitensInc/internlm2_5-20b-llamafied/99d6a44b-d556-4674-8ade-a5b30cf99255.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/IntervitensInc_internlm2_5-20b-llamafied/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "internlm2_5-20b-llamafied",
-    "id": "IntervitensInc/internlm2_5-20b-llamafied",
-    "developer": "IntervitensInc",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 19.861
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.341
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7478
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1715
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4475
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4051
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Invalid-Null/PeiYangMe-0.5/605118a3-316a-46b5-9719-f596e361a2a8.json b/data/hfopenllm_v2/Invalid-Null/PeiYangMe-0.5/605118a3-316a-46b5-9719-f596e361a2a8.json
deleted file mode 100644
index cb284ea11..000000000
--- a/data/hfopenllm_v2/Invalid-Null/PeiYangMe-0.5/605118a3-316a-46b5-9719-f596e361a2a8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Invalid-Null_PeiYangMe-0.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PeiYangMe-0.5",
-    "id": "Invalid-Null/PeiYangMe-0.5",
-    "developer": "Invalid-Null",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.061
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1409
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2791
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2441
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1109
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Invalid-Null/PeiYangMe-0.7/271d2829-fbd4-438e-9f09-59539af68c8b.json b/data/hfopenllm_v2/Invalid-Null/PeiYangMe-0.7/271d2829-fbd4-438e-9f09-59539af68c8b.json
deleted file mode 100644
index b5d375fc5..000000000
--- a/data/hfopenllm_v2/Invalid-Null/PeiYangMe-0.7/271d2829-fbd4-438e-9f09-59539af68c8b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Invalid-Null_PeiYangMe-0.7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PeiYangMe-0.7",
-    "id": "Invalid-Null/PeiYangMe-0.7",
-    "developer": "Invalid-Null",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.061
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1491
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3028
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0113
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2332
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3857
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1101
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Isaak-Carter/JOSIEv4o-8b-stage1-v4/107bc549-75c1-4272-b567-f8ab9f6cd675.json b/data/hfopenllm_v2/Isaak-Carter/JOSIEv4o-8b-stage1-v4/107bc549-75c1-4272-b567-f8ab9f6cd675.json
deleted file mode 100644
index ffdfacf77..000000000
--- a/data/hfopenllm_v2/Isaak-Carter/JOSIEv4o-8b-stage1-v4/107bc549-75c1-4272-b567-f8ab9f6cd675.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Isaak-Carter_JOSIEv4o-8b-stage1-v4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "JOSIEv4o-8b-stage1-v4",
-    "id": "Isaak-Carter/JOSIEv4o-8b-stage1-v4",
-    "developer": "Isaak-Carter",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2477
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4758
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3641
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3292
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Isaak-Carter/JOSIEv4o-8b-stage1-v4/dfb451e9-c1c1-45a1-8082-155763366129.json b/data/hfopenllm_v2/Isaak-Carter/JOSIEv4o-8b-stage1-v4/dfb451e9-c1c1-45a1-8082-155763366129.json
deleted file mode 100644
index 9fc986679..000000000
--- a/data/hfopenllm_v2/Isaak-Carter/JOSIEv4o-8b-stage1-v4/dfb451e9-c1c1-45a1-8082-155763366129.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Isaak-Carter_JOSIEv4o-8b-stage1-v4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "JOSIEv4o-8b-stage1-v4",
-    "id": "Isaak-Carter/JOSIEv4o-8b-stage1-v4",
-    "developer": "Isaak-Carter",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2553
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4725
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0529
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3654
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3316
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/b2d80977-d079-42ec-b057-5aac530b9d70.json b/data/hfopenllm_v2/Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/b2d80977-d079-42ec-b057-5aac530b9d70.json
deleted file mode 100644
index 839f443ca..000000000
--- a/data/hfopenllm_v2/Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/b2d80977-d079-42ec-b057-5aac530b9d70.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Isaak-Carter_Josiefied-Qwen2.5-7B-Instruct-abliterated-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Josiefied-Qwen2.5-7B-Instruct-abliterated-v2",
-    "id": "Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated-v2",
-    "developer": "Isaak-Carter",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7841
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5311
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4721
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4354
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4128
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated/16b33b80-3b4b-4edb-b89f-3d93dca8969c.json b/data/hfopenllm_v2/Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated/16b33b80-3b4b-4edb-b89f-3d93dca8969c.json
deleted file mode 100644
index 7fb690341..000000000
--- a/data/hfopenllm_v2/Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated/16b33b80-3b4b-4edb-b89f-3d93dca8969c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Isaak-Carter_Josiefied-Qwen2.5-7B-Instruct-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Josiefied-Qwen2.5-7B-Instruct-abliterated",
-    "id": "Isaak-Carter/Josiefied-Qwen2.5-7B-Instruct-abliterated",
-    "developer": "Isaak-Carter",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7317
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5396
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4924
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4087
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4276
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/J-LAB/Thynk_orpo/63c94e0a-4572-4b8a-bfe0-7f88bb847d7f.json b/data/hfopenllm_v2/J-LAB/Thynk_orpo/63c94e0a-4572-4b8a-bfe0-7f88bb847d7f.json
deleted file mode 100644
index 9f7d195c4..000000000
--- a/data/hfopenllm_v2/J-LAB/Thynk_orpo/63c94e0a-4572-4b8a-bfe0-7f88bb847d7f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/J-LAB_Thynk_orpo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Thynk_orpo",
-    "id": "J-LAB/Thynk_orpo",
-    "developer": "J-LAB",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2102
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4463
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4515
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3231
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JackFram/llama-160m/538f2b43-328c-456d-8a40-ff2b37924453.json b/data/hfopenllm_v2/JackFram/llama-160m/538f2b43-328c-456d-8a40-ff2b37924453.json
deleted file mode 100644
index 94fe29570..000000000
--- a/data/hfopenllm_v2/JackFram/llama-160m/538f2b43-328c-456d-8a40-ff2b37924453.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JackFram_llama-160m/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-160m",
-    "id": "JackFram/llama-160m",
-    "developer": "JackFram",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.162
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1791
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2888
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1128
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JackFram/llama-68m/fb7a68e6-716e-48c6-96c0-d227735f9a7c.json b/data/hfopenllm_v2/JackFram/llama-68m/fb7a68e6-716e-48c6-96c0-d227735f9a7c.json
deleted file mode 100644
index 620583267..000000000
--- a/data/hfopenllm_v2/JackFram/llama-68m/fb7a68e6-716e-48c6-96c0-d227735f9a7c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JackFram_llama-68m/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-68m",
-    "id": "JackFram/llama-68m",
-    "developer": "JackFram",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.068
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1726
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.391
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1144
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Jacoby746/Casual-Magnum-34B/3593d4b8-5602-4cca-935f-a76e342f060a.json b/data/hfopenllm_v2/Jacoby746/Casual-Magnum-34B/3593d4b8-5602-4cca-935f-a76e342f060a.json
deleted file mode 100644
index b0ceabd5f..000000000
--- a/data/hfopenllm_v2/Jacoby746/Casual-Magnum-34B/3593d4b8-5602-4cca-935f-a76e342f060a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Jacoby746_Casual-Magnum-34B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Casual-Magnum-34B",
-    "id": "Jacoby746/Casual-Magnum-34B",
-    "developer": "Jacoby746",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.193
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6032
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0921
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3725
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4078
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5184
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Jacoby746/Inf-Silent-Kunoichi-v0.1-2x7B/72d503fc-b221-498e-811a-a806769175d6.json b/data/hfopenllm_v2/Jacoby746/Inf-Silent-Kunoichi-v0.1-2x7B/72d503fc-b221-498e-811a-a806769175d6.json
deleted file mode 100644
index eeb08788f..000000000
--- a/data/hfopenllm_v2/Jacoby746/Inf-Silent-Kunoichi-v0.1-2x7B/72d503fc-b221-498e-811a-a806769175d6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Jacoby746_Inf-Silent-Kunoichi-v0.1-2x7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Inf-Silent-Kunoichi-v0.1-2x7B",
-    "id": "Jacoby746/Inf-Silent-Kunoichi-v0.1-2x7B",
-    "developer": "Jacoby746",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 12.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.388
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5185
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.071
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.428
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3271
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Jacoby746/Inf-Silent-Kunoichi-v0.2-2x7B/ad7d9698-d9e6-4f2d-9767-987835626c8c.json b/data/hfopenllm_v2/Jacoby746/Inf-Silent-Kunoichi-v0.2-2x7B/ad7d9698-d9e6-4f2d-9767-987835626c8c.json
deleted file mode 100644
index 1870540b9..000000000
--- a/data/hfopenllm_v2/Jacoby746/Inf-Silent-Kunoichi-v0.2-2x7B/ad7d9698-d9e6-4f2d-9767-987835626c8c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Jacoby746_Inf-Silent-Kunoichi-v0.2-2x7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Inf-Silent-Kunoichi-v0.2-2x7B",
-    "id": "Jacoby746/Inf-Silent-Kunoichi-v0.2-2x7B",
-    "developer": "Jacoby746",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 12.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3636
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5209
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0627
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.432
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Jacoby746/Proto-Athena-4x7B/98899942-fcf0-41de-8587-44d7429bea47.json b/data/hfopenllm_v2/Jacoby746/Proto-Athena-4x7B/98899942-fcf0-41de-8587-44d7429bea47.json
deleted file mode 100644
index 290b886ee..000000000
--- a/data/hfopenllm_v2/Jacoby746/Proto-Athena-4x7B/98899942-fcf0-41de-8587-44d7429bea47.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Jacoby746_Proto-Athena-4x7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Proto-Athena-4x7B",
-    "id": "Jacoby746/Proto-Athena-4x7B",
-    "developer": "Jacoby746",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 24.154
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3703
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5107
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4348
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3206
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Jacoby746/Proto-Athena-v0.2-4x7B/bb51eb59-88f6-49c2-814a-11b2c80313d0.json b/data/hfopenllm_v2/Jacoby746/Proto-Athena-v0.2-4x7B/bb51eb59-88f6-49c2-814a-11b2c80313d0.json
deleted file mode 100644
index 93635570d..000000000
--- a/data/hfopenllm_v2/Jacoby746/Proto-Athena-v0.2-4x7B/bb51eb59-88f6-49c2-814a-11b2c80313d0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Jacoby746_Proto-Athena-v0.2-4x7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Proto-Athena-v0.2-4x7B",
-    "id": "Jacoby746/Proto-Athena-v0.2-4x7B",
-    "developer": "Jacoby746",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 24.154
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3752
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5068
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0634
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4213
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3197
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Jacoby746/Proto-Harpy-Blazing-Light-v0.1-2x7B/d8563f36-e299-4186-a5dc-9dae51824e1f.json b/data/hfopenllm_v2/Jacoby746/Proto-Harpy-Blazing-Light-v0.1-2x7B/d8563f36-e299-4186-a5dc-9dae51824e1f.json
deleted file mode 100644
index 38b06bee1..000000000
--- a/data/hfopenllm_v2/Jacoby746/Proto-Harpy-Blazing-Light-v0.1-2x7B/d8563f36-e299-4186-a5dc-9dae51824e1f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Jacoby746_Proto-Harpy-Blazing-Light-v0.1-2x7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Proto-Harpy-Blazing-Light-v0.1-2x7B",
-    "id": "Jacoby746/Proto-Harpy-Blazing-Light-v0.1-2x7B",
-    "developer": "Jacoby746",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 12.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4905
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5187
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0748
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.445
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3301
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Jacoby746/Proto-Harpy-Spark-v0.1-7B/43bc0528-7bc5-4eac-8848-c9995079450f.json b/data/hfopenllm_v2/Jacoby746/Proto-Harpy-Spark-v0.1-7B/43bc0528-7bc5-4eac-8848-c9995079450f.json
deleted file mode 100644
index 220166198..000000000
--- a/data/hfopenllm_v2/Jacoby746/Proto-Harpy-Spark-v0.1-7B/43bc0528-7bc5-4eac-8848-c9995079450f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Jacoby746_Proto-Harpy-Spark-v0.1-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Proto-Harpy-Spark-v0.1-7B",
-    "id": "Jacoby746/Proto-Harpy-Spark-v0.1-7B",
-    "developer": "Jacoby746",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4333
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4736
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0619
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4317
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3069
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-DPO-1epoch/ce19893b-a7e1-4f8e-96f2-eb9cee2afeac.json b/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-DPO-1epoch/ce19893b-a7e1-4f8e-96f2-eb9cee2afeac.json
deleted file mode 100644
index b932687e9..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-DPO-1epoch/ce19893b-a7e1-4f8e-96f2-eb9cee2afeac.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen-0.5B-DPO-1epoch/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-0.5B-DPO-1epoch",
-    "id": "JayHyeon/Qwen-0.5B-DPO-1epoch",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2647
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3191
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3352
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1558
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-DPO-5epoch/24629e14-d197-4a5b-adff-7840af652f22.json b/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-DPO-5epoch/24629e14-d197-4a5b-adff-7840af652f22.json
deleted file mode 100644
index ab62482c6..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-DPO-5epoch/24629e14-d197-4a5b-adff-7840af652f22.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen-0.5B-DPO-5epoch/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-0.5B-DPO-5epoch",
-    "id": "JayHyeon/Qwen-0.5B-DPO-5epoch",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.257
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2433
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.338
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1533
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-IRPO-1epoch/9c3ea35c-2cf7-4c31-8b83-c69df3cd9448.json b/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-IRPO-1epoch/9c3ea35c-2cf7-4c31-8b83-c69df3cd9448.json
deleted file mode 100644
index bd901d9fc..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-IRPO-1epoch/9c3ea35c-2cf7-4c31-8b83-c69df3cd9448.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen-0.5B-IRPO-1epoch/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-0.5B-IRPO-1epoch",
-    "id": "JayHyeon/Qwen-0.5B-IRPO-1epoch",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2589
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3164
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0317
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2466
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3286
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.15
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-IRPO-5epoch/46548403-6eb5-4f7a-874c-1327420f4cab.json b/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-IRPO-5epoch/46548403-6eb5-4f7a-874c-1327420f4cab.json
deleted file mode 100644
index 296d7cd98..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-IRPO-5epoch/46548403-6eb5-4f7a-874c-1327420f4cab.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen-0.5B-IRPO-5epoch/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-0.5B-IRPO-5epoch",
-    "id": "JayHyeon/Qwen-0.5B-IRPO-5epoch",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2487
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3189
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0325
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2399
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3287
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1507
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-eDPO-1epoch/0bd9c061-b7ee-4bc2-9deb-ea7eea012c49.json b/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-eDPO-1epoch/0bd9c061-b7ee-4bc2-9deb-ea7eea012c49.json
deleted file mode 100644
index e9415599f..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-eDPO-1epoch/0bd9c061-b7ee-4bc2-9deb-ea7eea012c49.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen-0.5B-eDPO-1epoch/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-0.5B-eDPO-1epoch",
-    "id": "JayHyeon/Qwen-0.5B-eDPO-1epoch",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2623
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3181
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0347
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2424
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3327
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1553
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-eDPO-5epoch/aa2fe858-111c-45e8-b0d4-0048d7fc7ef7.json b/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-eDPO-5epoch/aa2fe858-111c-45e8-b0d4-0048d7fc7ef7.json
deleted file mode 100644
index c6919cc60..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen-0.5B-eDPO-5epoch/aa2fe858-111c-45e8-b0d4-0048d7fc7ef7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen-0.5B-eDPO-5epoch/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-0.5B-eDPO-5epoch",
-    "id": "JayHyeon/Qwen-0.5B-eDPO-5epoch",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2477
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0234
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3326
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1523
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1/ad03cae6-b126-4157-a225-9576e4d651d0.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1/ad03cae6-b126-4157-a225-9576e4d651d0.json
deleted file mode 100644
index 4ad70853d..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1/ad03cae6-b126-4157-a225-9576e4d651d0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1",
-    "id": "JayHyeon/Qwen2.5-0.5B-Instruct-SFT-DPO-1epoch_v1",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2469
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.326
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3434
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1575
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1/0d57b65d-3dd4-4185-b8cf-531105e94b5e.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1/0d57b65d-3dd4-4185-b8cf-531105e94b5e.json
deleted file mode 100644
index b71ae80d9..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1/0d57b65d-3dd4-4185-b8cf-531105e94b5e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1",
-    "id": "JayHyeon/Qwen2.5-0.5B-Instruct-SFT-IRPO-1epoch_v1",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2606
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3308
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0498
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3288
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1626
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1/f8882044-6e71-4788-b2ee-f51f85e67ecc.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1/f8882044-6e71-4788-b2ee-f51f85e67ecc.json
deleted file mode 100644
index f890e44af..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1/f8882044-6e71-4788-b2ee-f51f85e67ecc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1",
-    "id": "JayHyeon/Qwen2.5-0.5B-Instruct-SFT-MDPO-1epoch_v1",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2529
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3301
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1576
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT/3c8f96c5-af91-4f41-a0b4-6e1b7d55d8ad.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT/3c8f96c5-af91-4f41-a0b4-6e1b7d55d8ad.json
deleted file mode 100644
index 4ed5a1209..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-Instruct-SFT/3c8f96c5-af91-4f41-a0b4-6e1b7d55d8ad.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-Instruct-SFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-Instruct-SFT",
-    "id": "JayHyeon/Qwen2.5-0.5B-Instruct-SFT",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3254
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.152
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-2ep/e26743b9-4caf-46f8-bd5a-7e4445c850b1.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-2ep/e26743b9-4caf-46f8-bd5a-7e4445c850b1.json
deleted file mode 100644
index a9d17a4e6..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-2ep/e26743b9-4caf-46f8-bd5a-7e4445c850b1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-1e-4-2ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-1e-4-2ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-4-2ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.214
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3172
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2466
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1537
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-3ep/febd4016-3a30-4b26-93e5-f7b556781b9b.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-3ep/febd4016-3a30-4b26-93e5-f7b556781b9b.json
deleted file mode 100644
index 24b629e3d..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-3ep/febd4016-3a30-4b26-93e5-f7b556781b9b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-1e-4-3ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-1e-4-3ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-4-3ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2257
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3064
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2483
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3661
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1532
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-5ep/ae82125e-94ac-48ca-8240-807e4b7ef9a0.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-5ep/ae82125e-94ac-48ca-8240-807e4b7ef9a0.json
deleted file mode 100644
index 3a2866ea3..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4-5ep/ae82125e-94ac-48ca-8240-807e4b7ef9a0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-1e-4-5ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-1e-4-5ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-4-5ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1987
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3407
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1558
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4/5321fa0b-b010-4e1d-9f20-a97b56f4f937.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4/5321fa0b-b010-4e1d-9f20-a97b56f4f937.json
deleted file mode 100644
index 844a39283..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-4/5321fa0b-b010-4e1d-9f20-a97b56f4f937.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-1e-4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-1e-4",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-4",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.202
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3017
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0189
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3446
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1619
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-2ep/d25a4602-ea50-4a53-952c-112ba250123b.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-2ep/d25a4602-ea50-4a53-952c-112ba250123b.json
deleted file mode 100644
index 4799c0aa3..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-2ep/d25a4602-ea50-4a53-952c-112ba250123b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-1e-5-2ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-1e-5-2ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-5-2ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1971
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3225
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0529
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3368
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1651
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-3ep/232e3fc4-5cd2-4515-9e15-acd7d56bc34d.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-3ep/232e3fc4-5cd2-4515-9e15-acd7d56bc34d.json
deleted file mode 100644
index 3305c120d..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-3ep/232e3fc4-5cd2-4515-9e15-acd7d56bc34d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-1e-5-3ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-1e-5-3ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-5-3ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2241
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3353
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1689
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-5ep/975f54fe-a581-4ce1-b0c1-7becb7605f09.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-5ep/975f54fe-a581-4ce1-b0c1-7becb7605f09.json
deleted file mode 100644
index 925baa726..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5-5ep/975f54fe-a581-4ce1-b0c1-7becb7605f09.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-1e-5-5ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-1e-5-5ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-5-5ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2292
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3259
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0521
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3235
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1688
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5/92ae4461-48bc-47fe-a3ad-ea4c3452d395.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5/92ae4461-48bc-47fe-a3ad-ea4c3452d395.json
deleted file mode 100644
index 36e1434b4..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-1e-5/92ae4461-48bc-47fe-a3ad-ea4c3452d395.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-1e-5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-1e-5",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-1e-5",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1986
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.314
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0378
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.346
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1698
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-2ep/638e1cc0-9baf-4555-a278-4b21c46af86f.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-2ep/638e1cc0-9baf-4555-a278-4b21c46af86f.json
deleted file mode 100644
index b6a076686..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-2ep/638e1cc0-9baf-4555-a278-4b21c46af86f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-4-2ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-4-2ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-4-2ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1831
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2984
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2424
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3568
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1484
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-3ep/cef4161a-4e1c-4a92-bca8-b07f957a13b1.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-3ep/cef4161a-4e1c-4a92-bca8-b07f957a13b1.json
deleted file mode 100644
index ae6218e48..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-3ep/cef4161a-4e1c-4a92-bca8-b07f957a13b1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-4-3ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-4-3ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-4-3ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.199
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.311
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3449
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1416
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-5ep/715b556b-2bc0-4864-b4b1-b7413a5d45bc.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-5ep/715b556b-2bc0-4864-b4b1-b7413a5d45bc.json
deleted file mode 100644
index e5bad0533..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4-5ep/715b556b-2bc0-4864-b4b1-b7413a5d45bc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-4-5ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-4-5ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-4-5ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1897
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3874
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1336
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4/7552ad5c-5d1f-478b-a931-036083b2954e.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4/7552ad5c-5d1f-478b-a931-036083b2954e.json
deleted file mode 100644
index 43c9ef63c..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-4/7552ad5c-5d1f-478b-a931-036083b2954e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-4",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-4",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2034
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0242
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3434
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1413
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam/7bb3ae9f-9bb3-4bf2-9d97-d7f4f30697ac.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam/7bb3ae9f-9bb3-4bf2-9d97-d7f4f30697ac.json
deleted file mode 100644
index a44e27ca9..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam/7bb3ae9f-9bb3-4bf2-9d97-d7f4f30697ac.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_3e-7-3ep_0alp_5lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2411
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3167
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0347
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3301
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1562
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam/821d67e5-da8d-4383-8825-3bfa72a91fc9.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam/821d67e5-da8d-4383-8825-3bfa72a91fc9.json
deleted file mode 100644
index def874bd2..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam/821d67e5-da8d-4383-8825-3bfa72a91fc9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-1ep_0alp_5lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2369
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.326
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3355
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.157
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam/c5bddcba-4a40-4fbb-93e8-aebd06a70a66.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam/c5bddcba-4a40-4fbb-93e8-aebd06a70a66.json
deleted file mode 100644
index 48999ddc5..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam/c5bddcba-4a40-4fbb-93e8-aebd06a70a66.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-2ep_0alp_5lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2262
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0347
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3408
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1541
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam/dc35237c-606d-4609-927a-566bea767312.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam/dc35237c-606d-4609-927a-566bea767312.json
deleted file mode 100644
index 4dd0076c1..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam/dc35237c-606d-4609-927a-566bea767312.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-6-3ep_0alp_5lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3199
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3355
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1555
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam/3924d1af-e167-4186-a34b-d9b4b8c26d59.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam/3924d1af-e167-4186-a34b-d9b4b8c26d59.json
deleted file mode 100644
index a8a95addb..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam/3924d1af-e167-4186-a34b-d9b4b8c26d59.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-1ep_0alp_5lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.239
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3182
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3328
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.156
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam/f733c4cc-90fc-4b31-bed3-c57dba6d4b6a.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam/f733c4cc-90fc-4b31-bed3-c57dba6d4b6a.json
deleted file mode 100644
index 8e804555f..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam/f733c4cc-90fc-4b31-bed3-c57dba6d4b6a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-2ep_0alp_5lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2423
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0347
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3328
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1548
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam/08f933a0-b096-4271-890e-0df7e20d1d20.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam/08f933a0-b096-4271-890e-0df7e20d1d20.json
deleted file mode 100644
index 254d8727c..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam/08f933a0-b096-4271-890e-0df7e20d1d20.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPOP_5e-7-3ep_0alp_5lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2493
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.319
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3341
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1561
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam/8434e448-ed77-45f2-9c31-39128912f842.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam/8434e448-ed77-45f2-9c31-39128912f842.json
deleted file mode 100644
index 2af5272ab..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam/8434e448-ed77-45f2-9c31-39128912f842.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3167
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.158
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam/d801037b-1eb0-4058-9096-429e5237e015.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam/d801037b-1eb0-4058-9096-429e5237e015.json
deleted file mode 100644
index b5090ee5f..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam/d801037b-1eb0-4058-9096-429e5237e015.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-2ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2451
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.316
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3302
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1561
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam/e0c46f18-598e-402f-8955-68e71fab67cd.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam/e0c46f18-598e-402f-8955-68e71fab67cd.json
deleted file mode 100644
index eb8631855..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam/e0c46f18-598e-402f-8955-68e71fab67cd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-6-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2557
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3142
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3315
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1575
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam/4b987cb5-cf7c-4866-8cf0-9926f78c2de9.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam/4b987cb5-cf7c-4866-8cf0-9926f78c2de9.json
deleted file mode 100644
index a7ae82558..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam/4b987cb5-cf7c-4866-8cf0-9926f78c2de9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-2ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2605
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3167
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0363
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3341
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1577
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam/ec658058-1075-4918-9dc9-fc79d0dcf897.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam/ec658058-1075-4918-9dc9-fc79d0dcf897.json
deleted file mode 100644
index 090ea8c8f..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam/ec658058-1075-4918-9dc9-fc79d0dcf897.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_1e-7-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2578
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3173
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0355
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3288
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1583
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam/b68baa86-3e1a-4888-98ba-2ecede79b4a7.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam/b68baa86-3e1a-4888-98ba-2ecede79b4a7.json
deleted file mode 100644
index 0b37ef072..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam/b68baa86-3e1a-4888-98ba-2ecede79b4a7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2335
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3198
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3276
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1581
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam/0b11c8ab-2cfa-425d-9d81-d999f94401db.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam/0b11c8ab-2cfa-425d-9d81-d999f94401db.json
deleted file mode 100644
index 8cc0efa5c..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam/0b11c8ab-2cfa-425d-9d81-d999f94401db.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-2ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2472
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3226
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1538
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam/a3e48db8-3679-4f19-853d-82a73ef49400.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam/a3e48db8-3679-4f19-853d-82a73ef49400.json
deleted file mode 100644
index 9df394a48..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam/a3e48db8-3679-4f19-853d-82a73ef49400.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_2e-6-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2474
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3229
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0415
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3275
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1539
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam/7dbf35b2-80c1-4181-80f9-850ea51cead2.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam/7dbf35b2-80c1-4181-80f9-850ea51cead2.json
deleted file mode 100644
index 839bdc602..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam/7dbf35b2-80c1-4181-80f9-850ea51cead2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2403
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3245
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1573
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam/231f47db-1662-4313-9ff4-f32883f5615c.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam/231f47db-1662-4313-9ff4-f32883f5615c.json
deleted file mode 100644
index 33c170571..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam/231f47db-1662-4313-9ff4-f32883f5615c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-2ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2368
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3224
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0461
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3355
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1516
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam/c79df898-14c6-4f00-9f65-0d01cd34ed61.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam/c79df898-14c6-4f00-9f65-0d01cd34ed61.json
deleted file mode 100644
index f02cc4dee..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam/c79df898-14c6-4f00-9f65-0d01cd34ed61.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-6-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2372
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3248
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3394
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.155
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam/2c52917f-c396-410d-bc78-c93c433797fc.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam/2c52917f-c396-410d-bc78-c93c433797fc.json
deleted file mode 100644
index 1ff95edd2..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam/2c52917f-c396-410d-bc78-c93c433797fc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_3e-7-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2499
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3181
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0415
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3288
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1574
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam/0f1d2925-4e1c-495b-94be-f3515fbd53d7.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam/0f1d2925-4e1c-495b-94be-f3515fbd53d7.json
deleted file mode 100644
index 69688309d..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam/0f1d2925-4e1c-495b-94be-f3515fbd53d7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2381
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3242
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0498
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3328
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1572
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam/5cbb1972-9895-4689-9f6f-7e0037829a78.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam/5cbb1972-9895-4689-9f6f-7e0037829a78.json
deleted file mode 100644
index 98d643888..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam/5cbb1972-9895-4689-9f6f-7e0037829a78.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-2ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2421
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3225
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3408
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1496
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam/6bc42e37-1f31-47cb-97e4-9d0b28b53691.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam/6bc42e37-1f31-47cb-97e4-9d0b28b53691.json
deleted file mode 100644
index 146ac953b..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam/6bc42e37-1f31-47cb-97e4-9d0b28b53691.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-6-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2381
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3265
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0446
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3408
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1499
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam/a1573b95-59e6-4ae0-bc12-6ef6fee90b76.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam/a1573b95-59e6-4ae0-bc12-6ef6fee90b76.json
deleted file mode 100644
index 23710b611..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam/a1573b95-59e6-4ae0-bc12-6ef6fee90b76.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2526
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3177
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1572
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam/78c61b39-3c76-4af9-8d5e-fcd67d6c8779.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam/78c61b39-3c76-4af9-8d5e-fcd67d6c8779.json
deleted file mode 100644
index f02becf07..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam/78c61b39-3c76-4af9-8d5e-fcd67d6c8779.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_2ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2457
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.316
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0446
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3302
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1572
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam/e4c06400-da86-4448-b421-23476f50bdb3.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam/e4c06400-da86-4448-b421-23476f50bdb3.json
deleted file mode 100644
index 77c01e879..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam/e4c06400-da86-4448-b421-23476f50bdb3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_5e-7_3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2442
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3194
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0483
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3315
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1567
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam/48f4c2a7-e819-4789-92ea-e02c5e92d3e4.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam/48f4c2a7-e819-4789-92ea-e02c5e92d3e4.json
deleted file mode 100644
index 4b4f5231f..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam/48f4c2a7-e819-4789-92ea-e02c5e92d3e4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2604
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3178
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0355
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3288
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1567
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam/cd9cbbac-f1ca-4193-88cc-e5968cc1bb62.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam/cd9cbbac-f1ca-4193-88cc-e5968cc1bb62.json
deleted file mode 100644
index d5f2c4e6c..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam/cd9cbbac-f1ca-4193-88cc-e5968cc1bb62.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_2ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.249
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3173
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3302
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1569
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam/ab3685ab-1795-4a0e-8ee4-4f509616d1b8.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam/ab3685ab-1795-4a0e-8ee4-4f509616d1b8.json
deleted file mode 100644
index 03d43d960..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam/ab3685ab-1795-4a0e-8ee4-4f509616d1b8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-DPO_7e-7_3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2604
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.315
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0378
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1566
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam/9018f443-a63f-4e07-b10b-272f66d1eb0d.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam/9018f443-a63f-4e07-b10b-272f66d1eb0d.json
deleted file mode 100644
index b053ce369..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam/9018f443-a63f-4e07-b10b-272f66d1eb0d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-1ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.255
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3211
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0491
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3288
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1571
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam/548d1536-b941-43a9-a60b-ae5448b70933.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam/548d1536-b941-43a9-a60b-ae5448b70933.json
deleted file mode 100644
index 85329973f..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam/548d1536-b941-43a9-a60b-ae5448b70933.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-2ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2478
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3198
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0423
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3315
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1587
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam/99853109-17d9-46fa-a502-e4c977c1fb8f.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam/99853109-17d9-46fa-a502-e4c977c1fb8f.json
deleted file mode 100644
index 76daaf914..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam/99853109-17d9-46fa-a502-e4c977c1fb8f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_1e-7-3ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2475
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3225
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3301
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1556
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam/e171a0a0-f46d-404f-84e8-539155284e17.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam/e171a0a0-f46d-404f-84e8-539155284e17.json
deleted file mode 100644
index b9590f6bc..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam/e171a0a0-f46d-404f-84e8-539155284e17.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_3e-7-3ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.259
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3185
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0363
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3275
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1586
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam/eadd93e5-5770-4d4a-a1b2-6e732a82ce34.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam/eadd93e5-5770-4d4a-a1b2-6e732a82ce34.json
deleted file mode 100644
index 70f5c52d6..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam/eadd93e5-5770-4d4a-a1b2-6e732a82ce34.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-1ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2323
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3179
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1548
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam/151cb8c4-0a7d-4886-80ea-560902e1f932.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam/151cb8c4-0a7d-4886-80ea-560902e1f932.json
deleted file mode 100644
index 6d9e81842..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam/151cb8c4-0a7d-4886-80ea-560902e1f932.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-2ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2315
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.326
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0415
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3383
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1521
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam/1acb97c4-a9d2-4ec8-9486-77eb6857646c.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam/1acb97c4-a9d2-4ec8-9486-77eb6857646c.json
deleted file mode 100644
index 95db25ea7..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam/1acb97c4-a9d2-4ec8-9486-77eb6857646c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-6-3ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2298
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.332
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3329
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1567
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam/1d803ac5-3ca6-4cb0-bcd1-779eaea1562d.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam/1d803ac5-3ca6-4cb0-bcd1-779eaea1562d.json
deleted file mode 100644
index 5f441e414..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam/1d803ac5-3ca6-4cb0-bcd1-779eaea1562d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-1ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2469
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3179
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0415
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3302
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1575
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam/81562e50-23c5-4ef1-b98c-b40625f3b8c6.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam/81562e50-23c5-4ef1-b98c-b40625f3b8c6.json
deleted file mode 100644
index 1cd8d7c85..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam/81562e50-23c5-4ef1-b98c-b40625f3b8c6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-2ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.252
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3168
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.037
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3328
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1576
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam/95fa292a-ee64-4844-9646-ce3cc7f730d2.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam/95fa292a-ee64-4844-9646-ce3cc7f730d2.json
deleted file mode 100644
index 361a9000e..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam/95fa292a-ee64-4844-9646-ce3cc7f730d2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-IRPO_5e-7-3ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2666
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3191
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0347
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1567
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam/4d14c584-b5a1-41cd-9605-78088dfebd7f.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam/4d14c584-b5a1-41cd-9605-78088dfebd7f.json
deleted file mode 100644
index a144afa1e..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam/4d14c584-b5a1-41cd-9605-78088dfebd7f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2499
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3178
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.037
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3341
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1562
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam/1415d3d9-d7f8-48ef-8a2f-aa675c4c14db.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam/1415d3d9-d7f8-48ef-8a2f-aa675c4c14db.json
deleted file mode 100644
index fa1682646..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam/1415d3d9-d7f8-48ef-8a2f-aa675c4c14db.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-2ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2417
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3178
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3328
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1575
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam/4b0ab369-e72f-4229-b449-3a21ee9d2c95.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam/4b0ab369-e72f-4229-b449-3a21ee9d2c95.json
deleted file mode 100644
index cbb23bb4c..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam/4b0ab369-e72f-4229-b449-3a21ee9d2c95.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_0.5_1e-7-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2562
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.319
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0423
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3341
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1576
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam/478b6c1f-3329-4c9b-9d90-59b8b551c1af.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam/478b6c1f-3329-4c9b-9d90-59b8b551c1af.json
deleted file mode 100644
index fc688f9c5..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam/478b6c1f-3329-4c9b-9d90-59b8b551c1af.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2408
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3165
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3315
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1557
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam/212f8dd2-3c61-45bd-a3de-2326334feb73.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam/212f8dd2-3c61-45bd-a3de-2326334feb73.json
deleted file mode 100644
index 4c5ef642e..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam/212f8dd2-3c61-45bd-a3de-2326334feb73.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2481
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3204
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3302
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1592
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam/9251282e-f72f-406e-a2cf-e7063516f624.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam/9251282e-f72f-406e-a2cf-e7063516f624.json
deleted file mode 100644
index 98dde13cb..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam/9251282e-f72f-406e-a2cf-e7063516f624.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_1e-6_2ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2545
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3186
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0498
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1561
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam/91a3c739-7e16-4d21-8879-bb2fd4d4c6ad.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam/91a3c739-7e16-4d21-8879-bb2fd4d4c6ad.json
deleted file mode 100644
index b16eabb54..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam/91a3c739-7e16-4d21-8879-bb2fd4d4c6ad.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.252
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3204
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1538
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam/aaa78d8f-6050-4b5d-bb67-da6c9d1ee065.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam/aaa78d8f-6050-4b5d-bb67-da6c9d1ee065.json
deleted file mode 100644
index 2f7a799fb..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam/aaa78d8f-6050-4b5d-bb67-da6c9d1ee065.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2315
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3222
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1582
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam/1f0430fe-24ff-4ef6-8577-ee5bfa74f18b.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam/1f0430fe-24ff-4ef6-8577-ee5bfa74f18b.json
deleted file mode 100644
index 928372798..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam/1f0430fe-24ff-4ef6-8577-ee5bfa74f18b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_2e-6_2ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2515
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3187
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1539
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam/f374772b-2685-41e2-a455-9002e48e3739.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam/f374772b-2685-41e2-a455-9002e48e3739.json
deleted file mode 100644
index feb9ce218..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam/f374772b-2685-41e2-a455-9002e48e3739.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2472
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0347
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1588
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam/6db801f8-5253-47c0-b87e-6779bff42f6b.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam/6db801f8-5253-47c0-b87e-6779bff42f6b.json
deleted file mode 100644
index 70dc4e5f1..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam/6db801f8-5253-47c0-b87e-6779bff42f6b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-2ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.246
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3234
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0378
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3302
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1533
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam/0d704671-c0b6-4296-85b5-eaf972d6be6a.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam/0d704671-c0b6-4296-85b5-eaf972d6be6a.json
deleted file mode 100644
index 7fff16277..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam/0d704671-c0b6-4296-85b5-eaf972d6be6a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_3e-6-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2524
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3256
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3368
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1531
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam/7e31545f-0865-4843-914b-a71f8a84314f.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam/7e31545f-0865-4843-914b-a71f8a84314f.json
deleted file mode 100644
index b0805fce1..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam/7e31545f-0865-4843-914b-a71f8a84314f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2265
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3252
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1568
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam/431c7130-5a19-4a71-8a92-fea9726769ac.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam/431c7130-5a19-4a71-8a92-fea9726769ac.json
deleted file mode 100644
index a3d21ddaa..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam/431c7130-5a19-4a71-8a92-fea9726769ac.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-2ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2302
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3224
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3408
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.15
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam/ca850c4a-14d0-4145-9977-0d33e6e3e362.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam/ca850c4a-14d0-4145-9977-0d33e6e3e362.json
deleted file mode 100644
index 38c8a17c7..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam/ca850c4a-14d0-4145-9977-0d33e6e3e362.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-6-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2524
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3278
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3395
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1521
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam/7389caa3-6d8f-43e3-b3f2-d9320e56f621.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam/7389caa3-6d8f-43e3-b3f2-d9320e56f621.json
deleted file mode 100644
index a5198f971..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam/7389caa3-6d8f-43e3-b3f2-d9320e56f621.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2658
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3175
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0363
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3302
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1575
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam/1e822b0f-0d80-4613-983b-ebd2e6fbfcd6.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam/1e822b0f-0d80-4613-983b-ebd2e6fbfcd6.json
deleted file mode 100644
index 1707361ce..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam/1e822b0f-0d80-4613-983b-ebd2e6fbfcd6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2487
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3189
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0378
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3275
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1595
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam/1206f592-e6f7-4e7d-83cd-cbe82b37ec58.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam/1206f592-e6f7-4e7d-83cd-cbe82b37ec58.json
deleted file mode 100644
index 375805022..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam/1206f592-e6f7-4e7d-83cd-cbe82b37ec58.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_5e-7_2ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.256
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3159
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0378
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3275
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1562
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam/e4085c6a-bc16-4328-a724-4b9838b55faa.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam/e4085c6a-bc16-4328-a724-4b9838b55faa.json
deleted file mode 100644
index 67bf5ff22..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam/e4085c6a-bc16-4328-a724-4b9838b55faa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2499
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3156
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3302
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1556
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam/b929b955-1fbb-43d0-add1-4d58fdc4097c.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam/b929b955-1fbb-43d0-add1-4d58fdc4097c.json
deleted file mode 100644
index 0387f9f97..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam/b929b955-1fbb-43d0-add1-4d58fdc4097c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2496
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3177
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3315
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1567
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam/df723a0f-9a32-42f3-9421-780159f7d821.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam/df723a0f-9a32-42f3-9421-780159f7d821.json
deleted file mode 100644
index 03229f6c3..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam/df723a0f-9a32-42f3-9421-780159f7d821.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep-MDPO_7e-7_2ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2515
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3172
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3275
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1553
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep/c1046d2c-0b5b-4ab7-b173-8d5b5ecbc07d.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep/c1046d2c-0b5b-4ab7-b173-8d5b5ecbc07d.json
deleted file mode 100644
index d40e539a7..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep/c1046d2c-0b5b-4ab7-b173-8d5b5ecbc07d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-2ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-2ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-2ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2201
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3217
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3367
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.171
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-3ep/60c02070-7554-4764-8a02-841ca75a0d5c.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-3ep/60c02070-7554-4764-8a02-841ca75a0d5c.json
deleted file mode 100644
index c536b41df..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-3ep/60c02070-7554-4764-8a02-841ca75a0d5c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-3ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-3ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-3ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2281
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.324
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3301
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1746
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam/d243f226-149b-4824-837e-e80ab68bae9d.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam/d243f226-149b-4824-837e-e80ab68bae9d.json
deleted file mode 100644
index a45e23e39..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam/d243f226-149b-4824-837e-e80ab68bae9d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2526
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3528
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1574
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep/4f9361d0-2ad9-44da-a1d9-876d43451ae6.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep/4f9361d0-2ad9-44da-a1d9-876d43451ae6.json
deleted file mode 100644
index d516e44ee..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep/4f9361d0-2ad9-44da-a1d9-876d43451ae6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_1ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2481
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3175
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3475
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1597
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep/6c6e9ebc-f83d-48d5-b69f-be43d4167a0e.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep/6c6e9ebc-f83d-48d5-b69f-be43d4167a0e.json
deleted file mode 100644
index fc27247a4..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep/6c6e9ebc-f83d-48d5-b69f-be43d4167a0e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_5e-7_3ep_0alp_0lam_2ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2548
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3199
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3435
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1562
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam/7cd2c0da-15b8-4ad6-8cad-feb68631c079.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam/7cd2c0da-15b8-4ad6-8cad-feb68631c079.json
deleted file mode 100644
index 60222a1b0..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam/7cd2c0da-15b8-4ad6-8cad-feb68631c079.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2423
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3219
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.034
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1563
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep/36b84cf2-d221-4e9a-b728-37dc2bf7e1d6.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep/36b84cf2-d221-4e9a-b728-37dc2bf7e1d6.json
deleted file mode 100644
index 97123425b..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep/36b84cf2-d221-4e9a-b728-37dc2bf7e1d6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_1ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2493
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3191
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3475
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1592
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep/1fd0d1db-1d75-4b10-bae8-33023c2c7466.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep/1fd0d1db-1d75-4b10-bae8-33023c2c7466.json
deleted file mode 100644
index 591bed95d..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep/1fd0d1db-1d75-4b10-bae8-33023c2c7466.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep-MDPO_7e-7_3ep_0alp_0lam_2ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2478
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3218
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0415
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1556
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep/c6c02512-6c91-4818-a084-c48915fd83de.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep/c6c02512-6c91-4818-a084-c48915fd83de.json
deleted file mode 100644
index 6bd7dcd08..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep/c6c02512-6c91-4818-a084-c48915fd83de.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5-5ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5-5ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5-5ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2348
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3308
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3409
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1695
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5/326affa2-9ea4-4fc9-b60f-d2abeb7493c3.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5/326affa2-9ea4-4fc9-b60f-d2abeb7493c3.json
deleted file mode 100644
index 02a1ed3dd..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-2e-5/326affa2-9ea4-4fc9-b60f-d2abeb7493c3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-2e-5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-2e-5",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-2e-5",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2068
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3204
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.037
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3487
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1678
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-2ep/b3a190d1-5b86-4439-a21e-1f118239db82.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-2ep/b3a190d1-5b86-4439-a21e-1f118239db82.json
deleted file mode 100644
index 3ce9e8679..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-2ep/b3a190d1-5b86-4439-a21e-1f118239db82.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-5e-5-2ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-5e-5-2ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-5e-5-2ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2175
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0378
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3368
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1627
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-3ep/b37a7db5-b26f-4a82-b27c-6c3a2ba72fda.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-3ep/b37a7db5-b26f-4a82-b27c-6c3a2ba72fda.json
deleted file mode 100644
index ce21c0bef..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-3ep/b37a7db5-b26f-4a82-b27c-6c3a2ba72fda.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-5e-5-3ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-5e-5-3ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-5e-5-3ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2199
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3297
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3593
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1651
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-5ep/05a59445-b816-4982-9b1a-1c2394ffbaa9.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-5ep/05a59445-b816-4982-9b1a-1c2394ffbaa9.json
deleted file mode 100644
index 8649fd379..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5-5ep/05a59445-b816-4982-9b1a-1c2394ffbaa9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-5e-5-5ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-5e-5-5ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-5e-5-5ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2077
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3276
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0272
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3766
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1587
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5/ff952579-e92d-4af8-9497-f49fed5efba0.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5/ff952579-e92d-4af8-9497-f49fed5efba0.json
deleted file mode 100644
index c0f65b58b..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-5e-5/ff952579-e92d-4af8-9497-f49fed5efba0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-5e-5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-5e-5",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-5e-5",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.201
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3109
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.034
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1672
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-2ep/b541ede0-6de9-4557-8280-43567fd3dd96.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-2ep/b541ede0-6de9-4557-8280-43567fd3dd96.json
deleted file mode 100644
index b702f2d51..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-2ep/b541ede0-6de9-4557-8280-43567fd3dd96.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-7e-5-2ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-7e-5-2ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-7e-5-2ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2156
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.31
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2424
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3367
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1567
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-3ep/8514f601-0bb2-4639-90cc-29e96088e7de.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-3ep/8514f601-0bb2-4639-90cc-29e96088e7de.json
deleted file mode 100644
index 686bff7aa..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-3ep/8514f601-0bb2-4639-90cc-29e96088e7de.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-7e-5-3ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-7e-5-3ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-7e-5-3ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2381
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3199
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0332
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2366
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3554
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1522
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-5ep/57e6d0cf-943a-4b83-a1f4-4f03b5066523.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-5ep/57e6d0cf-943a-4b83-a1f4-4f03b5066523.json
deleted file mode 100644
index 138ef3aec..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5-5ep/57e6d0cf-943a-4b83-a1f4-4f03b5066523.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-7e-5-5ep/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-7e-5-5ep",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-7e-5-5ep",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.212
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.32
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2458
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3713
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1628
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5/ec205127-21c0-4edf-bb3a-ec8ccac4fcdb.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5/ec205127-21c0-4edf-bb3a-ec8ccac4fcdb.json
deleted file mode 100644
index 0c78b2864..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-7e-5/ec205127-21c0-4edf-bb3a-ec8ccac4fcdb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-7e-5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-7e-5",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-7e-5",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2093
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3158
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3367
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1622
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-DPO-1epoch_v1/14b260e6-4300-43ec-b7af-587a2f5b03fb.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-DPO-1epoch_v1/14b260e6-4300-43ec-b7af-587a2f5b03fb.json
deleted file mode 100644
index 0329a0fc0..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-DPO-1epoch_v1/14b260e6-4300-43ec-b7af-587a2f5b03fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-DPO-1epoch_v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-DPO-1epoch_v1",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-DPO-1epoch_v1",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2025
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3268
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0363
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3209
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.133
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-MDPO-1epoch_v1/53de1fc9-7097-4103-b731-588a7bf39f80.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-MDPO-1epoch_v1/53de1fc9-7097-4103-b731-588a7bf39f80.json
deleted file mode 100644
index 2d3c81041..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT-MDPO-1epoch_v1/53de1fc9-7097-4103-b731-588a7bf39f80.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT-MDPO-1epoch_v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT-MDPO-1epoch_v1",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT-MDPO-1epoch_v1",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1964
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3293
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0468
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1337
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT/1a1031c5-3ec2-4d12-93eb-e0a3b0448ed4.json b/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT/1a1031c5-3ec2-4d12-93eb-e0a3b0448ed4.json
deleted file mode 100644
index 2c42031ee..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen2.5-0.5B-SFT/1a1031c5-3ec2-4d12-93eb-e0a3b0448ed4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen2.5-0.5B-SFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-SFT",
-    "id": "JayHyeon/Qwen2.5-0.5B-SFT",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1964
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0272
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3394
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1673
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam/51b62d59-f39c-49ca-af0a-73df6440e29d.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam/51b62d59-f39c-49ca-af0a-73df6440e29d.json
deleted file mode 100644
index b18776596..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam/51b62d59-f39c-49ca-af0a-73df6440e29d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam",
-    "id": "JayHyeon/Qwen_0.5-DPOP_1e-6-3ep_0alp_5lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2532
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.314
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0491
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3315
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1566
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam/622a0ae1-0eb5-49f0-bc44-d396c7233e27.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam/622a0ae1-0eb5-49f0-bc44-d396c7233e27.json
deleted file mode 100644
index 8a1f01917..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam/622a0ae1-0eb5-49f0-bc44-d396c7233e27.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam",
-    "id": "JayHyeon/Qwen_0.5-DPOP_1e-7-3ep_0alp_5lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.267
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3189
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3288
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1562
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam/71291a41-283e-42ca-b192-7b759e3c3712.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam/71291a41-283e-42ca-b192-7b759e3c3712.json
deleted file mode 100644
index 9c3e33edf..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam/71291a41-283e-42ca-b192-7b759e3c3712.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam",
-    "id": "JayHyeon/Qwen_0.5-DPOP_3e-6-1ep_0alp_5lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2481
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3261
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3368
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1565
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam/7e504fef-b304-4c1a-856d-06e56a8869d7.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam/7e504fef-b304-4c1a-856d-06e56a8869d7.json
deleted file mode 100644
index 1841fefbe..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam/7e504fef-b304-4c1a-856d-06e56a8869d7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam",
-    "id": "JayHyeon/Qwen_0.5-DPOP_3e-6-2ep_0alp_5lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2383
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3218
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1503
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam/f8258f5e-8826-4fe1-b9d3-61708e79d4ab.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam/f8258f5e-8826-4fe1-b9d3-61708e79d4ab.json
deleted file mode 100644
index 0f50c78cd..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam/f8258f5e-8826-4fe1-b9d3-61708e79d4ab.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam",
-    "id": "JayHyeon/Qwen_0.5-DPOP_3e-6-3ep_0alp_5lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2471
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3224
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3328
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1533
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam/099ce031-1e11-4a07-bac1-03bef9b915d6.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam/099ce031-1e11-4a07-bac1-03bef9b915d6.json
deleted file mode 100644
index 0b464aa1e..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam/099ce031-1e11-4a07-bac1-03bef9b915d6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam",
-    "id": "JayHyeon/Qwen_0.5-DPOP_3e-7-1ep_0alp_5lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2447
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3181
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3341
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1565
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam/75ff25fd-e5f7-4380-b192-cbc8a8ee95aa.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam/75ff25fd-e5f7-4380-b192-cbc8a8ee95aa.json
deleted file mode 100644
index c6cea6b5c..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam/75ff25fd-e5f7-4380-b192-cbc8a8ee95aa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam",
-    "id": "JayHyeon/Qwen_0.5-DPOP_3e-7-2ep_0alp_5lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2551
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3194
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0446
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1567
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam/cbc43c7a-d8ac-4b03-a383-703f7fa51757.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam/cbc43c7a-d8ac-4b03-a383-703f7fa51757.json
deleted file mode 100644
index 6baf33d2a..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam/cbc43c7a-d8ac-4b03-a383-703f7fa51757.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam",
-    "id": "JayHyeon/Qwen_0.5-DPOP_3e-7-3ep_0alp_5lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2538
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3153
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0415
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3261
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1583
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam/72d7f252-1bff-40ad-9ec8-1ac2a2e02a8e.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam/72d7f252-1bff-40ad-9ec8-1ac2a2e02a8e.json
deleted file mode 100644
index b8795c78b..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam/72d7f252-1bff-40ad-9ec8-1ac2a2e02a8e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam",
-    "id": "JayHyeon/Qwen_0.5-DPOP_5e-7-1ep_0alp_5lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2402
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3168
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0378
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3328
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1568
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam/5eb10878-11e6-43ad-9bb5-658a3495129c.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam/5eb10878-11e6-43ad-9bb5-658a3495129c.json
deleted file mode 100644
index 6f4423d45..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam/5eb10878-11e6-43ad-9bb5-658a3495129c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam",
-    "id": "JayHyeon/Qwen_0.5-DPOP_5e-7-2ep_0alp_5lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2484
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3211
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3288
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1573
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam/23b29cd4-cfd0-49f1-8959-c3aa8be9722f.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam/23b29cd4-cfd0-49f1-8959-c3aa8be9722f.json
deleted file mode 100644
index e97c3a4f4..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam/23b29cd4-cfd0-49f1-8959-c3aa8be9722f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam",
-    "id": "JayHyeon/Qwen_0.5-DPOP_5e-7-3ep_0alp_5lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2578
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3203
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0423
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1583
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_1e-6-3ep_0alp_0lam/03db2532-f8e0-41e9-ac0c-ff2913f4b12a.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_1e-6-3ep_0alp_0lam/03db2532-f8e0-41e9-ac0c-ff2913f4b12a.json
deleted file mode 100644
index 5542366cc..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_1e-6-3ep_0alp_0lam/03db2532-f8e0-41e9-ac0c-ff2913f4b12a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_1e-6-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPO_1e-6-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-DPO_1e-6-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2316
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3258
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0529
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.158
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_1e-7-3ep_0alp_0lam/273f0d50-aa4e-4469-8360-2ce0a2e1a850.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_1e-7-3ep_0alp_0lam/273f0d50-aa4e-4469-8360-2ce0a2e1a850.json
deleted file mode 100644
index fcfa337b4..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_1e-7-3ep_0alp_0lam/273f0d50-aa4e-4469-8360-2ce0a2e1a850.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_1e-7-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPO_1e-7-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-DPO_1e-7-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.236
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3225
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3222
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1596
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-1ep_0alp_0lam/79a48e79-d59b-4f86-a8f4-3af174a9ee0b.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-1ep_0alp_0lam/79a48e79-d59b-4f86-a8f4-3af174a9ee0b.json
deleted file mode 100644
index 8cad12a77..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-1ep_0alp_0lam/79a48e79-d59b-4f86-a8f4-3af174a9ee0b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_3e-6-1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPO_3e-6-1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-DPO_3e-6-1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2337
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3132
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0347
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3235
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1533
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-2ep_0alp_0lam/9da9a0e6-257a-41f6-b3a3-e3279a4924db.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-2ep_0alp_0lam/9da9a0e6-257a-41f6-b3a3-e3279a4924db.json
deleted file mode 100644
index 8fcff6407..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-2ep_0alp_0lam/9da9a0e6-257a-41f6-b3a3-e3279a4924db.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_3e-6-2ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPO_3e-6-2ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-DPO_3e-6-2ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2569
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3276
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3156
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1565
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-3ep_0alp_0lam/dfed058c-48b2-4e1e-9a29-624771e3e9dd.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-3ep_0alp_0lam/dfed058c-48b2-4e1e-9a29-624771e3e9dd.json
deleted file mode 100644
index 1408d7de5..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-6-3ep_0alp_0lam/dfed058c-48b2-4e1e-9a29-624771e3e9dd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_3e-6-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPO_3e-6-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-DPO_3e-6-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.246
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3267
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3209
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1543
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-1ep_0alp_0lam/bcb53a8a-1670-400c-aab6-bd8ed2ebcdf4.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-1ep_0alp_0lam/bcb53a8a-1670-400c-aab6-bd8ed2ebcdf4.json
deleted file mode 100644
index 2ee2160cf..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-1ep_0alp_0lam/bcb53a8a-1670-400c-aab6-bd8ed2ebcdf4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_3e-7-1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPO_3e-7-1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-DPO_3e-7-1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2529
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3229
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0551
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3195
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1597
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-2ep_0alp_0lam/8438a108-0d5d-48b6-b73a-981d13329daa.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-2ep_0alp_0lam/8438a108-0d5d-48b6-b73a-981d13329daa.json
deleted file mode 100644
index 404b429a1..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-2ep_0alp_0lam/8438a108-0d5d-48b6-b73a-981d13329daa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_3e-7-2ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPO_3e-7-2ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-DPO_3e-7-2ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2505
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3256
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3195
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1599
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-3ep_0alp_0lam/88616292-1e38-4481-af30-6b60e28fb097.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-3ep_0alp_0lam/88616292-1e38-4481-af30-6b60e28fb097.json
deleted file mode 100644
index 6805ba8e2..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_3e-7-3ep_0alp_0lam/88616292-1e38-4481-af30-6b60e28fb097.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_3e-7-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPO_3e-7-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-DPO_3e-7-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2387
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3258
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0446
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3169
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1589
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-1ep_0alp_0lam/44094907-0b09-4706-a117-116a7e10a6e5.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-1ep_0alp_0lam/44094907-0b09-4706-a117-116a7e10a6e5.json
deleted file mode 100644
index 7ed0e9876..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-1ep_0alp_0lam/44094907-0b09-4706-a117-116a7e10a6e5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_5e-7-1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPO_5e-7-1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-DPO_5e-7-1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2532
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3218
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0634
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3209
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1593
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-2ep_0alp_0lam/d19e8078-87e9-4760-9b91-6b5f478820e1.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-2ep_0alp_0lam/d19e8078-87e9-4760-9b91-6b5f478820e1.json
deleted file mode 100644
index 97e1295f7..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-2ep_0alp_0lam/d19e8078-87e9-4760-9b91-6b5f478820e1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_5e-7-2ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPO_5e-7-2ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-DPO_5e-7-2ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2456
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3299
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3181
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1602
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-3ep_0alp_0lam/896464f1-01bc-4370-8d90-3368323b2908.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-3ep_0alp_0lam/896464f1-01bc-4370-8d90-3368323b2908.json
deleted file mode 100644
index b8bf7d1e7..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-DPO_5e-7-3ep_0alp_0lam/896464f1-01bc-4370-8d90-3368323b2908.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-DPO_5e-7-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-DPO_5e-7-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-DPO_5e-7-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2423
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3271
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3181
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1595
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IPO_5e-7-1ep_0alp_0lam/9889f0b9-9051-485c-bd44-32b1e56b865c.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IPO_5e-7-1ep_0alp_0lam/9889f0b9-9051-485c-bd44-32b1e56b865c.json
deleted file mode 100644
index 71c059874..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IPO_5e-7-1ep_0alp_0lam/9889f0b9-9051-485c-bd44-32b1e56b865c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IPO_5e-7-1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-IPO_5e-7-1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-IPO_5e-7-1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2574
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3279
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0559
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3169
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1651
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IPO_5e-7-3ep_0alp_0lam/6563ce79-6df4-4c78-89e2-064f1250d898.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IPO_5e-7-3ep_0alp_0lam/6563ce79-6df4-4c78-89e2-064f1250d898.json
deleted file mode 100644
index 04b6f52a6..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IPO_5e-7-3ep_0alp_0lam/6563ce79-6df4-4c78-89e2-064f1250d898.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IPO_5e-7-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-IPO_5e-7-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-IPO_5e-7-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3072
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3264
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0582
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3156
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1624
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam/b1778755-e6e6-47e2-925d-44d786c4ff62.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam/b1778755-e6e6-47e2-925d-44d786c4ff62.json
deleted file mode 100644
index 0810520c3..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam/b1778755-e6e6-47e2-925d-44d786c4ff62.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-IRPO_1e-6-3ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2551
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3242
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0468
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3182
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1574
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam/3ae923b8-e9f4-472e-8d5e-54fa5f42ce01.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam/3ae923b8-e9f4-472e-8d5e-54fa5f42ce01.json
deleted file mode 100644
index c28c2cd0f..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam/3ae923b8-e9f4-472e-8d5e-54fa5f42ce01.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-IRPO_1e-7-3ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2636
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3198
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1586
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam/40831e23-0a9e-4bdc-a365-9399b6b82ff9.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam/40831e23-0a9e-4bdc-a365-9399b6b82ff9.json
deleted file mode 100644
index 69728b3a1..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam/40831e23-0a9e-4bdc-a365-9399b6b82ff9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-IRPO_3e-6-1ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2323
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.037
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3169
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1612
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam/4a60fa82-34dc-4b0c-9102-65adac5039e4.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam/4a60fa82-34dc-4b0c-9102-65adac5039e4.json
deleted file mode 100644
index 0bf9d0574..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam/4a60fa82-34dc-4b0c-9102-65adac5039e4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-IRPO_3e-6-2ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2414
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0347
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1532
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam/75ff2c43-dd19-48ae-9ba3-f99cdbadda1c.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam/75ff2c43-dd19-48ae-9ba3-f99cdbadda1c.json
deleted file mode 100644
index 493b1d463..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam/75ff2c43-dd19-48ae-9ba3-f99cdbadda1c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-IRPO_3e-6-3ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2678
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3362
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1561
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam/d7962833-660a-4b9b-9836-8a2f3251f38e.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam/d7962833-660a-4b9b-9836-8a2f3251f38e.json
deleted file mode 100644
index 0290bbad0..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam/d7962833-660a-4b9b-9836-8a2f3251f38e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-IRPO_3e-7-1ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2561
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3231
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1589
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam/ad8ecabf-a868-496e-892b-582efb54fa6a.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam/ad8ecabf-a868-496e-892b-582efb54fa6a.json
deleted file mode 100644
index 5edc518ea..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam/ad8ecabf-a868-496e-892b-582efb54fa6a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-IRPO_3e-7-3ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2639
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3257
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3209
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1587
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam/49f25d3d-80c9-4723-8fa9-1501d44d70aa.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam/49f25d3d-80c9-4723-8fa9-1501d44d70aa.json
deleted file mode 100644
index 2d9b9aac9..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam/49f25d3d-80c9-4723-8fa9-1501d44d70aa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-IRPO_5e-7-1ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2518
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3214
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3169
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1585
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam/70ea520c-3e0c-4412-9dbe-40a00801335c.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam/70ea520c-3e0c-4412-9dbe-40a00801335c.json
deleted file mode 100644
index 47326edc5..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam/70ea520c-3e0c-4412-9dbe-40a00801335c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-IRPO_5e-7-2ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2438
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3266
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0619
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1554
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam/8e7f8bad-812b-4f6c-8dea-1cf44584c300.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam/8e7f8bad-812b-4f6c-8dea-1cf44584c300.json
deleted file mode 100644
index 2ecc04e03..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam/8e7f8bad-812b-4f6c-8dea-1cf44584c300.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-IRPO_5e-7-3ep_1alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2465
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3246
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0529
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3182
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1563
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam/3b39a8f0-c5ba-4f74-9d27-bf5b389e038c.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam/3b39a8f0-c5ba-4f74-9d27-bf5b389e038c.json
deleted file mode 100644
index fdc8cfb2c..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam/3b39a8f0-c5ba-4f74-9d27-bf5b389e038c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-MDPO_0.1_3e-6-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2506
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3261
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0498
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1522
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam/702a14d5-a7fd-4926-ab26-e4c3b7f5eda7.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam/702a14d5-a7fd-4926-ab26-e4c3b7f5eda7.json
deleted file mode 100644
index ed6db3b8d..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam/702a14d5-a7fd-4926-ab26-e4c3b7f5eda7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-MDPO_0.1_5e-7-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2457
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0347
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3315
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1566
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam/20e5d087-7b20-4a39-81da-7334354b61f0.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam/20e5d087-7b20-4a39-81da-7334354b61f0.json
deleted file mode 100644
index 2b13ae37e..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam/20e5d087-7b20-4a39-81da-7334354b61f0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-MDPO_0.3_3e-6-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2454
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3216
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1544
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam/4c5a769c-0472-402c-8e97-d24e5b302bac.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam/4c5a769c-0472-402c-8e97-d24e5b302bac.json
deleted file mode 100644
index 455e40c44..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam/4c5a769c-0472-402c-8e97-d24e5b302bac.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-MDPO_0.3_5e-7-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2342
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3189
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3302
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.158
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam/96166735-ed03-4931-81c9-d3daed1913d9.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam/96166735-ed03-4931-81c9-d3daed1913d9.json
deleted file mode 100644
index 4438ad84e..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam/96166735-ed03-4931-81c9-d3daed1913d9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-MDPO_0.5_1e-5-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.232
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3234
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3369
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1543
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam/06d9b1e3-d054-4fa5-bf1f-9d6149e5111c.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam/06d9b1e3-d054-4fa5-bf1f-9d6149e5111c.json
deleted file mode 100644
index 9827aba86..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam/06d9b1e3-d054-4fa5-bf1f-9d6149e5111c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2418
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3175
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0423
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3288
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.158
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam/776fd8d8-9846-4359-97d4-2340425d1315.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam/776fd8d8-9846-4359-97d4-2340425d1315.json
deleted file mode 100644
index 9a8c26353..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam/776fd8d8-9846-4359-97d4-2340425d1315.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-2ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2Model",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2493
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3197
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0423
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3315
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1571
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam/197ae1c5-c9b1-4912-91a3-8ccacddc1be6.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam/197ae1c5-c9b1-4912-91a3-8ccacddc1be6.json
deleted file mode 100644
index ecb5d143e..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam/197ae1c5-c9b1-4912-91a3-8ccacddc1be6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-MDPO_0.5_3e-7-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.252
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3198
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0423
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1551
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam/1fffd3d9-1c6b-4965-84e6-980bb0a13af3.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam/1fffd3d9-1c6b-4965-84e6-980bb0a13af3.json
deleted file mode 100644
index 0e4125637..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam/1fffd3d9-1c6b-4965-84e6-980bb0a13af3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-MDPO_0.5_4e-6-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.258
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3248
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3422
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1539
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam/57e8aaf0-f10b-4024-9f93-7b7f13f3ab10.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam/57e8aaf0-f10b-4024-9f93-7b7f13f3ab10.json
deleted file mode 100644
index a66e2fd4a..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam/57e8aaf0-f10b-4024-9f93-7b7f13f3ab10.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-MDPO_0.5_6e-6-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.232
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3265
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3395
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1537
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam/304d5bee-df2d-40fc-b4a0-e3d99178f4bd.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam/304d5bee-df2d-40fc-b4a0-e3d99178f4bd.json
deleted file mode 100644
index a651a40c3..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam/304d5bee-df2d-40fc-b4a0-e3d99178f4bd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-MDPO_0.5_7e-6-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2488
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3273
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0461
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1531
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam/6126d30d-e2dd-4b8b-9cb3-acdc76084bbb.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam/6126d30d-e2dd-4b8b-9cb3-acdc76084bbb.json
deleted file mode 100644
index 19c99bb89..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam/6126d30d-e2dd-4b8b-9cb3-acdc76084bbb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-MDPO_0.5_7e-7-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2524
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.313
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0446
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1564
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam/fc7284d9-a73f-4562-a781-5cb87247183f.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam/fc7284d9-a73f-4562-a781-5cb87247183f.json
deleted file mode 100644
index 4c3d00e8d..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam/fc7284d9-a73f-4562-a781-5cb87247183f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-MDPO_0.7_3e-6-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2514
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3315
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1538
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam/26ab447c-a850-4197-983a-a0dca4532029.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam/26ab447c-a850-4197-983a-a0dca4532029.json
deleted file mode 100644
index c06a3ff06..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam/26ab447c-a850-4197-983a-a0dca4532029.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-MDPO_0.7_5e-7-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2457
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3275
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1572
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam/ee9e2131-aa99-49e1-9814-f0664614354b.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam/ee9e2131-aa99-49e1-9814-f0664614354b.json
deleted file mode 100644
index 56da94f33..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam/ee9e2131-aa99-49e1-9814-f0664614354b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-MDPO_0.9_5e-7-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2636
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3181
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3235
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1574
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_3e-6-1ep_3vpo_const/23c472f7-f060-4a69-8f72-12490675825a.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_3e-6-1ep_3vpo_const/23c472f7-f060-4a69-8f72-12490675825a.json
deleted file mode 100644
index 070354a8d..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_3e-6-1ep_3vpo_const/23c472f7-f060-4a69-8f72-12490675825a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VDPO_3e-6-1ep_3vpo_const/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-VDPO_3e-6-1ep_3vpo_const",
-    "id": "JayHyeon/Qwen_0.5-VDPO_3e-6-1ep_3vpo_const",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2483
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3174
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0378
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3328
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1558
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam/04172bef-c06b-4c08-b2af-9e1fe4d97664.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam/04172bef-c06b-4c08-b2af-9e1fe4d97664.json
deleted file mode 100644
index c722b726e..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam/04172bef-c06b-4c08-b2af-9e1fe4d97664.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2518
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3218
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0529
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3235
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1595
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_10vpo_const/3436355a-d2fe-411f-a764-4cb8284deb4c.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_10vpo_const/3436355a-d2fe-411f-a764-4cb8284deb4c.json
deleted file mode 100644
index 7ac21a2a4..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_10vpo_const/3436355a-d2fe-411f-a764-4cb8284deb4c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_10vpo_const/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-VDPO_5e-7-1ep_10vpo_const",
-    "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_10vpo_const",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2536
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3234
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0491
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3236
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1597
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_1vpo_const/265655c0-2ead-4dd7-8c7e-4bee69d51bce.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_1vpo_const/265655c0-2ead-4dd7-8c7e-4bee69d51bce.json
deleted file mode 100644
index d1101d0a7..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_1vpo_const/265655c0-2ead-4dd7-8c7e-4bee69d51bce.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_1vpo_const/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-VDPO_5e-7-1ep_1vpo_const",
-    "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_1vpo_const",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2448
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.324
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0604
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3249
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1587
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_3vpo_const/645cae82-9e7b-4d1b-b944-e3783089c1c1.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_3vpo_const/645cae82-9e7b-4d1b-b944-e3783089c1c1.json
deleted file mode 100644
index f0b7804e2..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_3vpo_const/645cae82-9e7b-4d1b-b944-e3783089c1c1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VDPO_5e-7-1ep_3vpo_const/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-VDPO_5e-7-1ep_3vpo_const",
-    "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-1ep_3vpo_const",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2505
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3227
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0468
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3209
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1589
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam/ab658117-7c6b-428f-8f60-bf88a1d8a5bc.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam/ab658117-7c6b-428f-8f60-bf88a1d8a5bc.json
deleted file mode 100644
index a26877761..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam/ab658117-7c6b-428f-8f60-bf88a1d8a5bc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2472
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0498
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3208
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1587
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_1vpo_const/03c4b5ce-3b22-4d9f-bf60-b626b52a114b.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_1vpo_const/03c4b5ce-3b22-4d9f-bf60-b626b52a114b.json
deleted file mode 100644
index 5b2d375d1..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_1vpo_const/03c4b5ce-3b22-4d9f-bf60-b626b52a114b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VDPO_5e-7-3ep_1vpo_const/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-VDPO_5e-7-3ep_1vpo_const",
-    "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_1vpo_const",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2417
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3256
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0582
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3275
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1562
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_3vpo_const/ce7e3a31-c65b-4521-b685-fcbd067c75d9.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_3vpo_const/ce7e3a31-c65b-4521-b685-fcbd067c75d9.json
deleted file mode 100644
index 271b74801..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_3vpo_const/ce7e3a31-c65b-4521-b685-fcbd067c75d9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VDPO_5e-7-3ep_3vpo_const/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-VDPO_5e-7-3ep_3vpo_const",
-    "id": "JayHyeon/Qwen_0.5-VDPO_5e-7-3ep_3vpo_const",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2527
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3235
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3235
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.158
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam/adb53e2c-5dee-4840-8eae-e0186c6e103f.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam/adb53e2c-5dee-4840-8eae-e0186c6e103f.json
deleted file mode 100644
index 67643af45..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam/adb53e2c-5dee-4840-8eae-e0186c6e103f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2669
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.071
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3168
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1634
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_10vpo_const/ba89563d-f53a-4bf0-91e1-92ac950523d8.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_10vpo_const/ba89563d-f53a-4bf0-91e1-92ac950523d8.json
deleted file mode 100644
index fc2edfe9a..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_10vpo_const/ba89563d-f53a-4bf0-91e1-92ac950523d8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_10vpo_const/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-VIPO_5e-7-1ep_10vpo_const",
-    "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_10vpo_const",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2702
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.33
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.074
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3208
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1635
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_1vpo_const/3fc0ad8d-4bb2-401a-9baf-b94b39b7e1aa.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_1vpo_const/3fc0ad8d-4bb2-401a-9baf-b94b39b7e1aa.json
deleted file mode 100644
index 21e635518..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_1vpo_const/3fc0ad8d-4bb2-401a-9baf-b94b39b7e1aa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_1vpo_const/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-VIPO_5e-7-1ep_1vpo_const",
-    "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_1vpo_const",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.248
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3309
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3208
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1649
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_30vpo_const/ed816bcb-bbe9-48ae-a6ac-3603779a985f.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_30vpo_const/ed816bcb-bbe9-48ae-a6ac-3603779a985f.json
deleted file mode 100644
index a5f23395a..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_30vpo_const/ed816bcb-bbe9-48ae-a6ac-3603779a985f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_30vpo_const/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-VIPO_5e-7-1ep_30vpo_const",
-    "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_30vpo_const",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2622
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3282
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.074
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1634
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_3vpo_const/f347ed24-066a-4cba-8478-f03628cb2b5b.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_3vpo_const/f347ed24-066a-4cba-8478-f03628cb2b5b.json
deleted file mode 100644
index 1a374d513..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_3vpo_const/f347ed24-066a-4cba-8478-f03628cb2b5b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-1ep_3vpo_const/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-VIPO_5e-7-1ep_3vpo_const",
-    "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-1ep_3vpo_const",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3298
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3168
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1651
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam/ffddfea0-d17e-44e7-8931-a9601e9cb26b.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam/ffddfea0-d17e-44e7-8931-a9601e9cb26b.json
deleted file mode 100644
index bf35e8693..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam/ffddfea0-d17e-44e7-8931-a9601e9cb26b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam",
-    "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_0alp_0lam",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.293
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.322
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0627
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3116
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1591
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_10vpo_const/ec351fa1-78c2-48c6-83f0-7c2a9b2f0731.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_10vpo_const/ec351fa1-78c2-48c6-83f0-7c2a9b2f0731.json
deleted file mode 100644
index 45a06565e..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_10vpo_const/ec351fa1-78c2-48c6-83f0-7c2a9b2f0731.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_10vpo_const/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-VIPO_5e-7-3ep_10vpo_const",
-    "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_10vpo_const",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2881
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0725
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3102
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1582
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_1vpo_const/a0038c34-130b-49dc-a93f-94706a3dad50.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_1vpo_const/a0038c34-130b-49dc-a93f-94706a3dad50.json
deleted file mode 100644
index 8033e9669..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_1vpo_const/a0038c34-130b-49dc-a93f-94706a3dad50.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_1vpo_const/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-VIPO_5e-7-3ep_1vpo_const",
-    "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_1vpo_const",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2887
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3237
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0748
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3142
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1609
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_30vpo_const/cbd5ea42-1e5b-4984-bdcf-e60fbfb9d692.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_30vpo_const/cbd5ea42-1e5b-4984-bdcf-e60fbfb9d692.json
deleted file mode 100644
index 85441076b..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_30vpo_const/cbd5ea42-1e5b-4984-bdcf-e60fbfb9d692.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_30vpo_const/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-VIPO_5e-7-3ep_30vpo_const",
-    "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_30vpo_const",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2905
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3254
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.077
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1574
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_3vpo_const/b902e2b2-a0b3-4467-b076-b98717c40d74.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_3vpo_const/b902e2b2-a0b3-4467-b076-b98717c40d74.json
deleted file mode 100644
index cb1e7dc31..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_3vpo_const/b902e2b2-a0b3-4467-b076-b98717c40d74.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-VIPO_5e-7-3ep_3vpo_const/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-VIPO_5e-7-3ep_3vpo_const",
-    "id": "JayHyeon/Qwen_0.5-VIPO_5e-7-3ep_3vpo_const",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2905
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0702
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3089
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1592
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1/4c749665-59ff-49df-a193-0262f66e6003.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1/4c749665-59ff-49df-a193-0262f66e6003.json
deleted file mode 100644
index 7f560d07a..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1/4c749665-59ff-49df-a193-0262f66e6003.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1",
-    "id": "JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.1",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2393
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3244
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3222
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1573
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3/c99899c6-95e1-4dea-ac12-f8df49728a3b.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3/c99899c6-95e1-4dea-ac12-f8df49728a3b.json
deleted file mode 100644
index 52430793e..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3/c99899c6-95e1-4dea-ac12-f8df49728a3b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3",
-    "id": "JayHyeon/Qwen_0.5-cDPO_5e-7-3ep_0vpo_const_0.3",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2475
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3209
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0461
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3275
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1567
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1/13deca9f-073e-444b-bf79-35e816f7c312.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1/13deca9f-073e-444b-bf79-35e816f7c312.json
deleted file mode 100644
index d31366b10..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1/13deca9f-073e-444b-bf79-35e816f7c312.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1",
-    "id": "JayHyeon/Qwen_0.5-rDPO_3e-6-1ep_0vpo_const_0.1",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2321
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3278
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3022
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1496
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1/c8adc0a5-f4bf-4f88-984c-aba506eae6a9.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1/c8adc0a5-f4bf-4f88-984c-aba506eae6a9.json
deleted file mode 100644
index 9fb481bab..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1/c8adc0a5-f4bf-4f88-984c-aba506eae6a9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1",
-    "id": "JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.1",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3253
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0529
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3181
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1609
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3/b146daaf-ce1f-4520-bc19-21ce8679b220.json b/data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3/b146daaf-ce1f-4520-bc19-21ce8679b220.json
deleted file mode 100644
index 345fd539c..000000000
--- a/data/hfopenllm_v2/JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3/b146daaf-ce1f-4520-bc19-21ce8679b220.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JayHyeon_Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3",
-    "id": "JayHyeon/Qwen_0.5-rDPO_5e-7-3ep_0vpo_const_0.3",
-    "developer": "JayHyeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2739
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3245
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0461
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3089
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1597
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Jimmy19991222/Llama-3-Instruct-8B-SimPO-v0.2/45e1d037-1ed0-472c-a311-c651fde270fc.json b/data/hfopenllm_v2/Jimmy19991222/Llama-3-Instruct-8B-SimPO-v0.2/45e1d037-1ed0-472c-a311-c651fde270fc.json
deleted file mode 100644
index 36efeda0f..000000000
--- a/data/hfopenllm_v2/Jimmy19991222/Llama-3-Instruct-8B-SimPO-v0.2/45e1d037-1ed0-472c-a311-c651fde270fc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Jimmy19991222_Llama-3-Instruct-8B-SimPO-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-SimPO-v0.2",
-    "id": "Jimmy19991222/Llama-3-Instruct-8B-SimPO-v0.2",
-    "developer": "Jimmy19991222",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.654
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4984
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0619
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4013
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3686
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun/3f4ce54a-01f3-4c23-a4ba-22d47e0344dc.json b/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun/3f4ce54a-01f3-4c23-a4ba-22d47e0344dc.json
deleted file mode 100644
index ea821d05a..000000000
--- a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun/3f4ce54a-01f3-4c23-a4ba-22d47e0344dc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun",
-    "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert-f1-beta10-gamma0.3-lr1.0e-6-1minus-rerun",
-    "developer": "Jimmy19991222",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6717
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.488
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0604
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4041
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3634
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log/470d52be-9dbd-4714-b004-f65cc82d245f.json b/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log/470d52be-9dbd-4714-b004-f65cc82d245f.json
deleted file mode 100644
index 49a213fd9..000000000
--- a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log/470d52be-9dbd-4714-b004-f65cc82d245f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log",
-    "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_f1-beta10-gamma0.3-lr1.0e-6-scale-log",
-    "developer": "Jimmy19991222",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6556
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4935
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3658
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log/c836fd05-1969-439c-91e1-fd0cab816f6c.json b/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log/c836fd05-1969-439c-91e1-fd0cab816f6c.json
deleted file mode 100644
index 539534c00..000000000
--- a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log/c836fd05-1969-439c-91e1-fd0cab816f6c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log",
-    "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-bert_p-beta10-gamma0.3-lr1.0e-6-scale-log",
-    "developer": "Jimmy19991222",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6315
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4916
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3611
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4/14774c6b-eb03-4abc-92df-1e7a196ca8a4.json b/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4/14774c6b-eb03-4abc-92df-1e7a196ca8a4.json
deleted file mode 100644
index 80d5fde9f..000000000
--- a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4/14774c6b-eb03-4abc-92df-1e7a196ca8a4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Jimmy19991222_llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4",
-    "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-bleu-beta0.1-no-length-scale-gamma0.4",
-    "developer": "Jimmy19991222",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6285
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4986
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4014
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3545
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun/5293ae0c-8022-44d4-b2f5-4f5390dff93e.json b/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun/5293ae0c-8022-44d4-b2f5-4f5390dff93e.json
deleted file mode 100644
index ce4d8a93a..000000000
--- a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun/5293ae0c-8022-44d4-b2f5-4f5390dff93e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Jimmy19991222_llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun",
-    "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-1minus-gamma0.3-rerun",
-    "developer": "Jimmy19991222",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6678
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.494
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3987
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3658
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log/9020f91f-a8f0-447d-af68-247aa81a25c6.json b/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log/9020f91f-a8f0-447d-af68-247aa81a25c6.json
deleted file mode 100644
index 4e211ab7c..000000000
--- a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log/9020f91f-a8f0-447d-af68-247aa81a25c6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Jimmy19991222_llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log",
-    "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-rouge2-beta10-gamma0.3-lr1.0e-6-scale-log",
-    "developer": "Jimmy19991222",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6605
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4916
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0657
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3664
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log/0cd6837a-8c3f-4529-9ea0-8755e1725467.json b/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log/0cd6837a-8c3f-4529-9ea0-8755e1725467.json
deleted file mode 100644
index b0cd90258..000000000
--- a/data/hfopenllm_v2/Jimmy19991222/llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log/0cd6837a-8c3f-4529-9ea0-8755e1725467.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Jimmy19991222_llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log",
-    "id": "Jimmy19991222/llama-3-8b-instruct-gapo-v2-rougeL-beta10-gamma0.3-lr1.0e-6-scale-log",
-    "developer": "Jimmy19991222",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6492
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4952
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0642
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3961
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3711
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Joseph717171/Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32/7cb17011-cf77-4e86-b67f-84e6ff4b8086.json b/data/hfopenllm_v2/Joseph717171/Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32/7cb17011-cf77-4e86-b67f-84e6ff4b8086.json
deleted file mode 100644
index 25c8b3965..000000000
--- a/data/hfopenllm_v2/Joseph717171/Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32/7cb17011-cf77-4e86-b67f-84e6ff4b8086.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Joseph717171_Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32",
-    "id": "Joseph717171/Hermes-3-Llama-3.1-8B_TIES_with_Base_Embeds_Initialized_to_Special_Instruct_Toks_dtypeF32",
-    "developer": "Joseph717171",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6185
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5177
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4369
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3144
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Joseph717171/Llama-3.1-SuperNova-8B-Lite_TIES_with_Base/086831f9-c677-428b-a997-4da58733633c.json b/data/hfopenllm_v2/Joseph717171/Llama-3.1-SuperNova-8B-Lite_TIES_with_Base/086831f9-c677-428b-a997-4da58733633c.json
deleted file mode 100644
index eb6e8c12a..000000000
--- a/data/hfopenllm_v2/Joseph717171/Llama-3.1-SuperNova-8B-Lite_TIES_with_Base/086831f9-c677-428b-a997-4da58733633c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Joseph717171_Llama-3.1-SuperNova-8B-Lite_TIES_with_Base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-SuperNova-8B-Lite_TIES_with_Base",
-    "id": "Joseph717171/Llama-3.1-SuperNova-8B-Lite_TIES_with_Base",
-    "developer": "Joseph717171",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8096
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5147
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1835
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.411
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.388
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Josephgflowers/Cinder-Phi-2-V1-F16-gguf/d71893b8-b82c-490b-a700-b579d64e0610.json b/data/hfopenllm_v2/Josephgflowers/Cinder-Phi-2-V1-F16-gguf/d71893b8-b82c-490b-a700-b579d64e0610.json
deleted file mode 100644
index 536e2fe85..000000000
--- a/data/hfopenllm_v2/Josephgflowers/Cinder-Phi-2-V1-F16-gguf/d71893b8-b82c-490b-a700-b579d64e0610.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Josephgflowers_Cinder-Phi-2-V1-F16-gguf/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cinder-Phi-2-V1-F16-gguf",
-    "id": "Josephgflowers/Cinder-Phi-2-V1-F16-gguf",
-    "developer": "Josephgflowers",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.78
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2357
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4397
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0242
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3435
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2161
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Josephgflowers/Differential-Attention-Liquid-Metal-Tinyllama/9893689f-c27d-4148-a27f-cd07b07e98b7.json b/data/hfopenllm_v2/Josephgflowers/Differential-Attention-Liquid-Metal-Tinyllama/9893689f-c27d-4148-a27f-cd07b07e98b7.json
deleted file mode 100644
index dab2ee4a6..000000000
--- a/data/hfopenllm_v2/Josephgflowers/Differential-Attention-Liquid-Metal-Tinyllama/9893689f-c27d-4148-a27f-cd07b07e98b7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Josephgflowers_Differential-Attention-Liquid-Metal-Tinyllama/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Differential-Attention-Liquid-Metal-Tinyllama",
-    "id": "Josephgflowers/Differential-Attention-Liquid-Metal-Tinyllama",
-    "developer": "Josephgflowers",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.1
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2227
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2926
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0325
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1214
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Josephgflowers/TinyLlama-Cinder-Agent-v1/90f2df23-a9ec-44be-ade5-89b59cb7368a.json b/data/hfopenllm_v2/Josephgflowers/TinyLlama-Cinder-Agent-v1/90f2df23-a9ec-44be-ade5-89b59cb7368a.json
deleted file mode 100644
index 7c3cb09c4..000000000
--- a/data/hfopenllm_v2/Josephgflowers/TinyLlama-Cinder-Agent-v1/90f2df23-a9ec-44be-ade5-89b59cb7368a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Josephgflowers_TinyLlama-Cinder-Agent-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TinyLlama-Cinder-Agent-v1",
-    "id": "Josephgflowers/TinyLlama-Cinder-Agent-v1",
-    "developer": "Josephgflowers",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.1
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.267
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3116
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0347
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2441
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3395
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1161
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Josephgflowers/TinyLlama-v1.1-Cinders-World/afd545da-390a-478a-b0f5-ea819f088f27.json b/data/hfopenllm_v2/Josephgflowers/TinyLlama-v1.1-Cinders-World/afd545da-390a-478a-b0f5-ea819f088f27.json
deleted file mode 100644
index 3ac65a5c2..000000000
--- a/data/hfopenllm_v2/Josephgflowers/TinyLlama-v1.1-Cinders-World/afd545da-390a-478a-b0f5-ea819f088f27.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Josephgflowers_TinyLlama-v1.1-Cinders-World/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TinyLlama-v1.1-Cinders-World",
-    "id": "Josephgflowers/TinyLlama-v1.1-Cinders-World",
-    "developer": "Josephgflowers",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.1
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2469
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2998
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0347
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2441
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1198
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Josephgflowers/TinyLlama_v1.1_math_code-world-test-1/ce776f68-856f-4aee-b7e4-e55d15e8d714.json b/data/hfopenllm_v2/Josephgflowers/TinyLlama_v1.1_math_code-world-test-1/ce776f68-856f-4aee-b7e4-e55d15e8d714.json
deleted file mode 100644
index 489ca17d9..000000000
--- a/data/hfopenllm_v2/Josephgflowers/TinyLlama_v1.1_math_code-world-test-1/ce776f68-856f-4aee-b7e4-e55d15e8d714.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Josephgflowers_TinyLlama_v1.1_math_code-world-test-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TinyLlama_v1.1_math_code-world-test-1",
-    "id": "Josephgflowers/TinyLlama_v1.1_math_code-world-test-1",
-    "developer": "Josephgflowers",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.1
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0078
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2341
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3499
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1132
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Josephgflowers/Tinyllama-STEM-Cinder-Agent-v1/9b015729-524c-44f3-9c2c-c42981d7a61e.json b/data/hfopenllm_v2/Josephgflowers/Tinyllama-STEM-Cinder-Agent-v1/9b015729-524c-44f3-9c2c-c42981d7a61e.json
deleted file mode 100644
index 586b23dba..000000000
--- a/data/hfopenllm_v2/Josephgflowers/Tinyllama-STEM-Cinder-Agent-v1/9b015729-524c-44f3-9c2c-c42981d7a61e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Josephgflowers_Tinyllama-STEM-Cinder-Agent-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tinyllama-STEM-Cinder-Agent-v1",
-    "id": "Josephgflowers/Tinyllama-STEM-Cinder-Agent-v1",
-    "developer": "Josephgflowers",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.1
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2126
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3084
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0672
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2349
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3341
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1086
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Josephgflowers/Tinyllama-r1/56a54ffc-4692-496c-95df-8e4ad19d4d95.json b/data/hfopenllm_v2/Josephgflowers/Tinyllama-r1/56a54ffc-4692-496c-95df-8e4ad19d4d95.json
deleted file mode 100644
index 5e761ffc2..000000000
--- a/data/hfopenllm_v2/Josephgflowers/Tinyllama-r1/56a54ffc-4692-496c-95df-8e4ad19d4d95.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Josephgflowers_Tinyllama-r1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tinyllama-r1",
-    "id": "Josephgflowers/Tinyllama-r1",
-    "developer": "Josephgflowers",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.1
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2119
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3015
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0325
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3315
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1134
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JungZoona/T3Q-Qwen2.5-14B-Instruct-1M-e3/4b105969-2ce5-4c62-89ef-efd392c2ca89.json b/data/hfopenllm_v2/JungZoona/T3Q-Qwen2.5-14B-Instruct-1M-e3/4b105969-2ce5-4c62-89ef-efd392c2ca89.json
deleted file mode 100644
index 6864f9d79..000000000
--- a/data/hfopenllm_v2/JungZoona/T3Q-Qwen2.5-14B-Instruct-1M-e3/4b105969-2ce5-4c62-89ef-efd392c2ca89.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JungZoona_T3Q-Qwen2.5-14B-Instruct-1M-e3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "T3Q-Qwen2.5-14B-Instruct-1M-e3",
-    "id": "JungZoona/T3Q-Qwen2.5-14B-Instruct-1M-e3",
-    "developer": "JungZoona",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Unknown",
-      "params_billions": 0.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7324
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7586
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2863
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4169
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5911
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5884
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/JungZoona/T3Q-qwen2.5-14b-v1.0-e3/31af79b1-48c1-4399-9d16-8582c92996ee.json b/data/hfopenllm_v2/JungZoona/T3Q-qwen2.5-14b-v1.0-e3/31af79b1-48c1-4399-9d16-8582c92996ee.json
deleted file mode 100644
index ebb5f9b2d..000000000
--- a/data/hfopenllm_v2/JungZoona/T3Q-qwen2.5-14b-v1.0-e3/31af79b1-48c1-4399-9d16-8582c92996ee.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/JungZoona_T3Q-qwen2.5-14b-v1.0-e3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "T3Q-qwen2.5-14b-v1.0-e3",
-    "id": "JungZoona/T3Q-qwen2.5-14b-v1.0-e3",
-    "developer": "JungZoona",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7324
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7586
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2863
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4169
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5911
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5884
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Junhoee/Qwen-Megumin/59a67f29-cb7d-497c-b7bb-1764a665ae33.json b/data/hfopenllm_v2/Junhoee/Qwen-Megumin/59a67f29-cb7d-497c-b7bb-1764a665ae33.json
deleted file mode 100644
index 8f5c2b802..000000000
--- a/data/hfopenllm_v2/Junhoee/Qwen-Megumin/59a67f29-cb7d-497c-b7bb-1764a665ae33.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Junhoee_Qwen-Megumin/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-Megumin",
-    "id": "Junhoee/Qwen-Megumin",
-    "developer": "Junhoee",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 15.231
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7141
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5285
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4902
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.398
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-1415/fe57367c-74b7-483e-af54-4f404cbea75b.json b/data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-1415/fe57367c-74b7-483e-af54-4f404cbea75b.json
deleted file mode 100644
index d5244c3c9..000000000
--- a/data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-1415/fe57367c-74b7-483e-af54-4f404cbea75b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/KSU-HW-SEC_Llama3-70b-SVA-FT-1415/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3-70b-SVA-FT-1415",
-    "id": "KSU-HW-SEC/Llama3-70b-SVA-FT-1415",
-    "developer": "KSU-HW-SEC",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.618
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.665
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2198
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4565
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5243
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-500/fda2277b-1513-416e-b586-ed05920a0bb4.json b/data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-500/fda2277b-1513-416e-b586-ed05920a0bb4.json
deleted file mode 100644
index 12fd56cb7..000000000
--- a/data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-500/fda2277b-1513-416e-b586-ed05920a0bb4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/KSU-HW-SEC_Llama3-70b-SVA-FT-500/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3-70b-SVA-FT-500",
-    "id": "KSU-HW-SEC/Llama3-70b-SVA-FT-500",
-    "developer": "KSU-HW-SEC",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6105
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6692
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2137
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3809
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4511
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5227
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-final/b3dde216-f80a-4664-aadc-b5f5dd3e5895.json b/data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-final/b3dde216-f80a-4664-aadc-b5f5dd3e5895.json
deleted file mode 100644
index 54dad4185..000000000
--- a/data/hfopenllm_v2/KSU-HW-SEC/Llama3-70b-SVA-FT-final/b3dde216-f80a-4664-aadc-b5f5dd3e5895.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/KSU-HW-SEC_Llama3-70b-SVA-FT-final/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3-70b-SVA-FT-final",
-    "id": "KSU-HW-SEC/Llama3-70b-SVA-FT-final",
-    "developer": "KSU-HW-SEC",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6165
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.665
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2198
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4565
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5243
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/KSU-HW-SEC/Llama3.1-70b-SVA-FT-1000step/07ed6241-fd1a-46eb-91fd-92a4a8f6bd15.json b/data/hfopenllm_v2/KSU-HW-SEC/Llama3.1-70b-SVA-FT-1000step/07ed6241-fd1a-46eb-91fd-92a4a8f6bd15.json
deleted file mode 100644
index 39cc4b352..000000000
--- a/data/hfopenllm_v2/KSU-HW-SEC/Llama3.1-70b-SVA-FT-1000step/07ed6241-fd1a-46eb-91fd-92a4a8f6bd15.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/KSU-HW-SEC_Llama3.1-70b-SVA-FT-1000step/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-70b-SVA-FT-1000step",
-    "id": "KSU-HW-SEC/Llama3.1-70b-SVA-FT-1000step",
-    "developer": "KSU-HW-SEC",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7238
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6903
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.321
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.396
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4592
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5252
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Khetterman/DarkAtom-12B-v3/ba76c356-cd6a-4636-8ab1-18bb9df69881.json b/data/hfopenllm_v2/Khetterman/DarkAtom-12B-v3/ba76c356-cd6a-4636-8ab1-18bb9df69881.json
deleted file mode 100644
index 45d9e2c04..000000000
--- a/data/hfopenllm_v2/Khetterman/DarkAtom-12B-v3/ba76c356-cd6a-4636-8ab1-18bb9df69881.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Khetterman_DarkAtom-12B-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DarkAtom-12B-v3",
-    "id": "Khetterman/DarkAtom-12B-v3",
-    "developer": "Khetterman",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6173
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5154
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.111
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4468
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3546
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Khetterman/Kosmos-8B-v1/c6ae54a1-2821-48d1-b689-bbb85aaa70a6.json b/data/hfopenllm_v2/Khetterman/Kosmos-8B-v1/c6ae54a1-2821-48d1-b689-bbb85aaa70a6.json
deleted file mode 100644
index 9f41fea54..000000000
--- a/data/hfopenllm_v2/Khetterman/Kosmos-8B-v1/c6ae54a1-2821-48d1-b689-bbb85aaa70a6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Khetterman_Kosmos-8B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-8B-v1",
-    "id": "Khetterman/Kosmos-8B-v1",
-    "developer": "Khetterman",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4129
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5234
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0989
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3919
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3669
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Kimargin/GPT-NEO-1.3B-wiki/6f296f0e-80ca-49b7-94e7-cb45b795c715.json b/data/hfopenllm_v2/Kimargin/GPT-NEO-1.3B-wiki/6f296f0e-80ca-49b7-94e7-cb45b795c715.json
deleted file mode 100644
index 2495d87be..000000000
--- a/data/hfopenllm_v2/Kimargin/GPT-NEO-1.3B-wiki/6f296f0e-80ca-49b7-94e7-cb45b795c715.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Kimargin_GPT-NEO-1.3B-wiki/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-NEO-1.3B-wiki",
-    "id": "Kimargin/GPT-NEO-1.3B-wiki",
-    "developer": "Kimargin",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoForCausalLM",
-      "params_billions": 1.316
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1921
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3026
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.245
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3883
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1099
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/KingNish/Qwen2.5-0.5b-Test-ft/b5509e11-820a-4ad4-8c6a-0294762502a8.json b/data/hfopenllm_v2/KingNish/Qwen2.5-0.5b-Test-ft/b5509e11-820a-4ad4-8c6a-0294762502a8.json
deleted file mode 100644
index c989aeb0c..000000000
--- a/data/hfopenllm_v2/KingNish/Qwen2.5-0.5b-Test-ft/b5509e11-820a-4ad4-8c6a-0294762502a8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/KingNish_Qwen2.5-0.5b-Test-ft/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5b-Test-ft",
-    "id": "KingNish/Qwen2.5-0.5b-Test-ft",
-    "developer": "KingNish",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2671
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3232
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0355
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3421
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1689
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/KingNish/Reasoning-0.5b/90d73665-8d83-4e74-ab7d-29b1d3b6181b.json b/data/hfopenllm_v2/KingNish/Reasoning-0.5b/90d73665-8d83-4e74-ab7d-29b1d3b6181b.json
deleted file mode 100644
index 57493bf65..000000000
--- a/data/hfopenllm_v2/KingNish/Reasoning-0.5b/90d73665-8d83-4e74-ab7d-29b1d3b6181b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/KingNish_Reasoning-0.5b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Reasoning-0.5b",
-    "id": "KingNish/Reasoning-0.5b",
-    "developer": "KingNish",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2174
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3354
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3513
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1641
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/KingNish/Reasoning-Llama-3b-v0.1/72387647-cbac-4b72-9c22-db7029a39457.json b/data/hfopenllm_v2/KingNish/Reasoning-Llama-3b-v0.1/72387647-cbac-4b72-9c22-db7029a39457.json
deleted file mode 100644
index e30cf7c55..000000000
--- a/data/hfopenllm_v2/KingNish/Reasoning-Llama-3b-v0.1/72387647-cbac-4b72-9c22-db7029a39457.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/KingNish_Reasoning-Llama-3b-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Reasoning-Llama-3b-v0.1",
-    "id": "KingNish/Reasoning-Llama-3b-v0.1",
-    "developer": "KingNish",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6225
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4343
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1299
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3168
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/KingNish/qwen-1b-continued-v2.1/6219ec01-4b6a-4acd-aee1-96c3e8e48643.json b/data/hfopenllm_v2/KingNish/qwen-1b-continued-v2.1/6219ec01-4b6a-4acd-aee1-96c3e8e48643.json
deleted file mode 100644
index 143abb8e4..000000000
--- a/data/hfopenllm_v2/KingNish/qwen-1b-continued-v2.1/6219ec01-4b6a-4acd-aee1-96c3e8e48643.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/KingNish_qwen-1b-continued-v2.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen-1b-continued-v2.1",
-    "id": "KingNish/qwen-1b-continued-v2.1",
-    "developer": "KingNish",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.277
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1127
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3042
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4154
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1278
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/KingNish/qwen-1b-continued-v2.2/5c323d7c-25cd-4718-8a1f-54d986cadaf2.json b/data/hfopenllm_v2/KingNish/qwen-1b-continued-v2.2/5c323d7c-25cd-4718-8a1f-54d986cadaf2.json
deleted file mode 100644
index 04f12a579..000000000
--- a/data/hfopenllm_v2/KingNish/qwen-1b-continued-v2.2/5c323d7c-25cd-4718-8a1f-54d986cadaf2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/KingNish_qwen-1b-continued-v2.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen-1b-continued-v2.2",
-    "id": "KingNish/qwen-1b-continued-v2.2",
-    "developer": "KingNish",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.277
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1413
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3059
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3513
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1262
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/KingNish/qwen-1b-continued-v2/adfab21a-941b-4efc-8b63-fdfb3074ba9b.json b/data/hfopenllm_v2/KingNish/qwen-1b-continued-v2/adfab21a-941b-4efc-8b63-fdfb3074ba9b.json
deleted file mode 100644
index 7b2c2ff2f..000000000
--- a/data/hfopenllm_v2/KingNish/qwen-1b-continued-v2/adfab21a-941b-4efc-8b63-fdfb3074ba9b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/KingNish_qwen-1b-continued-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen-1b-continued-v2",
-    "id": "KingNish/qwen-1b-continued-v2",
-    "developer": "KingNish",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.277
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1579
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3119
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3393
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1193
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/KingNish/qwen-1b-continued/350d00a4-7501-4130-a069-323530bc9729.json b/data/hfopenllm_v2/KingNish/qwen-1b-continued/350d00a4-7501-4130-a069-323530bc9729.json
deleted file mode 100644
index 7d5687eaf..000000000
--- a/data/hfopenllm_v2/KingNish/qwen-1b-continued/350d00a4-7501-4130-a069-323530bc9729.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/KingNish_qwen-1b-continued/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen-1b-continued",
-    "id": "KingNish/qwen-1b-continued",
-    "developer": "KingNish",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.277
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1255
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2991
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3859
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1261
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Kquant03/CognitiveFusion2-4x7B-BF16/ea809d28-178e-4a0b-ab5a-34739077c5ff.json b/data/hfopenllm_v2/Kquant03/CognitiveFusion2-4x7B-BF16/ea809d28-178e-4a0b-ab5a-34739077c5ff.json
deleted file mode 100644
index 49d973710..000000000
--- a/data/hfopenllm_v2/Kquant03/CognitiveFusion2-4x7B-BF16/ea809d28-178e-4a0b-ab5a-34739077c5ff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Kquant03_CognitiveFusion2-4x7B-BF16/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CognitiveFusion2-4x7B-BF16",
-    "id": "Kquant03/CognitiveFusion2-4x7B-BF16",
-    "developer": "Kquant03",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 24.154
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3567
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4108
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4146
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2793
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Kquant03/L3-Pneuma-8B/243d5ccd-58f3-4da5-8718-553f3f456490.json b/data/hfopenllm_v2/Kquant03/L3-Pneuma-8B/243d5ccd-58f3-4da5-8718-553f3f456490.json
deleted file mode 100644
index 408f75c77..000000000
--- a/data/hfopenllm_v2/Kquant03/L3-Pneuma-8B/243d5ccd-58f3-4da5-8718-553f3f456490.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Kquant03_L3-Pneuma-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-Pneuma-8B",
-    "id": "Kquant03/L3-Pneuma-8B",
-    "developer": "Kquant03",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2374
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4955
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4172
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3184
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Krystalan/DRT-o1-14B/a45537a7-76a6-4855-b83b-abe965f13460.json b/data/hfopenllm_v2/Krystalan/DRT-o1-14B/a45537a7-76a6-4855-b83b-abe965f13460.json
deleted file mode 100644
index 970337ee9..000000000
--- a/data/hfopenllm_v2/Krystalan/DRT-o1-14B/a45537a7-76a6-4855-b83b-abe965f13460.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Krystalan_DRT-o1-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DRT-o1-14B",
-    "id": "Krystalan/DRT-o1-14B",
-    "developer": "Krystalan",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4068
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6379
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4826
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3523
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4795
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5179
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Krystalan/DRT-o1-7B/9be911b6-b9f4-47b1-849d-62eb20c9e944.json b/data/hfopenllm_v2/Krystalan/DRT-o1-7B/9be911b6-b9f4-47b1-849d-62eb20c9e944.json
deleted file mode 100644
index e93c9a10d..000000000
--- a/data/hfopenllm_v2/Krystalan/DRT-o1-7B/9be911b6-b9f4-47b1-849d-62eb20c9e944.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Krystalan_DRT-o1-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DRT-o1-7B",
-    "id": "Krystalan/DRT-o1-7B",
-    "developer": "Krystalan",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3928
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5468
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4479
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5087
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4151
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Kukedlc/NeuralExperiment-7b-MagicCoder-v7.5/33d7d5f0-cbee-4a26-b5e8-48bdd12492cf.json b/data/hfopenllm_v2/Kukedlc/NeuralExperiment-7b-MagicCoder-v7.5/33d7d5f0-cbee-4a26-b5e8-48bdd12492cf.json
deleted file mode 100644
index 0cf70abae..000000000
--- a/data/hfopenllm_v2/Kukedlc/NeuralExperiment-7b-MagicCoder-v7.5/33d7d5f0-cbee-4a26-b5e8-48bdd12492cf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Kukedlc_NeuralExperiment-7b-MagicCoder-v7.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NeuralExperiment-7b-MagicCoder-v7.5",
-    "id": "Kukedlc/NeuralExperiment-7b-MagicCoder-v7.5",
-    "developer": "Kukedlc",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4553
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3988
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4282
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2824
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Kukedlc/NeuralLLaMa-3-8b-DT-v0.1/4355fbdd-ac72-4f26-8e07-b7e8d774d238.json b/data/hfopenllm_v2/Kukedlc/NeuralLLaMa-3-8b-DT-v0.1/4355fbdd-ac72-4f26-8e07-b7e8d774d238.json
deleted file mode 100644
index cdfa1a3cb..000000000
--- a/data/hfopenllm_v2/Kukedlc/NeuralLLaMa-3-8b-DT-v0.1/4355fbdd-ac72-4f26-8e07-b7e8d774d238.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Kukedlc_NeuralLLaMa-3-8b-DT-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NeuralLLaMa-3-8b-DT-v0.1",
-    "id": "Kukedlc/NeuralLLaMa-3-8b-DT-v0.1",
-    "developer": "Kukedlc",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4371
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4987
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0808
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4071
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Kukedlc/NeuralLLaMa-3-8b-ORPO-v0.3/4bffc633-e20c-4874-b7db-d1b7dabb8070.json b/data/hfopenllm_v2/Kukedlc/NeuralLLaMa-3-8b-ORPO-v0.3/4bffc633-e20c-4874-b7db-d1b7dabb8070.json
deleted file mode 100644
index 76e148b2f..000000000
--- a/data/hfopenllm_v2/Kukedlc/NeuralLLaMa-3-8b-ORPO-v0.3/4bffc633-e20c-4874-b7db-d1b7dabb8070.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Kukedlc_NeuralLLaMa-3-8b-ORPO-v0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NeuralLLaMa-3-8b-ORPO-v0.3",
-    "id": "Kukedlc/NeuralLLaMa-3-8b-ORPO-v0.3",
-    "developer": "Kukedlc",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5276
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4557
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0483
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2391
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3057
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7B-v0.1/2d5c844d-d950-4254-bac2-0a986659c541.json b/data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7B-v0.1/2d5c844d-d950-4254-bac2-0a986659c541.json
deleted file mode 100644
index 79df21f7a..000000000
--- a/data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7B-v0.1/2d5c844d-d950-4254-bac2-0a986659c541.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Kukedlc_NeuralSynthesis-7B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NeuralSynthesis-7B-v0.1",
-    "id": "Kukedlc/NeuralSynthesis-7B-v0.1",
-    "developer": "Kukedlc",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4185
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5145
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0634
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4333
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3049
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7B-v0.3/f6e74b3c-9ee4-40c3-bf92-35d965503a04.json b/data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7B-v0.3/f6e74b3c-9ee4-40c3-bf92-35d965503a04.json
deleted file mode 100644
index 869b589d4..000000000
--- a/data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7B-v0.3/f6e74b3c-9ee4-40c3-bf92-35d965503a04.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Kukedlc_NeuralSynthesis-7B-v0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NeuralSynthesis-7B-v0.3",
-    "id": "Kukedlc/NeuralSynthesis-7B-v0.3",
-    "developer": "Kukedlc",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4078
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5138
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0778
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4346
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.305
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7b-v0.4-slerp/8f1d2600-7347-48b8-9759-11570598459d.json b/data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7b-v0.4-slerp/8f1d2600-7347-48b8-9759-11570598459d.json
deleted file mode 100644
index 950dab5d0..000000000
--- a/data/hfopenllm_v2/Kukedlc/NeuralSynthesis-7b-v0.4-slerp/8f1d2600-7347-48b8-9759-11570598459d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Kukedlc_NeuralSynthesis-7b-v0.4-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NeuralSynthesis-7b-v0.4-slerp",
-    "id": "Kukedlc/NeuralSynthesis-7b-v0.4-slerp",
-    "developer": "Kukedlc",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3947
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5143
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0627
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4332
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3043
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Kukedlc/Qwen-2.5-7b-Spanish-o1-CoT/cd653bfd-2c06-4224-aeeb-bf591995a69e.json b/data/hfopenllm_v2/Kukedlc/Qwen-2.5-7b-Spanish-o1-CoT/cd653bfd-2c06-4224-aeeb-bf591995a69e.json
deleted file mode 100644
index 21324748e..000000000
--- a/data/hfopenllm_v2/Kukedlc/Qwen-2.5-7b-Spanish-o1-CoT/cd653bfd-2c06-4224-aeeb-bf591995a69e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Kukedlc_Qwen-2.5-7b-Spanish-o1-CoT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-2.5-7b-Spanish-o1-CoT",
-    "id": "Kukedlc/Qwen-2.5-7b-Spanish-o1-CoT",
-    "developer": "Kukedlc",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.421
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5602
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4777
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4363
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Kumar955/Hemanth-llm/cdf1fcc7-429d-44bd-b76c-d26ee743f6fe.json b/data/hfopenllm_v2/Kumar955/Hemanth-llm/cdf1fcc7-429d-44bd-b76c-d26ee743f6fe.json
deleted file mode 100644
index c93b19301..000000000
--- a/data/hfopenllm_v2/Kumar955/Hemanth-llm/cdf1fcc7-429d-44bd-b76c-d26ee743f6fe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Kumar955_Hemanth-llm/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hemanth-llm",
-    "id": "Kumar955/Hemanth-llm",
-    "developer": "Kumar955",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5045
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5225
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0702
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4486
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3113
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/L-RAGE/3_PRYMMAL-ECE-7B-SLERP-V1/4828bd36-5453-4383-8985-08d04a7ebecd.json b/data/hfopenllm_v2/L-RAGE/3_PRYMMAL-ECE-7B-SLERP-V1/4828bd36-5453-4383-8985-08d04a7ebecd.json
deleted file mode 100644
index 22101047a..000000000
--- a/data/hfopenllm_v2/L-RAGE/3_PRYMMAL-ECE-7B-SLERP-V1/4828bd36-5453-4383-8985-08d04a7ebecd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/L-RAGE_3_PRYMMAL-ECE-7B-SLERP-V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "3_PRYMMAL-ECE-7B-SLERP-V1",
-    "id": "L-RAGE/3_PRYMMAL-ECE-7B-SLERP-V1",
-    "developer": "L-RAGE",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2742
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4228
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3841
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2925
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LEESM/llama-2-7b-hf-lora-oki100p/4c2baa59-c2f1-4779-9d21-1f69c0821968.json b/data/hfopenllm_v2/LEESM/llama-2-7b-hf-lora-oki100p/4c2baa59-c2f1-4779-9d21-1f69c0821968.json
deleted file mode 100644
index 6b1aef1c0..000000000
--- a/data/hfopenllm_v2/LEESM/llama-2-7b-hf-lora-oki100p/4c2baa59-c2f1-4779-9d21-1f69c0821968.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LEESM_llama-2-7b-hf-lora-oki100p/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-2-7b-hf-lora-oki100p",
-    "id": "LEESM/llama-2-7b-hf-lora-oki100p",
-    "developer": "LEESM",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.738
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2513
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3492
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3687
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1856
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LEESM/llama-2-7b-hf-lora-oki10p/555c1079-c4d0-4b9e-9d2d-769e7ba32429.json b/data/hfopenllm_v2/LEESM/llama-2-7b-hf-lora-oki10p/555c1079-c4d0-4b9e-9d2d-769e7ba32429.json
deleted file mode 100644
index a80706c7c..000000000
--- a/data/hfopenllm_v2/LEESM/llama-2-7b-hf-lora-oki10p/555c1079-c4d0-4b9e-9d2d-769e7ba32429.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LEESM_llama-2-7b-hf-lora-oki10p/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-2-7b-hf-lora-oki10p",
-    "id": "LEESM/llama-2-7b-hf-lora-oki10p",
-    "developer": "LEESM",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.738
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.227
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3531
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3475
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1679
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LEESM/llama-3-8b-bnb-4b-kowiki231101/58a4a1c6-0ee4-4524-9ca1-b40870f1d600.json b/data/hfopenllm_v2/LEESM/llama-3-8b-bnb-4b-kowiki231101/58a4a1c6-0ee4-4524-9ca1-b40870f1d600.json
deleted file mode 100644
index 87aab7f15..000000000
--- a/data/hfopenllm_v2/LEESM/llama-3-8b-bnb-4b-kowiki231101/58a4a1c6-0ee4-4524-9ca1-b40870f1d600.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LEESM_llama-3-8b-bnb-4b-kowiki231101/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-8b-bnb-4b-kowiki231101",
-    "id": "LEESM/llama-3-8b-bnb-4b-kowiki231101",
-    "developer": "LEESM",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1685
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4131
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3551
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2425
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LEESM/llama-3-Korean-Bllossom-8B-trexlab-oki10p/eea2a38a-4f1b-48d0-894c-09974894f264.json b/data/hfopenllm_v2/LEESM/llama-3-Korean-Bllossom-8B-trexlab-oki10p/eea2a38a-4f1b-48d0-894c-09974894f264.json
deleted file mode 100644
index 65d39674d..000000000
--- a/data/hfopenllm_v2/LEESM/llama-3-Korean-Bllossom-8B-trexlab-oki10p/eea2a38a-4f1b-48d0-894c-09974894f264.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LEESM_llama-3-Korean-Bllossom-8B-trexlab-oki10p/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-Korean-Bllossom-8B-trexlab-oki10p",
-    "id": "LEESM/llama-3-Korean-Bllossom-8B-trexlab-oki10p",
-    "developer": "LEESM",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2137
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4343
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0468
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3869
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3177
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/3d8063ab-0ad5-43e4-83ff-90b46dee766f.json b/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/3d8063ab-0ad5-43e4-83ff-90b46dee766f.json
deleted file mode 100644
index 53d2d42f2..000000000
--- a/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/3d8063ab-0ad5-43e4-83ff-90b46dee766f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LGAI-EXAONE_EXAONE-3.0-7.8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "EXAONE-3.0-7.8B-Instruct",
-    "id": "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct",
-    "developer": "LGAI-EXAONE",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "ExaoneForCausalLM",
-      "params_billions": 7.8
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7193
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4174
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3044
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3661
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3577
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct/da5e0284-7c44-42d4-a110-a23880de277f.json b/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct/da5e0284-7c44-42d4-a110-a23880de277f.json
deleted file mode 100644
index ae4eb863d..000000000
--- a/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct/da5e0284-7c44-42d4-a110-a23880de277f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LGAI-EXAONE_EXAONE-3.5-2.4B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "EXAONE-3.5-2.4B-Instruct",
-    "id": "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct",
-    "developer": "LGAI-EXAONE",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "ExaoneForCausalLM",
-      "params_billions": 2.405
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.795
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4092
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3678
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3661
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-32B-Instruct/bef017bb-47b1-48e4-93c4-3b222a16af7a.json b/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-32B-Instruct/bef017bb-47b1-48e4-93c4-3b222a16af7a.json
deleted file mode 100644
index 5bbd9c37d..000000000
--- a/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-32B-Instruct/bef017bb-47b1-48e4-93c4-3b222a16af7a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LGAI-EXAONE_EXAONE-3.5-32B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "EXAONE-3.5-32B-Instruct",
-    "id": "LGAI-EXAONE/EXAONE-3.5-32B-Instruct",
-    "developer": "LGAI-EXAONE",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "ExaoneForCausalLM",
-      "params_billions": 32.003
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8392
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5761
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3807
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4637
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct/401c83b0-b7d2-4987-9e46-f127fdbb595f.json b/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct/401c83b0-b7d2-4987-9e46-f127fdbb595f.json
deleted file mode 100644
index 48cb86a10..000000000
--- a/data/hfopenllm_v2/LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct/401c83b0-b7d2-4987-9e46-f127fdbb595f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LGAI-EXAONE_EXAONE-3.5-7.8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "EXAONE-3.5-7.8B-Instruct",
-    "id": "LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct",
-    "developer": "LGAI-EXAONE",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "ExaoneForCausalLM",
-      "params_billions": 7.818
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8136
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4728
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4751
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3779
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4133
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LLM360/K2-Chat/c6fde59b-73ed-4179-a907-076be068b262.json b/data/hfopenllm_v2/LLM360/K2-Chat/c6fde59b-73ed-4179-a907-076be068b262.json
deleted file mode 100644
index 6ec8b2598..000000000
--- a/data/hfopenllm_v2/LLM360/K2-Chat/c6fde59b-73ed-4179-a907-076be068b262.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LLM360_K2-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "K2-Chat",
-    "id": "LLM360/K2-Chat",
-    "developer": "LLM360",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 65.286
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5152
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5358
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1035
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.457
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3371
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LLM360/K2/90997fea-6c67-493e-bd8e-5327cfb33ea4.json b/data/hfopenllm_v2/LLM360/K2/90997fea-6c67-493e-bd8e-5327cfb33ea4.json
deleted file mode 100644
index c8a7cd176..000000000
--- a/data/hfopenllm_v2/LLM360/K2/90997fea-6c67-493e-bd8e-5327cfb33ea4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LLM360_K2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "K2",
-    "id": "LLM360/K2",
-    "developer": "LLM360",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 65.286
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2252
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4972
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0272
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.398
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3004
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LLM4Binary/llm4decompile-1.3b-v2/08957d63-7462-44ff-9dd8-060a5801a31b.json b/data/hfopenllm_v2/LLM4Binary/llm4decompile-1.3b-v2/08957d63-7462-44ff-9dd8-060a5801a31b.json
deleted file mode 100644
index 85cc8161c..000000000
--- a/data/hfopenllm_v2/LLM4Binary/llm4decompile-1.3b-v2/08957d63-7462-44ff-9dd8-060a5801a31b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LLM4Binary_llm4decompile-1.3b-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llm4decompile-1.3b-v2",
-    "id": "LLM4Binary/llm4decompile-1.3b-v2",
-    "developer": "LLM4Binary",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.346
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2268
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2357
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4072
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1209
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lambent/qwen2.5-reinstruct-alternate-lumen-14B/a434f569-e7d6-4464-afa8-6104be43fa06.json b/data/hfopenllm_v2/Lambent/qwen2.5-reinstruct-alternate-lumen-14B/a434f569-e7d6-4464-afa8-6104be43fa06.json
deleted file mode 100644
index 105590f52..000000000
--- a/data/hfopenllm_v2/Lambent/qwen2.5-reinstruct-alternate-lumen-14B/a434f569-e7d6-4464-afa8-6104be43fa06.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lambent_qwen2.5-reinstruct-alternate-lumen-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen2.5-reinstruct-alternate-lumen-14B",
-    "id": "Lambent/qwen2.5-reinstruct-alternate-lumen-14B",
-    "developer": "Lambent",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4794
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6459
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4622
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3767
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.477
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5388
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Langboat/Mengzi3-8B-Chat/e32ed251-e817-409f-b4c3-8f168f1ff822.json b/data/hfopenllm_v2/Langboat/Mengzi3-8B-Chat/e32ed251-e817-409f-b4c3-8f168f1ff822.json
deleted file mode 100644
index 1c7e4d12f..000000000
--- a/data/hfopenllm_v2/Langboat/Mengzi3-8B-Chat/e32ed251-e817-409f-b4c3-8f168f1ff822.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Langboat_Mengzi3-8B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mengzi3-8B-Chat",
-    "id": "Langboat/Mengzi3-8B-Chat",
-    "developer": "Langboat",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.514
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4684
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0906
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4078
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3142
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lawnakk/BBA100/1d9a65a3-d2bb-48a7-8a00-8e4a79c36db2.json b/data/hfopenllm_v2/Lawnakk/BBA100/1d9a65a3-d2bb-48a7-8a00-8e4a79c36db2.json
deleted file mode 100644
index 9cbc0affe..000000000
--- a/data/hfopenllm_v2/Lawnakk/BBA100/1d9a65a3-d2bb-48a7-8a00-8e4a79c36db2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lawnakk_BBA100/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBA100",
-    "id": "Lawnakk/BBA100",
-    "developer": "Lawnakk",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2076
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2826
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2441
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.402
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1122
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lawnakk/BBALAW1.0/608398da-ae2a-4be2-aaf9-6ec8899aa63d.json b/data/hfopenllm_v2/Lawnakk/BBALAW1.0/608398da-ae2a-4be2-aaf9-6ec8899aa63d.json
deleted file mode 100644
index 11de02d19..000000000
--- a/data/hfopenllm_v2/Lawnakk/BBALAW1.0/608398da-ae2a-4be2-aaf9-6ec8899aa63d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lawnakk_BBALAW1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBALAW1.0",
-    "id": "Lawnakk/BBALAW1.0",
-    "developer": "Lawnakk",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 4.353
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1351
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2828
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3526
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1128
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lawnakk/BBALAW1.2/80e04641-be7d-4351-a4f6-1318981ef834.json b/data/hfopenllm_v2/Lawnakk/BBALAW1.2/80e04641-be7d-4351-a4f6-1318981ef834.json
deleted file mode 100644
index 0b4de2bab..000000000
--- a/data/hfopenllm_v2/Lawnakk/BBALAW1.2/80e04641-be7d-4351-a4f6-1318981ef834.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lawnakk_BBALAW1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBALAW1.2",
-    "id": "Lawnakk/BBALAW1.2",
-    "developer": "Lawnakk",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 4.353
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1354
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2811
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3579
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1105
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lawnakk/BBALAW1.3/e74222c6-636c-4075-8d4d-30c73fa70fda.json b/data/hfopenllm_v2/Lawnakk/BBALAW1.3/e74222c6-636c-4075-8d4d-30c73fa70fda.json
deleted file mode 100644
index a9204af31..000000000
--- a/data/hfopenllm_v2/Lawnakk/BBALAW1.3/e74222c6-636c-4075-8d4d-30c73fa70fda.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lawnakk_BBALAW1.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBALAW1.3",
-    "id": "Lawnakk/BBALAW1.3",
-    "developer": "Lawnakk",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 4.353
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1354
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3619
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1094
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lawnakk/BBALAW1.6/aed80361-9304-44a0-934a-52976d7f1bf3.json b/data/hfopenllm_v2/Lawnakk/BBALAW1.6/aed80361-9304-44a0-934a-52976d7f1bf3.json
deleted file mode 100644
index 02391ea7b..000000000
--- a/data/hfopenllm_v2/Lawnakk/BBALAW1.6/aed80361-9304-44a0-934a-52976d7f1bf3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lawnakk_BBALAW1.6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBALAW1.6",
-    "id": "Lawnakk/BBALAW1.6",
-    "developer": "Lawnakk",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5245
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5554
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3603
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4368
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4507
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lawnakk/BBALAW1.61/709bd280-b03e-4908-808f-34566bc968f4.json b/data/hfopenllm_v2/Lawnakk/BBALAW1.61/709bd280-b03e-4908-808f-34566bc968f4.json
deleted file mode 100644
index d518940c4..000000000
--- a/data/hfopenllm_v2/Lawnakk/BBALAW1.61/709bd280-b03e-4908-808f-34566bc968f4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lawnakk_BBALAW1.61/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBALAW1.61",
-    "id": "Lawnakk/BBALAW1.61",
-    "developer": "Lawnakk",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5771
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5549
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3663
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4355
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4471
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lawnakk/BBALAW1.62/66c495b3-4b09-42ad-b742-4d753c3bde7a.json b/data/hfopenllm_v2/Lawnakk/BBALAW1.62/66c495b3-4b09-42ad-b742-4d753c3bde7a.json
deleted file mode 100644
index 0ca80591d..000000000
--- a/data/hfopenllm_v2/Lawnakk/BBALAW1.62/66c495b3-4b09-42ad-b742-4d753c3bde7a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lawnakk_BBALAW1.62/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBALAW1.62",
-    "id": "Lawnakk/BBALAW1.62",
-    "developer": "Lawnakk",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5046
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5581
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2825
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4343
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4545
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lawnakk/BBALAW1.63/e24f7be6-3051-4990-8b93-121aec5402eb.json b/data/hfopenllm_v2/Lawnakk/BBALAW1.63/e24f7be6-3051-4990-8b93-121aec5402eb.json
deleted file mode 100644
index cb01d3410..000000000
--- a/data/hfopenllm_v2/Lawnakk/BBALAW1.63/e24f7be6-3051-4990-8b93-121aec5402eb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lawnakk_BBALAW1.63/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBALAW1.63",
-    "id": "Lawnakk/BBALAW1.63",
-    "developer": "Lawnakk",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4407
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5541
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3701
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4303
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4471
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lawnakk/BBALAW1.64/0321571b-4246-4490-bd6c-7b106eb8e15a.json b/data/hfopenllm_v2/Lawnakk/BBALAW1.64/0321571b-4246-4490-bd6c-7b106eb8e15a.json
deleted file mode 100644
index 227047148..000000000
--- a/data/hfopenllm_v2/Lawnakk/BBALAW1.64/0321571b-4246-4490-bd6c-7b106eb8e15a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lawnakk_BBALAW1.64/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBALAW1.64",
-    "id": "Lawnakk/BBALAW1.64",
-    "developer": "Lawnakk",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1395
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2779
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2483
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3447
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1115
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lawnakk/BBALAW1/54dbf947-ab18-40dd-9cd7-a496289b2e72.json b/data/hfopenllm_v2/Lawnakk/BBALAW1/54dbf947-ab18-40dd-9cd7-a496289b2e72.json
deleted file mode 100644
index f7c7740c8..000000000
--- a/data/hfopenllm_v2/Lawnakk/BBALAW1/54dbf947-ab18-40dd-9cd7-a496289b2e72.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lawnakk_BBALAW1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBALAW1",
-    "id": "Lawnakk/BBALAW1",
-    "developer": "Lawnakk",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1905
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2872
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2433
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4153
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1121
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LenguajeNaturalAI/leniachat-gemma-2b-v0/d841e204-ed6a-439d-8408-d5cfb3b38dae.json b/data/hfopenllm_v2/LenguajeNaturalAI/leniachat-gemma-2b-v0/d841e204-ed6a-439d-8408-d5cfb3b38dae.json
deleted file mode 100644
index 6394ebbd7..000000000
--- a/data/hfopenllm_v2/LenguajeNaturalAI/leniachat-gemma-2b-v0/d841e204-ed6a-439d-8408-d5cfb3b38dae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LenguajeNaturalAI_leniachat-gemma-2b-v0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "leniachat-gemma-2b-v0",
-    "id": "LenguajeNaturalAI/leniachat-gemma-2b-v0",
-    "developer": "LenguajeNaturalAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 2.506
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.215
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3074
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0113
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3659
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.117
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LenguajeNaturalAI/leniachat-qwen2-1.5B-v0/96b57891-83e3-4948-ad48-64a2a370e166.json b/data/hfopenllm_v2/LenguajeNaturalAI/leniachat-qwen2-1.5B-v0/96b57891-83e3-4948-ad48-64a2a370e166.json
deleted file mode 100644
index 1f7ec34a2..000000000
--- a/data/hfopenllm_v2/LenguajeNaturalAI/leniachat-qwen2-1.5B-v0/96b57891-83e3-4948-ad48-64a2a370e166.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LenguajeNaturalAI_leniachat-qwen2-1.5B-v0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "leniachat-qwen2-1.5B-v0",
-    "id": "LenguajeNaturalAI/leniachat-qwen2-1.5B-v0",
-    "developer": "LenguajeNaturalAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.543
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2221
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3684
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.188
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/CheckPoint_A/30301818-6dad-45f9-acfb-a68ccc7c0609.json b/data/hfopenllm_v2/LeroyDyer/CheckPoint_A/30301818-6dad-45f9-acfb-a68ccc7c0609.json
deleted file mode 100644
index 009c8048b..000000000
--- a/data/hfopenllm_v2/LeroyDyer/CheckPoint_A/30301818-6dad-45f9-acfb-a68ccc7c0609.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_CheckPoint_A/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CheckPoint_A",
-    "id": "LeroyDyer/CheckPoint_A",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4513
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4748
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0589
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4231
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.288
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/CheckPoint_B/50743107-30de-4c5d-bf83-cc003af8a5db.json b/data/hfopenllm_v2/LeroyDyer/CheckPoint_B/50743107-30de-4c5d-bf83-cc003af8a5db.json
deleted file mode 100644
index c87445cfa..000000000
--- a/data/hfopenllm_v2/LeroyDyer/CheckPoint_B/50743107-30de-4c5d-bf83-cc003af8a5db.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_CheckPoint_B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CheckPoint_B",
-    "id": "LeroyDyer/CheckPoint_B",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.444
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.478
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0718
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3898
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2907
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/CheckPoint_C/625ee1b3-e0a1-4a86-83a4-6e66b380f864.json b/data/hfopenllm_v2/LeroyDyer/CheckPoint_C/625ee1b3-e0a1-4a86-83a4-6e66b380f864.json
deleted file mode 100644
index 27af86b1d..000000000
--- a/data/hfopenllm_v2/LeroyDyer/CheckPoint_C/625ee1b3-e0a1-4a86-83a4-6e66b380f864.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_CheckPoint_C/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CheckPoint_C",
-    "id": "LeroyDyer/CheckPoint_C",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3477
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4586
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0551
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4346
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3021
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/CheckPoint_R1/89fda762-1989-4850-837c-f79ef538c58c.json b/data/hfopenllm_v2/LeroyDyer/CheckPoint_R1/89fda762-1989-4850-837c-f79ef538c58c.json
deleted file mode 100644
index 5131c24dd..000000000
--- a/data/hfopenllm_v2/LeroyDyer/CheckPoint_R1/89fda762-1989-4850-837c-f79ef538c58c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_CheckPoint_R1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CheckPoint_R1",
-    "id": "LeroyDyer/CheckPoint_R1",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1728
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4225
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4031
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2205
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/LCARS_AI_001/1de1f906-0e36-4f79-b159-16ef8ee33ab3.json b/data/hfopenllm_v2/LeroyDyer/LCARS_AI_001/1de1f906-0e36-4f79-b159-16ef8ee33ab3.json
deleted file mode 100644
index f13e5d9d1..000000000
--- a/data/hfopenllm_v2/LeroyDyer/LCARS_AI_001/1de1f906-0e36-4f79-b159-16ef8ee33ab3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_LCARS_AI_001/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LCARS_AI_001",
-    "id": "LeroyDyer/LCARS_AI_001",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3109
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4258
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0234
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4384
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.267
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/LCARS_AI_1x4_003_SuperAI/d8588222-9e4b-47c1-9f86-92f47c9c8e38.json b/data/hfopenllm_v2/LeroyDyer/LCARS_AI_1x4_003_SuperAI/d8588222-9e4b-47c1-9f86-92f47c9c8e38.json
deleted file mode 100644
index a513ad18e..000000000
--- a/data/hfopenllm_v2/LeroyDyer/LCARS_AI_1x4_003_SuperAI/d8588222-9e4b-47c1-9f86-92f47c9c8e38.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_LCARS_AI_1x4_003_SuperAI/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LCARS_AI_1x4_003_SuperAI",
-    "id": "LeroyDyer/LCARS_AI_1x4_003_SuperAI",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 24.154
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4111
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.492
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4506
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2972
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/LCARS_AI_StarTrek_Computer/15e6e6e6-39fa-424f-ba12-5f209cd4b2cc.json b/data/hfopenllm_v2/LeroyDyer/LCARS_AI_StarTrek_Computer/15e6e6e6-39fa-424f-ba12-5f209cd4b2cc.json
deleted file mode 100644
index c31ece219..000000000
--- a/data/hfopenllm_v2/LeroyDyer/LCARS_AI_StarTrek_Computer/15e6e6e6-39fa-424f-ba12-5f209cd4b2cc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_LCARS_AI_StarTrek_Computer/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LCARS_AI_StarTrek_Computer",
-    "id": "LeroyDyer/LCARS_AI_StarTrek_Computer",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3583
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4446
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.395
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2458
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/LCARS_TOP_SCORE/81225b85-1523-49c1-b770-897112d2e6ae.json b/data/hfopenllm_v2/LeroyDyer/LCARS_TOP_SCORE/81225b85-1523-49c1-b770-897112d2e6ae.json
deleted file mode 100644
index 6e727e47a..000000000
--- a/data/hfopenllm_v2/LeroyDyer/LCARS_TOP_SCORE/81225b85-1523-49c1-b770-897112d2e6ae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_LCARS_TOP_SCORE/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LCARS_TOP_SCORE",
-    "id": "LeroyDyer/LCARS_TOP_SCORE",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4371
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5127
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0672
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4293
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3031
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/Mixtral_AI_SwahiliTron_7b/254deaf7-a253-4d41-a10d-1143f86b288c.json b/data/hfopenllm_v2/LeroyDyer/Mixtral_AI_SwahiliTron_7b/254deaf7-a253-4d41-a10d-1143f86b288c.json
deleted file mode 100644
index 3ffc415fc..000000000
--- a/data/hfopenllm_v2/LeroyDyer/Mixtral_AI_SwahiliTron_7b/254deaf7-a253-4d41-a10d-1143f86b288c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_Mixtral_AI_SwahiliTron_7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mixtral_AI_SwahiliTron_7b",
-    "id": "LeroyDyer/Mixtral_AI_SwahiliTron_7b",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1534
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3055
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1208
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWebAI_Human_AGI/ba0b66f5-724a-4a6b-ac20-a36d530a8b4b.json b/data/hfopenllm_v2/LeroyDyer/SpydazWebAI_Human_AGI/ba0b66f5-724a-4a6b-ac20-a36d530a8b4b.json
deleted file mode 100644
index 93ca501b0..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWebAI_Human_AGI/ba0b66f5-724a-4a6b-ac20-a36d530a8b4b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWebAI_Human_AGI/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWebAI_Human_AGI",
-    "id": "LeroyDyer/SpydazWebAI_Human_AGI",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3388
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3375
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3966
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1479
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWebAI_Human_AGI_001/eed0b3b4-e277-49ee-aed5-f3599b2d5653.json b/data/hfopenllm_v2/LeroyDyer/SpydazWebAI_Human_AGI_001/eed0b3b4-e277-49ee-aed5-f3599b2d5653.json
deleted file mode 100644
index 03d018a11..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWebAI_Human_AGI_001/eed0b3b4-e277-49ee-aed5-f3599b2d5653.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWebAI_Human_AGI_001/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWebAI_Human_AGI_001",
-    "id": "LeroyDyer/SpydazWebAI_Human_AGI_001",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3118
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3433
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3994
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1426
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_CyberTron_Ultra_7b/96a21b6e-ed47-40fb-85cd-15924330e60d.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_CyberTron_Ultra_7b/96a21b6e-ed47-40fb-85cd-15924330e60d.json
deleted file mode 100644
index faca52fe7..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_CyberTron_Ultra_7b/96a21b6e-ed47-40fb-85cd-15924330e60d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_CyberTron_Ultra_7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_AI_CyberTron_Ultra_7b",
-    "id": "LeroyDyer/SpydazWeb_AI_CyberTron_Ultra_7b",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1556
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4811
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4136
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2866
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAGI_001_M2/f41f5471-6384-4510-85d2-41f236082583.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAGI_001_M2/f41f5471-6384-4510-85d2-41f236082583.json
deleted file mode 100644
index 0b361a56c..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAGI_001_M2/f41f5471-6384-4510-85d2-41f236082583.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAGI_001_M2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_AI_HumanAGI_001_M2",
-    "id": "LeroyDyer/SpydazWeb_AI_HumanAGI_001_M2",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.394
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4888
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4503
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3005
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAGI_002/2728eccc-525f-4350-901b-dbc352c78014.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAGI_002/2728eccc-525f-4350-901b-dbc352c78014.json
deleted file mode 100644
index 0eb676741..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAGI_002/2728eccc-525f-4350-901b-dbc352c78014.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAGI_002/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_AI_HumanAGI_002",
-    "id": "LeroyDyer/SpydazWeb_AI_HumanAGI_002",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4088
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5044
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4865
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3059
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_001/3e7ae935-46c3-427c-8713-41c659c1828a.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_001/3e7ae935-46c3-427c-8713-41c659c1828a.json
deleted file mode 100644
index f6494cb5b..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_001/3e7ae935-46c3-427c-8713-41c659c1828a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_001/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_AI_HumanAI_001",
-    "id": "LeroyDyer/SpydazWeb_AI_HumanAI_001",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2252
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3344
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.386
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1271
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_006/66782676-c942-4aff-b754-b96cd96cf1f9.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_006/66782676-c942-4aff-b754-b96cd96cf1f9.json
deleted file mode 100644
index 9408883d4..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_006/66782676-c942-4aff-b754-b96cd96cf1f9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_006/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_AI_HumanAI_006",
-    "id": "LeroyDyer/SpydazWeb_AI_HumanAI_006",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.143
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3302
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3568
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1135
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_007/941a9e27-2ac4-4dab-a6d0-cb9319c79a27.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_007/941a9e27-2ac4-4dab-a6d0-cb9319c79a27.json
deleted file mode 100644
index fd4a7fc65..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_007/941a9e27-2ac4-4dab-a6d0-cb9319c79a27.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_007/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_AI_HumanAI_007",
-    "id": "LeroyDyer/SpydazWeb_AI_HumanAI_007",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3352
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3416
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0227
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4096
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1352
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_009_CHAT/caf93f75-530e-4f4d-9cc0-2cf9b0a7f2ff.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_009_CHAT/caf93f75-530e-4f4d-9cc0-2cf9b0a7f2ff.json
deleted file mode 100644
index 67f16d3b2..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_009_CHAT/caf93f75-530e-4f4d-9cc0-2cf9b0a7f2ff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_009_CHAT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_AI_HumanAI_009_CHAT",
-    "id": "LeroyDyer/SpydazWeb_AI_HumanAI_009_CHAT",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2973
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3307
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4138
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1433
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_010_CHAT/d3ca0458-ee97-4a4c-a6a9-066880ffefb5.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_010_CHAT/d3ca0458-ee97-4a4c-a6a9-066880ffefb5.json
deleted file mode 100644
index 3fc4a754a..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_010_CHAT/d3ca0458-ee97-4a4c-a6a9-066880ffefb5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_010_CHAT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_AI_HumanAI_010_CHAT",
-    "id": "LeroyDyer/SpydazWeb_AI_HumanAI_010_CHAT",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2507
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3336
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4137
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.143
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT/615bf89b-9357-46f4-82ed-f49b0021da01.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT/615bf89b-9357-46f4-82ed-f49b0021da01.json
deleted file mode 100644
index 877f891f4..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT/615bf89b-9357-46f4-82ed-f49b0021da01.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_011_INSTRUCT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_AI_HumanAI_011_INSTRUCT",
-    "id": "LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3149
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3523
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3831
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1595
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML/06398630-23ad-4000-8ea2-fcca230568d7.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML/06398630-23ad-4000-8ea2-fcca230568d7.json
deleted file mode 100644
index 5534cf616..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML/06398630-23ad-4000-8ea2-fcca230568d7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_011_INSTRUCT_ML/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_AI_HumanAI_011_INSTRUCT_ML",
-    "id": "LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3752
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3984
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0257
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4239
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2019
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1/bdfa30f8-da0f-418f-adaf-caafda4c81a5.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1/bdfa30f8-da0f-418f-adaf-caafda4c81a5.json
deleted file mode 100644
index 9e0ca9e6d..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1/bdfa30f8-da0f-418f-adaf-caafda4c81a5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1",
-    "id": "LeroyDyer/SpydazWeb_AI_HumanAI_011_INSTRUCT_ML_r1",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.405
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4858
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0551
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3921
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2956
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/bd5e550c-5355-4e01-bafc-2ca89899253a.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/bd5e550c-5355-4e01-bafc-2ca89899253a.json
deleted file mode 100644
index 05508c5e5..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/bd5e550c-5355-4e01-bafc-2ca89899253a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_IA",
-    "id": "LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3066
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4577
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0446
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4254
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2318
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/f842ad5b-24f0-419b-9d65-5a6ff1f5e04b.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/f842ad5b-24f0-419b-9d65-5a6ff1f5e04b.json
deleted file mode 100644
index 6bafa117f..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/f842ad5b-24f0-419b-9d65-5a6ff1f5e04b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_IA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_IA",
-    "id": "LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_IA",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3036
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4575
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0446
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4253
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2329
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_MX/3a09590f-28f3-4161-8a93-d42cec62aa90.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_MX/3a09590f-28f3-4161-8a93-d42cec62aa90.json
deleted file mode 100644
index fad8789d2..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_MX/3a09590f-28f3-4161-8a93-d42cec62aa90.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_MX/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_MX",
-    "id": "LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_MX",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3066
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3158
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3444
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1107
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/0f6b76ca-c4b8-40b2-a3af-2ea1c3650933.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/0f6b76ca-c4b8-40b2-a3af-2ea1c3650933.json
deleted file mode 100644
index 33826b74e..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/0f6b76ca-c4b8-40b2-a3af-2ea1c3650933.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_XA",
-    "id": "LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3579
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4477
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0423
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4134
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2376
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/f276ad54-4e3b-4718-ae1f-0479565e4565.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/f276ad54-4e3b-4718-ae1f-0479565e4565.json
deleted file mode 100644
index 4d6c638c9..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/f276ad54-4e3b-4718-ae1f-0479565e4565.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_012_INSTRUCT_XA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_AI_HumanAI_012_INSTRUCT_XA",
-    "id": "LeroyDyer/SpydazWeb_AI_HumanAI_012_INSTRUCT_XA",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3798
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4483
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4148
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2389
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_RP/dec20396-6555-4773-bf02-2cd1fcedda89.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_RP/dec20396-6555-4773-bf02-2cd1fcedda89.json
deleted file mode 100644
index 35280576b..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_RP/dec20396-6555-4773-bf02-2cd1fcedda89.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_AI_HumanAI_RP",
-    "id": "LeroyDyer/SpydazWeb_AI_HumanAI_RP",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2541
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3323
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3883
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1324
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_TextVision/eebc33e1-0016-4adf-815a-72653a34c01b.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_TextVision/eebc33e1-0016-4adf-815a-72653a34c01b.json
deleted file mode 100644
index b478f0a27..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_AI_HumanAI_TextVision/eebc33e1-0016-4adf-815a-72653a34c01b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_AI_HumanAI_TextVision/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_AI_HumanAI_TextVision",
-    "id": "LeroyDyer/SpydazWeb_AI_HumanAI_TextVision",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3063
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3354
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3938
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1387
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M1/803c3898-c1a6-4832-ac3a-a86139489810.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M1/803c3898-c1a6-4832-ac3a-a86139489810.json
deleted file mode 100644
index 0bd353a92..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M1/803c3898-c1a6-4832-ac3a-a86139489810.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_HumanAI_M1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_HumanAI_M1",
-    "id": "LeroyDyer/SpydazWeb_HumanAI_M1",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3582
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3563
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3671
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1663
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M2/bfaa3d3e-66fd-4477-85af-4b83f13ff05b.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M2/bfaa3d3e-66fd-4477-85af-4b83f13ff05b.json
deleted file mode 100644
index 3df076af2..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M2/bfaa3d3e-66fd-4477-85af-4b83f13ff05b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_HumanAI_M2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_HumanAI_M2",
-    "id": "LeroyDyer/SpydazWeb_HumanAI_M2",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3931
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3751
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.201
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M3/99debdd2-1dea-4eb6-be5c-c144656cfe20.json b/data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M3/99debdd2-1dea-4eb6-be5c-c144656cfe20.json
deleted file mode 100644
index 5211d3064..000000000
--- a/data/hfopenllm_v2/LeroyDyer/SpydazWeb_HumanAI_M3/99debdd2-1dea-4eb6-be5c-c144656cfe20.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer_SpydazWeb_HumanAI_M3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SpydazWeb_HumanAI_M3",
-    "id": "LeroyDyer/SpydazWeb_HumanAI_M3",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1579
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3127
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3914
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1149
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_12/ad67bb88-7f74-4eb4-b771-0b3b60be4416.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_12/ad67bb88-7f74-4eb4-b771-0b3b60be4416.json
deleted file mode 100644
index 8fb0a66ae..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_12/ad67bb88-7f74-4eb4-b771-0b3b60be4416.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_12/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_12",
-    "id": "LeroyDyer/_Spydaz_Web_AI_12",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2765
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3582
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1137
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_14/af2f579d-1e8a-47d8-8e44-a599bee83e37.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_14/af2f579d-1e8a-47d8-8e44-a599bee83e37.json
deleted file mode 100644
index 6d3423046..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_14/af2f579d-1e8a-47d8-8e44-a599bee83e37.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_14/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_14",
-    "id": "LeroyDyer/_Spydaz_Web_AI_14",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1812
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2989
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3395
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1139
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_001/763c840e-ea73-453e-8e54-5f4fd6fda9cd.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_001/763c840e-ea73-453e-8e54-5f4fd6fda9cd.json
deleted file mode 100644
index 5e467d10a..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_001/763c840e-ea73-453e-8e54-5f4fd6fda9cd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_001/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_R1_001",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_001",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4505
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4609
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0634
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4256
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2734
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_002/4fb40ac4-a637-4b9a-b69d-ba551c0f0938.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_002/4fb40ac4-a637-4b9a-b69d-ba551c0f0938.json
deleted file mode 100644
index bedce984a..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_002/4fb40ac4-a637-4b9a-b69d-ba551c0f0938.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_002/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_R1_002",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_002",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5307
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4683
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0582
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4255
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_MUSR/ffc4ef41-4a28-4816-be54-8ffd8e153073.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_MUSR/ffc4ef41-4a28-4816-be54-8ffd8e153073.json
deleted file mode 100644
index ccb933fa1..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_MUSR/ffc4ef41-4a28-4816-be54-8ffd8e153073.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_MUSR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_R1_MUSR",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_MUSR",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4786
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4672
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0604
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4869
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2828
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_MasterCoder/f75fe902-f1c7-4e6c-87d6-128688db8d94.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_MasterCoder/f75fe902-f1c7-4e6c-87d6-128688db8d94.json
deleted file mode 100644
index b1dc3496d..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_MasterCoder/f75fe902-f1c7-4e6c-87d6-128688db8d94.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_MasterCoder/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_R1_MasterCoder",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_MasterCoder",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4143
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4689
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.472
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2719
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_001/dbd3098b-4532-441b-a81c-072c52579be6.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_001/dbd3098b-4532-441b-a81c-072c52579be6.json
deleted file mode 100644
index 96985ae3e..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_001/dbd3098b-4532-441b-a81c-072c52579be6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_001/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_R1_Math_001",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_001",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4571
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4818
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0695
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4778
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2681
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_003/438e4aa3-5e02-446e-bd3a-07ef724d24ff.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_003/438e4aa3-5e02-446e-bd3a-07ef724d24ff.json
deleted file mode 100644
index d2edae906..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_003/438e4aa3-5e02-446e-bd3a-07ef724d24ff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_003/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_R1_Math_003",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_003",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.62
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4756
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0695
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4202
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2999
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent/027fdc55-61eb-416c-b6ad-4408912d151b.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent/027fdc55-61eb-416c-b6ad-4408912d151b.json
deleted file mode 100644
index e3bc05a1b..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent/027fdc55-61eb-416c-b6ad-4408912d151b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_AdvancedStudent",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5951
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4927
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5198
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Student/37a4895d-def5-494d-9b62-d8c97ba9350b.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Student/37a4895d-def5-494d-9b62-d8c97ba9350b.json
deleted file mode 100644
index d67104ff1..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Student/37a4895d-def5-494d-9b62-d8c97ba9350b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_Student/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_R1_Math_Student",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Student",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5736
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4881
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5098
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2927
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Teacher/0d53c27e-962c-428f-b540-35ab027883a8.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Teacher/0d53c27e-962c-428f-b540-35ab027883a8.json
deleted file mode 100644
index 173bb8257..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Teacher/0d53c27e-962c-428f-b540-35ab027883a8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_Math_Teacher/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_R1_Math_Teacher",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Math_Teacher",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5772
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4805
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5222
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2956
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_001/6f7b2d91-24d6-442c-93a5-9afc88e9a308.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_001/6f7b2d91-24d6-442c-93a5-9afc88e9a308.json
deleted file mode 100644
index a4540e902..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_001/6f7b2d91-24d6-442c-93a5-9afc88e9a308.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_001/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_R1_OmG_001",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_001",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5818
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4908
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4486
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2906
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_002/21793520-7d1a-4040-bb96-fa7fe98ae580.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_002/21793520-7d1a-4040-bb96-fa7fe98ae580.json
deleted file mode 100644
index c42060e34..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_002/21793520-7d1a-4040-bb96-fa7fe98ae580.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_002/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_R1_OmG_002",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_002",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5462
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4655
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0498
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4511
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2867
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Coder/59d53c40-5b16-4a70-a693-5fb554cf7614.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Coder/59d53c40-5b16-4a70-a693-5fb554cf7614.json
deleted file mode 100644
index 3df532609..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Coder/59d53c40-5b16-4a70-a693-5fb554cf7614.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_Coder/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_R1_OmG_Coder",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Coder",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4924
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4638
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5625
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.289
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Math/b28a569c-6bdf-4547-a2ce-c3e224764be3.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Math/b28a569c-6bdf-4547-a2ce-c3e224764be3.json
deleted file mode 100644
index 481f7a4c2..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Math/b28a569c-6bdf-4547-a2ce-c3e224764be3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_Math/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_R1_OmG_Math",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_Math",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5033
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4677
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4326
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2913
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_MathMaster/2de129c8-2259-4367-a619-85d9e8f61e06.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_MathMaster/2de129c8-2259-4367-a619-85d9e8f61e06.json
deleted file mode 100644
index b5ee95324..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_MathMaster/2de129c8-2259-4367-a619-85d9e8f61e06.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_OmG_MathMaster/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_R1_OmG_MathMaster",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_OmG_MathMaster",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5558
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4742
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.451
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2672
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Student_Coder/c242030f-fb2b-42dc-a5d1-687273b17282.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Student_Coder/c242030f-fb2b-42dc-a5d1-687273b17282.json
deleted file mode 100644
index d5484f88e..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Student_Coder/c242030f-fb2b-42dc-a5d1-687273b17282.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_Student_Coder/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_R1_Student_Coder",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Student_Coder",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.545
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4651
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0657
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4388
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Teacher_Coder/3b3fdb16-b6e1-40c8-9ac0-02f1f2207eb7.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Teacher_Coder/3b3fdb16-b6e1-40c8-9ac0-02f1f2207eb7.json
deleted file mode 100644
index f0fe06eb1..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Teacher_Coder/3b3fdb16-b6e1-40c8-9ac0-02f1f2207eb7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_Teacher_Coder/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_R1_Teacher_Coder",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Teacher_Coder",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5082
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4797
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4338
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2845
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Top_Student/ef6e8e0d-7ba4-45ea-aaf7-617f68f2e97c.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Top_Student/ef6e8e0d-7ba4-45ea-aaf7-617f68f2e97c.json
deleted file mode 100644
index bf973d06a..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_Top_Student/ef6e8e0d-7ba4-45ea-aaf7-617f68f2e97c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_Top_Student/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_R1_Top_Student",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_Top_Student",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.604
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4988
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0725
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5398
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3024
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_X1/f8c131a4-1fee-4694-8753-88853418ef4b.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_X1/f8c131a4-1fee-4694-8753-88853418ef4b.json
deleted file mode 100644
index 6d959d14f..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_X1/f8c131a4-1fee-4694-8753-88853418ef4b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_X1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_R1_X1",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_X1",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4273
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4759
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4232
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2891
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_X2/27dec9ff-fb18-43dd-949f-7c0587a5858f.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_X2/27dec9ff-fb18-43dd-949f-7c0587a5858f.json
deleted file mode 100644
index 0a0055521..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_R1_X2/27dec9ff-fb18-43dd-949f-7c0587a5858f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_R1_X2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_R1_X2",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_R1_X2",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5434
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4786
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4695
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2921
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_RP_R1/060df34d-ab67-43e1-bd56-ebaceb77abd3.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_RP_R1/060df34d-ab67-43e1-bd56-ebaceb77abd3.json
deleted file mode 100644
index b431b930c..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_AGI_RP_R1/060df34d-ab67-43e1-bd56-ebaceb77abd3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_AGI_RP_R1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_AGI_RP_R1",
-    "id": "LeroyDyer/_Spydaz_Web_AI_AGI_RP_R1",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5426
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4701
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0604
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4201
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_BIBLE_002/a6357673-3daa-4593-8593-2b65a7d5477e.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_BIBLE_002/a6357673-3daa-4593-8593-2b65a7d5477e.json
deleted file mode 100644
index a62a6bb3e..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_BIBLE_002/a6357673-3daa-4593-8593-2b65a7d5477e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_BIBLE_002/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_BIBLE_002",
-    "id": "LeroyDyer/_Spydaz_Web_AI_BIBLE_002",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2195
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0174
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3407
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1368
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatML_002/121d4877-1955-48db-a23a-6b0ad0623b9e.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatML_002/121d4877-1955-48db-a23a-6b0ad0623b9e.json
deleted file mode 100644
index d97ec510d..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatML_002/121d4877-1955-48db-a23a-6b0ad0623b9e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_ChatML_002/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_ChatML_002",
-    "id": "LeroyDyer/_Spydaz_Web_AI_ChatML_002",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2412
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3106
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0113
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3623
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1095
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatQA/1f1eab02-219e-4ad8-af50-e103541e1c9d.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatQA/1f1eab02-219e-4ad8-af50-e103541e1c9d.json
deleted file mode 100644
index 21baf209e..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatQA/1f1eab02-219e-4ad8-af50-e103541e1c9d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_ChatQA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_ChatQA",
-    "id": "LeroyDyer/_Spydaz_Web_AI_ChatQA",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1415
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3236
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3447
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1475
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatQA_003/b4cccfb3-1c17-48a3-a211-a26c44de757f.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatQA_003/b4cccfb3-1c17-48a3-a211-a26c44de757f.json
deleted file mode 100644
index 5a698b13e..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_ChatQA_003/b4cccfb3-1c17-48a3-a211-a26c44de757f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_ChatQA_003/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_ChatQA_003",
-    "id": "LeroyDyer/_Spydaz_Web_AI_ChatQA_003",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2209
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3172
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3818
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1133
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_TEMP_/05e97a86-681d-42a2-8a47-beade25d8fc9.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_TEMP_/05e97a86-681d-42a2-8a47-beade25d8fc9.json
deleted file mode 100644
index 09c825d37..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_TEMP_/05e97a86-681d-42a2-8a47-beade25d8fc9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_TEMP_/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_TEMP_",
-    "id": "LeroyDyer/_Spydaz_Web_AI_TEMP_",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4795
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4957
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1239
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4218
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_Top_Teacher_/6c0899b4-f066-45f6-827d-11c535ef0634.json b/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_Top_Teacher_/6c0899b4-f066-45f6-827d-11c535ef0634.json
deleted file mode 100644
index c36cff39f..000000000
--- a/data/hfopenllm_v2/LeroyDyer/_Spydaz_Web_AI_Top_Teacher_/6c0899b4-f066-45f6-827d-11c535ef0634.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LeroyDyer__Spydaz_Web_AI_Top_Teacher_/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "_Spydaz_Web_AI_Top_Teacher_",
-    "id": "LeroyDyer/_Spydaz_Web_AI_Top_Teacher_",
-    "developer": "LeroyDyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4404
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4891
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1156
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4366
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.315
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.0/f9660557-b9f6-4ecc-b260-c245f0e62b5b.json b/data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.0/f9660557-b9f6-4ecc-b260-c245f0e62b5b.json
deleted file mode 100644
index b9c99ac4d..000000000
--- a/data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.0/f9660557-b9f6-4ecc-b260-c245f0e62b5b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LightningRodLabs_Flashlight-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Flashlight-v1.0",
-    "id": "LightningRodLabs/Flashlight-v1.0",
-    "developer": "LightningRodLabs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6745
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6877
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.497
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3423
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4101
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5402
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.1/89168032-5840-4c2c-821e-b3d717ade46f.json b/data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.1/89168032-5840-4c2c-821e-b3d717ade46f.json
deleted file mode 100644
index 4c4c25448..000000000
--- a/data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.1/89168032-5840-4c2c-821e-b3d717ade46f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LightningRodLabs_Flashlight-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Flashlight-v1.1",
-    "id": "LightningRodLabs/Flashlight-v1.1",
-    "developer": "LightningRodLabs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6721
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6901
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5325
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3398
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4048
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5416
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.2/10d0aa63-67d9-4dba-9bdc-db7ab3b4547d.json b/data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.2/10d0aa63-67d9-4dba-9bdc-db7ab3b4547d.json
deleted file mode 100644
index 5dd6485a7..000000000
--- a/data/hfopenllm_v2/LightningRodLabs/Flashlight-v1.2/10d0aa63-67d9-4dba-9bdc-db7ab3b4547d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LightningRodLabs_Flashlight-v1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Flashlight-v1.2",
-    "id": "LightningRodLabs/Flashlight-v1.2",
-    "developer": "LightningRodLabs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.436
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3265
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1556
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2357
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4554
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2485
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-2B-SLERP-V1/6f66ae5b-8cb6-4263-98a4-4a1eddfaca10.json b/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-2B-SLERP-V1/6f66ae5b-8cb6-4263-98a4-4a1eddfaca10.json
deleted file mode 100644
index 9f4103403..000000000
--- a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-2B-SLERP-V1/6f66ae5b-8cb6-4263-98a4-4a1eddfaca10.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lil-R_2_PRYMMAL-ECE-2B-SLERP-V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "2_PRYMMAL-ECE-2B-SLERP-V1",
-    "id": "Lil-R/2_PRYMMAL-ECE-2B-SLERP-V1",
-    "developer": "Lil-R",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5823
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4287
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0914
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4375
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2678
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-2B-SLERP-V2/5e715199-7030-47b4-89c6-83ba0968c07c.json b/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-2B-SLERP-V2/5e715199-7030-47b4-89c6-83ba0968c07c.json
deleted file mode 100644
index 343a94755..000000000
--- a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-2B-SLERP-V2/5e715199-7030-47b4-89c6-83ba0968c07c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lil-R_2_PRYMMAL-ECE-2B-SLERP-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "2_PRYMMAL-ECE-2B-SLERP-V2",
-    "id": "Lil-R/2_PRYMMAL-ECE-2B-SLERP-V2",
-    "developer": "Lil-R",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5543
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4376
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0944
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4482
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2744
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V1/3fca39e8-443d-47da-a858-83a68c18eec9.json b/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V1/3fca39e8-443d-47da-a858-83a68c18eec9.json
deleted file mode 100644
index e5d8c93e6..000000000
--- a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V1/3fca39e8-443d-47da-a858-83a68c18eec9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lil-R_2_PRYMMAL-ECE-7B-SLERP-V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "2_PRYMMAL-ECE-7B-SLERP-V1",
-    "id": "Lil-R/2_PRYMMAL-ECE-7B-SLERP-V1",
-    "developer": "Lil-R",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1073
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3053
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3911
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1124
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V2/b7518bd2-d3af-49e6-823a-f8d507e8e60f.json b/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V2/b7518bd2-d3af-49e6-823a-f8d507e8e60f.json
deleted file mode 100644
index 6709db764..000000000
--- a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V2/b7518bd2-d3af-49e6-823a-f8d507e8e60f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lil-R_2_PRYMMAL-ECE-7B-SLERP-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "2_PRYMMAL-ECE-7B-SLERP-V2",
-    "id": "Lil-R/2_PRYMMAL-ECE-7B-SLERP-V2",
-    "developer": "Lil-R",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1073
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3053
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3911
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1124
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V3/fa399f16-1652-430c-be19-afaf5ab96be1.json b/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V3/fa399f16-1652-430c-be19-afaf5ab96be1.json
deleted file mode 100644
index 04700dc51..000000000
--- a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP-V3/fa399f16-1652-430c-be19-afaf5ab96be1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lil-R_2_PRYMMAL-ECE-7B-SLERP-V3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "2_PRYMMAL-ECE-7B-SLERP-V3",
-    "id": "Lil-R/2_PRYMMAL-ECE-7B-SLERP-V3",
-    "developer": "Lil-R",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2235
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3578
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4107
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1817
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP/cbe5032b-122c-4a0b-a099-50e998a4bc77.json b/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP/cbe5032b-122c-4a0b-a099-50e998a4bc77.json
deleted file mode 100644
index ad0887f0c..000000000
--- a/data/hfopenllm_v2/Lil-R/2_PRYMMAL-ECE-7B-SLERP/cbe5032b-122c-4a0b-a099-50e998a4bc77.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lil-R_2_PRYMMAL-ECE-7B-SLERP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "2_PRYMMAL-ECE-7B-SLERP",
-    "id": "Lil-R/2_PRYMMAL-ECE-7B-SLERP",
-    "developer": "Lil-R",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5577
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5557
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3633
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4396
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4507
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lil-R/PRYMMAL-ECE-1B-SLERP-V1/fd8c3209-dcc0-4d27-a3aa-d0f76ef86f8d.json b/data/hfopenllm_v2/Lil-R/PRYMMAL-ECE-1B-SLERP-V1/fd8c3209-dcc0-4d27-a3aa-d0f76ef86f8d.json
deleted file mode 100644
index 82de476ba..000000000
--- a/data/hfopenllm_v2/Lil-R/PRYMMAL-ECE-1B-SLERP-V1/fd8c3209-dcc0-4d27-a3aa-d0f76ef86f8d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lil-R_PRYMMAL-ECE-1B-SLERP-V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PRYMMAL-ECE-1B-SLERP-V1",
-    "id": "Lil-R/PRYMMAL-ECE-1B-SLERP-V1",
-    "developer": "Lil-R",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2874
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.419
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1035
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3974
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2926
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lil-R/PRYMMAL-ECE-7B-SLERP-V8/1a18d49c-ad7b-4823-abbc-7191e9d659cd.json b/data/hfopenllm_v2/Lil-R/PRYMMAL-ECE-7B-SLERP-V8/1a18d49c-ad7b-4823-abbc-7191e9d659cd.json
deleted file mode 100644
index 1bab75b49..000000000
--- a/data/hfopenllm_v2/Lil-R/PRYMMAL-ECE-7B-SLERP-V8/1a18d49c-ad7b-4823-abbc-7191e9d659cd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lil-R_PRYMMAL-ECE-7B-SLERP-V8/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PRYMMAL-ECE-7B-SLERP-V8",
-    "id": "Lil-R/PRYMMAL-ECE-7B-SLERP-V8",
-    "developer": "Lil-R",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1258
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2955
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3631
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1128
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LilRg/10PRYMMAL-3B-slerp/9e2c614e-1104-43a6-9e8f-b7851562e01a.json b/data/hfopenllm_v2/LilRg/10PRYMMAL-3B-slerp/9e2c614e-1104-43a6-9e8f-b7851562e01a.json
deleted file mode 100644
index 3ab3feb1c..000000000
--- a/data/hfopenllm_v2/LilRg/10PRYMMAL-3B-slerp/9e2c614e-1104-43a6-9e8f-b7851562e01a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LilRg_10PRYMMAL-3B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "10PRYMMAL-3B-slerp",
-    "id": "LilRg/10PRYMMAL-3B-slerp",
-    "developer": "LilRg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1946
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.532
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1495
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4529
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3881
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LilRg/ECE-1B-merge-PRYMMAL/7d4b83ab-9c9d-46e5-8cbf-b8afcf781230.json b/data/hfopenllm_v2/LilRg/ECE-1B-merge-PRYMMAL/7d4b83ab-9c9d-46e5-8cbf-b8afcf781230.json
deleted file mode 100644
index 93bc8cf86..000000000
--- a/data/hfopenllm_v2/LilRg/ECE-1B-merge-PRYMMAL/7d4b83ab-9c9d-46e5-8cbf-b8afcf781230.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LilRg_ECE-1B-merge-PRYMMAL/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-1B-merge-PRYMMAL",
-    "id": "LilRg/ECE-1B-merge-PRYMMAL",
-    "developer": "LilRg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2712
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4235
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1012
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3801
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2906
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LilRg/ECE_Finetunning/a42b5d7e-be7f-4cde-aaf0-001e2cf05a44.json b/data/hfopenllm_v2/LilRg/ECE_Finetunning/a42b5d7e-be7f-4cde-aaf0-001e2cf05a44.json
deleted file mode 100644
index fd422f5f3..000000000
--- a/data/hfopenllm_v2/LilRg/ECE_Finetunning/a42b5d7e-be7f-4cde-aaf0-001e2cf05a44.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LilRg_ECE_Finetunning/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE_Finetunning",
-    "id": "LilRg/ECE_Finetunning",
-    "developer": "LilRg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 16.061
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0445
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4732
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3839
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3191
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LilRg/PRYMMAL-6B-slerp/21f6688c-be52-4352-9c95-d37c0a5f6c94.json b/data/hfopenllm_v2/LilRg/PRYMMAL-6B-slerp/21f6688c-be52-4352-9c95-d37c0a5f6c94.json
deleted file mode 100644
index ecbfdcf37..000000000
--- a/data/hfopenllm_v2/LilRg/PRYMMAL-6B-slerp/21f6688c-be52-4352-9c95-d37c0a5f6c94.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LilRg_PRYMMAL-6B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PRYMMAL-6B-slerp",
-    "id": "LilRg/PRYMMAL-6B-slerp",
-    "developer": "LilRg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.293
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1153
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2868
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2458
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3698
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1108
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V3/e92ba586-7bee-4a9b-b388-e35efde3d36f.json b/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V3/e92ba586-7bee-4a9b-b388-e35efde3d36f.json
deleted file mode 100644
index e89c0598f..000000000
--- a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V3/e92ba586-7bee-4a9b-b388-e35efde3d36f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LilRg_PRYMMAL-ECE-7B-SLERP-V3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PRYMMAL-ECE-7B-SLERP-V3",
-    "id": "LilRg/PRYMMAL-ECE-7B-SLERP-V3",
-    "developer": "LilRg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1243
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2957
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3671
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1127
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V4/45ed0bb3-efbf-4a32-9735-d814aa08790a.json b/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V4/45ed0bb3-efbf-4a32-9735-d814aa08790a.json
deleted file mode 100644
index caaf555af..000000000
--- a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V4/45ed0bb3-efbf-4a32-9735-d814aa08790a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LilRg_PRYMMAL-ECE-7B-SLERP-V4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PRYMMAL-ECE-7B-SLERP-V4",
-    "id": "LilRg/PRYMMAL-ECE-7B-SLERP-V4",
-    "developer": "LilRg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1249
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2957
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3671
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1127
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V5/eff28375-89a7-4970-9342-428b07d0c6f4.json b/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V5/eff28375-89a7-4970-9342-428b07d0c6f4.json
deleted file mode 100644
index 17c5c7e82..000000000
--- a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V5/eff28375-89a7-4970-9342-428b07d0c6f4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LilRg_PRYMMAL-ECE-7B-SLERP-V5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PRYMMAL-ECE-7B-SLERP-V5",
-    "id": "LilRg/PRYMMAL-ECE-7B-SLERP-V5",
-    "developer": "LilRg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1249
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2957
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3671
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1127
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V6/23877e30-b8fb-45ea-a803-47df757ea909.json b/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V6/23877e30-b8fb-45ea-a803-47df757ea909.json
deleted file mode 100644
index ec37cc975..000000000
--- a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V6/23877e30-b8fb-45ea-a803-47df757ea909.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LilRg_PRYMMAL-ECE-7B-SLERP-V6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PRYMMAL-ECE-7B-SLERP-V6",
-    "id": "LilRg/PRYMMAL-ECE-7B-SLERP-V6",
-    "developer": "LilRg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1243
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2957
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3671
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1127
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V7/8bc25d04-9cc5-4551-a9c5-ce185c7ad974.json b/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V7/8bc25d04-9cc5-4551-a9c5-ce185c7ad974.json
deleted file mode 100644
index 06e5be7ee..000000000
--- a/data/hfopenllm_v2/LilRg/PRYMMAL-ECE-7B-SLERP-V7/8bc25d04-9cc5-4551-a9c5-ce185c7ad974.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LilRg_PRYMMAL-ECE-7B-SLERP-V7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PRYMMAL-ECE-7B-SLERP-V7",
-    "id": "LilRg/PRYMMAL-ECE-7B-SLERP-V7",
-    "developer": "LilRg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1249
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2957
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3671
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1127
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LilRg/PRYMMAL-slerp-Merge/d2d4b5a5-109d-4d26-a166-3d97b341584e.json b/data/hfopenllm_v2/LilRg/PRYMMAL-slerp-Merge/d2d4b5a5-109d-4d26-a166-3d97b341584e.json
deleted file mode 100644
index 3ff38766a..000000000
--- a/data/hfopenllm_v2/LilRg/PRYMMAL-slerp-Merge/d2d4b5a5-109d-4d26-a166-3d97b341584e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LilRg_PRYMMAL-slerp-Merge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PRYMMAL-slerp-Merge",
-    "id": "LilRg/PRYMMAL-slerp-Merge",
-    "developer": "LilRg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3044
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5364
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1616
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4635
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3863
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v2-merged/ac404d92-7a06-4758-ab1d-fcf840c2b995.json b/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v2-merged/ac404d92-7a06-4758-ab1d-fcf840c2b995.json
deleted file mode 100644
index c0faabec4..000000000
--- a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v2-merged/ac404d92-7a06-4758-ab1d-fcf840c2b995.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LimYeri_CodeMind-Llama3-8B-unsloth_v2-merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CodeMind-Llama3-8B-unsloth_v2-merged",
-    "id": "LimYeri/CodeMind-Llama3-8B-unsloth_v2-merged",
-    "developer": "LimYeri",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6946
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.486
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3316
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3506
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v3-merged/95ea7fbf-d3f2-4fc1-ba17-05549f6e4d25.json b/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v3-merged/95ea7fbf-d3f2-4fc1-ba17-05549f6e4d25.json
deleted file mode 100644
index 535530ceb..000000000
--- a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v3-merged/95ea7fbf-d3f2-4fc1-ba17-05549f6e4d25.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LimYeri_CodeMind-Llama3-8B-unsloth_v3-merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CodeMind-Llama3-8B-unsloth_v3-merged",
-    "id": "LimYeri/CodeMind-Llama3-8B-unsloth_v3-merged",
-    "developer": "LimYeri",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6763
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4908
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3496
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged/c101e272-24d2-44db-9b0f-2ed4d17cec41.json b/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged/c101e272-24d2-44db-9b0f-2ed4d17cec41.json
deleted file mode 100644
index c1155f027..000000000
--- a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged/c101e272-24d2-44db-9b0f-2ed4d17cec41.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LimYeri_CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged",
-    "id": "LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-DPO-merged",
-    "developer": "LimYeri",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6492
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4853
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3608
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3354
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-merged/2cb789c7-dddf-42b2-8fdf-4cbd5132946c.json b/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-merged/2cb789c7-dddf-42b2-8fdf-4cbd5132946c.json
deleted file mode 100644
index 14803d71b..000000000
--- a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-merged/2cb789c7-dddf-42b2-8fdf-4cbd5132946c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LimYeri_CodeMind-Llama3-8B-unsloth_v4-one-merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CodeMind-Llama3-8B-unsloth_v4-one-merged",
-    "id": "LimYeri/CodeMind-Llama3-8B-unsloth_v4-one-merged",
-    "developer": "LimYeri",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3211
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4739
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0551
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4069
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3353
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3.1-8B-unsloth-merged/a414aefd-ce24-49a9-b431-0c6014ebfbd8.json b/data/hfopenllm_v2/LimYeri/CodeMind-Llama3.1-8B-unsloth-merged/a414aefd-ce24-49a9-b431-0c6014ebfbd8.json
deleted file mode 100644
index 323a4d3be..000000000
--- a/data/hfopenllm_v2/LimYeri/CodeMind-Llama3.1-8B-unsloth-merged/a414aefd-ce24-49a9-b431-0c6014ebfbd8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/LimYeri_CodeMind-Llama3.1-8B-unsloth-merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CodeMind-Llama3.1-8B-unsloth-merged",
-    "id": "LimYeri/CodeMind-Llama3.1-8B-unsloth-merged",
-    "developer": "LimYeri",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.649
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4695
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1088
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3752
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.334
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Locutusque/CollectiveLM-Falcon-3-7B/91fcb6a3-d351-48c8-87e8-e2a06642e925.json b/data/hfopenllm_v2/Locutusque/CollectiveLM-Falcon-3-7B/91fcb6a3-d351-48c8-87e8-e2a06642e925.json
deleted file mode 100644
index 95e472469..000000000
--- a/data/hfopenllm_v2/Locutusque/CollectiveLM-Falcon-3-7B/91fcb6a3-d351-48c8-87e8-e2a06642e925.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Locutusque_CollectiveLM-Falcon-3-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CollectiveLM-Falcon-3-7B",
-    "id": "Locutusque/CollectiveLM-Falcon-3-7B",
-    "developer": "Locutusque",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3918
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5105
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2183
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3887
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3599
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Locutusque/Hercules-6.0-Llama-3.1-8B/3cd90efa-ddf0-43c4-884c-84337ded14b2.json b/data/hfopenllm_v2/Locutusque/Hercules-6.0-Llama-3.1-8B/3cd90efa-ddf0-43c4-884c-84337ded14b2.json
deleted file mode 100644
index 200257158..000000000
--- a/data/hfopenllm_v2/Locutusque/Hercules-6.0-Llama-3.1-8B/3cd90efa-ddf0-43c4-884c-84337ded14b2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Locutusque_Hercules-6.0-Llama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hercules-6.0-Llama-3.1-8B",
-    "id": "Locutusque/Hercules-6.0-Llama-3.1-8B",
-    "developer": "Locutusque",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.663
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4813
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1669
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3621
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3615
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Locutusque/Hercules-6.1-Llama-3.1-8B/c66c21e9-a332-40f9-ae87-bdd78a25d753.json b/data/hfopenllm_v2/Locutusque/Hercules-6.1-Llama-3.1-8B/c66c21e9-a332-40f9-ae87-bdd78a25d753.json
deleted file mode 100644
index c70788d79..000000000
--- a/data/hfopenllm_v2/Locutusque/Hercules-6.1-Llama-3.1-8B/c66c21e9-a332-40f9-ae87-bdd78a25d753.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Locutusque_Hercules-6.1-Llama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hercules-6.1-Llama-3.1-8B",
-    "id": "Locutusque/Hercules-6.1-Llama-3.1-8B",
-    "developer": "Locutusque",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6007
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4656
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.176
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3553
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3669
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Locutusque/Llama-3-NeuralHercules-5.0-8B/0b4def91-29df-45d9-8dd4-c4097ec47ba3.json b/data/hfopenllm_v2/Locutusque/Llama-3-NeuralHercules-5.0-8B/0b4def91-29df-45d9-8dd4-c4097ec47ba3.json
deleted file mode 100644
index 289875aa0..000000000
--- a/data/hfopenllm_v2/Locutusque/Llama-3-NeuralHercules-5.0-8B/0b4def91-29df-45d9-8dd4-c4097ec47ba3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Locutusque_Llama-3-NeuralHercules-5.0-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-NeuralHercules-5.0-8B",
-    "id": "Locutusque/Llama-3-NeuralHercules-5.0-8B",
-    "developer": "Locutusque",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4489
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.394
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3881
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2933
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Locutusque/Llama-3-Yggdrasil-2.0-8B/2cbf258c-369e-4b1c-863f-43cf97c3a7a4.json b/data/hfopenllm_v2/Locutusque/Llama-3-Yggdrasil-2.0-8B/2cbf258c-369e-4b1c-863f-43cf97c3a7a4.json
deleted file mode 100644
index 04066b41d..000000000
--- a/data/hfopenllm_v2/Locutusque/Llama-3-Yggdrasil-2.0-8B/2cbf258c-369e-4b1c-863f-43cf97c3a7a4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Locutusque_Llama-3-Yggdrasil-2.0-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Yggdrasil-2.0-8B",
-    "id": "Locutusque/Llama-3-Yggdrasil-2.0-8B",
-    "developer": "Locutusque",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5371
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4772
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0831
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3977
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3167
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Locutusque/TinyMistral-248M-v2.5/8372889e-f9cd-4cf7-aec0-8e18d5c627e3.json b/data/hfopenllm_v2/Locutusque/TinyMistral-248M-v2.5/8372889e-f9cd-4cf7-aec0-8e18d5c627e3.json
deleted file mode 100644
index f44380bf2..000000000
--- a/data/hfopenllm_v2/Locutusque/TinyMistral-248M-v2.5/8372889e-f9cd-4cf7-aec0-8e18d5c627e3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Locutusque_TinyMistral-248M-v2.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TinyMistral-248M-v2.5",
-    "id": "Locutusque/TinyMistral-248M-v2.5",
-    "developer": "Locutusque",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 0.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1336
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3039
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3782
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1135
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Luni/StarDust-12b-v1/ce4cc270-57da-4d08-9130-62508b409cb2.json b/data/hfopenllm_v2/Luni/StarDust-12b-v1/ce4cc270-57da-4d08-9130-62508b409cb2.json
deleted file mode 100644
index cba3a4a90..000000000
--- a/data/hfopenllm_v2/Luni/StarDust-12b-v1/ce4cc270-57da-4d08-9130-62508b409cb2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Luni_StarDust-12b-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "StarDust-12b-v1",
-    "id": "Luni/StarDust-12b-v1",
-    "developer": "Luni",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5459
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5366
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0763
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4324
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3412
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Luni/StarDust-12b-v2/4cfedb8f-0e47-4008-9bc5-fb15e4afa607.json b/data/hfopenllm_v2/Luni/StarDust-12b-v2/4cfedb8f-0e47-4008-9bc5-fb15e4afa607.json
deleted file mode 100644
index b3c1d3a2d..000000000
--- a/data/hfopenllm_v2/Luni/StarDust-12b-v2/4cfedb8f-0e47-4008-9bc5-fb15e4afa607.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Luni_StarDust-12b-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "StarDust-12b-v2",
-    "id": "Luni/StarDust-12b-v2",
-    "developer": "Luni",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5629
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5419
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0687
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4338
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3439
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v3/de3c949d-bab5-4430-bdd1-48e1b7860934.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v3/de3c949d-bab5-4430-bdd1-48e1b7860934.json
deleted file mode 100644
index a9ed180e4..000000000
--- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v3/de3c949d-bab5-4430-bdd1-48e1b7860934.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NQLSG-Qwen2.5-14B-MegaFusion-v3",
-    "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v3",
-    "developer": "Lunzima",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7049
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6478
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4162
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4808
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5394
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v4/011e53cd-409f-479b-9c3d-bfce75a1277b.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v4/011e53cd-409f-479b-9c3d-bfce75a1277b.json
deleted file mode 100644
index 77effc1ca..000000000
--- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v4/011e53cd-409f-479b-9c3d-bfce75a1277b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NQLSG-Qwen2.5-14B-MegaFusion-v4",
-    "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v4",
-    "developer": "Lunzima",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6943
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.642
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3467
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4769
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5252
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v5/1ff40e45-5be4-4625-9f66-5599a829903d.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v5/1ff40e45-5be4-4625-9f66-5599a829903d.json
deleted file mode 100644
index 5eaad0555..000000000
--- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v5/1ff40e45-5be4-4625-9f66-5599a829903d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NQLSG-Qwen2.5-14B-MegaFusion-v5",
-    "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v5",
-    "developer": "Lunzima",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7485
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6467
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4358
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3624
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4473
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.514
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt/fed97d94-2949-4383-8f25-fa79bd413508.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt/fed97d94-2949-4383-8f25-fa79bd413508.json
deleted file mode 100644
index 264dd2b3d..000000000
--- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt/fed97d94-2949-4383-8f25-fa79bd413508.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt",
-    "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6-cpt",
-    "developer": "Lunzima",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4663
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6215
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3316
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3758
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4937
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5204
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6/f4820bc8-7dfd-4439-af95-21b6cc9367ac.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6/f4820bc8-7dfd-4439-af95-21b6cc9367ac.json
deleted file mode 100644
index 37d866ec4..000000000
--- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6/f4820bc8-7dfd-4439-af95-21b6cc9367ac.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NQLSG-Qwen2.5-14B-MegaFusion-v6",
-    "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v6",
-    "developer": "Lunzima",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7043
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6458
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3958
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3775
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4768
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5392
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase/36e576bb-de50-49ec-a91f-f134c11bbe38.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase/36e576bb-de50-49ec-a91f-f134c11bbe38.json
deleted file mode 100644
index fbc0a8b94..000000000
--- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase/36e576bb-de50-49ec-a91f-f134c11bbe38.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase",
-    "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7-rebase",
-    "developer": "Lunzima",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6931
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6423
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4888
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5277
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7/0edd388b-7a1b-4334-9b72-52d84653ff67.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7/0edd388b-7a1b-4334-9b72-52d84653ff67.json
deleted file mode 100644
index a93457aeb..000000000
--- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7/0edd388b-7a1b-4334-9b72-52d84653ff67.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NQLSG-Qwen2.5-14B-MegaFusion-v7",
-    "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v7",
-    "developer": "Lunzima",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6794
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6531
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4101
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4834
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5376
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.5/b3199674-328e-41a0-9aa4-bf39aec735bc.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.5/b3199674-328e-41a0-9aa4-bf39aec735bc.json
deleted file mode 100644
index 948f8741b..000000000
--- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.5/b3199674-328e-41a0-9aa4-bf39aec735bc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.5",
-    "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.5",
-    "developer": "Lunzima",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5929
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6451
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3656
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.477
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.529
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.6/52db4d79-7040-4525-934e-0f33e4acec63.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.6/52db4d79-7040-4525-934e-0f33e4acec63.json
deleted file mode 100644
index 0b2f0fd81..000000000
--- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.6/52db4d79-7040-4525-934e-0f33e4acec63.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.6",
-    "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.6",
-    "developer": "Lunzima",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5919
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6457
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4071
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4953
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.54
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.7/ee34821e-9182-433f-a8b0-745711e23738.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.7/ee34821e-9182-433f-a8b0-745711e23738.json
deleted file mode 100644
index 5a366fbb8..000000000
--- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.7/ee34821e-9182-433f-a8b0-745711e23738.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.7",
-    "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.7",
-    "developer": "Lunzima",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7875
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6483
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4381
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5242
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.8/10ef0990-5356-432f-b24c-dd107188ec5f.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.8/10ef0990-5356-432f-b24c-dd107188ec5f.json
deleted file mode 100644
index be4b45ebc..000000000
--- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.8/10ef0990-5356-432f-b24c-dd107188ec5f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.8/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.8",
-    "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.8",
-    "developer": "Lunzima",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7028
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6566
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4237
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3758
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4912
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5323
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.9/47de680d-33b1-4441-92da-4b97a5fc513f.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.9/47de680d-33b1-4441-92da-4b97a5fc513f.json
deleted file mode 100644
index f10ff96d2..000000000
--- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.9/47de680d-33b1-4441-92da-4b97a5fc513f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8.9/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8.9",
-    "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8.9",
-    "developer": "Lunzima",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7993
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6483
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.537
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4328
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5199
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8/96ac0351-2ade-4d76-bcf9-bc0f633f8694.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8/96ac0351-2ade-4d76-bcf9-bc0f633f8694.json
deleted file mode 100644
index e0b9d6239..000000000
--- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8/96ac0351-2ade-4d76-bcf9-bc0f633f8694.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v8/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NQLSG-Qwen2.5-14B-MegaFusion-v8",
-    "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v8",
-    "developer": "Lunzima",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7875
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6419
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5559
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4394
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5206
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9-stock/31aae266-c14b-451f-8bab-62ee7d5d382e.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9-stock/31aae266-c14b-451f-8bab-62ee7d5d382e.json
deleted file mode 100644
index 0e46ee73a..000000000
--- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9-stock/31aae266-c14b-451f-8bab-62ee7d5d382e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9-stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NQLSG-Qwen2.5-14B-MegaFusion-v9-stock",
-    "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9-stock",
-    "developer": "Lunzima",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6514
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6571
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4184
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.482
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5412
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.1/f6edb102-e867-46d1-afdc-3c45166bd510.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.1/f6edb102-e867-46d1-afdc-3c45166bd510.json
deleted file mode 100644
index 45123ecdc..000000000
--- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.1/f6edb102-e867-46d1-afdc-3c45166bd510.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NQLSG-Qwen2.5-14B-MegaFusion-v9.1",
-    "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.1",
-    "developer": "Lunzima",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8003
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6555
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5468
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3431
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4354
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5251
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.2/8b7756cc-9af3-4f98-84ac-7fef4c1bdaa0.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.2/8b7756cc-9af3-4f98-84ac-7fef4c1bdaa0.json
deleted file mode 100644
index 3c1854b85..000000000
--- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.2/8b7756cc-9af3-4f98-84ac-7fef4c1bdaa0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NQLSG-Qwen2.5-14B-MegaFusion-v9.2",
-    "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9.2",
-    "developer": "Lunzima",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7862
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6538
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5332
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3557
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4381
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5283
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9/dcf33a22-5e57-4476-a2cb-ebd60407a920.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9/dcf33a22-5e57-4476-a2cb-ebd60407a920.json
deleted file mode 100644
index f8f18e3db..000000000
--- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9/dcf33a22-5e57-4476-a2cb-ebd60407a920.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-MegaFusion-v9/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NQLSG-Qwen2.5-14B-MegaFusion-v9",
-    "id": "Lunzima/NQLSG-Qwen2.5-14B-MegaFusion-v9",
-    "developer": "Lunzima",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5235
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6546
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4366
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3884
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4806
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5422
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-OriginalFusion/15659480-be0b-41c8-a463-873be444b194.json b/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-OriginalFusion/15659480-be0b-41c8-a463-873be444b194.json
deleted file mode 100644
index 179cbfc2b..000000000
--- a/data/hfopenllm_v2/Lunzima/NQLSG-Qwen2.5-14B-OriginalFusion/15659480-be0b-41c8-a463-873be444b194.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lunzima_NQLSG-Qwen2.5-14B-OriginalFusion/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NQLSG-Qwen2.5-14B-OriginalFusion",
-    "id": "Lunzima/NQLSG-Qwen2.5-14B-OriginalFusion",
-    "developer": "Lunzima",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6142
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6592
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4275
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3809
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5122
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5239
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lyte/Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3/0444c1bf-a3d3-4d23-bc6c-0a98c4dc1e9d.json b/data/hfopenllm_v2/Lyte/Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3/0444c1bf-a3d3-4d23-bc6c-0a98c4dc1e9d.json
deleted file mode 100644
index 3b3f7cc5e..000000000
--- a/data/hfopenllm_v2/Lyte/Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3/0444c1bf-a3d3-4d23-bc6c-0a98c4dc1e9d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lyte_Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3",
-    "id": "Lyte/Llama-3.1-8B-Instruct-Reasoner-1o1_v0.3",
-    "developer": "Lyte",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7098
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.495
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1903
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3461
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3618
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lyte/Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04/93aa3a13-5069-410f-a1df-6944e0231e0e.json b/data/hfopenllm_v2/Lyte/Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04/93aa3a13-5069-410f-a1df-6944e0231e0e.json
deleted file mode 100644
index 1d73f7a54..000000000
--- a/data/hfopenllm_v2/Lyte/Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04/93aa3a13-5069-410f-a1df-6944e0231e0e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lyte_Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04",
-    "id": "Lyte/Llama-3.2-1B-Instruct-COT-RL-Expriement1-EP04",
-    "developer": "Lyte",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5774
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0801
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3236
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1843
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Lyte/Llama-3.2-3B-Overthinker/427ea7d0-c1f1-4cfe-b6a7-555262a7a317.json b/data/hfopenllm_v2/Lyte/Llama-3.2-3B-Overthinker/427ea7d0-c1f1-4cfe-b6a7-555262a7a317.json
deleted file mode 100644
index 4e1649e92..000000000
--- a/data/hfopenllm_v2/Lyte/Llama-3.2-3B-Overthinker/427ea7d0-c1f1-4cfe-b6a7-555262a7a317.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Lyte_Llama-3.2-3B-Overthinker/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-Overthinker",
-    "id": "Lyte/Llama-3.2-3B-Overthinker",
-    "developer": "Lyte",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6408
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.432
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1563
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3419
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2985
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/M4-ai/TinyMistral-248M-v3/c6dbe372-7a3c-487c-87c0-fb324c39f8c9.json b/data/hfopenllm_v2/M4-ai/TinyMistral-248M-v3/c6dbe372-7a3c-487c-87c0-fb324c39f8c9.json
deleted file mode 100644
index cbe484e9b..000000000
--- a/data/hfopenllm_v2/M4-ai/TinyMistral-248M-v3/c6dbe372-7a3c-487c-87c0-fb324c39f8c9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/M4-ai_TinyMistral-248M-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TinyMistral-248M-v3",
-    "id": "M4-ai/TinyMistral-248M-v3",
-    "developer": "M4-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 0.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1639
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2885
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2408
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3793
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1132
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MEscriva/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/cf8d99c8-8790-4bdf-bfc2-1a6d1fe35916.json b/data/hfopenllm_v2/MEscriva/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/cf8d99c8-8790-4bdf-bfc2-1a6d1fe35916.json
deleted file mode 100644
index 6ab96529f..000000000
--- a/data/hfopenllm_v2/MEscriva/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/cf8d99c8-8790-4bdf-bfc2-1a6d1fe35916.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MEscriva_ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis",
-    "id": "MEscriva/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis",
-    "developer": "MEscriva",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0866
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3057
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4017
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1154
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MLP-KTLim/llama-3-Korean-Bllossom-8B/5b5d42d7-8012-46f1-826f-32d839806048.json b/data/hfopenllm_v2/MLP-KTLim/llama-3-Korean-Bllossom-8B/5b5d42d7-8012-46f1-826f-32d839806048.json
deleted file mode 100644
index a8760b3f6..000000000
--- a/data/hfopenllm_v2/MLP-KTLim/llama-3-Korean-Bllossom-8B/5b5d42d7-8012-46f1-826f-32d839806048.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MLP-KTLim_llama-3-Korean-Bllossom-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-Korean-Bllossom-8B",
-    "id": "MLP-KTLim/llama-3-Korean-Bllossom-8B",
-    "developer": "MLP-KTLim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5113
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.49
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3675
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3594
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MTSAIR/Cotype-Nano/5e1bf2cb-55c4-4806-89af-cb9953c7c1b1.json b/data/hfopenllm_v2/MTSAIR/Cotype-Nano/5e1bf2cb-55c4-4806-89af-cb9953c7c1b1.json
deleted file mode 100644
index fb6c77edb..000000000
--- a/data/hfopenllm_v2/MTSAIR/Cotype-Nano/5e1bf2cb-55c4-4806-89af-cb9953c7c1b1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MTSAIR_Cotype-Nano/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cotype-Nano",
-    "id": "MTSAIR/Cotype-Nano",
-    "developer": "MTSAIR",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3748
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3865
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0974
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2477
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MTSAIR/MultiVerse_70B/21ee4b33-9829-4cca-9603-c30fd4a1f7ff.json b/data/hfopenllm_v2/MTSAIR/MultiVerse_70B/21ee4b33-9829-4cca-9603-c30fd4a1f7ff.json
deleted file mode 100644
index 4f7b2b0b1..000000000
--- a/data/hfopenllm_v2/MTSAIR/MultiVerse_70B/21ee4b33-9829-4cca-9603-c30fd4a1f7ff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MTSAIR_MultiVerse_70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MultiVerse_70B",
-    "id": "MTSAIR/MultiVerse_70B",
-    "developer": "MTSAIR",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 72.289
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5249
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6183
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1926
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.354
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.474
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.486
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.1/c6c14a8b-0e9f-4b97-b9f3-27c7250fb8f2.json b/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.1/c6c14a8b-0e9f-4b97-b9f3-27c7250fb8f2.json
deleted file mode 100644
index 115db7b96..000000000
--- a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.1/c6c14a8b-0e9f-4b97-b9f3-27c7250fb8f2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Magpie-Align_Llama-3-8B-Magpie-Align-SFT-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Magpie-Align-SFT-v0.1",
-    "id": "Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.1",
-    "developer": "Magpie-Align",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4361
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4615
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3277
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2863
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.3/6586fa94-9f43-4814-8c8a-8ed244ac94e7.json b/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.3/6586fa94-9f43-4814-8c8a-8ed244ac94e7.json
deleted file mode 100644
index 20e5b11c3..000000000
--- a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.3/6586fa94-9f43-4814-8c8a-8ed244ac94e7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Magpie-Align_Llama-3-8B-Magpie-Align-SFT-v0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Magpie-Align-SFT-v0.3",
-    "id": "Magpie-Align/Llama-3-8B-Magpie-Align-SFT-v0.3",
-    "developer": "Magpie-Align",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5064
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4572
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0733
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3424
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2902
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.1/df7d7db2-867e-47f0-9abf-d71b79e97630.json b/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.1/df7d7db2-867e-47f0-9abf-d71b79e97630.json
deleted file mode 100644
index bbdb79de2..000000000
--- a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.1/df7d7db2-867e-47f0-9abf-d71b79e97630.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Magpie-Align_Llama-3-8B-Magpie-Align-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Magpie-Align-v0.1",
-    "id": "Magpie-Align/Llama-3-8B-Magpie-Align-v0.1",
-    "developer": "Magpie-Align",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4118
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4811
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.034
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3047
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3006
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.1/e2502e7e-3a10-49f3-b5c6-b20496fed998.json b/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.1/e2502e7e-3a10-49f3-b5c6-b20496fed998.json
deleted file mode 100644
index 5f345d139..000000000
--- a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.1/e2502e7e-3a10-49f3-b5c6-b20496fed998.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Magpie-Align_Llama-3-8B-Magpie-Align-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Magpie-Align-v0.1",
-    "id": "Magpie-Align/Llama-3-8B-Magpie-Align-v0.1",
-    "developer": "Magpie-Align",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4027
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4789
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0461
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3001
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.3/51cde18f-09b0-4b66-a962-811ee49e192f.json b/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.3/51cde18f-09b0-4b66-a962-811ee49e192f.json
deleted file mode 100644
index f30e78827..000000000
--- a/data/hfopenllm_v2/Magpie-Align/Llama-3-8B-Magpie-Align-v0.3/51cde18f-09b0-4b66-a962-811ee49e192f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Magpie-Align_Llama-3-8B-Magpie-Align-v0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Magpie-Align-v0.3",
-    "id": "Magpie-Align/Llama-3-8B-Magpie-Align-v0.3",
-    "developer": "Magpie-Align",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4497
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.457
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3134
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Magpie-Align/Llama-3.1-8B-Magpie-Align-SFT-v0.1/4ea48b42-8026-4799-b35d-46757fd2753f.json b/data/hfopenllm_v2/Magpie-Align/Llama-3.1-8B-Magpie-Align-SFT-v0.1/4ea48b42-8026-4799-b35d-46757fd2753f.json
deleted file mode 100644
index aeb4cabcb..000000000
--- a/data/hfopenllm_v2/Magpie-Align/Llama-3.1-8B-Magpie-Align-SFT-v0.1/4ea48b42-8026-4799-b35d-46757fd2753f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Magpie-Align_Llama-3.1-8B-Magpie-Align-SFT-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-Magpie-Align-SFT-v0.1",
-    "id": "Magpie-Align/Llama-3.1-8B-Magpie-Align-SFT-v0.1",
-    "developer": "Magpie-Align",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4782
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4764
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0899
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3397
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2943
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Magpie-Align/Llama-3.1-8B-Magpie-Align-v0.1/52e9b4ae-9119-4f26-87e4-6532d1148ecd.json b/data/hfopenllm_v2/Magpie-Align/Llama-3.1-8B-Magpie-Align-v0.1/52e9b4ae-9119-4f26-87e4-6532d1148ecd.json
deleted file mode 100644
index a67ee6b2e..000000000
--- a/data/hfopenllm_v2/Magpie-Align/Llama-3.1-8B-Magpie-Align-v0.1/52e9b4ae-9119-4f26-87e4-6532d1148ecd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Magpie-Align_Llama-3.1-8B-Magpie-Align-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-Magpie-Align-v0.1",
-    "id": "Magpie-Align/Llama-3.1-8B-Magpie-Align-v0.1",
-    "developer": "Magpie-Align",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4458
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4622
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3141
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Magpie-Align/MagpieLM-8B-Chat-v0.1/4bda68c0-cc09-4945-961b-48776b7b5fc8.json b/data/hfopenllm_v2/Magpie-Align/MagpieLM-8B-Chat-v0.1/4bda68c0-cc09-4945-961b-48776b7b5fc8.json
deleted file mode 100644
index 590475654..000000000
--- a/data/hfopenllm_v2/Magpie-Align/MagpieLM-8B-Chat-v0.1/4bda68c0-cc09-4945-961b-48776b7b5fc8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Magpie-Align_MagpieLM-8B-Chat-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MagpieLM-8B-Chat-v0.1",
-    "id": "Magpie-Align/MagpieLM-8B-Chat-v0.1",
-    "developer": "Magpie-Align",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3701
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4172
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3501
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3195
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Magpie-Align/MagpieLM-8B-SFT-v0.1/18ea0ad0-a216-4906-a96c-c8b040398dbd.json b/data/hfopenllm_v2/Magpie-Align/MagpieLM-8B-SFT-v0.1/18ea0ad0-a216-4906-a96c-c8b040398dbd.json
deleted file mode 100644
index 3ffe20bca..000000000
--- a/data/hfopenllm_v2/Magpie-Align/MagpieLM-8B-SFT-v0.1/18ea0ad0-a216-4906-a96c-c8b040398dbd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Magpie-Align_MagpieLM-8B-SFT-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MagpieLM-8B-SFT-v0.1",
-    "id": "Magpie-Align/MagpieLM-8B-SFT-v0.1",
-    "developer": "Magpie-Align",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4721
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4553
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0755
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3649
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.299
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MagusCorp/grpo_lora_enem_llama3_7b/1e2321f6-93bd-4acf-9f5b-c82807a40233.json b/data/hfopenllm_v2/MagusCorp/grpo_lora_enem_llama3_7b/1e2321f6-93bd-4acf-9f5b-c82807a40233.json
deleted file mode 100644
index 7178d1d47..000000000
--- a/data/hfopenllm_v2/MagusCorp/grpo_lora_enem_llama3_7b/1e2321f6-93bd-4acf-9f5b-c82807a40233.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MagusCorp_grpo_lora_enem_llama3_7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "grpo_lora_enem_llama3_7b",
-    "id": "MagusCorp/grpo_lora_enem_llama3_7b",
-    "developer": "MagusCorp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4724
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4801
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3971
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3574
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ManoloPueblo/ContentCuisine_1-7B-slerp/13032961-52a1-43cf-b69d-1802c43e1bcc.json b/data/hfopenllm_v2/ManoloPueblo/ContentCuisine_1-7B-slerp/13032961-52a1-43cf-b69d-1802c43e1bcc.json
deleted file mode 100644
index 6a60d0dd4..000000000
--- a/data/hfopenllm_v2/ManoloPueblo/ContentCuisine_1-7B-slerp/13032961-52a1-43cf-b69d-1802c43e1bcc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ManoloPueblo_ContentCuisine_1-7B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ContentCuisine_1-7B-slerp",
-    "id": "ManoloPueblo/ContentCuisine_1-7B-slerp",
-    "developer": "ManoloPueblo",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3907
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5188
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0733
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4672
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ManoloPueblo/LLM_MERGE_CC2/9d444061-2c29-499a-8906-77ef58aba34d.json b/data/hfopenllm_v2/ManoloPueblo/LLM_MERGE_CC2/9d444061-2c29-499a-8906-77ef58aba34d.json
deleted file mode 100644
index 2ebddc2a3..000000000
--- a/data/hfopenllm_v2/ManoloPueblo/LLM_MERGE_CC2/9d444061-2c29-499a-8906-77ef58aba34d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ManoloPueblo_LLM_MERGE_CC2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLM_MERGE_CC2",
-    "id": "ManoloPueblo/LLM_MERGE_CC2",
-    "developer": "ManoloPueblo",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3853
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5209
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0642
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4593
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3032
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ManoloPueblo/LLM_MERGE_CC3/1ffdf6b0-b3a3-432a-a0e4-69b4d447bb76.json b/data/hfopenllm_v2/ManoloPueblo/LLM_MERGE_CC3/1ffdf6b0-b3a3-432a-a0e4-69b4d447bb76.json
deleted file mode 100644
index 7e1671f54..000000000
--- a/data/hfopenllm_v2/ManoloPueblo/LLM_MERGE_CC3/1ffdf6b0-b3a3-432a-a0e4-69b4d447bb76.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ManoloPueblo_LLM_MERGE_CC3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLM_MERGE_CC3",
-    "id": "ManoloPueblo/LLM_MERGE_CC3",
-    "developer": "ManoloPueblo",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3959
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5246
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0793
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4672
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3156
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MarinaraSpaghetti/NemoReRemix-12B/8ce733ea-e6e9-4f9b-ab28-f93202507265.json b/data/hfopenllm_v2/MarinaraSpaghetti/NemoReRemix-12B/8ce733ea-e6e9-4f9b-ab28-f93202507265.json
deleted file mode 100644
index 805297be2..000000000
--- a/data/hfopenllm_v2/MarinaraSpaghetti/NemoReRemix-12B/8ce733ea-e6e9-4f9b-ab28-f93202507265.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MarinaraSpaghetti_NemoReRemix-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NemoReRemix-12B",
-    "id": "MarinaraSpaghetti/NemoReRemix-12B",
-    "developer": "MarinaraSpaghetti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3343
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5537
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0906
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4501
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3598
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MarinaraSpaghetti/Nemomix-v4.0-12B/0e88aa91-609c-4d2d-9296-25b06eeb0342.json b/data/hfopenllm_v2/MarinaraSpaghetti/Nemomix-v4.0-12B/0e88aa91-609c-4d2d-9296-25b06eeb0342.json
deleted file mode 100644
index 6824fedbe..000000000
--- a/data/hfopenllm_v2/MarinaraSpaghetti/Nemomix-v4.0-12B/0e88aa91-609c-4d2d-9296-25b06eeb0342.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MarinaraSpaghetti_Nemomix-v4.0-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nemomix-v4.0-12B",
-    "id": "MarinaraSpaghetti/Nemomix-v4.0-12B",
-    "developer": "MarinaraSpaghetti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5575
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5275
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4244
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3613
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Marsouuu/MiniMathExpert-2_61B-ECE-PRYMMAL-Martial/3e235ea0-3f04-4d99-9db2-7cafcbdbac6f.json b/data/hfopenllm_v2/Marsouuu/MiniMathExpert-2_61B-ECE-PRYMMAL-Martial/3e235ea0-3f04-4d99-9db2-7cafcbdbac6f.json
deleted file mode 100644
index a4acc5d5f..000000000
--- a/data/hfopenllm_v2/Marsouuu/MiniMathExpert-2_61B-ECE-PRYMMAL-Martial/3e235ea0-3f04-4d99-9db2-7cafcbdbac6f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Marsouuu_MiniMathExpert-2_61B-ECE-PRYMMAL-Martial/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MiniMathExpert-2_61B-ECE-PRYMMAL-Martial",
-    "id": "Marsouuu/MiniMathExpert-2_61B-ECE-PRYMMAL-Martial",
-    "developer": "Marsouuu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2548
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3953
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.074
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4083
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2274
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Marsouuu/MiniQwenMathExpert-ECE-PRYMMAL-Martial/5e31a55c-f222-4192-b031-27bb40ba56fa.json b/data/hfopenllm_v2/Marsouuu/MiniQwenMathExpert-ECE-PRYMMAL-Martial/5e31a55c-f222-4192-b031-27bb40ba56fa.json
deleted file mode 100644
index fe44d4d77..000000000
--- a/data/hfopenllm_v2/Marsouuu/MiniQwenMathExpert-ECE-PRYMMAL-Martial/5e31a55c-f222-4192-b031-27bb40ba56fa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Marsouuu_MiniQwenMathExpert-ECE-PRYMMAL-Martial/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MiniQwenMathExpert-ECE-PRYMMAL-Martial",
-    "id": "Marsouuu/MiniQwenMathExpert-ECE-PRYMMAL-Martial",
-    "developer": "Marsouuu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2795
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.423
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2922
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Marsouuu/MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial/11fd4b70-4ea7-4bee-8caf-8921d4c89f24.json b/data/hfopenllm_v2/Marsouuu/MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial/11fd4b70-4ea7-4bee-8caf-8921d4c89f24.json
deleted file mode 100644
index 8cb29fb63..000000000
--- a/data/hfopenllm_v2/Marsouuu/MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial/11fd4b70-4ea7-4bee-8caf-8921d4c89f24.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Marsouuu_MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial",
-    "id": "Marsouuu/MistralBase-4x7B-MoE-ECE-PRYMMAL-Martial",
-    "developer": "Marsouuu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 24.16
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1697
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3464
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3991
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1379
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Marsouuu/general3B-ECE-PRYMMAL-Martial/8e721067-898d-45ca-b4f5-9f523c4ce3d3.json b/data/hfopenllm_v2/Marsouuu/general3B-ECE-PRYMMAL-Martial/8e721067-898d-45ca-b4f5-9f523c4ce3d3.json
deleted file mode 100644
index 7e5d5f9c9..000000000
--- a/data/hfopenllm_v2/Marsouuu/general3B-ECE-PRYMMAL-Martial/8e721067-898d-45ca-b4f5-9f523c4ce3d3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Marsouuu_general3B-ECE-PRYMMAL-Martial/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "general3B-ECE-PRYMMAL-Martial",
-    "id": "Marsouuu/general3B-ECE-PRYMMAL-Martial",
-    "developer": "Marsouuu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2722
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5394
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1548
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4701
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3876
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Marsouuu/general3Bv2-ECE-PRYMMAL-Martial/be5d5480-ce4c-4ade-8c6a-c08cd2826909.json b/data/hfopenllm_v2/Marsouuu/general3Bv2-ECE-PRYMMAL-Martial/be5d5480-ce4c-4ade-8c6a-c08cd2826909.json
deleted file mode 100644
index 2888af590..000000000
--- a/data/hfopenllm_v2/Marsouuu/general3Bv2-ECE-PRYMMAL-Martial/be5d5480-ce4c-4ade-8c6a-c08cd2826909.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Marsouuu_general3Bv2-ECE-PRYMMAL-Martial/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "general3Bv2-ECE-PRYMMAL-Martial",
-    "id": "Marsouuu/general3Bv2-ECE-PRYMMAL-Martial",
-    "developer": "Marsouuu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5693
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5637
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3671
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4396
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4498
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Marsouuu/lareneg1_78B-ECE-PRYMMAL-Martial/54dec074-29f8-4863-be37-2c08f6f2c3cb.json b/data/hfopenllm_v2/Marsouuu/lareneg1_78B-ECE-PRYMMAL-Martial/54dec074-29f8-4863-be37-2c08f6f2c3cb.json
deleted file mode 100644
index 21ecaf0b1..000000000
--- a/data/hfopenllm_v2/Marsouuu/lareneg1_78B-ECE-PRYMMAL-Martial/54dec074-29f8-4863-be37-2c08f6f2c3cb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Marsouuu_lareneg1_78B-ECE-PRYMMAL-Martial/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "lareneg1_78B-ECE-PRYMMAL-Martial",
-    "id": "Marsouuu/lareneg1_78B-ECE-PRYMMAL-Martial",
-    "developer": "Marsouuu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2795
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.423
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2922
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Marsouuu/lareneg3B-ECE-PRYMMAL-Martial/88a15025-556b-469d-be77-c773f2c61038.json b/data/hfopenllm_v2/Marsouuu/lareneg3B-ECE-PRYMMAL-Martial/88a15025-556b-469d-be77-c773f2c61038.json
deleted file mode 100644
index f634cf074..000000000
--- a/data/hfopenllm_v2/Marsouuu/lareneg3B-ECE-PRYMMAL-Martial/88a15025-556b-469d-be77-c773f2c61038.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Marsouuu_lareneg3B-ECE-PRYMMAL-Martial/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "lareneg3B-ECE-PRYMMAL-Martial",
-    "id": "Marsouuu/lareneg3B-ECE-PRYMMAL-Martial",
-    "developer": "Marsouuu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3303
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5453
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1518
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4725
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3767
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Marsouuu/lareneg3Bv2-ECE-PRYMMAL-Martial/b4f4596b-17e5-40bf-ae60-0b17492ba9f8.json b/data/hfopenllm_v2/Marsouuu/lareneg3Bv2-ECE-PRYMMAL-Martial/b4f4596b-17e5-40bf-ae60-0b17492ba9f8.json
deleted file mode 100644
index 829e669c7..000000000
--- a/data/hfopenllm_v2/Marsouuu/lareneg3Bv2-ECE-PRYMMAL-Martial/b4f4596b-17e5-40bf-ae60-0b17492ba9f8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Marsouuu_lareneg3Bv2-ECE-PRYMMAL-Martial/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "lareneg3Bv2-ECE-PRYMMAL-Martial",
-    "id": "Marsouuu/lareneg3Bv2-ECE-PRYMMAL-Martial",
-    "developer": "Marsouuu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5753
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5623
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3656
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4369
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4511
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/Calme-4x7B-MoE-v0.1/97ce858e-a64f-4881-b6d0-0a2c0814336d.json b/data/hfopenllm_v2/MaziyarPanahi/Calme-4x7B-MoE-v0.1/97ce858e-a64f-4881-b6d0-0a2c0814336d.json
deleted file mode 100644
index e5c5e6482..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/Calme-4x7B-MoE-v0.1/97ce858e-a64f-4881-b6d0-0a2c0814336d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_Calme-4x7B-MoE-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Calme-4x7B-MoE-v0.1",
-    "id": "MaziyarPanahi/Calme-4x7B-MoE-v0.1",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 24.154
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4315
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5103
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0801
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3057
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/Calme-4x7B-MoE-v0.2/1becd83e-e9b8-49c1-a137-80c5a8dbdf0d.json b/data/hfopenllm_v2/MaziyarPanahi/Calme-4x7B-MoE-v0.2/1becd83e-e9b8-49c1-a137-80c5a8dbdf0d.json
deleted file mode 100644
index d772d36c9..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/Calme-4x7B-MoE-v0.2/1becd83e-e9b8-49c1-a137-80c5a8dbdf0d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_Calme-4x7B-MoE-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Calme-4x7B-MoE-v0.2",
-    "id": "MaziyarPanahi/Calme-4x7B-MoE-v0.2",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 24.154
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4294
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5111
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.074
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4318
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3058
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/Llama-3-70B-Instruct-v0.1/337bb321-9c6e-4751-9c9b-d8ba0120dd07.json b/data/hfopenllm_v2/MaziyarPanahi/Llama-3-70B-Instruct-v0.1/337bb321-9c6e-4751-9c9b-d8ba0120dd07.json
deleted file mode 100644
index b12b4dd7c..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/Llama-3-70B-Instruct-v0.1/337bb321-9c6e-4751-9c9b-d8ba0120dd07.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_Llama-3-70B-Instruct-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-70B-Instruct-v0.1",
-    "id": "MaziyarPanahi/Llama-3-70B-Instruct-v0.1",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4714
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5366
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1805
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4433
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4618
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.10/cfa95cc9-5bb1-4921-97c7-078f2f929a2f.json b/data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.10/cfa95cc9-5bb1-4921-97c7-078f2f929a2f.json
deleted file mode 100644
index 2084bb4eb..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.10/cfa95cc9-5bb1-4921-97c7-078f2f929a2f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_Llama-3-8B-Instruct-v0.10/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-v0.10",
-    "id": "MaziyarPanahi/Llama-3-8B-Instruct-v0.10",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7667
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4924
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4214
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3862
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.8/6d5ba3c4-a0c2-40cd-9766-68d36d21c5b6.json b/data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.8/6d5ba3c4-a0c2-40cd-9766-68d36d21c5b6.json
deleted file mode 100644
index 9ec03b55c..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.8/6d5ba3c4-a0c2-40cd-9766-68d36d21c5b6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_Llama-3-8B-Instruct-v0.8/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-v0.8",
-    "id": "MaziyarPanahi/Llama-3-8B-Instruct-v0.8",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7528
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4963
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0778
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4202
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3853
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.9/6cc4404a-f3e1-47b9-b56b-34e4269e1261.json b/data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.9/6cc4404a-f3e1-47b9-b56b-34e4269e1261.json
deleted file mode 100644
index 9730bfb53..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/Llama-3-8B-Instruct-v0.9/6cc4404a-f3e1-47b9-b56b-34e4269e1261.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_Llama-3-8B-Instruct-v0.9/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-v0.9",
-    "id": "MaziyarPanahi/Llama-3-8B-Instruct-v0.9",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.763
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4936
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0733
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4148
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3846
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/Qwen1.5-MoE-A2.7B-Wikihow/8d820e43-ff42-4247-9ad0-4ed8e70672b4.json b/data/hfopenllm_v2/MaziyarPanahi/Qwen1.5-MoE-A2.7B-Wikihow/8d820e43-ff42-4247-9ad0-4ed8e70672b4.json
deleted file mode 100644
index 25b553bee..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/Qwen1.5-MoE-A2.7B-Wikihow/8d820e43-ff42-4247-9ad0-4ed8e70672b4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_Qwen1.5-MoE-A2.7B-Wikihow/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5-MoE-A2.7B-Wikihow",
-    "id": "MaziyarPanahi/Qwen1.5-MoE-A2.7B-Wikihow",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2MoeForCausalLM",
-      "params_billions": 14.316
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2954
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.392
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0823
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3502
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.238
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/Qwen2-7B-Instruct-v0.1/d858ce8e-6a4b-46b1-8d51-03ebc2d8aaec.json b/data/hfopenllm_v2/MaziyarPanahi/Qwen2-7B-Instruct-v0.1/d858ce8e-6a4b-46b1-8d51-03ebc2d8aaec.json
deleted file mode 100644
index 1ec344d87..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/Qwen2-7B-Instruct-v0.1/d858ce8e-6a4b-46b1-8d51-03ebc2d8aaec.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_Qwen2-7B-Instruct-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-7B-Instruct-v0.1",
-    "id": "MaziyarPanahi/Qwen2-7B-Instruct-v0.1",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3352
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5123
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2213
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4435
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3857
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/Qwen2-7B-Instruct-v0.8/9813dd88-ff70-4d9e-86c5-9b73444275c5.json b/data/hfopenllm_v2/MaziyarPanahi/Qwen2-7B-Instruct-v0.8/9813dd88-ff70-4d9e-86c5-9b73444275c5.json
deleted file mode 100644
index 1b9f32279..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/Qwen2-7B-Instruct-v0.8/9813dd88-ff70-4d9e-86c5-9b73444275c5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_Qwen2-7B-Instruct-v0.8/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-7B-Instruct-v0.8",
-    "id": "MaziyarPanahi/Qwen2-7B-Instruct-v0.8",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2775
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4637
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1767
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4293
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3566
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-llama3.1-70b/ac677432-e7d1-4439-9c05-426059c285ef.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-llama3.1-70b/ac677432-e7d1-4439-9c05-426059c285ef.json
deleted file mode 100644
index 3872599b8..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-llama3.1-70b/ac677432-e7d1-4439-9c05-426059c285ef.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.1-llama3.1-70b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.1-llama3.1-70b",
-    "id": "MaziyarPanahi/calme-2.1-llama3.1-70b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8434
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6448
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4101
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.438
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5283
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-phi3-4b/018f270f-3cfe-403c-a236-483038a0b04e.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-phi3-4b/018f270f-3cfe-403c-a236-483038a0b04e.json
deleted file mode 100644
index d1bb5a3bb..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-phi3-4b/018f270f-3cfe-403c-a236-483038a0b04e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.1-phi3-4b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.1-phi3-4b",
-    "id": "MaziyarPanahi/calme-2.1-phi3-4b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5525
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5595
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1314
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4015
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3746
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-phi3.5-4b/718a40ea-26b1-4cf4-9584-57be798640ae.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-phi3.5-4b/718a40ea-26b1-4cf4-9584-57be798640ae.json
deleted file mode 100644
index 6ca32abe5..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-phi3.5-4b/718a40ea-26b1-4cf4-9584-57be798640ae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.1-phi3.5-4b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.1-phi3.5-4b",
-    "id": "MaziyarPanahi/calme-2.1-phi3.5-4b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5659
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5484
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2039
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3995
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2-72b/207a28a9-ae24-4a31-be95-96296b2e466d.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2-72b/207a28a9-ae24-4a31-be95-96296b2e466d.json
deleted file mode 100644
index 847508496..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2-72b/207a28a9-ae24-4a31-be95-96296b2e466d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.1-qwen2-72b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.1-qwen2-72b",
-    "id": "MaziyarPanahi/calme-2.1-qwen2-72b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.699
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8163
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6966
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4079
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3809
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4732
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5415
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2-7b/72efedb8-d456-41ed-b1ae-4887cb6c18f8.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2-7b/72efedb8-d456-41ed-b1ae-4887cb6c18f8.json
deleted file mode 100644
index e64c522fa..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2-7b/72efedb8-d456-41ed-b1ae-4887cb6c18f8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.1-qwen2-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.1-qwen2-7b",
-    "id": "MaziyarPanahi/calme-2.1-qwen2-7b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3816
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5046
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2311
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4437
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3693
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2.5-72b/ac91fb37-5742-4a3d-b93a-86c63b90cad5.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2.5-72b/ac91fb37-5742-4a3d-b93a-86c63b90cad5.json
deleted file mode 100644
index d23d575cb..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-qwen2.5-72b/ac91fb37-5742-4a3d-b93a-86c63b90cad5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.1-qwen2.5-72b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.1-qwen2.5-72b",
-    "id": "MaziyarPanahi/calme-2.1-qwen2.5-72b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.7
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8662
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7262
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5914
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3633
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4298
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5619
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-rys-78b/c71d025d-e954-4420-b397-e07c3644d1f4.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-rys-78b/c71d025d-e954-4420-b397-e07c3644d1f4.json
deleted file mode 100644
index d2b0fca22..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.1-rys-78b/c71d025d-e954-4420-b397-e07c3644d1f4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.1-rys-78b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.1-rys-78b",
-    "id": "MaziyarPanahi/calme-2.1-rys-78b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 77.965
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8136
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7098
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3943
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3943
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4693
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5444
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-llama3-70b/968c3759-de5f-4255-ba95-cafc7a3c70a7.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-llama3-70b/968c3759-de5f-4255-ba95-cafc7a3c70a7.json
deleted file mode 100644
index 6a8c1b645..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-llama3-70b/968c3759-de5f-4255-ba95-cafc7a3c70a7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.2-llama3-70b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.2-llama3-70b",
-    "id": "MaziyarPanahi/calme-2.2-llama3-70b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8208
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6435
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2394
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3414
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4446
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5207
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-llama3.1-70b/5e23b2f7-33f7-4e49-b73a-a02b8650ee0d.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-llama3.1-70b/5e23b2f7-33f7-4e49-b73a-a02b8650ee0d.json
deleted file mode 100644
index 27dee1772..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-llama3.1-70b/5e23b2f7-33f7-4e49-b73a-a02b8650ee0d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.2-llama3.1-70b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.2-llama3.1-70b",
-    "id": "MaziyarPanahi/calme-2.2-llama3.1-70b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8593
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6793
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4366
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4542
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5415
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-phi3-4b/1b6c64f6-acf8-4cff-bcae-6e8b3725c6f1.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-phi3-4b/1b6c64f6-acf8-4cff-bcae-6e8b3725c6f1.json
deleted file mode 100644
index 76e64261b..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-phi3-4b/1b6c64f6-acf8-4cff-bcae-6e8b3725c6f1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.2-phi3-4b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.2-phi3-4b",
-    "id": "MaziyarPanahi/calme-2.2-phi3-4b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5069
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.553
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3976
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3814
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2-72b/7908f572-8886-4add-ae84-b4ec0ec17c26.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2-72b/7908f572-8886-4add-ae84-b4ec0ec17c26.json
deleted file mode 100644
index ac04e14a8..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2-72b/7908f572-8886-4add-ae84-b4ec0ec17c26.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.2-qwen2-72b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.2-qwen2-72b",
-    "id": "MaziyarPanahi/calme-2.2-qwen2-72b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8008
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.694
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4532
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3742
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4508
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5435
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2-7b/9e04ec5c-2208-4569-9b63-4768ed4262b9.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2-7b/9e04ec5c-2208-4569-9b63-4768ed4262b9.json
deleted file mode 100644
index 1a6f153fa..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2-7b/9e04ec5c-2208-4569-9b63-4768ed4262b9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.2-qwen2-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.2-qwen2-7b",
-    "id": "MaziyarPanahi/calme-2.2-qwen2-7b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3597
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5215
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4358
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3899
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2.5-72b/ee2c8beb-6566-4b19-91d0-8e48c12a3fdf.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2.5-72b/ee2c8beb-6566-4b19-91d0-8e48c12a3fdf.json
deleted file mode 100644
index 6b2655c95..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-qwen2.5-72b/ee2c8beb-6566-4b19-91d0-8e48c12a3fdf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.2-qwen2.5-72b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.2-qwen2.5-72b",
-    "id": "MaziyarPanahi/calme-2.2-qwen2.5-72b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.7
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8477
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7276
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5891
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3591
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4207
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5618
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-rys-78b/c7579616-0c21-443a-a149-0c51a0ae92ac.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-rys-78b/c7579616-0c21-443a-a149-0c51a0ae92ac.json
deleted file mode 100644
index 554ca11d3..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.2-rys-78b/c7579616-0c21-443a-a149-0c51a0ae92ac.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.2-rys-78b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.2-rys-78b",
-    "id": "MaziyarPanahi/calme-2.2-rys-78b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 77.965
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7986
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7081
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4071
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4069
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4536
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5386
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-llama3-70b/ef7a1429-db2f-433b-a606-339a9d868e7a.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-llama3-70b/ef7a1429-db2f-433b-a606-339a9d868e7a.json
deleted file mode 100644
index 84b0cfd4e..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-llama3-70b/ef7a1429-db2f-433b-a606-339a9d868e7a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.3-llama3-70b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.3-llama3-70b",
-    "id": "MaziyarPanahi/calme-2.3-llama3-70b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.801
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6399
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2326
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4261
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5204
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-llama3.1-70b/f531e13c-79ed-45da-a246-857fd2c884c1.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-llama3.1-70b/f531e13c-79ed-45da-a246-857fd2c884c1.json
deleted file mode 100644
index 9e97146e6..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-llama3.1-70b/f531e13c-79ed-45da-a246-857fd2c884c1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.3-llama3.1-70b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.3-llama3.1-70b",
-    "id": "MaziyarPanahi/calme-2.3-llama3.1-70b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8605
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6872
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3927
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4568
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5363
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-phi3-4b/0f525d93-663a-442c-9a51-1ad3a5054172.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-phi3-4b/0f525d93-663a-442c-9a51-1ad3a5054172.json
deleted file mode 100644
index bc2d4c497..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-phi3-4b/0f525d93-663a-442c-9a51-1ad3a5054172.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.3-phi3-4b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.3-phi3-4b",
-    "id": "MaziyarPanahi/calme-2.3-phi3-4b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4926
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5538
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1473
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3988
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3828
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-qwen2-72b/15af21e1-3193-47fa-a3fc-1f087216d4d9.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-qwen2-72b/15af21e1-3193-47fa-a3fc-1f087216d4d9.json
deleted file mode 100644
index f937b7720..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-qwen2-72b/15af21e1-3193-47fa-a3fc-1f087216d4d9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.3-qwen2-72b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.3-qwen2-72b",
-    "id": "MaziyarPanahi/calme-2.3-qwen2-72b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.385
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6576
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3172
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4112
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5419
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-qwen2-7b/67b270d9-3422-4770-9957-7bde65acca0a.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-qwen2-7b/67b270d9-3422-4770-9957-7bde65acca0a.json
deleted file mode 100644
index ed4ee9992..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-qwen2-7b/67b270d9-3422-4770-9957-7bde65acca0a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.3-qwen2-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.3-qwen2-7b",
-    "id": "MaziyarPanahi/calme-2.3-qwen2-7b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3825
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5064
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2069
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4422
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3611
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-rys-78b/e2d38bcc-9133-4051-82d0-4e4fd66e00f8.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-rys-78b/e2d38bcc-9133-4051-82d0-4e4fd66e00f8.json
deleted file mode 100644
index 90960df8c..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.3-rys-78b/e2d38bcc-9133-4051-82d0-4e4fd66e00f8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.3-rys-78b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.3-rys-78b",
-    "id": "MaziyarPanahi/calme-2.3-rys-78b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 77.965
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8066
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7108
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.398
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4044
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4549
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5475
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.4-llama3-70b/4ff256af-73c7-4a5a-96da-19546a786c59.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.4-llama3-70b/4ff256af-73c7-4a5a-96da-19546a786c59.json
deleted file mode 100644
index 5cac03e59..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.4-llama3-70b/4ff256af-73c7-4a5a-96da-19546a786c59.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.4-llama3-70b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.4-llama3-70b",
-    "id": "MaziyarPanahi/calme-2.4-llama3-70b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5027
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6418
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2447
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3398
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4288
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5204
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.4-qwen2-7b/225cbeef-1d0d-40fc-949d-4ba6696fb690.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.4-qwen2-7b/225cbeef-1d0d-40fc-949d-4ba6696fb690.json
deleted file mode 100644
index b84becf6e..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.4-qwen2-7b/225cbeef-1d0d-40fc-949d-4ba6696fb690.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.4-qwen2-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.4-qwen2-7b",
-    "id": "MaziyarPanahi/calme-2.4-qwen2-7b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.33
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5101
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2032
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4453
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3977
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.4-rys-78b/24fcd662-5abb-4bf8-b8df-1c21b048cd92.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.4-rys-78b/24fcd662-5abb-4bf8-b8df-1c21b048cd92.json
deleted file mode 100644
index 6535c029d..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.4-rys-78b/24fcd662-5abb-4bf8-b8df-1c21b048cd92.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.4-rys-78b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.4-rys-78b",
-    "id": "MaziyarPanahi/calme-2.4-rys-78b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 77.965
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8011
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.728
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4071
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4027
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5771
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7002
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.5-qwen2-7b/7badcb45-7826-4fd1-b964-c697fbda76cc.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.5-qwen2-7b/7badcb45-7826-4fd1-b964-c697fbda76cc.json
deleted file mode 100644
index 76077262d..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.5-qwen2-7b/7badcb45-7826-4fd1-b964-c697fbda76cc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.5-qwen2-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.5-qwen2-7b",
-    "id": "MaziyarPanahi/calme-2.5-qwen2-7b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3145
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4887
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2258
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4565
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3682
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.6-qwen2-7b/bfb532f1-3319-46ff-80ae-0ca783a18bb6.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.6-qwen2-7b/bfb532f1-3319-46ff-80ae-0ca783a18bb6.json
deleted file mode 100644
index 901ba7227..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.6-qwen2-7b/bfb532f1-3319-46ff-80ae-0ca783a18bb6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.6-qwen2-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.6-qwen2-7b",
-    "id": "MaziyarPanahi/calme-2.6-qwen2-7b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3443
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.493
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4586
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3732
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-2.7-qwen2-7b/ea304515-b41f-4e96-a0ec-78c897ebf9a4.json b/data/hfopenllm_v2/MaziyarPanahi/calme-2.7-qwen2-7b/ea304515-b41f-4e96-a0ec-78c897ebf9a4.json
deleted file mode 100644
index 4bb85f9fc..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-2.7-qwen2-7b/ea304515-b41f-4e96-a0ec-78c897ebf9a4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-2.7-qwen2-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-2.7-qwen2-7b",
-    "id": "MaziyarPanahi/calme-2.7-qwen2-7b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3592
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4883
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1382
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4824
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3705
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-baguette-3b/1fe79ea5-1922-4a5e-8857-1c832353b0a6.json b/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-baguette-3b/1fe79ea5-1922-4a5e-8857-1c832353b0a6.json
deleted file mode 100644
index 1217267f3..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-baguette-3b/1fe79ea5-1922-4a5e-8857-1c832353b0a6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-3.1-baguette-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-3.1-baguette-3b",
-    "id": "MaziyarPanahi/calme-3.1-baguette-3b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.085
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6234
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4683
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.256
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4008
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3399
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-instruct-3b/9098d70f-cbcd-4f6c-bcba-0b1da743396e.json b/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-instruct-3b/9098d70f-cbcd-4f6c-bcba-0b1da743396e.json
deleted file mode 100644
index dfeeb7050..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-instruct-3b/9098d70f-cbcd-4f6c-bcba-0b1da743396e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-3.1-instruct-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-3.1-instruct-3b",
-    "id": "MaziyarPanahi/calme-3.1-instruct-3b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.085
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4336
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4813
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1775
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3952
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3557
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-instruct-78b/df4ed9e0-30bc-4a3f-b7a2-8955cbb38d31.json b/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-instruct-78b/df4ed9e0-30bc-4a3f-b7a2-8955cbb38d31.json
deleted file mode 100644
index 8dfc528f4..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-instruct-78b/df4ed9e0-30bc-4a3f-b7a2-8955cbb38d31.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-3.1-instruct-78b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-3.1-instruct-78b",
-    "id": "MaziyarPanahi/calme-3.1-instruct-78b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 77.965
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8136
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7305
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3927
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.396
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5891
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7185
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-llamaloi-3b/f68957d5-20a1-438f-9931-6a787aaed467.json b/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-llamaloi-3b/f68957d5-20a1-438f-9931-6a787aaed467.json
deleted file mode 100644
index d3b477ce7..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-3.1-llamaloi-3b/f68957d5-20a1-438f-9931-6a787aaed467.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-3.1-llamaloi-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-3.1-llamaloi-3b",
-    "id": "MaziyarPanahi/calme-3.1-llamaloi-3b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7375
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4587
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.173
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-3.2-baguette-3b/416e0c04-9119-4230-ba71-b0f47e2d4997.json b/data/hfopenllm_v2/MaziyarPanahi/calme-3.2-baguette-3b/416e0c04-9119-4230-ba71-b0f47e2d4997.json
deleted file mode 100644
index 339284c10..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-3.2-baguette-3b/416e0c04-9119-4230-ba71-b0f47e2d4997.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-3.2-baguette-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-3.2-baguette-3b",
-    "id": "MaziyarPanahi/calme-3.2-baguette-3b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.085
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6338
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4709
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2825
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4021
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3338
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-3.2-instruct-3b/d57780e2-154e-437d-ac2f-0007e1f9140e.json b/data/hfopenllm_v2/MaziyarPanahi/calme-3.2-instruct-3b/d57780e2-154e-437d-ac2f-0007e1f9140e.json
deleted file mode 100644
index b0efacce9..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-3.2-instruct-3b/d57780e2-154e-437d-ac2f-0007e1f9140e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-3.2-instruct-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-3.2-instruct-3b",
-    "id": "MaziyarPanahi/calme-3.2-instruct-3b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5533
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4866
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2168
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4047
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3653
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-3.2-instruct-78b/027d464b-1375-4de7-aa57-e1473d16ba89.json b/data/hfopenllm_v2/MaziyarPanahi/calme-3.2-instruct-78b/027d464b-1375-4de7-aa57-e1473d16ba89.json
deleted file mode 100644
index 41995ee2f..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-3.2-instruct-78b/027d464b-1375-4de7-aa57-e1473d16ba89.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-3.2-instruct-78b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-3.2-instruct-78b",
-    "id": "MaziyarPanahi/calme-3.2-instruct-78b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 77.965
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8063
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7319
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4033
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4027
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6024
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7303
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-3.3-baguette-3b/a81f20fa-57e8-498c-a162-6d8a9be09ee6.json b/data/hfopenllm_v2/MaziyarPanahi/calme-3.3-baguette-3b/a81f20fa-57e8-498c-a162-6d8a9be09ee6.json
deleted file mode 100644
index 3d87a48e4..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-3.3-baguette-3b/a81f20fa-57e8-498c-a162-6d8a9be09ee6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-3.3-baguette-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-3.3-baguette-3b",
-    "id": "MaziyarPanahi/calme-3.3-baguette-3b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.636
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4678
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3807
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3928
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3342
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MaziyarPanahi/calme-3.3-instruct-3b/d72ddbff-8ff7-446f-a74a-10a46bce6e3e.json b/data/hfopenllm_v2/MaziyarPanahi/calme-3.3-instruct-3b/d72ddbff-8ff7-446f-a74a-10a46bce6e3e.json
deleted file mode 100644
index 49ca1c4b5..000000000
--- a/data/hfopenllm_v2/MaziyarPanahi/calme-3.3-instruct-3b/d72ddbff-8ff7-446f-a74a-10a46bce6e3e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MaziyarPanahi_calme-3.3-instruct-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calme-3.3-instruct-3b",
-    "id": "MaziyarPanahi/calme-3.3-instruct-3b",
-    "developer": "MaziyarPanahi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6423
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4693
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3739
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4074
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3305
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Minami-su/Amara-o1-7B-Qwen/f681d612-f574-4641-b34e-95b6de97f9e8.json b/data/hfopenllm_v2/Minami-su/Amara-o1-7B-Qwen/f681d612-f574-4641-b34e-95b6de97f9e8.json
deleted file mode 100644
index ad3d67280..000000000
--- a/data/hfopenllm_v2/Minami-su/Amara-o1-7B-Qwen/f681d612-f574-4641-b34e-95b6de97f9e8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Minami-su_Amara-o1-7B-Qwen/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Amara-o1-7B-Qwen",
-    "id": "Minami-su/Amara-o1-7B-Qwen",
-    "developer": "Minami-su",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.739
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5199
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4007
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4083
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Minami-su/Amara-o2-7B-Qwen/cae1adaf-e424-4dcd-943b-5bbb708aca57.json b/data/hfopenllm_v2/Minami-su/Amara-o2-7B-Qwen/cae1adaf-e424-4dcd-943b-5bbb708aca57.json
deleted file mode 100644
index d8fef8df3..000000000
--- a/data/hfopenllm_v2/Minami-su/Amara-o2-7B-Qwen/cae1adaf-e424-4dcd-943b-5bbb708aca57.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Minami-su_Amara-o2-7B-Qwen/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Amara-o2-7B-Qwen",
-    "id": "Minami-su/Amara-o2-7B-Qwen",
-    "developer": "Minami-su",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7147
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5173
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4086
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3781
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4165
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Minami-su/test-7B-00/969ac825-92f2-448c-899a-226e69dee377.json b/data/hfopenllm_v2/Minami-su/test-7B-00/969ac825-92f2-448c-899a-226e69dee377.json
deleted file mode 100644
index 813aba2e0..000000000
--- a/data/hfopenllm_v2/Minami-su/test-7B-00/969ac825-92f2-448c-899a-226e69dee377.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Minami-su_test-7B-00/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "test-7B-00",
-    "id": "Minami-su/test-7B-00",
-    "developer": "Minami-su",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.669
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4466
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4517
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4126
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3588
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Minami-su/test-7B-01/e108ad28-c155-4162-852c-0f588a136bdc.json b/data/hfopenllm_v2/Minami-su/test-7B-01/e108ad28-c155-4162-852c-0f588a136bdc.json
deleted file mode 100644
index 878626d0b..000000000
--- a/data/hfopenllm_v2/Minami-su/test-7B-01/e108ad28-c155-4162-852c-0f588a136bdc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Minami-su_test-7B-01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "test-7B-01",
-    "id": "Minami-su/test-7B-01",
-    "developer": "Minami-su",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6736
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4422
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4554
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4153
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3536
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Minami-su/test-v2-7B-00/93cfeba9-7d31-45b4-a6e2-99a5f318f5b3.json b/data/hfopenllm_v2/Minami-su/test-v2-7B-00/93cfeba9-7d31-45b4-a6e2-99a5f318f5b3.json
deleted file mode 100644
index 0b0afc074..000000000
--- a/data/hfopenllm_v2/Minami-su/test-v2-7B-00/93cfeba9-7d31-45b4-a6e2-99a5f318f5b3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Minami-su_test-v2-7B-00/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "test-v2-7B-00",
-    "id": "Minami-su/test-v2-7B-00",
-    "developer": "Minami-su",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6747
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4416
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4418
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4154
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3472
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1/c1b16b84-9392-48f3-b483-0a9786925506.json b/data/hfopenllm_v2/ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1/c1b16b84-9392-48f3-b483-0a9786925506.json
deleted file mode 100644
index 0a18b9f48..000000000
--- a/data/hfopenllm_v2/ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1/c1b16b84-9392-48f3-b483-0a9786925506.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ModelCloud_Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1",
-    "id": "ModelCloud/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortex-v1",
-    "developer": "ModelCloud",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 5.453
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5269
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3253
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0604
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3249
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1764
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ModelSpace/GemmaX2-28-9B-v0.1/b0c6e08d-b426-49d5-8a66-ee3d70131b62.json b/data/hfopenllm_v2/ModelSpace/GemmaX2-28-9B-v0.1/b0c6e08d-b426-49d5-8a66-ee3d70131b62.json
deleted file mode 100644
index 3ce7f3cf6..000000000
--- a/data/hfopenllm_v2/ModelSpace/GemmaX2-28-9B-v0.1/b0c6e08d-b426-49d5-8a66-ee3d70131b62.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ModelSpace_GemmaX2-28-9B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GemmaX2-28-9B-v0.1",
-    "id": "ModelSpace/GemmaX2-28-9B-v0.1",
-    "developer": "ModelSpace",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0039
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3687
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0272
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3537
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2231
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MoonRide/Llama-3.2-3B-Khelavaster/6a6651a3-b34e-404d-ac25-42c151fb9ba3.json b/data/hfopenllm_v2/MoonRide/Llama-3.2-3B-Khelavaster/6a6651a3-b34e-404d-ac25-42c151fb9ba3.json
deleted file mode 100644
index 7575cd4c1..000000000
--- a/data/hfopenllm_v2/MoonRide/Llama-3.2-3B-Khelavaster/6a6651a3-b34e-404d-ac25-42c151fb9ba3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MoonRide_Llama-3.2-3B-Khelavaster/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-Khelavaster",
-    "id": "MoonRide/Llama-3.2-3B-Khelavaster",
-    "developer": "MoonRide",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.607
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4925
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4516
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1616
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3699
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3122
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Mostafa8Mehrabi/llama-3.2-1b-Insomnia-ChatBot-merged/da63b789-5571-4ed8-976e-146d385b18e2.json b/data/hfopenllm_v2/Mostafa8Mehrabi/llama-3.2-1b-Insomnia-ChatBot-merged/da63b789-5571-4ed8-976e-146d385b18e2.json
deleted file mode 100644
index 47726cb82..000000000
--- a/data/hfopenllm_v2/Mostafa8Mehrabi/llama-3.2-1b-Insomnia-ChatBot-merged/da63b789-5571-4ed8-976e-146d385b18e2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Mostafa8Mehrabi_llama-3.2-1b-Insomnia-ChatBot-merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3.2-1b-Insomnia-ChatBot-merged",
-    "id": "Mostafa8Mehrabi/llama-3.2-1b-Insomnia-ChatBot-merged",
-    "developer": "Mostafa8Mehrabi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1321
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3004
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0076
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2366
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1131
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MrRobotoAI/MrRoboto-ProLong-8b-v4i/87b900e7-3bab-4e60-b0ef-349667cb2656.json b/data/hfopenllm_v2/MrRobotoAI/MrRoboto-ProLong-8b-v4i/87b900e7-3bab-4e60-b0ef-349667cb2656.json
deleted file mode 100644
index 87434c005..000000000
--- a/data/hfopenllm_v2/MrRobotoAI/MrRoboto-ProLong-8b-v4i/87b900e7-3bab-4e60-b0ef-349667cb2656.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MrRobotoAI_MrRoboto-ProLong-8b-v4i/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MrRoboto-ProLong-8b-v4i",
-    "id": "MrRobotoAI/MrRoboto-ProLong-8b-v4i",
-    "developer": "MrRobotoAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3835
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4585
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0551
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4014
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3068
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MrRobotoAI/MrRoboto-ProLongBASE-pt8-unaligned-8b/c9fd4740-4990-4174-b782-9b63c34d6407.json b/data/hfopenllm_v2/MrRobotoAI/MrRoboto-ProLongBASE-pt8-unaligned-8b/c9fd4740-4990-4174-b782-9b63c34d6407.json
deleted file mode 100644
index 91f97c6f4..000000000
--- a/data/hfopenllm_v2/MrRobotoAI/MrRoboto-ProLongBASE-pt8-unaligned-8b/c9fd4740-4990-4174-b782-9b63c34d6407.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MrRobotoAI_MrRoboto-ProLongBASE-pt8-unaligned-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MrRoboto-ProLongBASE-pt8-unaligned-8b",
-    "id": "MrRobotoAI/MrRoboto-ProLongBASE-pt8-unaligned-8b",
-    "developer": "MrRobotoAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3475
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4515
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0423
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4279
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2566
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1211-3B/2582a049-e940-408b-b2d9-7a7bdf470e49.json b/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1211-3B/2582a049-e940-408b-b2d9-7a7bdf470e49.json
deleted file mode 100644
index 5001bc314..000000000
--- a/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1211-3B/2582a049-e940-408b-b2d9-7a7bdf470e49.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MultivexAI_Gladiator-Mini-Exp-1211-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gladiator-Mini-Exp-1211-3B",
-    "id": "MultivexAI/Gladiator-Mini-Exp-1211-3B",
-    "developer": "MultivexAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6876
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4484
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1375
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.326
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3152
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct-V2/99310118-d2ec-4647-85db-fcc22aee9161.json b/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct-V2/99310118-d2ec-4647-85db-fcc22aee9161.json
deleted file mode 100644
index 22f18ce11..000000000
--- a/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct-V2/99310118-d2ec-4647-85db-fcc22aee9161.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MultivexAI_Gladiator-Mini-Exp-1221-3B-Instruct-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gladiator-Mini-Exp-1221-3B-Instruct-V2",
-    "id": "MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct-V2",
-    "developer": "MultivexAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6215
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4389
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1412
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3008
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3025
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct/bedd12e4-da18-4ca6-ba51-6d13e1c80bae.json b/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct/bedd12e4-da18-4ca6-ba51-6d13e1c80bae.json
deleted file mode 100644
index b849a1f53..000000000
--- a/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct/bedd12e4-da18-4ca6-ba51-6d13e1c80bae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MultivexAI_Gladiator-Mini-Exp-1221-3B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gladiator-Mini-Exp-1221-3B-Instruct",
-    "id": "MultivexAI/Gladiator-Mini-Exp-1221-3B-Instruct",
-    "developer": "MultivexAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6079
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.437
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1352
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3115
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3049
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1222-3B-Instruct/6767e14a-bbfa-4a0d-8120-1f48a565474e.json b/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1222-3B-Instruct/6767e14a-bbfa-4a0d-8120-1f48a565474e.json
deleted file mode 100644
index 37852b3f6..000000000
--- a/data/hfopenllm_v2/MultivexAI/Gladiator-Mini-Exp-1222-3B-Instruct/6767e14a-bbfa-4a0d-8120-1f48a565474e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MultivexAI_Gladiator-Mini-Exp-1222-3B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gladiator-Mini-Exp-1222-3B-Instruct",
-    "id": "MultivexAI/Gladiator-Mini-Exp-1222-3B-Instruct",
-    "developer": "MultivexAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6163
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4373
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1412
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3128
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3017
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/MultivexAI/Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF/70260aac-1bbf-4913-9dcc-58633d055314.json b/data/hfopenllm_v2/MultivexAI/Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF/70260aac-1bbf-4913-9dcc-58633d055314.json
deleted file mode 100644
index f8ab724bd..000000000
--- a/data/hfopenllm_v2/MultivexAI/Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF/70260aac-1bbf-4913-9dcc-58633d055314.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/MultivexAI_Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF",
-    "id": "MultivexAI/Phi-3.5-Mini-Instruct-MultiVex-v0.25-GGUF",
-    "developer": "MultivexAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.144
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2908
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3642
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1109
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v1.1/fba6e1a2-c197-4731-91ea-f6d059ba8b16.json b/data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v1.1/fba6e1a2-c197-4731-91ea-f6d059ba8b16.json
deleted file mode 100644
index 9fe318e68..000000000
--- a/data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v1.1/fba6e1a2-c197-4731-91ea-f6d059ba8b16.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Mxode_NanoLM-0.3B-Instruct-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NanoLM-0.3B-Instruct-v1.1",
-    "id": "Mxode/NanoLM-0.3B-Instruct-v1.1",
-    "developer": "Mxode",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.315
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1783
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3014
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4273
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1121
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v1/22e74d0c-70d6-43c5-be4d-62842d93fedf.json b/data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v1/22e74d0c-70d6-43c5-be4d-62842d93fedf.json
deleted file mode 100644
index fd4f80330..000000000
--- a/data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v1/22e74d0c-70d6-43c5-be4d-62842d93fedf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Mxode_NanoLM-0.3B-Instruct-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NanoLM-0.3B-Instruct-v1",
-    "id": "Mxode/NanoLM-0.3B-Instruct-v1",
-    "developer": "Mxode",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.315
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1537
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3028
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4155
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1105
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v2/f7c33065-1da1-4da4-81c7-f2c9307b6e9b.json b/data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v2/f7c33065-1da1-4da4-81c7-f2c9307b6e9b.json
deleted file mode 100644
index f92c110e8..000000000
--- a/data/hfopenllm_v2/Mxode/NanoLM-0.3B-Instruct-v2/f7c33065-1da1-4da4-81c7-f2c9307b6e9b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Mxode_NanoLM-0.3B-Instruct-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NanoLM-0.3B-Instruct-v2",
-    "id": "Mxode/NanoLM-0.3B-Instruct-v2",
-    "developer": "Mxode",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.315
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1668
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2921
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3955
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1134
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Mxode/NanoLM-1B-Instruct-v1.1/ecdb4661-426a-46be-aefc-7e04483cebc0.json b/data/hfopenllm_v2/Mxode/NanoLM-1B-Instruct-v1.1/ecdb4661-426a-46be-aefc-7e04483cebc0.json
deleted file mode 100644
index a4f75b709..000000000
--- a/data/hfopenllm_v2/Mxode/NanoLM-1B-Instruct-v1.1/ecdb4661-426a-46be-aefc-7e04483cebc0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Mxode_NanoLM-1B-Instruct-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NanoLM-1B-Instruct-v1.1",
-    "id": "Mxode/NanoLM-1B-Instruct-v1.1",
-    "developer": "Mxode",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.076
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2395
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3184
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0363
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3433
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1215
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Mxode/NanoLM-1B-Instruct-v2/236976b3-af46-45ac-a8a5-f5897e3468a1.json b/data/hfopenllm_v2/Mxode/NanoLM-1B-Instruct-v2/236976b3-af46-45ac-a8a5-f5897e3468a1.json
deleted file mode 100644
index 3c6004de5..000000000
--- a/data/hfopenllm_v2/Mxode/NanoLM-1B-Instruct-v2/236976b3-af46-45ac-a8a5-f5897e3468a1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Mxode_NanoLM-1B-Instruct-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NanoLM-1B-Instruct-v2",
-    "id": "Mxode/NanoLM-1B-Instruct-v2",
-    "developer": "Mxode",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.076
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.263
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3123
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0415
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3552
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1238
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NAPS-ai/naps-gemma-2-27b-v-0.1.0/fd175296-a5f6-4914-80e9-b8b75bc659de.json b/data/hfopenllm_v2/NAPS-ai/naps-gemma-2-27b-v-0.1.0/fd175296-a5f6-4914-80e9-b8b75bc659de.json
deleted file mode 100644
index 55309810d..000000000
--- a/data/hfopenllm_v2/NAPS-ai/naps-gemma-2-27b-v-0.1.0/fd175296-a5f6-4914-80e9-b8b75bc659de.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NAPS-ai_naps-gemma-2-27b-v-0.1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "naps-gemma-2-27b-v-0.1.0",
-    "id": "NAPS-ai/naps-gemma-2-27b-v-0.1.0",
-    "developer": "NAPS-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2912
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3575
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1168
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NAPS-ai/naps-gemma-2-27b-v0.1.0/d910bbaa-d55c-4b00-9320-856a8a6713c0.json b/data/hfopenllm_v2/NAPS-ai/naps-gemma-2-27b-v0.1.0/d910bbaa-d55c-4b00-9320-856a8a6713c0.json
deleted file mode 100644
index 8fed04ca8..000000000
--- a/data/hfopenllm_v2/NAPS-ai/naps-gemma-2-27b-v0.1.0/d910bbaa-d55c-4b00-9320-856a8a6713c0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NAPS-ai_naps-gemma-2-27b-v0.1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "naps-gemma-2-27b-v0.1.0",
-    "id": "NAPS-ai/naps-gemma-2-27b-v0.1.0",
-    "developer": "NAPS-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2912
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3575
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1168
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-8b-instruct-v0.3/99a5f123-5d2e-469b-884e-c9a64c6bc197.json b/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-8b-instruct-v0.3/99a5f123-5d2e-469b-884e-c9a64c6bc197.json
deleted file mode 100644
index 027340ffc..000000000
--- a/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-8b-instruct-v0.3/99a5f123-5d2e-469b-884e-c9a64c6bc197.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NAPS-ai_naps-llama-3_1-8b-instruct-v0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "naps-llama-3_1-8b-instruct-v0.3",
-    "id": "NAPS-ai/naps-llama-3_1-8b-instruct-v0.3",
-    "developer": "NAPS-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5391
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4901
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1903
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3787
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3398
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-8b-instruct-v0.4/ed17a715-f0ae-461c-9618-ac952c450ec5.json b/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-8b-instruct-v0.4/ed17a715-f0ae-461c-9618-ac952c450ec5.json
deleted file mode 100644
index 778af4ae2..000000000
--- a/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-8b-instruct-v0.4/ed17a715-f0ae-461c-9618-ac952c450ec5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NAPS-ai_naps-llama-3_1-8b-instruct-v0.4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "naps-llama-3_1-8b-instruct-v0.4",
-    "id": "NAPS-ai/naps-llama-3_1-8b-instruct-v0.4",
-    "developer": "NAPS-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7344
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4862
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1964
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4421
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3475
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-instruct-v0.5.0/3dd2a474-9ea8-4e26-8986-5bcc67c78c39.json b/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-instruct-v0.5.0/3dd2a474-9ea8-4e26-8986-5bcc67c78c39.json
deleted file mode 100644
index 3ae07feca..000000000
--- a/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1-instruct-v0.5.0/3dd2a474-9ea8-4e26-8986-5bcc67c78c39.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NAPS-ai_naps-llama-3_1-instruct-v0.5.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "naps-llama-3_1-instruct-v0.5.0",
-    "id": "NAPS-ai/naps-llama-3_1-instruct-v0.5.0",
-    "developer": "NAPS-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.502
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4148
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0363
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3713
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2614
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1_instruct-v0.6.0/b39e14a6-c05f-4e88-b2d4-63a199aa61a1.json b/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1_instruct-v0.6.0/b39e14a6-c05f-4e88-b2d4-63a199aa61a1.json
deleted file mode 100644
index 5d319d532..000000000
--- a/data/hfopenllm_v2/NAPS-ai/naps-llama-3_1_instruct-v0.6.0/b39e14a6-c05f-4e88-b2d4-63a199aa61a1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NAPS-ai_naps-llama-3_1_instruct-v0.6.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "naps-llama-3_1_instruct-v0.6.0",
-    "id": "NAPS-ai/naps-llama-3_1_instruct-v0.6.0",
-    "developer": "NAPS-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4528
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0642
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3739
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3241
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NAPS-ai/naps-llama3.1-70B-v0.2-fp16/39893637-552a-48d8-9b83-433415eb26c3.json b/data/hfopenllm_v2/NAPS-ai/naps-llama3.1-70B-v0.2-fp16/39893637-552a-48d8-9b83-433415eb26c3.json
deleted file mode 100644
index fa9e2a0a2..000000000
--- a/data/hfopenllm_v2/NAPS-ai/naps-llama3.1-70B-v0.2-fp16/39893637-552a-48d8-9b83-433415eb26c3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NAPS-ai_naps-llama3.1-70B-v0.2-fp16/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "naps-llama3.1-70B-v0.2-fp16",
-    "id": "NAPS-ai/naps-llama3.1-70B-v0.2-fp16",
-    "developer": "NAPS-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.761
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1845
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3041
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2391
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3486
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1099
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NCSOFT/Llama-VARCO-8B-Instruct/f9549713-f487-4e26-bfeb-ec6d394b7014.json b/data/hfopenllm_v2/NCSOFT/Llama-VARCO-8B-Instruct/f9549713-f487-4e26-bfeb-ec6d394b7014.json
deleted file mode 100644
index 515a08f92..000000000
--- a/data/hfopenllm_v2/NCSOFT/Llama-VARCO-8B-Instruct/f9549713-f487-4e26-bfeb-ec6d394b7014.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NCSOFT_Llama-VARCO-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-VARCO-8B-Instruct",
-    "id": "NCSOFT/Llama-VARCO-8B-Instruct",
-    "developer": "NCSOFT",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.447
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5023
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3841
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.319
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NJS26/NJS_777/02579c41-f117-4412-9c00-ee7db3e9ab97.json b/data/hfopenllm_v2/NJS26/NJS_777/02579c41-f117-4412-9c00-ee7db3e9ab97.json
deleted file mode 100644
index bf0dad5d2..000000000
--- a/data/hfopenllm_v2/NJS26/NJS_777/02579c41-f117-4412-9c00-ee7db3e9ab97.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NJS26_NJS_777/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NJS_777",
-    "id": "NJS26/NJS_777",
-    "developer": "NJS26",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 10.362
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1881
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2178
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2064
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3538
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1163
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NLPark/AnFeng_v3.1-Avocet/bfa1d761-00aa-4438-a5de-972d934c63d5.json b/data/hfopenllm_v2/NLPark/AnFeng_v3.1-Avocet/bfa1d761-00aa-4438-a5de-972d934c63d5.json
deleted file mode 100644
index d4c3c3a62..000000000
--- a/data/hfopenllm_v2/NLPark/AnFeng_v3.1-Avocet/bfa1d761-00aa-4438-a5de-972d934c63d5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NLPark_AnFeng_v3.1-Avocet/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AnFeng_v3.1-Avocet",
-    "id": "NLPark/AnFeng_v3.1-Avocet",
-    "developer": "NLPark",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.393
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5096
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5829
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1594
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4476
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4438
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NLPark/B-and-W_Flycatcher-3AD1E/20a84d88-05c2-4e02-8c84-2afa84cc659f.json b/data/hfopenllm_v2/NLPark/B-and-W_Flycatcher-3AD1E/20a84d88-05c2-4e02-8c84-2afa84cc659f.json
deleted file mode 100644
index 17941cc70..000000000
--- a/data/hfopenllm_v2/NLPark/B-and-W_Flycatcher-3AD1E/20a84d88-05c2-4e02-8c84-2afa84cc659f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NLPark_B-and-W_Flycatcher-3AD1E/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "B-and-W_Flycatcher-3AD1E",
-    "id": "NLPark/B-and-W_Flycatcher-3AD1E",
-    "developer": "NLPark",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4908
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6065
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2379
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3305
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4423
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4741
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NLPark/Shi-Ci-Robin-Test_3AD80/84eedce3-3a93-4630-b914-aa281fd2efda.json b/data/hfopenllm_v2/NLPark/Shi-Ci-Robin-Test_3AD80/84eedce3-3a93-4630-b914-aa281fd2efda.json
deleted file mode 100644
index b7883ce1e..000000000
--- a/data/hfopenllm_v2/NLPark/Shi-Ci-Robin-Test_3AD80/84eedce3-3a93-4630-b914-aa281fd2efda.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NLPark_Shi-Ci-Robin-Test_3AD80/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Shi-Ci-Robin-Test_3AD80",
-    "id": "NLPark/Shi-Ci-Robin-Test_3AD80",
-    "developer": "NLPark",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7227
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6705
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3157
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3599
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4696
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5121
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NTQAI/NxMobileLM-1.5B-SFT/b3b7b62f-ac82-4ef9-9634-afb81645ec19.json b/data/hfopenllm_v2/NTQAI/NxMobileLM-1.5B-SFT/b3b7b62f-ac82-4ef9-9634-afb81645ec19.json
deleted file mode 100644
index ddf2ce282..000000000
--- a/data/hfopenllm_v2/NTQAI/NxMobileLM-1.5B-SFT/b3b7b62f-ac82-4ef9-9634-afb81645ec19.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NTQAI_NxMobileLM-1.5B-SFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NxMobileLM-1.5B-SFT",
-    "id": "NTQAI/NxMobileLM-1.5B-SFT",
-    "developer": "NTQAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6392
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3957
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0846
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3555
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2817
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NTQAI/Nxcode-CQ-7B-orpo/283c5166-b9c5-4d20-9653-0cd0346d87c1.json b/data/hfopenllm_v2/NTQAI/Nxcode-CQ-7B-orpo/283c5166-b9c5-4d20-9653-0cd0346d87c1.json
deleted file mode 100644
index 61343896e..000000000
--- a/data/hfopenllm_v2/NTQAI/Nxcode-CQ-7B-orpo/283c5166-b9c5-4d20-9653-0cd0346d87c1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NTQAI_Nxcode-CQ-7B-orpo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nxcode-CQ-7B-orpo",
-    "id": "NTQAI/Nxcode-CQ-7B-orpo",
-    "developer": "NTQAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.25
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4007
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4143
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.394
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1612
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NYTK/PULI-GPTrio/478b54cd-6410-41e5-8a53-4e46bcd9d7af.json b/data/hfopenllm_v2/NYTK/PULI-GPTrio/478b54cd-6410-41e5-8a53-4e46bcd9d7af.json
deleted file mode 100644
index 513daf9dc..000000000
--- a/data/hfopenllm_v2/NYTK/PULI-GPTrio/478b54cd-6410-41e5-8a53-4e46bcd9d7af.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NYTK_PULI-GPTrio/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PULI-GPTrio",
-    "id": "NYTK/PULI-GPTrio",
-    "developer": "NYTK",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 7.673
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.218
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.306
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3819
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1137
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NYTK/PULI-LlumiX-32K/de2ae7a9-93eb-4149-b3ff-b5b7dfba29c4.json b/data/hfopenllm_v2/NYTK/PULI-LlumiX-32K/de2ae7a9-93eb-4149-b3ff-b5b7dfba29c4.json
deleted file mode 100644
index 373fe82fa..000000000
--- a/data/hfopenllm_v2/NYTK/PULI-LlumiX-32K/de2ae7a9-93eb-4149-b3ff-b5b7dfba29c4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NYTK_PULI-LlumiX-32K/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PULI-LlumiX-32K",
-    "id": "NYTK/PULI-LlumiX-32K",
-    "developer": "NYTK",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.738
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.17
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3189
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3964
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1681
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Naveenpoliasetty/llama3-8B-V2/ef5aa9db-804b-4a53-9c22-9c99f6c69eeb.json b/data/hfopenllm_v2/Naveenpoliasetty/llama3-8B-V2/ef5aa9db-804b-4a53-9c22-9c99f6c69eeb.json
deleted file mode 100644
index 6dbee699d..000000000
--- a/data/hfopenllm_v2/Naveenpoliasetty/llama3-8B-V2/ef5aa9db-804b-4a53-9c22-9c99f6c69eeb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Naveenpoliasetty_llama3-8B-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama3-8B-V2",
-    "id": "Naveenpoliasetty/llama3-8B-V2",
-    "developer": "Naveenpoliasetty",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4123
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5189
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0785
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4081
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NbAiLab/nb-llama-3.1-8B-Instruct/553fd36d-08dd-46a3-ab04-77b9039e7921.json b/data/hfopenllm_v2/NbAiLab/nb-llama-3.1-8B-Instruct/553fd36d-08dd-46a3-ab04-77b9039e7921.json
deleted file mode 100644
index b291ca255..000000000
--- a/data/hfopenllm_v2/NbAiLab/nb-llama-3.1-8B-Instruct/553fd36d-08dd-46a3-ab04-77b9039e7921.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NbAiLab_nb-llama-3.1-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "nb-llama-3.1-8B-Instruct",
-    "id": "NbAiLab/nb-llama-3.1-8B-Instruct",
-    "developer": "NbAiLab",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3625
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0227
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3208
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1197
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NbAiLab/nb-llama-3.1-8B-sft/e2bae853-cc0f-456a-a635-98d5f87ac47c.json b/data/hfopenllm_v2/NbAiLab/nb-llama-3.1-8B-sft/e2bae853-cc0f-456a-a635-98d5f87ac47c.json
deleted file mode 100644
index c5be98399..000000000
--- a/data/hfopenllm_v2/NbAiLab/nb-llama-3.1-8B-sft/e2bae853-cc0f-456a-a635-98d5f87ac47c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NbAiLab_nb-llama-3.1-8B-sft/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "nb-llama-3.1-8B-sft",
-    "id": "NbAiLab/nb-llama-3.1-8B-sft",
-    "developer": "NbAiLab",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3616
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3282
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3287
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1222
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nekochu/Llama-3.1-8B-German-ORPO/d6c5f196-c97b-4a0a-81b0-59143ec4b10e.json b/data/hfopenllm_v2/Nekochu/Llama-3.1-8B-German-ORPO/d6c5f196-c97b-4a0a-81b0-59143ec4b10e.json
deleted file mode 100644
index 95ea21ef4..000000000
--- a/data/hfopenllm_v2/Nekochu/Llama-3.1-8B-German-ORPO/d6c5f196-c97b-4a0a-81b0-59143ec4b10e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nekochu_Llama-3.1-8B-German-ORPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-German-ORPO",
-    "id": "Nekochu/Llama-3.1-8B-German-ORPO",
-    "developer": "Nekochu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4611
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4983
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1171
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4647
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3393
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nekochu/Llama-3.1-8B-french-DPO/5d92e02f-b590-4b6b-8c64-30690f79e916.json b/data/hfopenllm_v2/Nekochu/Llama-3.1-8B-french-DPO/5d92e02f-b590-4b6b-8c64-30690f79e916.json
deleted file mode 100644
index ccbbfa608..000000000
--- a/data/hfopenllm_v2/Nekochu/Llama-3.1-8B-french-DPO/5d92e02f-b590-4b6b-8c64-30690f79e916.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nekochu_Llama-3.1-8B-french-DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-french-DPO",
-    "id": "Nekochu/Llama-3.1-8B-french-DPO",
-    "developer": "Nekochu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4656
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5111
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0974
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4216
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3414
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nekochu/Luminia-13B-v3/e10f38df-b5d5-47c6-924f-563c6f8a6616.json b/data/hfopenllm_v2/Nekochu/Luminia-13B-v3/e10f38df-b5d5-47c6-924f-563c6f8a6616.json
deleted file mode 100644
index 5012f1af5..000000000
--- a/data/hfopenllm_v2/Nekochu/Luminia-13B-v3/e10f38df-b5d5-47c6-924f-563c6f8a6616.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nekochu_Luminia-13B-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Luminia-13B-v3",
-    "id": "Nekochu/Luminia-13B-v3",
-    "developer": "Nekochu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.016
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2523
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4112
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3983
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2215
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nekochu/Luminia-8B-RP/27257dc9-750c-4673-8865-986434bc5c0e.json b/data/hfopenllm_v2/Nekochu/Luminia-8B-RP/27257dc9-750c-4673-8865-986434bc5c0e.json
deleted file mode 100644
index 1ed86080e..000000000
--- a/data/hfopenllm_v2/Nekochu/Luminia-8B-RP/27257dc9-750c-4673-8865-986434bc5c0e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nekochu_Luminia-8B-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Luminia-8B-RP",
-    "id": "Nekochu/Luminia-8B-RP",
-    "developer": "Nekochu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5574
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5218
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3998
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3631
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NeverSleep/Lumimaid-v0.2-12B/e599f3f8-e5eb-4bfe-a102-efc5a967434d.json b/data/hfopenllm_v2/NeverSleep/Lumimaid-v0.2-12B/e599f3f8-e5eb-4bfe-a102-efc5a967434d.json
deleted file mode 100644
index 69ad8d773..000000000
--- a/data/hfopenllm_v2/NeverSleep/Lumimaid-v0.2-12B/e599f3f8-e5eb-4bfe-a102-efc5a967434d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NeverSleep_Lumimaid-v0.2-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lumimaid-v0.2-12B",
-    "id": "NeverSleep/Lumimaid-v0.2-12B",
-    "developer": "NeverSleep",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1099
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5396
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4821
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3511
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NeverSleep/Lumimaid-v0.2-8B/8e56f2dd-49d0-4eff-beea-53d01cd96f0e.json b/data/hfopenllm_v2/NeverSleep/Lumimaid-v0.2-8B/8e56f2dd-49d0-4eff-beea-53d01cd96f0e.json
deleted file mode 100644
index e40a35f70..000000000
--- a/data/hfopenllm_v2/NeverSleep/Lumimaid-v0.2-8B/8e56f2dd-49d0-4eff-beea-53d01cd96f0e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NeverSleep_Lumimaid-v0.2-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lumimaid-v0.2-8B",
-    "id": "NeverSleep/Lumimaid-v0.2-8B",
-    "developer": "NeverSleep",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5038
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5238
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1435
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4303
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3636
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Dolphin3.0-Llama3.1-1B-abliterated/f1a2b5d0-2c8a-4bbc-8bc5-0484485c2dad.json b/data/hfopenllm_v2/Nexesenex/Dolphin3.0-Llama3.1-1B-abliterated/f1a2b5d0-2c8a-4bbc-8bc5-0484485c2dad.json
deleted file mode 100644
index f3573a1fa..000000000
--- a/data/hfopenllm_v2/Nexesenex/Dolphin3.0-Llama3.1-1B-abliterated/f1a2b5d0-2c8a-4bbc-8bc5-0484485c2dad.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Dolphin3.0-Llama3.1-1B-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dolphin3.0-Llama3.1-1B-abliterated",
-    "id": "Nexesenex/Dolphin3.0-Llama3.1-1B-abliterated",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5312
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3241
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2408
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3237
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1373
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DeepDive_3_Prev_v1.0/2c12ee67-0c77-4cb2-9e88-1c731ed55c3f.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DeepDive_3_Prev_v1.0/2c12ee67-0c77-4cb2-9e88-1c731ed55c3f.json
deleted file mode 100644
index cd74ed8ee..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DeepDive_3_Prev_v1.0/2c12ee67-0c77-4cb2-9e88-1c731ed55c3f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DeepDive_3_Prev_v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_DeepDive_3_Prev_v1.0",
-    "id": "Nexesenex/Llama_3.1_8b_DeepDive_3_Prev_v1.0",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6809
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5155
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1866
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3666
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3438
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0/567f8f54-225f-4d9b-be06-f24091adc1e6.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0/567f8f54-225f-4d9b-be06-f24091adc1e6.json
deleted file mode 100644
index 7e171d5ad..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0/567f8f54-225f-4d9b-be06-f24091adc1e6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0",
-    "id": "Nexesenex/Llama_3.1_8b_DeepDive_3_R1_Prev_v1.0",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7101
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.512
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1926
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3758
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3441
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DobHerWild_R1_v1.1R/ebb59730-9522-4c45-8f42-c0d941fd728c.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DobHerWild_R1_v1.1R/ebb59730-9522-4c45-8f42-c0d941fd728c.json
deleted file mode 100644
index 2ed4134e3..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DobHerWild_R1_v1.1R/ebb59730-9522-4c45-8f42-c0d941fd728c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DobHerWild_R1_v1.1R/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_DobHerWild_R1_v1.1R",
-    "id": "Nexesenex/Llama_3.1_8b_DobHerWild_R1_v1.1R",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5257
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2319
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3852
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3688
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.01/2c44fa8c-ebd3-4ea6-8578-61da38965c09.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.01/2c44fa8c-ebd3-4ea6-8578-61da38965c09.json
deleted file mode 100644
index d6626e33b..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.01/2c44fa8c-ebd3-4ea6-8578-61da38965c09.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DoberWild_v2.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_DoberWild_v2.01",
-    "id": "Nexesenex/Llama_3.1_8b_DoberWild_v2.01",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.031
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7996
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5251
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2002
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4012
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3791
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.02/3ef26b8c-6bfb-457b-a160-a65c3cc8b0c6.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.02/3ef26b8c-6bfb-457b-a160-a65c3cc8b0c6.json
deleted file mode 100644
index f57ee9701..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.02/3ef26b8c-6bfb-457b-a160-a65c3cc8b0c6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DoberWild_v2.02/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_DoberWild_v2.02",
-    "id": "Nexesenex/Llama_3.1_8b_DoberWild_v2.02",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7746
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5313
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1994
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3946
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3764
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.03/0ab721ba-fbda-44ca-a349-1d3abfaabe62.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.03/0ab721ba-fbda-44ca-a349-1d3abfaabe62.json
deleted file mode 100644
index 7306b4374..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DoberWild_v2.03/0ab721ba-fbda-44ca-a349-1d3abfaabe62.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DoberWild_v2.03/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_DoberWild_v2.03",
-    "id": "Nexesenex/Llama_3.1_8b_DoberWild_v2.03",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7764
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5294
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2077
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3906
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3722
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.01/2fea1128-4f0c-40d8-be87-72c42c0648fb.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.01/2fea1128-4f0c-40d8-be87-72c42c0648fb.json
deleted file mode 100644
index a603278e4..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.01/2fea1128-4f0c-40d8-be87-72c42c0648fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DodoWild_v2.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_DodoWild_v2.01",
-    "id": "Nexesenex/Llama_3.1_8b_DodoWild_v2.01",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.031
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7978
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5253
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1986
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.409
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.02/db9dc9d2-4aa2-43d0-9f2e-15fbd05af62c.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.02/db9dc9d2-4aa2-43d0-9f2e-15fbd05af62c.json
deleted file mode 100644
index afa87e2bb..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.02/db9dc9d2-4aa2-43d0-9f2e-15fbd05af62c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DodoWild_v2.02/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_DodoWild_v2.02",
-    "id": "Nexesenex/Llama_3.1_8b_DodoWild_v2.02",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8017
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5262
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2273
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3971
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3761
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.03/28399fd0-840c-49d3-8179-407ed83d3bfc.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.03/28399fd0-840c-49d3-8179-407ed83d3bfc.json
deleted file mode 100644
index 7503f6ebc..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.03/28399fd0-840c-49d3-8179-407ed83d3bfc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DodoWild_v2.03/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_DodoWild_v2.03",
-    "id": "Nexesenex/Llama_3.1_8b_DodoWild_v2.03",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7941
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5308
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2221
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3959
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3786
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.10/d7108c13-e14a-4366-9a39-204f853b1bee.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.10/d7108c13-e14a-4366-9a39-204f853b1bee.json
deleted file mode 100644
index cda2e5b7a..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_DodoWild_v2.10/d7108c13-e14a-4366-9a39-204f853b1bee.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_DodoWild_v2.10/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_DodoWild_v2.10",
-    "id": "Nexesenex/Llama_3.1_8b_DodoWild_v2.10",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8054
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5278
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1971
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4157
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3855
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.01/56152d05-9273-4701-8c0a-723e2cab618d.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.01/56152d05-9273-4701-8c0a-723e2cab618d.json
deleted file mode 100644
index 9d16585a7..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.01/56152d05-9273-4701-8c0a-723e2cab618d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Dolermed_R1_V1.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_Dolermed_R1_V1.01",
-    "id": "Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.01",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7534
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5312
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2017
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3747
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.03/55d2f23d-cb6c-42d2-8b57-837451d3c6df.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.03/55d2f23d-cb6c-42d2-8b57-837451d3c6df.json
deleted file mode 100644
index d30924818..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.03/55d2f23d-cb6c-42d2-8b57-837451d3c6df.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Dolermed_R1_V1.03/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_Dolermed_R1_V1.03",
-    "id": "Nexesenex/Llama_3.1_8b_Dolermed_R1_V1.03",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7564
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5316
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2092
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.372
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_V1.01/7479ae87-e795-4e20-848a-291614176def.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_V1.01/7479ae87-e795-4e20-848a-291614176def.json
deleted file mode 100644
index 3796fa305..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolermed_V1.01/7479ae87-e795-4e20-848a-291614176def.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Dolermed_V1.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_Dolermed_V1.01",
-    "id": "Nexesenex/Llama_3.1_8b_Dolermed_V1.01",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.031
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5087
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5194
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1344
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3945
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.357
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolerstormed_V1.04/04ceb40e-bde8-487b-9d29-dc8f681af9be.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolerstormed_V1.04/04ceb40e-bde8-487b-9d29-dc8f681af9be.json
deleted file mode 100644
index baf40f42c..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Dolerstormed_V1.04/04ceb40e-bde8-487b-9d29-dc8f681af9be.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Dolerstormed_V1.04/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_Dolerstormed_V1.04",
-    "id": "Nexesenex/Llama_3.1_8b_Dolerstormed_V1.04",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7889
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5195
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1926
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.403
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3889
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedash_R1_V1.04/e26b00b0-d9df-4ce2-a649-b19f8957b8ce.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedash_R1_V1.04/e26b00b0-d9df-4ce2-a649-b19f8957b8ce.json
deleted file mode 100644
index 59b222af2..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedash_R1_V1.04/e26b00b0-d9df-4ce2-a649-b19f8957b8ce.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Hermedash_R1_V1.04/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_Hermedash_R1_V1.04",
-    "id": "Nexesenex/Llama_3.1_8b_Hermedash_R1_V1.04",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7872
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5192
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1866
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4111
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3882
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.01/9954194c-69b5-4eb4-8b32-859845548cb0.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.01/9954194c-69b5-4eb4-8b32-859845548cb0.json
deleted file mode 100644
index db5b6578a..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.01/9954194c-69b5-4eb4-8b32-859845548cb0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Hermedive_R1_V1.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_Hermedive_R1_V1.01",
-    "id": "Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.01",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5001
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5171
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1775
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4008
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3427
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.03/2afbc279-242a-4276-85f0-facd29c2d89b.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.03/2afbc279-242a-4276-85f0-facd29c2d89b.json
deleted file mode 100644
index 18a502512..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.03/2afbc279-242a-4276-85f0-facd29c2d89b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Hermedive_R1_V1.03/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_Hermedive_R1_V1.03",
-    "id": "Nexesenex/Llama_3.1_8b_Hermedive_R1_V1.03",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6648
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5141
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1858
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3613
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3488
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_V1.01/ba307ad4-3647-4785-9bf1-cd4dacf3c71f.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_V1.01/ba307ad4-3647-4785-9bf1-cd4dacf3c71f.json
deleted file mode 100644
index fba2698e9..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Hermedive_V1.01/ba307ad4-3647-4785-9bf1-cd4dacf3c71f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Hermedive_V1.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_Hermedive_V1.01",
-    "id": "Nexesenex/Llama_3.1_8b_Hermedive_V1.01",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.031
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5062
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4918
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1647
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3697
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3551
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Mediver_V1.01/d03c73ca-7364-4517-aea4-f0ac564c49df.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Mediver_V1.01/d03c73ca-7364-4517-aea4-f0ac564c49df.json
deleted file mode 100644
index 9a356ff72..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Mediver_V1.01/d03c73ca-7364-4517-aea4-f0ac564c49df.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Mediver_V1.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_Mediver_V1.01",
-    "id": "Nexesenex/Llama_3.1_8b_Mediver_V1.01",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.031
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1885
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4415
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0015
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3898
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2994
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Medusa_v1.01/1dd4b82a-ca80-4c9c-8800-f97ab2b9cbe7.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Medusa_v1.01/1dd4b82a-ca80-4c9c-8800-f97ab2b9cbe7.json
deleted file mode 100644
index 08ab1831d..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Medusa_v1.01/1dd4b82a-ca80-4c9c-8800-f97ab2b9cbe7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Medusa_v1.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_Medusa_v1.01",
-    "id": "Nexesenex/Llama_3.1_8b_Medusa_v1.01",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.031
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7685
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5018
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1465
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4067
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3531
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Smarteaz_0.2_R1/f2363099-c39a-4874-bf77-ccc0fa087680.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Smarteaz_0.2_R1/f2363099-c39a-4874-bf77-ccc0fa087680.json
deleted file mode 100644
index 8f98cf96c..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Smarteaz_0.2_R1/f2363099-c39a-4874-bf77-ccc0fa087680.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Smarteaz_0.2_R1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_Smarteaz_0.2_R1",
-    "id": "Nexesenex/Llama_3.1_8b_Smarteaz_0.2_R1",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6346
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5113
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2606
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4188
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3645
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Smarteaz_V1.01/596eeee8-3600-4f8a-8888-978b610eb2ca.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Smarteaz_V1.01/596eeee8-3600-4f8a-8888-978b610eb2ca.json
deleted file mode 100644
index c7e67d274..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Smarteaz_V1.01/596eeee8-3600-4f8a-8888-978b610eb2ca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Smarteaz_V1.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_Smarteaz_V1.01",
-    "id": "Nexesenex/Llama_3.1_8b_Smarteaz_V1.01",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8151
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5241
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2341
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3789
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3736
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Stormeder_v1.04/595ddba1-c450-4b69-85b7-0e3118c8c6c7.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Stormeder_v1.04/595ddba1-c450-4b69-85b7-0e3118c8c6c7.json
deleted file mode 100644
index 0c3c8b8b1..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Stormeder_v1.04/595ddba1-c450-4b69-85b7-0e3118c8c6c7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Stormeder_v1.04/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_Stormeder_v1.04",
-    "id": "Nexesenex/Llama_3.1_8b_Stormeder_v1.04",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7853
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5207
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.185
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3949
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3852
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Typhoon_v1.03/64890314-bba0-4fb2-8c21-38b413cff4c8.json b/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Typhoon_v1.03/64890314-bba0-4fb2-8c21-38b413cff4c8.json
deleted file mode 100644
index 30c9c3431..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.1_8b_Typhoon_v1.03/64890314-bba0-4fb2-8c21-38b413cff4c8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.1_8b_Typhoon_v1.03/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.1_8b_Typhoon_v1.03",
-    "id": "Nexesenex/Llama_3.1_8b_Typhoon_v1.03",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8078
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5314
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2273
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3815
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_AquaSyn_0.1/470b8b0d-fbaf-408c-a28e-57d1b294f8a8.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_AquaSyn_0.1/470b8b0d-fbaf-408c-a28e-57d1b294f8a8.json
deleted file mode 100644
index 421f9e1fb..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_AquaSyn_0.1/470b8b0d-fbaf-408c-a28e-57d1b294f8a8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_AquaSyn_0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.2_1b_AquaSyn_0.1",
-    "id": "Nexesenex/Llama_3.2_1b_AquaSyn_0.1",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2741
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3284
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2483
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.346
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1378
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_AquaSyn_0.11/00a1579e-8636-4eca-9a63-c0b067a5f3dc.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_AquaSyn_0.11/00a1579e-8636-4eca-9a63-c0b067a5f3dc.json
deleted file mode 100644
index 9ccdfaea8..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_AquaSyn_0.11/00a1579e-8636-4eca-9a63-c0b067a5f3dc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_AquaSyn_0.11/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.2_1b_AquaSyn_0.11",
-    "id": "Nexesenex/Llama_3.2_1b_AquaSyn_0.11",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2431
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0234
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3368
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1116
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Dolto_0.1/a52cc4c9-6d60-4083-ac77-591e247d86c9.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Dolto_0.1/a52cc4c9-6d60-4083-ac77-591e247d86c9.json
deleted file mode 100644
index 8e04a3f3b..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Dolto_0.1/a52cc4c9-6d60-4083-ac77-591e247d86c9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_Dolto_0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.2_1b_Dolto_0.1",
-    "id": "Nexesenex/Llama_3.2_1b_Dolto_0.1",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5434
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.335
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.037
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2374
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3421
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1364
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Odyssea_V1.01/ac5c321a-d35a-4e0f-a1be-bcc0b7109f91.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Odyssea_V1.01/ac5c321a-d35a-4e0f-a1be-bcc0b7109f91.json
deleted file mode 100644
index d44a7f415..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Odyssea_V1.01/ac5c321a-d35a-4e0f-a1be-bcc0b7109f91.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_Odyssea_V1.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.2_1b_Odyssea_V1.01",
-    "id": "Nexesenex/Llama_3.2_1b_Odyssea_V1.01",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2495
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0174
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1152
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Odyssea_V1/c4d11b01-ae5b-4198-b102-07160f100a41.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Odyssea_V1/c4d11b01-ae5b-4198-b102-07160f100a41.json
deleted file mode 100644
index 4efe276e5..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Odyssea_V1/c4d11b01-ae5b-4198-b102-07160f100a41.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_Odyssea_V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.2_1b_Odyssea_V1",
-    "id": "Nexesenex/Llama_3.2_1b_Odyssea_V1",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2553
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.301
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3394
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1153
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_OpenTree_R1_0.1/19405ead-2263-4613-8053-43beeafb4bfc.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_OpenTree_R1_0.1/19405ead-2263-4613-8053-43beeafb4bfc.json
deleted file mode 100644
index d058404ed..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_OpenTree_R1_0.1/19405ead-2263-4613-8053-43beeafb4bfc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_OpenTree_R1_0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.2_1b_OpenTree_R1_0.1",
-    "id": "Nexesenex/Llama_3.2_1b_OpenTree_R1_0.1",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5366
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3131
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1675
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_OrcaSun_V1/6c698a60-a813-4be7-b55f-b684029b492d.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_OrcaSun_V1/6c698a60-a813-4be7-b55f-b684029b492d.json
deleted file mode 100644
index 377ee7131..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_OrcaSun_V1/6c698a60-a813-4be7-b55f-b684029b492d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_OrcaSun_V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.2_1b_OrcaSun_V1",
-    "id": "Nexesenex/Llama_3.2_1b_OrcaSun_V1",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5949
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.355
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0597
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2366
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.338
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1904
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_RandomLego_RP_R1_0.1/b67c4a44-7787-45e2-b88c-5d7e8e496fa3.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_RandomLego_RP_R1_0.1/b67c4a44-7787-45e2-b88c-5d7e8e496fa3.json
deleted file mode 100644
index 16d96b077..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_RandomLego_RP_R1_0.1/b67c4a44-7787-45e2-b88c-5d7e8e496fa3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_RandomLego_RP_R1_0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.2_1b_RandomLego_RP_R1_0.1",
-    "id": "Nexesenex/Llama_3.2_1b_RandomLego_RP_R1_0.1",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5543
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3428
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3249
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1563
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_SunOrca_V1/a20a529e-c52e-41b7-a8ee-909167048bfb.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_SunOrca_V1/a20a529e-c52e-41b7-a8ee-909167048bfb.json
deleted file mode 100644
index 2c9dd24ef..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_SunOrca_V1/a20a529e-c52e-41b7-a8ee-909167048bfb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_SunOrca_V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.2_1b_SunOrca_V1",
-    "id": "Nexesenex/Llama_3.2_1b_SunOrca_V1",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.543
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3431
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0672
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1884
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Sydonia_0.1/2735e6f4-839f-4ab1-8ede-3447891b1b26.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Sydonia_0.1/2735e6f4-839f-4ab1-8ede-3447891b1b26.json
deleted file mode 100644
index 794afd099..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Sydonia_0.1/2735e6f4-839f-4ab1-8ede-3447891b1b26.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_Sydonia_0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.2_1b_Sydonia_0.1",
-    "id": "Nexesenex/Llama_3.2_1b_Sydonia_0.1",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2197
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2282
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1224
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Syneridol_0.2/e74e7e7f-8550-4cba-97cd-2626c82d6b29.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Syneridol_0.2/e74e7e7f-8550-4cba-97cd-2626c82d6b29.json
deleted file mode 100644
index b39cc7c79..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Syneridol_0.2/e74e7e7f-8550-4cba-97cd-2626c82d6b29.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_Syneridol_0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.2_1b_Syneridol_0.2",
-    "id": "Nexesenex/Llama_3.2_1b_Syneridol_0.2",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2157
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3139
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2349
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3343
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1227
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Synopsys_0.1/14f4c00d-8915-413d-8e85-79f395127682.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Synopsys_0.1/14f4c00d-8915-413d-8e85-79f395127682.json
deleted file mode 100644
index 0b4dbadaf..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Synopsys_0.1/14f4c00d-8915-413d-8e85-79f395127682.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_Synopsys_0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.2_1b_Synopsys_0.1",
-    "id": "Nexesenex/Llama_3.2_1b_Synopsys_0.1",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1764
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3162
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2391
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3461
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1231
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Synopsys_0.11/9119b586-d3b2-4ce0-a243-d584e2087184.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Synopsys_0.11/9119b586-d3b2-4ce0-a243-d584e2087184.json
deleted file mode 100644
index ff16ce048..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_1b_Synopsys_0.11/9119b586-d3b2-4ce0-a243-d584e2087184.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_1b_Synopsys_0.11/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.2_1b_Synopsys_0.11",
-    "id": "Nexesenex/Llama_3.2_1b_Synopsys_0.11",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2842
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3102
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3513
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1123
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v1/629f3f1a-f8ee-4d1b-b604-7bbd35c6517b.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v1/629f3f1a-f8ee-4d1b-b604-7bbd35c6517b.json
deleted file mode 100644
index 66193208d..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v1/629f3f1a-f8ee-4d1b-b604-7bbd35c6517b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_3b_Kermes_v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.2_3b_Kermes_v1",
-    "id": "Nexesenex/Llama_3.2_3b_Kermes_v1",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4852
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.441
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.031
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.407
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2547
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v2.1/a6ac828c-904b-413a-a5fa-a5ed06a28143.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v2.1/a6ac828c-904b-413a-a5fa-a5ed06a28143.json
deleted file mode 100644
index feb5d98bb..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v2.1/a6ac828c-904b-413a-a5fa-a5ed06a28143.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_3b_Kermes_v2.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.2_3b_Kermes_v2.1",
-    "id": "Nexesenex/Llama_3.2_3b_Kermes_v2.1",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5584
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4464
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0521
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3964
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2692
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v2/251a3ef9-c7ae-4d79-8a60-4bc021a3f001.json b/data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v2/251a3ef9-c7ae-4d79-8a60-4bc021a3f001.json
deleted file mode 100644
index 294b9bac9..000000000
--- a/data/hfopenllm_v2/Nexesenex/Llama_3.2_3b_Kermes_v2/251a3ef9-c7ae-4d79-8a60-4bc021a3f001.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Llama_3.2_3b_Kermes_v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.2_3b_Kermes_v2",
-    "id": "Nexesenex/Llama_3.2_3b_Kermes_v2",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5754
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4455
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3778
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2734
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Nemotron_W_4b_Halo_0.1/962b48a3-23d7-4104-b34d-4e5c2af31d58.json b/data/hfopenllm_v2/Nexesenex/Nemotron_W_4b_Halo_0.1/962b48a3-23d7-4104-b34d-4e5c2af31d58.json
deleted file mode 100644
index d9a752984..000000000
--- a/data/hfopenllm_v2/Nexesenex/Nemotron_W_4b_Halo_0.1/962b48a3-23d7-4104-b34d-4e5c2af31d58.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Nemotron_W_4b_Halo_0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nemotron_W_4b_Halo_0.1",
-    "id": "Nexesenex/Nemotron_W_4b_Halo_0.1",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.513
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3627
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4135
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0423
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4165
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2505
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Nemotron_W_4b_MagLight_0.1/e4b0be31-6f9a-4a57-b433-e561da9bd827.json b/data/hfopenllm_v2/Nexesenex/Nemotron_W_4b_MagLight_0.1/e4b0be31-6f9a-4a57-b433-e561da9bd827.json
deleted file mode 100644
index 7186f9001..000000000
--- a/data/hfopenllm_v2/Nexesenex/Nemotron_W_4b_MagLight_0.1/e4b0be31-6f9a-4a57-b433-e561da9bd827.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Nemotron_W_4b_MagLight_0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nemotron_W_4b_MagLight_0.1",
-    "id": "Nexesenex/Nemotron_W_4b_MagLight_0.1",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.513
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.423
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4231
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4112
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2545
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/Qwen_2.5_3b_Smarteaz_0.01a/9a31f208-b7d8-4baa-b96e-99926ecb35af.json b/data/hfopenllm_v2/Nexesenex/Qwen_2.5_3b_Smarteaz_0.01a/9a31f208-b7d8-4baa-b96e-99926ecb35af.json
deleted file mode 100644
index 8ea2f8444..000000000
--- a/data/hfopenllm_v2/Nexesenex/Qwen_2.5_3b_Smarteaz_0.01a/9a31f208-b7d8-4baa-b96e-99926ecb35af.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_Qwen_2.5_3b_Smarteaz_0.01a/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen_2.5_3b_Smarteaz_0.01a",
-    "id": "Nexesenex/Qwen_2.5_3b_Smarteaz_0.01a",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.085
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4012
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4637
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1805
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.432
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.286
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexesenex/pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL/8d933df1-60cb-471d-bfc3-b11c93150203.json b/data/hfopenllm_v2/Nexesenex/pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL/8d933df1-60cb-471d-bfc3-b11c93150203.json
deleted file mode 100644
index 494bbaf5c..000000000
--- a/data/hfopenllm_v2/Nexesenex/pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL/8d933df1-60cb-471d-bfc3-b11c93150203.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexesenex_pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL",
-    "id": "Nexesenex/pankajmathur_orca_mini_v9_6_1B-instruct-Abliterated-LPL",
-    "developer": "Nexesenex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.589
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3562
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0748
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3396
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1803
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nexusflow/NexusRaven-V2-13B/35315c3a-ec06-433a-b3fa-ae7a4a59b7ea.json b/data/hfopenllm_v2/Nexusflow/NexusRaven-V2-13B/35315c3a-ec06-433a-b3fa-ae7a4a59b7ea.json
deleted file mode 100644
index f72a2ef92..000000000
--- a/data/hfopenllm_v2/Nexusflow/NexusRaven-V2-13B/35315c3a-ec06-433a-b3fa-ae7a4a59b7ea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nexusflow_NexusRaven-V2-13B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NexusRaven-V2-13B",
-    "id": "Nexusflow/NexusRaven-V2-13B",
-    "developer": "Nexusflow",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1791
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3949
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3737
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1872
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NikolaSigmoid/AceMath-1.5B-Instruct-1epoch/3530db9a-0d61-4cf8-9fff-b15f6488c845.json b/data/hfopenllm_v2/NikolaSigmoid/AceMath-1.5B-Instruct-1epoch/3530db9a-0d61-4cf8-9fff-b15f6488c845.json
deleted file mode 100644
index 24a1eb210..000000000
--- a/data/hfopenllm_v2/NikolaSigmoid/AceMath-1.5B-Instruct-1epoch/3530db9a-0d61-4cf8-9fff-b15f6488c845.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NikolaSigmoid_AceMath-1.5B-Instruct-1epoch/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AceMath-1.5B-Instruct-1epoch",
-    "id": "NikolaSigmoid/AceMath-1.5B-Instruct-1epoch",
-    "developer": "NikolaSigmoid",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.791
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2849
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4263
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3051
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3925
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2376
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NikolaSigmoid/AceMath-1.5B-Instruct-dolphin-r1-200/7d9901e0-eafe-4d49-a5bb-fab059708bcb.json b/data/hfopenllm_v2/NikolaSigmoid/AceMath-1.5B-Instruct-dolphin-r1-200/7d9901e0-eafe-4d49-a5bb-fab059708bcb.json
deleted file mode 100644
index 797f0d872..000000000
--- a/data/hfopenllm_v2/NikolaSigmoid/AceMath-1.5B-Instruct-dolphin-r1-200/7d9901e0-eafe-4d49-a5bb-fab059708bcb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NikolaSigmoid_AceMath-1.5B-Instruct-dolphin-r1-200/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AceMath-1.5B-Instruct-dolphin-r1-200",
-    "id": "NikolaSigmoid/AceMath-1.5B-Instruct-dolphin-r1-200",
-    "developer": "NikolaSigmoid",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.928
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1808
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2815
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1143
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NikolaSigmoid/DeepSeek-R1-Distill-Qwen-1.5B-500/ee7f9025-bb2c-4902-b8e2-bfac2b63d2fd.json b/data/hfopenllm_v2/NikolaSigmoid/DeepSeek-R1-Distill-Qwen-1.5B-500/ee7f9025-bb2c-4902-b8e2-bfac2b63d2fd.json
deleted file mode 100644
index 149206bb9..000000000
--- a/data/hfopenllm_v2/NikolaSigmoid/DeepSeek-R1-Distill-Qwen-1.5B-500/ee7f9025-bb2c-4902-b8e2-bfac2b63d2fd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NikolaSigmoid_DeepSeek-R1-Distill-Qwen-1.5B-500/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-1.5B-500",
-    "id": "NikolaSigmoid/DeepSeek-R1-Distill-Qwen-1.5B-500",
-    "developer": "NikolaSigmoid",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.157
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1749
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2602
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2458
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.338
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NikolaSigmoid/acemath-200/6157f79e-2673-4ad6-99d7-e5cf5e4e1db2.json b/data/hfopenllm_v2/NikolaSigmoid/acemath-200/6157f79e-2673-4ad6-99d7-e5cf5e4e1db2.json
deleted file mode 100644
index 947e95059..000000000
--- a/data/hfopenllm_v2/NikolaSigmoid/acemath-200/6157f79e-2673-4ad6-99d7-e5cf5e4e1db2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NikolaSigmoid_acemath-200/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "acemath-200",
-    "id": "NikolaSigmoid/acemath-200",
-    "developer": "NikolaSigmoid",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.791
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2849
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4263
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3051
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3925
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2376
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NikolaSigmoid/phi-4-14b/0aa7572c-1aa6-4997-a2a2-3b557fbde639.json b/data/hfopenllm_v2/NikolaSigmoid/phi-4-14b/0aa7572c-1aa6-4997-a2a2-3b557fbde639.json
deleted file mode 100644
index 61e7f3fc3..000000000
--- a/data/hfopenllm_v2/NikolaSigmoid/phi-4-14b/0aa7572c-1aa6-4997-a2a2-3b557fbde639.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NikolaSigmoid_phi-4-14b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-4-14b",
-    "id": "NikolaSigmoid/phi-4-14b",
-    "developer": "NikolaSigmoid",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "",
-      "params_billions": 14.704
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0561
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6695
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2938
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4035
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5047
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5278
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NikolaSigmoid/phi-4-1steps/6f5df760-2d3e-47b1-b55e-4031a5f11d41.json b/data/hfopenllm_v2/NikolaSigmoid/phi-4-1steps/6f5df760-2d3e-47b1-b55e-4031a5f11d41.json
deleted file mode 100644
index eba7dc221..000000000
--- a/data/hfopenllm_v2/NikolaSigmoid/phi-4-1steps/6f5df760-2d3e-47b1-b55e-4031a5f11d41.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NikolaSigmoid_phi-4-1steps/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-4-1steps",
-    "id": "NikolaSigmoid/phi-4-1steps",
-    "developer": "NikolaSigmoid",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "",
-      "params_billions": 14.704
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0528
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6707
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2983
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4018
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5021
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5273
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NikolaSigmoid/phi-4-300steps/ac676b03-c3ce-4ff1-83fc-5c8db82f1497.json b/data/hfopenllm_v2/NikolaSigmoid/phi-4-300steps/ac676b03-c3ce-4ff1-83fc-5c8db82f1497.json
deleted file mode 100644
index d245536d7..000000000
--- a/data/hfopenllm_v2/NikolaSigmoid/phi-4-300steps/ac676b03-c3ce-4ff1-83fc-5c8db82f1497.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NikolaSigmoid_phi-4-300steps/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-4-300steps",
-    "id": "NikolaSigmoid/phi-4-300steps",
-    "developer": "NikolaSigmoid",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "",
-      "params_billions": 14.704
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0561
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6701
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2946
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4052
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5034
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5288
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nitral-AI/Captain-Eris-BMO_Violent-GRPO-v0.420/2229cdf8-3ecb-4f11-8824-9c3bfbf6f968.json b/data/hfopenllm_v2/Nitral-AI/Captain-Eris-BMO_Violent-GRPO-v0.420/2229cdf8-3ecb-4f11-8824-9c3bfbf6f968.json
deleted file mode 100644
index 021208aeb..000000000
--- a/data/hfopenllm_v2/Nitral-AI/Captain-Eris-BMO_Violent-GRPO-v0.420/2229cdf8-3ecb-4f11-8824-9c3bfbf6f968.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nitral-AI_Captain-Eris-BMO_Violent-GRPO-v0.420/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Captain-Eris-BMO_Violent-GRPO-v0.420",
-    "id": "Nitral-AI/Captain-Eris-BMO_Violent-GRPO-v0.420",
-    "developer": "Nitral-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6313
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5079
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1314
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4228
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3596
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nitral-AI/Captain-Eris_BMO-Violent-12B/95ebc5b8-a541-4fca-9e7c-692720e73362.json b/data/hfopenllm_v2/Nitral-AI/Captain-Eris_BMO-Violent-12B/95ebc5b8-a541-4fca-9e7c-692720e73362.json
deleted file mode 100644
index 52a63c6ba..000000000
--- a/data/hfopenllm_v2/Nitral-AI/Captain-Eris_BMO-Violent-12B/95ebc5b8-a541-4fca-9e7c-692720e73362.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nitral-AI_Captain-Eris_BMO-Violent-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Captain-Eris_BMO-Violent-12B",
-    "id": "Nitral-AI/Captain-Eris_BMO-Violent-12B",
-    "developer": "Nitral-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6152
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5104
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1367
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4255
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3571
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nitral-AI/Captain-Eris_Violet-GRPO-v0.420/09a2508d-a171-493f-9ff2-e7f375815c91.json b/data/hfopenllm_v2/Nitral-AI/Captain-Eris_Violet-GRPO-v0.420/09a2508d-a171-493f-9ff2-e7f375815c91.json
deleted file mode 100644
index 4d63cdc2b..000000000
--- a/data/hfopenllm_v2/Nitral-AI/Captain-Eris_Violet-GRPO-v0.420/09a2508d-a171-493f-9ff2-e7f375815c91.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nitral-AI_Captain-Eris_Violet-GRPO-v0.420/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Captain-Eris_Violet-GRPO-v0.420",
-    "id": "Nitral-AI/Captain-Eris_Violet-GRPO-v0.420",
-    "developer": "Nitral-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6262
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5159
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4279
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3535
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nitral-AI/Captain-Eris_Violet-V0.420-12B/12a4a921-5859-4fd6-9d64-677a7d8ef696.json b/data/hfopenllm_v2/Nitral-AI/Captain-Eris_Violet-V0.420-12B/12a4a921-5859-4fd6-9d64-677a7d8ef696.json
deleted file mode 100644
index 8679f45f6..000000000
--- a/data/hfopenllm_v2/Nitral-AI/Captain-Eris_Violet-V0.420-12B/12a4a921-5859-4fd6-9d64-677a7d8ef696.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nitral-AI_Captain-Eris_Violet-V0.420-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Captain-Eris_Violet-V0.420-12B",
-    "id": "Nitral-AI/Captain-Eris_Violet-V0.420-12B",
-    "developer": "Nitral-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4339
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5478
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1073
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4331
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3723
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nitral-AI/Captain_BMO-12B/b79f12d0-cdfc-4c9d-a88b-40612dcbf64d.json b/data/hfopenllm_v2/Nitral-AI/Captain_BMO-12B/b79f12d0-cdfc-4c9d-a88b-40612dcbf64d.json
deleted file mode 100644
index ab08d38f3..000000000
--- a/data/hfopenllm_v2/Nitral-AI/Captain_BMO-12B/b79f12d0-cdfc-4c9d-a88b-40612dcbf64d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nitral-AI_Captain_BMO-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Captain_BMO-12B",
-    "id": "Nitral-AI/Captain_BMO-12B",
-    "developer": "Nitral-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4751
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5286
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1397
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3748
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3569
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nitral-AI/Hathor_Stable-v0.2-L3-8B/d162cf7c-3ef4-420f-aab4-789a98b1195a.json b/data/hfopenllm_v2/Nitral-AI/Hathor_Stable-v0.2-L3-8B/d162cf7c-3ef4-420f-aab4-789a98b1195a.json
deleted file mode 100644
index 74bd3f7c0..000000000
--- a/data/hfopenllm_v2/Nitral-AI/Hathor_Stable-v0.2-L3-8B/d162cf7c-3ef4-420f-aab4-789a98b1195a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nitral-AI_Hathor_Stable-v0.2-L3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hathor_Stable-v0.2-L3-8B",
-    "id": "Nitral-AI/Hathor_Stable-v0.2-L3-8B",
-    "developer": "Nitral-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7175
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5286
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.105
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3781
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3696
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nitral-AI/Hathor_Tahsin-L3-8B-v0.85/7e49018e-5e2d-4cdb-be5b-2ac04ec84bf5.json b/data/hfopenllm_v2/Nitral-AI/Hathor_Tahsin-L3-8B-v0.85/7e49018e-5e2d-4cdb-be5b-2ac04ec84bf5.json
deleted file mode 100644
index b03d999be..000000000
--- a/data/hfopenllm_v2/Nitral-AI/Hathor_Tahsin-L3-8B-v0.85/7e49018e-5e2d-4cdb-be5b-2ac04ec84bf5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nitral-AI_Hathor_Tahsin-L3-8B-v0.85/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hathor_Tahsin-L3-8B-v0.85",
-    "id": "Nitral-AI/Hathor_Tahsin-L3-8B-v0.85",
-    "developer": "Nitral-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.711
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5279
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1005
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3647
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.372
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nitral-AI/Nera_Noctis-12B/24677f2a-ea89-4289-bcb6-13699de9782f.json b/data/hfopenllm_v2/Nitral-AI/Nera_Noctis-12B/24677f2a-ea89-4289-bcb6-13699de9782f.json
deleted file mode 100644
index 2cbdf842f..000000000
--- a/data/hfopenllm_v2/Nitral-AI/Nera_Noctis-12B/24677f2a-ea89-4289-bcb6-13699de9782f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nitral-AI_Nera_Noctis-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nera_Noctis-12B",
-    "id": "Nitral-AI/Nera_Noctis-12B",
-    "developer": "Nitral-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4562
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5194
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0876
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3979
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3468
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nohobby/MS-Schisandra-22B-v0.1/3e09df3c-2224-4a29-8e55-18a485db2b25.json b/data/hfopenllm_v2/Nohobby/MS-Schisandra-22B-v0.1/3e09df3c-2224-4a29-8e55-18a485db2b25.json
deleted file mode 100644
index a6a01b8ab..000000000
--- a/data/hfopenllm_v2/Nohobby/MS-Schisandra-22B-v0.1/3e09df3c-2224-4a29-8e55-18a485db2b25.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nohobby_MS-Schisandra-22B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MS-Schisandra-22B-v0.1",
-    "id": "Nohobby/MS-Schisandra-22B-v0.1",
-    "developer": "Nohobby",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6331
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.579
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2228
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3322
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3928
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4096
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Nohobby/MS-Schisandra-22B-v0.2/cc0bd236-8fc4-43d3-a18f-4b2afb112946.json b/data/hfopenllm_v2/Nohobby/MS-Schisandra-22B-v0.2/cc0bd236-8fc4-43d3-a18f-4b2afb112946.json
deleted file mode 100644
index c97104188..000000000
--- a/data/hfopenllm_v2/Nohobby/MS-Schisandra-22B-v0.2/cc0bd236-8fc4-43d3-a18f-4b2afb112946.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Nohobby_MS-Schisandra-22B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MS-Schisandra-22B-v0.2",
-    "id": "Nohobby/MS-Schisandra-22B-v0.2",
-    "developer": "Nohobby",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6383
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5841
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2032
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4075
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4136
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Norquinal/Alpha/5afd4c0f-b61d-452f-8c48-d298780d91d5.json b/data/hfopenllm_v2/Norquinal/Alpha/5afd4c0f-b61d-452f-8c48-d298780d91d5.json
deleted file mode 100644
index a242e53cd..000000000
--- a/data/hfopenllm_v2/Norquinal/Alpha/5afd4c0f-b61d-452f-8c48-d298780d91d5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Norquinal_Alpha/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Alpha",
-    "id": "Norquinal/Alpha",
-    "developer": "Norquinal",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2803
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3374
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3631
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Norquinal/Bravo/eac52141-4fd8-4e21-9c78-920ab8933e5a.json b/data/hfopenllm_v2/Norquinal/Bravo/eac52141-4fd8-4e21-9c78-920ab8933e5a.json
deleted file mode 100644
index 4a140030a..000000000
--- a/data/hfopenllm_v2/Norquinal/Bravo/eac52141-4fd8-4e21-9c78-920ab8933e5a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Norquinal_Bravo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Bravo",
-    "id": "Norquinal/Bravo",
-    "developer": "Norquinal",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3025
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3558
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3869
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3127
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Norquinal/Charlie/8449837f-64ac-4293-b1f8-210e62779202.json b/data/hfopenllm_v2/Norquinal/Charlie/8449837f-64ac-4293-b1f8-210e62779202.json
deleted file mode 100644
index eb7553fa4..000000000
--- a/data/hfopenllm_v2/Norquinal/Charlie/8449837f-64ac-4293-b1f8-210e62779202.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Norquinal_Charlie/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Charlie",
-    "id": "Norquinal/Charlie",
-    "developer": "Norquinal",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3061
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0582
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3737
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3093
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Norquinal/Delta/ab8a665c-8234-484f-a8a9-8ee79d73edff.json b/data/hfopenllm_v2/Norquinal/Delta/ab8a665c-8234-484f-a8a9-8ee79d73edff.json
deleted file mode 100644
index bbcdf56d1..000000000
--- a/data/hfopenllm_v2/Norquinal/Delta/ab8a665c-8234-484f-a8a9-8ee79d73edff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Norquinal_Delta/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Delta",
-    "id": "Norquinal/Delta",
-    "developer": "Norquinal",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2538
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3435
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3777
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2959
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Norquinal/Echo/a954242f-41a6-49d7-a71d-3bfe940cdb92.json b/data/hfopenllm_v2/Norquinal/Echo/a954242f-41a6-49d7-a71d-3bfe940cdb92.json
deleted file mode 100644
index b0482210d..000000000
--- a/data/hfopenllm_v2/Norquinal/Echo/a954242f-41a6-49d7-a71d-3bfe940cdb92.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Norquinal_Echo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Echo",
-    "id": "Norquinal/Echo",
-    "developer": "Norquinal",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3158
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.353
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3804
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3095
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Norquinal/Foxtrot/6d1c518f-3f42-49eb-9208-b30e27e7e87e.json b/data/hfopenllm_v2/Norquinal/Foxtrot/6d1c518f-3f42-49eb-9208-b30e27e7e87e.json
deleted file mode 100644
index eb075ffce..000000000
--- a/data/hfopenllm_v2/Norquinal/Foxtrot/6d1c518f-3f42-49eb-9208-b30e27e7e87e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Norquinal_Foxtrot/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Foxtrot",
-    "id": "Norquinal/Foxtrot",
-    "developer": "Norquinal",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3558
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0582
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3804
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.305
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Norquinal/Golf/87931db7-42a4-48df-b5a5-8bd934061dbe.json b/data/hfopenllm_v2/Norquinal/Golf/87931db7-42a4-48df-b5a5-8bd934061dbe.json
deleted file mode 100644
index a262d2157..000000000
--- a/data/hfopenllm_v2/Norquinal/Golf/87931db7-42a4-48df-b5a5-8bd934061dbe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Norquinal_Golf/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Golf",
-    "id": "Norquinal/Golf",
-    "developer": "Norquinal",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3534
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3533
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.338
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3056
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Norquinal/Hotel/54088dbc-04cc-4b35-b4e1-e495b7cfd47f.json b/data/hfopenllm_v2/Norquinal/Hotel/54088dbc-04cc-4b35-b4e1-e495b7cfd47f.json
deleted file mode 100644
index 21264522c..000000000
--- a/data/hfopenllm_v2/Norquinal/Hotel/54088dbc-04cc-4b35-b4e1-e495b7cfd47f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Norquinal_Hotel/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hotel",
-    "id": "Norquinal/Hotel",
-    "developer": "Norquinal",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3215
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3679
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0529
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3288
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3157
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NotASI/FineTome-Llama3.2-1B-0929/7129efad-8ab2-4f7a-b6ed-055989b3e131.json b/data/hfopenllm_v2/NotASI/FineTome-Llama3.2-1B-0929/7129efad-8ab2-4f7a-b6ed-055989b3e131.json
deleted file mode 100644
index 0d3997068..000000000
--- a/data/hfopenllm_v2/NotASI/FineTome-Llama3.2-1B-0929/7129efad-8ab2-4f7a-b6ed-055989b3e131.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NotASI_FineTome-Llama3.2-1B-0929/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FineTome-Llama3.2-1B-0929",
-    "id": "NotASI/FineTome-Llama3.2-1B-0929",
-    "developer": "NotASI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3991
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3246
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0363
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3488
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1429
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NotASI/FineTome-Llama3.2-3B-1002/cfc6f85f-e4b6-4164-b7eb-4efb888e1ba5.json b/data/hfopenllm_v2/NotASI/FineTome-Llama3.2-3B-1002/cfc6f85f-e4b6-4164-b7eb-4efb888e1ba5.json
deleted file mode 100644
index acaee36c4..000000000
--- a/data/hfopenllm_v2/NotASI/FineTome-Llama3.2-3B-1002/cfc6f85f-e4b6-4164-b7eb-4efb888e1ba5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NotASI_FineTome-Llama3.2-3B-1002/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FineTome-Llama3.2-3B-1002",
-    "id": "NotASI/FineTome-Llama3.2-3B-1002",
-    "developer": "NotASI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5474
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4319
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0627
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3685
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2437
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NotASI/FineTome-v1.5-Llama3.2-1B-1007/0f053a45-cd79-4e51-9b4c-ae5c51006c17.json b/data/hfopenllm_v2/NotASI/FineTome-v1.5-Llama3.2-1B-1007/0f053a45-cd79-4e51-9b4c-ae5c51006c17.json
deleted file mode 100644
index eb26e5748..000000000
--- a/data/hfopenllm_v2/NotASI/FineTome-v1.5-Llama3.2-1B-1007/0f053a45-cd79-4e51-9b4c-ae5c51006c17.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NotASI_FineTome-v1.5-Llama3.2-1B-1007/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FineTome-v1.5-Llama3.2-1B-1007",
-    "id": "NotASI/FineTome-v1.5-Llama3.2-1B-1007",
-    "developer": "NotASI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3924
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3241
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0317
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3475
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1427
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NotASI/FineTome-v1.5-Llama3.2-3B-1007/d8002b35-1454-4635-a31e-b419c7000b53.json b/data/hfopenllm_v2/NotASI/FineTome-v1.5-Llama3.2-3B-1007/d8002b35-1454-4635-a31e-b419c7000b53.json
deleted file mode 100644
index d99453b0a..000000000
--- a/data/hfopenllm_v2/NotASI/FineTome-v1.5-Llama3.2-3B-1007/d8002b35-1454-4635-a31e-b419c7000b53.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NotASI_FineTome-v1.5-Llama3.2-3B-1007/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FineTome-v1.5-Llama3.2-3B-1007",
-    "id": "NotASI/FineTome-v1.5-Llama3.2-3B-1007",
-    "developer": "NotASI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5508
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4312
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0642
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3645
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2448
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NousResearch/DeepHermes-3-Mistral-24B-Preview/4c08530e-d529-49a1-a3fe-2351c422981a.json b/data/hfopenllm_v2/NousResearch/DeepHermes-3-Mistral-24B-Preview/4c08530e-d529-49a1-a3fe-2351c422981a.json
deleted file mode 100644
index df49d3613..000000000
--- a/data/hfopenllm_v2/NousResearch/DeepHermes-3-Mistral-24B-Preview/4c08530e-d529-49a1-a3fe-2351c422981a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NousResearch_DeepHermes-3-Mistral-24B-Preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepHermes-3-Mistral-24B-Preview",
-    "id": "NousResearch/DeepHermes-3-Mistral-24B-Preview",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4536
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6488
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4503
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.459
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NousResearch/Hermes-2-Pro-Llama-3-8B/d16879dc-7ed7-49c4-aca6-4c9cd3b3a350.json b/data/hfopenllm_v2/NousResearch/Hermes-2-Pro-Llama-3-8B/d16879dc-7ed7-49c4-aca6-4c9cd3b3a350.json
deleted file mode 100644
index bbff9c46a..000000000
--- a/data/hfopenllm_v2/NousResearch/Hermes-2-Pro-Llama-3-8B/d16879dc-7ed7-49c4-aca6-4c9cd3b3a350.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NousResearch_Hermes-2-Pro-Llama-3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hermes-2-Pro-Llama-3-8B",
-    "id": "NousResearch/Hermes-2-Pro-Llama-3-8B",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.031
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5362
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5071
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0838
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4262
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3052
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NousResearch/Hermes-2-Pro-Mistral-7B/70656b13-e0a2-4ef4-af43-0d9995d57af6.json b/data/hfopenllm_v2/NousResearch/Hermes-2-Pro-Mistral-7B/70656b13-e0a2-4ef4-af43-0d9995d57af6.json
deleted file mode 100644
index ec8e4de73..000000000
--- a/data/hfopenllm_v2/NousResearch/Hermes-2-Pro-Mistral-7B/70656b13-e0a2-4ef4-af43-0d9995d57af6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NousResearch_Hermes-2-Pro-Mistral-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hermes-2-Pro-Mistral-7B",
-    "id": "NousResearch/Hermes-2-Pro-Mistral-7B",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5668
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4995
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0604
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4376
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2946
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NousResearch/Hermes-2-Theta-Llama-3-8B/6544f1ca-02a6-4e58-98f0-e19cc6082682.json b/data/hfopenllm_v2/NousResearch/Hermes-2-Theta-Llama-3-8B/6544f1ca-02a6-4e58-98f0-e19cc6082682.json
deleted file mode 100644
index b3a330363..000000000
--- a/data/hfopenllm_v2/NousResearch/Hermes-2-Theta-Llama-3-8B/6544f1ca-02a6-4e58-98f0-e19cc6082682.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NousResearch_Hermes-2-Theta-Llama-3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hermes-2-Theta-Llama-3-8B",
-    "id": "NousResearch/Hermes-2-Theta-Llama-3-8B",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6518
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5207
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0967
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3949
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3369
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.1-70B/5cd3796f-fb31-49c1-a974-019c5c5b20ae.json b/data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.1-70B/5cd3796f-fb31-49c1-a974-019c5c5b20ae.json
deleted file mode 100644
index 75935fc92..000000000
--- a/data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.1-70B/5cd3796f-fb31-49c1-a974-019c5c5b20ae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NousResearch_Hermes-3-Llama-3.1-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hermes-3-Llama-3.1-70B",
-    "id": "NousResearch/Hermes-3-Llama-3.1-70B",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7661
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6756
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.21
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3616
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4949
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4727
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.1-8B/49eff9ad-90c9-43b1-a1f5-cf371ac4b39b.json b/data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.1-8B/49eff9ad-90c9-43b1-a1f5-cf371ac4b39b.json
deleted file mode 100644
index 0ebe2c0ab..000000000
--- a/data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.1-8B/49eff9ad-90c9-43b1-a1f5-cf371ac4b39b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NousResearch_Hermes-3-Llama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hermes-3-Llama-3.1-8B",
-    "id": "NousResearch/Hermes-3-Llama-3.1-8B",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.617
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5177
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4369
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3139
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.2-3B/59720f7e-7e09-483f-8332-8dc7aa19ae78.json b/data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.2-3B/59720f7e-7e09-483f-8332-8dc7aa19ae78.json
deleted file mode 100644
index 248fd1575..000000000
--- a/data/hfopenllm_v2/NousResearch/Hermes-3-Llama-3.2-3B/59720f7e-7e09-483f-8332-8dc7aa19ae78.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NousResearch_Hermes-3-Llama-3.2-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hermes-3-Llama-3.2-3B",
-    "id": "NousResearch/Hermes-3-Llama-3.2-3B",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3825
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4352
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.403
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2544
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/a3a89e4a-0589-4776-a1da-227552482e94.json b/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/a3a89e4a-0589-4776-a1da-227552482e94.json
deleted file mode 100644
index c982b95bd..000000000
--- a/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/a3a89e4a-0589-4776-a1da-227552482e94.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NousResearch_Nous-Hermes-2-Mistral-7B-DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nous-Hermes-2-Mistral-7B-DPO",
-    "id": "NousResearch/Nous-Hermes-2-Mistral-7B-DPO",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5763
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4853
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3015
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/b3c04d1f-80e3-4d86-9779-c5e4bbce6f35.json b/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/b3c04d1f-80e3-4d86-9779-c5e4bbce6f35.json
deleted file mode 100644
index 44ac522b4..000000000
--- a/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/b3c04d1f-80e3-4d86-9779-c5e4bbce6f35.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nous-Hermes-2-Mixtral-8x7B-DPO",
-    "id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 46.703
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5897
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5539
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1224
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4595
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3666
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT/448fda35-bfdc-42ae-90f9-d44383e0a454.json b/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT/448fda35-bfdc-42ae-90f9-d44383e0a454.json
deleted file mode 100644
index d14264647..000000000
--- a/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT/448fda35-bfdc-42ae-90f9-d44383e0a454.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NousResearch_Nous-Hermes-2-Mixtral-8x7B-SFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nous-Hermes-2-Mixtral-8x7B-SFT",
-    "id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-SFT",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 46.703
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5731
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5058
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0211
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4214
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3066
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-SOLAR-10.7B/0d97542e-82b6-4f27-9822-62b67e7690c2.json b/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-SOLAR-10.7B/0d97542e-82b6-4f27-9822-62b67e7690c2.json
deleted file mode 100644
index d0f799596..000000000
--- a/data/hfopenllm_v2/NousResearch/Nous-Hermes-2-SOLAR-10.7B/0d97542e-82b6-4f27-9822-62b67e7690c2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NousResearch_Nous-Hermes-2-SOLAR-10.7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nous-Hermes-2-SOLAR-10.7B",
-    "id": "NousResearch/Nous-Hermes-2-SOLAR-10.7B",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5279
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5414
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4373
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3458
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NousResearch/Nous-Hermes-llama-2-7b/2725bd69-839d-4427-8e05-0e289fff70de.json b/data/hfopenllm_v2/NousResearch/Nous-Hermes-llama-2-7b/2725bd69-839d-4427-8e05-0e289fff70de.json
deleted file mode 100644
index 07b68eec5..000000000
--- a/data/hfopenllm_v2/NousResearch/Nous-Hermes-llama-2-7b/2725bd69-839d-4427-8e05-0e289fff70de.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NousResearch_Nous-Hermes-llama-2-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nous-Hermes-llama-2-7b",
-    "id": "NousResearch/Nous-Hermes-llama-2-7b",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.738
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1729
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3824
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4257
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.194
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NousResearch/Yarn-Llama-2-13b-128k/adb71488-adb8-4848-bf1d-aecd04cb6718.json b/data/hfopenllm_v2/NousResearch/Yarn-Llama-2-13b-128k/adb71488-adb8-4848-bf1d-aecd04cb6718.json
deleted file mode 100644
index d612c4d8c..000000000
--- a/data/hfopenllm_v2/NousResearch/Yarn-Llama-2-13b-128k/adb71488-adb8-4848-bf1d-aecd04cb6718.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NousResearch_Yarn-Llama-2-13b-128k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yarn-Llama-2-13b-128k",
-    "id": "NousResearch/Yarn-Llama-2-13b-128k",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1655
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3827
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0174
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3458
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.232
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NousResearch/Yarn-Llama-2-7b-128k/c7736577-c4c3-4233-9308-a4bb9b2dbb89.json b/data/hfopenllm_v2/NousResearch/Yarn-Llama-2-7b-128k/c7736577-c4c3-4233-9308-a4bb9b2dbb89.json
deleted file mode 100644
index ca3a9ae2e..000000000
--- a/data/hfopenllm_v2/NousResearch/Yarn-Llama-2-7b-128k/c7736577-c4c3-4233-9308-a4bb9b2dbb89.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NousResearch_Yarn-Llama-2-7b-128k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yarn-Llama-2-7b-128k",
-    "id": "NousResearch/Yarn-Llama-2-7b-128k",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1485
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3248
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3967
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1791
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NousResearch/Yarn-Llama-2-7b-64k/76fe52f4-9fa5-4ccb-8c92-7bd9eb9886ee.json b/data/hfopenllm_v2/NousResearch/Yarn-Llama-2-7b-64k/76fe52f4-9fa5-4ccb-8c92-7bd9eb9886ee.json
deleted file mode 100644
index a38d9e7b9..000000000
--- a/data/hfopenllm_v2/NousResearch/Yarn-Llama-2-7b-64k/76fe52f4-9fa5-4ccb-8c92-7bd9eb9886ee.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NousResearch_Yarn-Llama-2-7b-64k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yarn-Llama-2-7b-64k",
-    "id": "NousResearch/Yarn-Llama-2-7b-64k",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.17
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3326
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3939
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1799
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NousResearch/Yarn-Mistral-7b-128k/1d92e45f-c5a5-4dd6-a61f-8e0f7246117a.json b/data/hfopenllm_v2/NousResearch/Yarn-Mistral-7b-128k/1d92e45f-c5a5-4dd6-a61f-8e0f7246117a.json
deleted file mode 100644
index 8725d08a6..000000000
--- a/data/hfopenllm_v2/NousResearch/Yarn-Mistral-7b-128k/1d92e45f-c5a5-4dd6-a61f-8e0f7246117a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NousResearch_Yarn-Mistral-7b-128k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yarn-Mistral-7b-128k",
-    "id": "NousResearch/Yarn-Mistral-7b-128k",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1934
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4314
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0317
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4071
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2893
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NousResearch/Yarn-Mistral-7b-64k/5e1513f1-4375-4380-85fa-b96a419c013b.json b/data/hfopenllm_v2/NousResearch/Yarn-Mistral-7b-64k/5e1513f1-4375-4380-85fa-b96a419c013b.json
deleted file mode 100644
index 6002e3e56..000000000
--- a/data/hfopenllm_v2/NousResearch/Yarn-Mistral-7b-64k/5e1513f1-4375-4380-85fa-b96a419c013b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NousResearch_Yarn-Mistral-7b-64k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yarn-Mistral-7b-64k",
-    "id": "NousResearch/Yarn-Mistral-7b-64k",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.208
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4293
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.037
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4124
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2914
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NousResearch/Yarn-Solar-10b-32k/fadbf3b2-283a-4f8e-9acf-463d75924b97.json b/data/hfopenllm_v2/NousResearch/Yarn-Solar-10b-32k/fadbf3b2-283a-4f8e-9acf-463d75924b97.json
deleted file mode 100644
index a3f6cc8a8..000000000
--- a/data/hfopenllm_v2/NousResearch/Yarn-Solar-10b-32k/fadbf3b2-283a-4f8e-9acf-463d75924b97.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NousResearch_Yarn-Solar-10b-32k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yarn-Solar-10b-32k",
-    "id": "NousResearch/Yarn-Solar-10b-32k",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1942
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4987
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4146
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NousResearch/Yarn-Solar-10b-64k/c04ffe5b-c313-4249-83bb-bbe07ad6fc69.json b/data/hfopenllm_v2/NousResearch/Yarn-Solar-10b-64k/c04ffe5b-c313-4249-83bb-bbe07ad6fc69.json
deleted file mode 100644
index 6b8acf7a6..000000000
--- a/data/hfopenllm_v2/NousResearch/Yarn-Solar-10b-64k/c04ffe5b-c313-4249-83bb-bbe07ad6fc69.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NousResearch_Yarn-Solar-10b-64k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yarn-Solar-10b-64k",
-    "id": "NousResearch/Yarn-Solar-10b-64k",
-    "developer": "NousResearch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1989
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4922
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4014
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3148
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Novaciano/ASTAROTH-3.2-1B/a9aa164e-386b-4987-9f49-2dde64ade45c.json b/data/hfopenllm_v2/Novaciano/ASTAROTH-3.2-1B/a9aa164e-386b-4987-9f49-2dde64ade45c.json
deleted file mode 100644
index da6642839..000000000
--- a/data/hfopenllm_v2/Novaciano/ASTAROTH-3.2-1B/a9aa164e-386b-4987-9f49-2dde64ade45c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Novaciano_ASTAROTH-3.2-1B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ASTAROTH-3.2-1B",
-    "id": "Novaciano/ASTAROTH-3.2-1B",
-    "developer": "Novaciano",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5613
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3543
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0733
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3142
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1909
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Novaciano/BLAST_PROCESSING-3.2-1B/e4c1b3ef-e1db-4eca-b818-f3b1680cc5f0.json b/data/hfopenllm_v2/Novaciano/BLAST_PROCESSING-3.2-1B/e4c1b3ef-e1db-4eca-b818-f3b1680cc5f0.json
deleted file mode 100644
index 3f01feccd..000000000
--- a/data/hfopenllm_v2/Novaciano/BLAST_PROCESSING-3.2-1B/e4c1b3ef-e1db-4eca-b818-f3b1680cc5f0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Novaciano_BLAST_PROCESSING-3.2-1B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BLAST_PROCESSING-3.2-1B",
-    "id": "Novaciano/BLAST_PROCESSING-3.2-1B",
-    "developer": "Novaciano",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3922
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.346
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0748
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3351
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1941
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Novaciano/Cerberus-3.2-1B/1ab95edc-ea3c-4d3f-9f59-dc7f7468adb9.json b/data/hfopenllm_v2/Novaciano/Cerberus-3.2-1B/1ab95edc-ea3c-4d3f-9f59-dc7f7468adb9.json
deleted file mode 100644
index 850ea6a63..000000000
--- a/data/hfopenllm_v2/Novaciano/Cerberus-3.2-1B/1ab95edc-ea3c-4d3f-9f59-dc7f7468adb9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Novaciano_Cerberus-3.2-1B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cerberus-3.2-1B",
-    "id": "Novaciano/Cerberus-3.2-1B",
-    "developer": "Novaciano",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5017
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4165
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0582
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1663
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Novaciano/Cultist-3.2-1B/80a81bbc-6edf-48b9-afb7-e4e0a03753d8.json b/data/hfopenllm_v2/Novaciano/Cultist-3.2-1B/80a81bbc-6edf-48b9-afb7-e4e0a03753d8.json
deleted file mode 100644
index 25acd6b95..000000000
--- a/data/hfopenllm_v2/Novaciano/Cultist-3.2-1B/80a81bbc-6edf-48b9-afb7-e4e0a03753d8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Novaciano_Cultist-3.2-1B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cultist-3.2-1B",
-    "id": "Novaciano/Cultist-3.2-1B",
-    "developer": "Novaciano",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5295
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3399
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0589
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.333
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1714
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Novaciano/FuseChat-3.2-1B-GRPO_Creative_RP/afb24bf8-3c47-4278-9b84-19b05017745b.json b/data/hfopenllm_v2/Novaciano/FuseChat-3.2-1B-GRPO_Creative_RP/afb24bf8-3c47-4278-9b84-19b05017745b.json
deleted file mode 100644
index cfdd2f4d5..000000000
--- a/data/hfopenllm_v2/Novaciano/FuseChat-3.2-1B-GRPO_Creative_RP/afb24bf8-3c47-4278-9b84-19b05017745b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Novaciano_FuseChat-3.2-1B-GRPO_Creative_RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FuseChat-3.2-1B-GRPO_Creative_RP",
-    "id": "Novaciano/FuseChat-3.2-1B-GRPO_Creative_RP",
-    "developer": "Novaciano",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5598
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3488
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0801
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3329
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1735
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Novaciano/Fusetrix-3.2-1B-GRPO_RP_Creative/4f8cda4d-959b-41ab-a79d-d2b35968eb89.json b/data/hfopenllm_v2/Novaciano/Fusetrix-3.2-1B-GRPO_RP_Creative/4f8cda4d-959b-41ab-a79d-d2b35968eb89.json
deleted file mode 100644
index 7acb2bdcc..000000000
--- a/data/hfopenllm_v2/Novaciano/Fusetrix-3.2-1B-GRPO_RP_Creative/4f8cda4d-959b-41ab-a79d-d2b35968eb89.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Novaciano_Fusetrix-3.2-1B-GRPO_RP_Creative/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fusetrix-3.2-1B-GRPO_RP_Creative",
-    "id": "Novaciano/Fusetrix-3.2-1B-GRPO_RP_Creative",
-    "developer": "Novaciano",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5366
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3435
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3209
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1758
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Novaciano/Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP/2818aa8c-5c73-4de9-bcbe-fd8f68e8bc6b.json b/data/hfopenllm_v2/Novaciano/Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP/2818aa8c-5c73-4de9-bcbe-fd8f68e8bc6b.json
deleted file mode 100644
index 4b1fc2436..000000000
--- a/data/hfopenllm_v2/Novaciano/Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP/2818aa8c-5c73-4de9-bcbe-fd8f68e8bc6b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Novaciano_Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP",
-    "id": "Novaciano/Fusetrix-Dolphin-3.2-1B-GRPO_Creative_RP",
-    "developer": "Novaciano",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5343
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3502
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.105
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3183
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1823
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Novaciano/HarmfulProject-3.2-1B/6a683ead-0f3e-449b-9ae1-8afc9f1ab33d.json b/data/hfopenllm_v2/Novaciano/HarmfulProject-3.2-1B/6a683ead-0f3e-449b-9ae1-8afc9f1ab33d.json
deleted file mode 100644
index 2b69d3c05..000000000
--- a/data/hfopenllm_v2/Novaciano/HarmfulProject-3.2-1B/6a683ead-0f3e-449b-9ae1-8afc9f1ab33d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Novaciano_HarmfulProject-3.2-1B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HarmfulProject-3.2-1B",
-    "id": "Novaciano/HarmfulProject-3.2-1B",
-    "developer": "Novaciano",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3874
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3274
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3419
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1823
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Novaciano/LEWD-Mental-Cultist-3.2-1B/38cb02a8-862d-40e1-922a-e65f537df87e.json b/data/hfopenllm_v2/Novaciano/LEWD-Mental-Cultist-3.2-1B/38cb02a8-862d-40e1-922a-e65f537df87e.json
deleted file mode 100644
index c5ba6cc4d..000000000
--- a/data/hfopenllm_v2/Novaciano/LEWD-Mental-Cultist-3.2-1B/38cb02a8-862d-40e1-922a-e65f537df87e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Novaciano_LEWD-Mental-Cultist-3.2-1B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LEWD-Mental-Cultist-3.2-1B",
-    "id": "Novaciano/LEWD-Mental-Cultist-3.2-1B",
-    "developer": "Novaciano",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5309
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3513
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0529
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3223
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1769
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Novaciano/La_Mejor_Mezcla-3.2-1B/f816e2a7-2629-4abe-9ed0-3d1299e95194.json b/data/hfopenllm_v2/Novaciano/La_Mejor_Mezcla-3.2-1B/f816e2a7-2629-4abe-9ed0-3d1299e95194.json
deleted file mode 100644
index 6d6061a90..000000000
--- a/data/hfopenllm_v2/Novaciano/La_Mejor_Mezcla-3.2-1B/f816e2a7-2629-4abe-9ed0-3d1299e95194.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Novaciano_La_Mejor_Mezcla-3.2-1B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "La_Mejor_Mezcla-3.2-1B",
-    "id": "Novaciano/La_Mejor_Mezcla-3.2-1B",
-    "developer": "Novaciano",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.551
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3488
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0899
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1829
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Novaciano/Sigil-Of-Satan-3.2-1B/286fae5b-544a-4033-9092-d633fc80f47b.json b/data/hfopenllm_v2/Novaciano/Sigil-Of-Satan-3.2-1B/286fae5b-544a-4033-9092-d633fc80f47b.json
deleted file mode 100644
index b801b00b1..000000000
--- a/data/hfopenllm_v2/Novaciano/Sigil-Of-Satan-3.2-1B/286fae5b-544a-4033-9092-d633fc80f47b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Novaciano_Sigil-Of-Satan-3.2-1B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sigil-Of-Satan-3.2-1B",
-    "id": "Novaciano/Sigil-Of-Satan-3.2-1B",
-    "developer": "Novaciano",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5494
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3546
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3276
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1855
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NucleusAI/nucleus-22B-token-500B/93477bf6-ea00-418b-8a2f-975a9554263e.json b/data/hfopenllm_v2/NucleusAI/nucleus-22B-token-500B/93477bf6-ea00-418b-8a2f-975a9554263e.json
deleted file mode 100644
index 9e4f3e608..000000000
--- a/data/hfopenllm_v2/NucleusAI/nucleus-22B-token-500B/93477bf6-ea00-418b-8a2f-975a9554263e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NucleusAI_nucleus-22B-token-500B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "nucleus-22B-token-500B",
-    "id": "NucleusAI/nucleus-22B-token-500B",
-    "developer": "NucleusAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 21.828
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0257
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.292
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3511
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1162
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/NyxKrage/Microsoft_Phi-4/3d7c6576-f99c-4bb3-94fa-4f713e2898f6.json b/data/hfopenllm_v2/NyxKrage/Microsoft_Phi-4/3d7c6576-f99c-4bb3-94fa-4f713e2898f6.json
deleted file mode 100644
index a36e190d6..000000000
--- a/data/hfopenllm_v2/NyxKrage/Microsoft_Phi-4/3d7c6576-f99c-4bb3-94fa-4f713e2898f6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/NyxKrage_Microsoft_Phi-4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Microsoft_Phi-4",
-    "id": "NyxKrage/Microsoft_Phi-4",
-    "developer": "NyxKrage",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0585
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6691
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2991
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.406
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5034
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5287
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OEvortex/Emotional-llama-8B/d1e9a242-941f-4461-b75b-7043c2c01ef7.json b/data/hfopenllm_v2/OEvortex/Emotional-llama-8B/d1e9a242-941f-4461-b75b-7043c2c01ef7.json
deleted file mode 100644
index 66b1bbf40..000000000
--- a/data/hfopenllm_v2/OEvortex/Emotional-llama-8B/d1e9a242-941f-4461-b75b-7043c2c01ef7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OEvortex_Emotional-llama-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Emotional-llama-8B",
-    "id": "OEvortex/Emotional-llama-8B",
-    "developer": "OEvortex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3516
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4839
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0816
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3659
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3535
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OEvortex/HelpingAI-15B/e39661af-ad93-41d7-8892-1230064f1a1c.json b/data/hfopenllm_v2/OEvortex/HelpingAI-15B/e39661af-ad93-41d7-8892-1230064f1a1c.json
deleted file mode 100644
index 58003624d..000000000
--- a/data/hfopenllm_v2/OEvortex/HelpingAI-15B/e39661af-ad93-41d7-8892-1230064f1a1c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OEvortex_HelpingAI-15B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HelpingAI-15B",
-    "id": "OEvortex/HelpingAI-15B",
-    "developer": "OEvortex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 15.323
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.203
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3619
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1111
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OEvortex/HelpingAI-3B-reloaded/595b61b2-5220-48f6-91a0-3aa0d37c63d8.json b/data/hfopenllm_v2/OEvortex/HelpingAI-3B-reloaded/595b61b2-5220-48f6-91a0-3aa0d37c63d8.json
deleted file mode 100644
index c50a519bb..000000000
--- a/data/hfopenllm_v2/OEvortex/HelpingAI-3B-reloaded/595b61b2-5220-48f6-91a0-3aa0d37c63d8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OEvortex_HelpingAI-3B-reloaded/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HelpingAI-3B-reloaded",
-    "id": "OEvortex/HelpingAI-3B-reloaded",
-    "developer": "OEvortex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 2.81
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4647
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4129
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3524
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2595
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OEvortex/HelpingAI2-9B/3173263e-2a42-4e8d-956e-8175ef464e76.json b/data/hfopenllm_v2/OEvortex/HelpingAI2-9B/3173263e-2a42-4e8d-956e-8175ef464e76.json
deleted file mode 100644
index daf836ff0..000000000
--- a/data/hfopenllm_v2/OEvortex/HelpingAI2-9B/3173263e-2a42-4e8d-956e-8175ef464e76.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OEvortex_HelpingAI2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HelpingAI2-9B",
-    "id": "OEvortex/HelpingAI2-9B",
-    "developer": "OEvortex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.903
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4413
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4845
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0589
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3711
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.29
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OEvortex/HelpingAI2.5-10B/f77f8291-1573-4fb6-a984-1cc099c09621.json b/data/hfopenllm_v2/OEvortex/HelpingAI2.5-10B/f77f8291-1573-4fb6-a984-1cc099c09621.json
deleted file mode 100644
index e0efa3c20..000000000
--- a/data/hfopenllm_v2/OEvortex/HelpingAI2.5-10B/f77f8291-1573-4fb6-a984-1cc099c09621.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OEvortex_HelpingAI2.5-10B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HelpingAI2.5-10B",
-    "id": "OEvortex/HelpingAI2.5-10B",
-    "developer": "OEvortex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.211
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3277
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4496
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2575
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OliveiraJLT/Sagui-7B-Instruct-v0.1/c4681e14-513c-4e5e-af8c-88ca11849176.json b/data/hfopenllm_v2/OliveiraJLT/Sagui-7B-Instruct-v0.1/c4681e14-513c-4e5e-af8c-88ca11849176.json
deleted file mode 100644
index 1f0b31152..000000000
--- a/data/hfopenllm_v2/OliveiraJLT/Sagui-7B-Instruct-v0.1/c4681e14-513c-4e5e-af8c-88ca11849176.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OliveiraJLT_Sagui-7B-Instruct-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sagui-7B-Instruct-v0.1",
-    "id": "OliveiraJLT/Sagui-7B-Instruct-v0.1",
-    "developer": "OliveiraJLT",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.738
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2892
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3111
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2424
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4191
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1485
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Omkar1102/code-yi/0c220edd-2563-4fec-99a4-ef8c210ca5ce.json b/data/hfopenllm_v2/Omkar1102/code-yi/0c220edd-2563-4fec-99a4-ef8c210ca5ce.json
deleted file mode 100644
index 032bf1e35..000000000
--- a/data/hfopenllm_v2/Omkar1102/code-yi/0c220edd-2563-4fec-99a4-ef8c210ca5ce.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Omkar1102_code-yi/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "code-yi",
-    "id": "Omkar1102/code-yi",
-    "developer": "Omkar1102",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 2.084
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2254
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.275
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3762
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1123
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Omkar1102/code-yi/bd7ef5a7-aa75-4eb4-8860-aec63f8bf9d1.json b/data/hfopenllm_v2/Omkar1102/code-yi/bd7ef5a7-aa75-4eb4-8860-aec63f8bf9d1.json
deleted file mode 100644
index aa34a77d8..000000000
--- a/data/hfopenllm_v2/Omkar1102/code-yi/bd7ef5a7-aa75-4eb4-8860-aec63f8bf9d1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Omkar1102_code-yi/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "code-yi",
-    "id": "Omkar1102/code-yi",
-    "developer": "Omkar1102",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 2.084
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2148
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3802
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1126
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OmnicromsBrain/NeuralStar_FusionWriter_4x7b/85c20522-03c0-4dac-a1c8-2945e4bf0e0e.json b/data/hfopenllm_v2/OmnicromsBrain/NeuralStar_FusionWriter_4x7b/85c20522-03c0-4dac-a1c8-2945e4bf0e0e.json
deleted file mode 100644
index 0f861c536..000000000
--- a/data/hfopenllm_v2/OmnicromsBrain/NeuralStar_FusionWriter_4x7b/85c20522-03c0-4dac-a1c8-2945e4bf0e0e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OmnicromsBrain_NeuralStar_FusionWriter_4x7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NeuralStar_FusionWriter_4x7b",
-    "id": "OmnicromsBrain/NeuralStar_FusionWriter_4x7b",
-    "developer": "OmnicromsBrain",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 24.154
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5964
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4776
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0491
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4019
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2606
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OnlyCheeini/greesychat-turbo/f180fddd-077f-43f9-b2d9-38c5f33be44d.json b/data/hfopenllm_v2/OnlyCheeini/greesychat-turbo/f180fddd-077f-43f9-b2d9-38c5f33be44d.json
deleted file mode 100644
index f63187b2d..000000000
--- a/data/hfopenllm_v2/OnlyCheeini/greesychat-turbo/f180fddd-077f-43f9-b2d9-38c5f33be44d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OnlyCheeini_greesychat-turbo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "greesychat-turbo",
-    "id": "OnlyCheeini/greesychat-turbo",
-    "developer": "OnlyCheeini",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0233
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3092
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1138
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Open-Orca/Mistral-7B-OpenOrca/ef384329-8406-4767-ac1a-3eba3131f726.json b/data/hfopenllm_v2/Open-Orca/Mistral-7B-OpenOrca/ef384329-8406-4767-ac1a-3eba3131f726.json
deleted file mode 100644
index db0c0f414..000000000
--- a/data/hfopenllm_v2/Open-Orca/Mistral-7B-OpenOrca/ef384329-8406-4767-ac1a-3eba3131f726.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Open-Orca_Mistral-7B-OpenOrca/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-OpenOrca",
-    "id": "Open-Orca/Mistral-7B-OpenOrca",
-    "developer": "Open-Orca",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4978
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4768
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0355
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3858
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2653
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenAssistant/oasst-sft-1-pythia-12b/2ddeae27-77d3-413c-a6e1-9de0f3980c4e.json b/data/hfopenllm_v2/OpenAssistant/oasst-sft-1-pythia-12b/2ddeae27-77d3-413c-a6e1-9de0f3980c4e.json
deleted file mode 100644
index db24eb0bb..000000000
--- a/data/hfopenllm_v2/OpenAssistant/oasst-sft-1-pythia-12b/2ddeae27-77d3-413c-a6e1-9de0f3980c4e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenAssistant_oasst-sft-1-pythia-12b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "oasst-sft-1-pythia-12b",
-    "id": "OpenAssistant/oasst-sft-1-pythia-12b",
-    "developer": "OpenAssistant",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 12.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1055
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3147
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3327
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1113
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-falcon3-10b-v24.2-131k/38b2dbbe-be86-4ef0-a39b-89841f662141.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-falcon3-10b-v24.2-131k/38b2dbbe-be86-4ef0-a39b-89841f662141.json
deleted file mode 100644
index 133f23b3d..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-falcon3-10b-v24.2-131k/38b2dbbe-be86-4ef0-a39b-89841f662141.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-falcon3-10b-v24.2-131k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-falcon3-10b-v24.2-131k",
-    "id": "OpenBuddy/openbuddy-falcon3-10b-v24.2-131k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.34
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5086
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6004
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.213
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4186
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3834
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-70b-v21.2-32k/999a8091-22bd-4c08-bee1-772202e7edde.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-70b-v21.2-32k/999a8091-22bd-4c08-bee1-772202e7edde.json
deleted file mode 100644
index e76e849ca..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-70b-v21.2-32k/999a8091-22bd-4c08-bee1-772202e7edde.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-llama3-70b-v21.2-32k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-llama3-70b-v21.2-32k",
-    "id": "OpenBuddy/openbuddy-llama3-70b-v21.2-32k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.701
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6507
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2032
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3423
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.458
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4832
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-8b-v21.1-8k/fda91d98-d259-430c-929b-78852cab64ec.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-8b-v21.1-8k/fda91d98-d259-430c-929b-78852cab64ec.json
deleted file mode 100644
index 38eedcf18..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-8b-v21.1-8k/fda91d98-d259-430c-929b-78852cab64ec.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-llama3-8b-v21.1-8k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-llama3-8b-v21.1-8k",
-    "id": "OpenBuddy/openbuddy-llama3-8b-v21.1-8k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.557
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4788
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3988
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2955
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-8b-v21.2-32k/535bfa4f-ab63-4832-9f17-7b245ff2b2af.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-8b-v21.2-32k/535bfa4f-ab63-4832-9f17-7b245ff2b2af.json
deleted file mode 100644
index abbc1f40c..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3-8b-v21.2-32k/535bfa4f-ab63-4832-9f17-7b245ff2b2af.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-llama3-8b-v21.2-32k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-llama3-8b-v21.2-32k",
-    "id": "OpenBuddy/openbuddy-llama3-8b-v21.2-32k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6192
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4856
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0785
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3779
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3299
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-70b-v22.1-131k/681a6cc5-5519-4b13-8b50-93adcab4a3f7.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-70b-v22.1-131k/681a6cc5-5519-4b13-8b50-93adcab4a3f7.json
deleted file mode 100644
index e126a1c30..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-70b-v22.1-131k/681a6cc5-5519-4b13-8b50-93adcab4a3f7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-llama3.1-70b-v22.1-131k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-llama3.1-70b-v22.1-131k",
-    "id": "OpenBuddy/openbuddy-llama3.1-70b-v22.1-131k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7333
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6698
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.395
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.463
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5304
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-8b-v22.2-131k/141dd12c-6901-4a96-a051-f35647ddcc73.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-8b-v22.2-131k/141dd12c-6901-4a96-a051-f35647ddcc73.json
deleted file mode 100644
index c11010c3b..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-8b-v22.2-131k/141dd12c-6901-4a96-a051-f35647ddcc73.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-llama3.1-8b-v22.2-131k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-llama3.1-8b-v22.2-131k",
-    "id": "OpenBuddy/openbuddy-llama3.1-8b-v22.2-131k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6657
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5007
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4081
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.331
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-8b-v22.3-131k/5b095779-aacc-41f3-9a3f-83f64a1c0d4c.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-8b-v22.3-131k/5b095779-aacc-41f3-9a3f-83f64a1c0d4c.json
deleted file mode 100644
index e33abbdfc..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.1-8b-v22.3-131k/5b095779-aacc-41f3-9a3f-83f64a1c0d4c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-llama3.1-8b-v22.3-131k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-llama3.1-8b-v22.3-131k",
-    "id": "OpenBuddy/openbuddy-llama3.1-8b-v22.3-131k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5997
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5066
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1208
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4015
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3277
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.2-1b-v23.1-131k/7a88c95a-b253-4f36-8fde-1b0158bbf0b6.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.2-1b-v23.1-131k/7a88c95a-b253-4f36-8fde-1b0158bbf0b6.json
deleted file mode 100644
index 55a086e2b..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.2-1b-v23.1-131k/7a88c95a-b253-4f36-8fde-1b0158bbf0b6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-llama3.2-1b-v23.1-131k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-llama3.2-1b-v23.1-131k",
-    "id": "OpenBuddy/openbuddy-llama3.2-1b-v23.1-131k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.359
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3267
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.184
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.2-3b-v23.2-131k/7938a00e-4e11-4223-a900-fa53df168ab7.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.2-3b-v23.2-131k/7938a00e-4e11-4223-a900-fa53df168ab7.json
deleted file mode 100644
index c20150989..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.2-3b-v23.2-131k/7938a00e-4e11-4223-a900-fa53df168ab7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-llama3.2-3b-v23.2-131k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-llama3.2-3b-v23.2-131k",
-    "id": "OpenBuddy/openbuddy-llama3.2-3b-v23.2-131k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.607
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4319
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4073
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2479
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.3-70b-v24.1-131k/8f966b4e-1baf-445f-9f10-4ba6b47aaf9b.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.3-70b-v24.1-131k/8f966b4e-1baf-445f-9f10-4ba6b47aaf9b.json
deleted file mode 100644
index 97d54e843..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-llama3.3-70b-v24.1-131k/8f966b4e-1baf-445f-9f10-4ba6b47aaf9b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-llama3.3-70b-v24.1-131k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-llama3.3-70b-v24.1-131k",
-    "id": "OpenBuddy/openbuddy-llama3.3-70b-v24.1-131k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8121
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6858
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4411
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4346
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4869
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5327
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k/a334d998-21a5-4108-96e3-9935507a9f8f.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k/a334d998-21a5-4108-96e3-9935507a9f8f.json
deleted file mode 100644
index 25e129f68..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k/a334d998-21a5-4108-96e3-9935507a9f8f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-mixtral-7bx8-v18.1-32k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-mixtral-7bx8-v18.1-32k",
-    "id": "OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 46.741
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5493
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4656
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3831
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3804
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-nemotron-70b-v23.1-131k/941e27c6-81da-4ce1-b1c8-544c1426cd11.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-nemotron-70b-v23.1-131k/941e27c6-81da-4ce1-b1c8-544c1426cd11.json
deleted file mode 100644
index 83c193063..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-nemotron-70b-v23.1-131k/941e27c6-81da-4ce1-b1c8-544c1426cd11.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-nemotron-70b-v23.1-131k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-nemotron-70b-v23.1-131k",
-    "id": "OpenBuddy/openbuddy-nemotron-70b-v23.1-131k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7555
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6749
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.321
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3633
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4538
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5175
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-nemotron-70b-v23.2-131k/e409a374-685b-482d-82e4-2436dca37309.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-nemotron-70b-v23.2-131k/e409a374-685b-482d-82e4-2436dca37309.json
deleted file mode 100644
index d6fb1a342..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-nemotron-70b-v23.2-131k/e409a374-685b-482d-82e4-2436dca37309.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-nemotron-70b-v23.2-131k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-nemotron-70b-v23.2-131k",
-    "id": "OpenBuddy/openbuddy-nemotron-70b-v23.2-131k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7227
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6705
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3157
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3599
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4696
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5121
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.1-200k/84713625-97b6-4fad-982d-41b5c500d73a.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.1-200k/84713625-97b6-4fad-982d-41b5c500d73a.json
deleted file mode 100644
index d8cf89afa..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.1-200k/84713625-97b6-4fad-982d-41b5c500d73a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-qwen2.5llamaify-14b-v23.1-200k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-qwen2.5llamaify-14b-v23.1-200k",
-    "id": "OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.1-200k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6309
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6013
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2538
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.424
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4673
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.3-200k/b7edd9ab-a018-4b2f-9b01-b56cbe98abda.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.3-200k/b7edd9ab-a018-4b2f-9b01-b56cbe98abda.json
deleted file mode 100644
index e24023046..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.3-200k/b7edd9ab-a018-4b2f-9b01-b56cbe98abda.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-qwen2.5llamaify-14b-v23.3-200k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-qwen2.5llamaify-14b-v23.3-200k",
-    "id": "OpenBuddy/openbuddy-qwen2.5llamaify-14b-v23.3-200k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6131
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6081
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2311
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4346
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4795
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-7b-v23.1-200k/ec896115-21ef-4337-9fdd-32a04c574a05.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-7b-v23.1-200k/ec896115-21ef-4337-9fdd-32a04c574a05.json
deleted file mode 100644
index 56c75d371..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwen2.5llamaify-7b-v23.1-200k/ec896115-21ef-4337-9fdd-32a04c574a05.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-qwen2.5llamaify-7b-v23.1-200k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-qwen2.5llamaify-7b-v23.1-200k",
-    "id": "OpenBuddy/openbuddy-qwen2.5llamaify-7b-v23.1-200k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.615
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5673
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5509
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1888
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4363
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3948
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwq-32b-v24.1-200k/d8e5f49b-7bf3-41d4-a91e-c566219609f6.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-qwq-32b-v24.1-200k/d8e5f49b-7bf3-41d4-a91e-c566219609f6.json
deleted file mode 100644
index 45a21bb0f..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwq-32b-v24.1-200k/d8e5f49b-7bf3-41d4-a91e-c566219609f6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-qwq-32b-v24.1-200k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-qwq-32b-v24.1-200k",
-    "id": "OpenBuddy/openbuddy-qwq-32b-v24.1-200k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5937
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6798
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3739
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3809
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4849
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.549
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwq-32b-v24.2-200k/ce1a92a3-6bec-410f-ab42-c567c5d23856.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-qwq-32b-v24.2-200k/ce1a92a3-6bec-410f-ab42-c567c5d23856.json
deleted file mode 100644
index 74da68126..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-qwq-32b-v24.2-200k/ce1a92a3-6bec-410f-ab42-c567c5d23856.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-qwq-32b-v24.2-200k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-qwq-32b-v24.2-200k",
-    "id": "OpenBuddy/openbuddy-qwq-32b-v24.2-200k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.597
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6772
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3776
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3767
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4718
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5446
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-yi1.5-34b-v21.3-32k/0a125470-b50f-4ca0-90dc-1f6b69c3ccd4.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-yi1.5-34b-v21.3-32k/0a125470-b50f-4ca0-90dc-1f6b69c3ccd4.json
deleted file mode 100644
index ee9bc7580..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-yi1.5-34b-v21.3-32k/0a125470-b50f-4ca0-90dc-1f6b69c3ccd4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-yi1.5-34b-v21.3-32k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-yi1.5-34b-v21.3-32k",
-    "id": "OpenBuddy/openbuddy-yi1.5-34b-v21.3-32k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.407
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.542
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6163
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1782
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.349
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4439
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4599
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-zero-14b-v22.3-32k/aeee0165-ac7e-4da6-8102-ba60f43587de.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-zero-14b-v22.3-32k/aeee0165-ac7e-4da6-8102-ba60f43587de.json
deleted file mode 100644
index fe9fb2d74..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-zero-14b-v22.3-32k/aeee0165-ac7e-4da6-8102-ba60f43587de.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-zero-14b-v22.3-32k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-zero-14b-v22.3-32k",
-    "id": "OpenBuddy/openbuddy-zero-14b-v22.3-32k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.022
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3753
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.486
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0937
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4166
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3187
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-zero-3b-v21.2-32k/b47b8666-2556-45df-ba5b-9a5e94186784.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-zero-3b-v21.2-32k/b47b8666-2556-45df-ba5b-9a5e94186784.json
deleted file mode 100644
index 8ba48e7a5..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-zero-3b-v21.2-32k/b47b8666-2556-45df-ba5b-9a5e94186784.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-zero-3b-v21.2-32k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-zero-3b-v21.2-32k",
-    "id": "OpenBuddy/openbuddy-zero-3b-v21.2-32k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.769
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3802
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0189
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3566
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2034
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenBuddy/openbuddy-zero-56b-v21.2-32k/0bde5d57-39be-4497-a2a8-d08d3c8d65f4.json b/data/hfopenllm_v2/OpenBuddy/openbuddy-zero-56b-v21.2-32k/0bde5d57-39be-4497-a2a8-d08d3c8d65f4.json
deleted file mode 100644
index bcede16a0..000000000
--- a/data/hfopenllm_v2/OpenBuddy/openbuddy-zero-56b-v21.2-32k/0bde5d57-39be-4497-a2a8-d08d3c8d65f4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenBuddy_openbuddy-zero-56b-v21.2-32k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbuddy-zero-56b-v21.2-32k",
-    "id": "OpenBuddy/openbuddy-zero-56b-v21.2-32k",
-    "developer": "OpenBuddy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 56.707
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5057
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6128
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1624
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4305
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4399
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenGenerativeAI/Bifrost-14B/86599961-3ec2-4837-89a4-809f1dd7226c.json b/data/hfopenllm_v2/OpenGenerativeAI/Bifrost-14B/86599961-3ec2-4837-89a4-809f1dd7226c.json
deleted file mode 100644
index e930a3e87..000000000
--- a/data/hfopenllm_v2/OpenGenerativeAI/Bifrost-14B/86599961-3ec2-4837-89a4-809f1dd7226c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenGenerativeAI_Bifrost-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Bifrost-14B",
-    "id": "OpenGenerativeAI/Bifrost-14B",
-    "developer": "OpenGenerativeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6615
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6845
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2356
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4624
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5074
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenGenerativeAI/Bifrost/dc3ca25e-41b2-4206-afaa-7d2d10fd27a7.json b/data/hfopenllm_v2/OpenGenerativeAI/Bifrost/dc3ca25e-41b2-4206-afaa-7d2d10fd27a7.json
deleted file mode 100644
index 9269d76e2..000000000
--- a/data/hfopenllm_v2/OpenGenerativeAI/Bifrost/dc3ca25e-41b2-4206-afaa-7d2d10fd27a7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenGenerativeAI_Bifrost/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Bifrost",
-    "id": "OpenGenerativeAI/Bifrost",
-    "developer": "OpenGenerativeAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6348
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6849
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2545
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3683
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4598
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.516
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct-human-data/cd77d407-3be3-4b84-8a73-34a15744de93.json b/data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct-human-data/cd77d407-3be3-4b84-8a73-34a15744de93.json
deleted file mode 100644
index 1f47bfd03..000000000
--- a/data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct-human-data/cd77d407-3be3-4b84-8a73-34a15744de93.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenLLM-France_Lucie-7B-Instruct-human-data/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lucie-7B-Instruct-human-data",
-    "id": "OpenLLM-France/Lucie-7B-Instruct-human-data",
-    "developer": "OpenLLM-France",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.707
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2946
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3284
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3729
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.143
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct-v1.1/1cd20db5-0225-4724-b1f9-7c32eae456e1.json b/data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct-v1.1/1cd20db5-0225-4724-b1f9-7c32eae456e1.json
deleted file mode 100644
index 0a5903536..000000000
--- a/data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct-v1.1/1cd20db5-0225-4724-b1f9-7c32eae456e1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenLLM-France_Lucie-7B-Instruct-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lucie-7B-Instruct-v1.1",
-    "id": "OpenLLM-France/Lucie-7B-Instruct-v1.1",
-    "developer": "OpenLLM-France",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.707
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3039
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3816
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0317
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1864
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct/dfc45dc3-51e6-454b-aee9-ea6b0714f0ca.json b/data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct/dfc45dc3-51e6-454b-aee9-ea6b0714f0ca.json
deleted file mode 100644
index 16a3cc3c6..000000000
--- a/data/hfopenllm_v2/OpenLLM-France/Lucie-7B-Instruct/dfc45dc3-51e6-454b-aee9-ea6b0714f0ca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenLLM-France_Lucie-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lucie-7B-Instruct",
-    "id": "OpenLLM-France/Lucie-7B-Instruct",
-    "developer": "OpenLLM-France",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.707
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2796
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3254
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3662
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1556
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenLLM-France/Lucie-7B/3da2a408-672c-47b8-be32-61f56a15e9f3.json b/data/hfopenllm_v2/OpenLLM-France/Lucie-7B/3da2a408-672c-47b8-be32-61f56a15e9f3.json
deleted file mode 100644
index be1189595..000000000
--- a/data/hfopenllm_v2/OpenLLM-France/Lucie-7B/3da2a408-672c-47b8-be32-61f56a15e9f3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenLLM-France_Lucie-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lucie-7B",
-    "id": "OpenLLM-France/Lucie-7B",
-    "developer": "OpenLLM-France",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.707
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2496
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3492
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3923
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1498
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenLeecher/llama3-8b-lima/94700c3c-f18d-4f96-a794-65bcf483fca9.json b/data/hfopenllm_v2/OpenLeecher/llama3-8b-lima/94700c3c-f18d-4f96-a794-65bcf483fca9.json
deleted file mode 100644
index 63e1ec99a..000000000
--- a/data/hfopenllm_v2/OpenLeecher/llama3-8b-lima/94700c3c-f18d-4f96-a794-65bcf483fca9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenLeecher_llama3-8b-lima/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama3-8b-lima",
-    "id": "OpenLeecher/llama3-8b-lima",
-    "developer": "OpenLeecher",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4371
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4296
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2383
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3713
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/OpenScholar/Llama-3.1_OpenScholar-8B/6f3481d4-076f-45bd-8564-d485109c7a63.json b/data/hfopenllm_v2/OpenScholar/Llama-3.1_OpenScholar-8B/6f3481d4-076f-45bd-8564-d485109c7a63.json
deleted file mode 100644
index 38d1da240..000000000
--- a/data/hfopenllm_v2/OpenScholar/Llama-3.1_OpenScholar-8B/6f3481d4-076f-45bd-8564-d485109c7a63.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/OpenScholar_Llama-3.1_OpenScholar-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1_OpenScholar-8B",
-    "id": "OpenScholar/Llama-3.1_OpenScholar-8B",
-    "developer": "OpenScholar",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6064
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5208
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1654
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4275
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3708
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2/9f5ca3b2-747a-4fd0-b382-bf7ef503ba25.json b/data/hfopenllm_v2/Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2/9f5ca3b2-747a-4fd0-b382-bf7ef503ba25.json
deleted file mode 100644
index 3461ecd6d..000000000
--- a/data/hfopenllm_v2/Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2/9f5ca3b2-747a-4fd0-b382-bf7ef503ba25.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Orenguteng_Llama-3.1-8B-Lexi-Uncensored-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-Lexi-Uncensored-V2",
-    "id": "Orenguteng/Llama-3.1-8B-Lexi-Uncensored-V2",
-    "developer": "Orenguteng",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7792
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5084
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1971
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3843
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3781
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Orenguteng/Llama-3.1-8B-Lexi-Uncensored/f1932041-263a-4841-9c8b-c6cc9fa50c21.json b/data/hfopenllm_v2/Orenguteng/Llama-3.1-8B-Lexi-Uncensored/f1932041-263a-4841-9c8b-c6cc9fa50c21.json
deleted file mode 100644
index e8f2ea31c..000000000
--- a/data/hfopenllm_v2/Orenguteng/Llama-3.1-8B-Lexi-Uncensored/f1932041-263a-4841-9c8b-c6cc9fa50c21.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Orenguteng_Llama-3.1-8B-Lexi-Uncensored/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-Lexi-Uncensored",
-    "id": "Orenguteng/Llama-3.1-8B-Lexi-Uncensored",
-    "developer": "Orenguteng",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7777
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5057
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1571
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3871
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.379
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Orion-zhen/Qwen2.5-7B-Instruct-Uncensored/691bef38-bc9e-4f8d-b774-9d7c62eec72b.json b/data/hfopenllm_v2/Orion-zhen/Qwen2.5-7B-Instruct-Uncensored/691bef38-bc9e-4f8d-b774-9d7c62eec72b.json
deleted file mode 100644
index 1eff33056..000000000
--- a/data/hfopenllm_v2/Orion-zhen/Qwen2.5-7B-Instruct-Uncensored/691bef38-bc9e-4f8d-b774-9d7c62eec72b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Orion-zhen_Qwen2.5-7B-Instruct-Uncensored/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-Instruct-Uncensored",
-    "id": "Orion-zhen/Qwen2.5-7B-Instruct-Uncensored",
-    "developer": "Orion-zhen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7204
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5474
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4773
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4361
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4427
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Orion-zhen/phi-4-abliterated/5795f693-9ebc-47c6-9d2c-185dd0d32044.json b/data/hfopenllm_v2/Orion-zhen/phi-4-abliterated/5795f693-9ebc-47c6-9d2c-185dd0d32044.json
deleted file mode 100644
index 93b48af5e..000000000
--- a/data/hfopenllm_v2/Orion-zhen/phi-4-abliterated/5795f693-9ebc-47c6-9d2c-185dd0d32044.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Orion-zhen_phi-4-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-4-abliterated",
-    "id": "Orion-zhen/phi-4-abliterated",
-    "developer": "Orion-zhen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0576
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6698
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3021
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4044
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5006
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5292
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/P0x0/Astra-v1-12B/eb83f474-0d3d-488c-bc0f-93e5d1dfb2f3.json b/data/hfopenllm_v2/P0x0/Astra-v1-12B/eb83f474-0d3d-488c-bc0f-93e5d1dfb2f3.json
deleted file mode 100644
index 0751e8835..000000000
--- a/data/hfopenllm_v2/P0x0/Astra-v1-12B/eb83f474-0d3d-488c-bc0f-93e5d1dfb2f3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/P0x0_Astra-v1-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Astra-v1-12B",
-    "id": "P0x0/Astra-v1-12B",
-    "developer": "P0x0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2806
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5215
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1133
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4052
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3461
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PJMixers-Dev/L3.2-Instruct-Thinking-v0.1-1B/f93b2053-11c4-4868-860f-90fbfe8288fc.json b/data/hfopenllm_v2/PJMixers-Dev/L3.2-Instruct-Thinking-v0.1-1B/f93b2053-11c4-4868-860f-90fbfe8288fc.json
deleted file mode 100644
index b26fc54c0..000000000
--- a/data/hfopenllm_v2/PJMixers-Dev/L3.2-Instruct-Thinking-v0.1-1B/f93b2053-11c4-4868-860f-90fbfe8288fc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PJMixers-Dev_L3.2-Instruct-Thinking-v0.1-1B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.2-Instruct-Thinking-v0.1-1B",
-    "id": "PJMixers-Dev/L3.2-Instruct-Thinking-v0.1-1B",
-    "developer": "PJMixers-Dev",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4628
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3302
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1483
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-Instruct-Interleaved-Zeroed-13B/8984fe95-9fd3-48ff-aa5f-18df63ecd6bb.json b/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-Instruct-Interleaved-Zeroed-13B/8984fe95-9fd3-48ff-aa5f-18df63ecd6bb.json
deleted file mode 100644
index a3d8dd0e7..000000000
--- a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-Instruct-Interleaved-Zeroed-13B/8984fe95-9fd3-48ff-aa5f-18df63ecd6bb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PJMixers-Dev_LLaMa-3.1-Instruct-Interleaved-Zeroed-13B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMa-3.1-Instruct-Interleaved-Zeroed-13B",
-    "id": "PJMixers-Dev/LLaMa-3.1-Instruct-Interleaved-Zeroed-13B",
-    "developer": "PJMixers-Dev",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.047
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7871
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5073
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2002
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.387
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3767
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-RomboTiesTest-8B/a0f6f5de-578c-4290-85b5-c51aed985074.json b/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-RomboTiesTest-8B/a0f6f5de-578c-4290-85b5-c51aed985074.json
deleted file mode 100644
index 1df7617e1..000000000
--- a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-RomboTiesTest-8B/a0f6f5de-578c-4290-85b5-c51aed985074.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PJMixers-Dev_LLaMa-3.1-RomboTiesTest-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMa-3.1-RomboTiesTest-8B",
-    "id": "PJMixers-Dev/LLaMa-3.1-RomboTiesTest-8B",
-    "developer": "PJMixers-Dev",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7825
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5073
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2002
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.387
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3767
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-RomboTiesTest2-8B/8ccc76ff-25c9-4706-b6a8-31b49f8be813.json b/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-RomboTiesTest2-8B/8ccc76ff-25c9-4706-b6a8-31b49f8be813.json
deleted file mode 100644
index 3d4794050..000000000
--- a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.1-RomboTiesTest2-8B/8ccc76ff-25c9-4706-b6a8-31b49f8be813.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PJMixers-Dev_LLaMa-3.1-RomboTiesTest2-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMa-3.1-RomboTiesTest2-8B",
-    "id": "PJMixers-Dev/LLaMa-3.1-RomboTiesTest2-8B",
-    "developer": "PJMixers-Dev",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7825
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5073
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2002
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.387
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3767
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B/924f8b31-506d-4df2-8a7b-d0cd66d55f6d.json b/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B/924f8b31-506d-4df2-8a7b-d0cd66d55f6d.json
deleted file mode 100644
index e62bf8d2d..000000000
--- a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B/924f8b31-506d-4df2-8a7b-d0cd66d55f6d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PJMixers-Dev_LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B",
-    "id": "PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.1-SFT-3B",
-    "developer": "PJMixers-Dev",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6931
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4556
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3127
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B/8e7dfd9f-350d-406c-811d-453f1744dd53.json b/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B/8e7dfd9f-350d-406c-811d-453f1744dd53.json
deleted file mode 100644
index ddc4092b0..000000000
--- a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B/8e7dfd9f-350d-406c-811d-453f1744dd53.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PJMixers-Dev_LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B",
-    "id": "PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-3B",
-    "developer": "PJMixers-Dev",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6292
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4581
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1299
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3659
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3115
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B/b713d1d2-351f-43a1-b77d-27723e1d4267.json b/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B/b713d1d2-351f-43a1-b77d-27723e1d4267.json
deleted file mode 100644
index 68fb66922..000000000
--- a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B/b713d1d2-351f-43a1-b77d-27723e1d4267.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PJMixers-Dev_LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B",
-    "id": "PJMixers-Dev/LLaMa-3.2-Instruct-JankMix-v0.2-SFT-HailMary-v0.1-KTO-3B",
-    "developer": "PJMixers-Dev",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6504
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4511
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1261
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3687
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3108
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMixBread-v0.1-3B/322a9442-174f-4223-b839-6f8f9664d5e5.json b/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMixBread-v0.1-3B/322a9442-174f-4223-b839-6f8f9664d5e5.json
deleted file mode 100644
index b3614926d..000000000
--- a/data/hfopenllm_v2/PJMixers-Dev/LLaMa-3.2-Instruct-JankMixBread-v0.1-3B/322a9442-174f-4223-b839-6f8f9664d5e5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PJMixers-Dev_LLaMa-3.2-Instruct-JankMixBread-v0.1-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMa-3.2-Instruct-JankMixBread-v0.1-3B",
-    "id": "PJMixers-Dev/LLaMa-3.2-Instruct-JankMixBread-v0.1-3B",
-    "developer": "PJMixers-Dev",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5041
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4483
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1307
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3516
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3083
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PJMixers-Dev/Qwen2.5-RomboTiesTest-7B/b12e71d1-c435-4172-a28f-38e26791dadb.json b/data/hfopenllm_v2/PJMixers-Dev/Qwen2.5-RomboTiesTest-7B/b12e71d1-c435-4172-a28f-38e26791dadb.json
deleted file mode 100644
index 16e94042b..000000000
--- a/data/hfopenllm_v2/PJMixers-Dev/Qwen2.5-RomboTiesTest-7B/b12e71d1-c435-4172-a28f-38e26791dadb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PJMixers-Dev_Qwen2.5-RomboTiesTest-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-RomboTiesTest-7B",
-    "id": "PJMixers-Dev/Qwen2.5-RomboTiesTest-7B",
-    "developer": "PJMixers-Dev",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.808
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7558
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5399
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4962
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4034
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4285
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PJMixers/LLaMa-3-CursedStock-v2.0-8B/ad33b0e8-39c8-4118-81bd-bc86b482f122.json b/data/hfopenllm_v2/PJMixers/LLaMa-3-CursedStock-v2.0-8B/ad33b0e8-39c8-4118-81bd-bc86b482f122.json
deleted file mode 100644
index 4d593a418..000000000
--- a/data/hfopenllm_v2/PJMixers/LLaMa-3-CursedStock-v2.0-8B/ad33b0e8-39c8-4118-81bd-bc86b482f122.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PJMixers_LLaMa-3-CursedStock-v2.0-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMa-3-CursedStock-v2.0-8B",
-    "id": "PJMixers/LLaMa-3-CursedStock-v2.0-8B",
-    "developer": "PJMixers",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6331
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5271
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0944
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3856
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3556
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Parissa3/test-model/db8a7864-293b-45e9-995b-5301071c902d.json b/data/hfopenllm_v2/Parissa3/test-model/db8a7864-293b-45e9-995b-5301071c902d.json
deleted file mode 100644
index f98fab749..000000000
--- a/data/hfopenllm_v2/Parissa3/test-model/db8a7864-293b-45e9-995b-5301071c902d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Parissa3_test-model/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "test-model",
-    "id": "Parissa3/test-model",
-    "developer": "Parissa3",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3883
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5194
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4685
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3057
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Pinkstack/PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B/31e3beea-28dc-4b47-a5e9-5fafc89226db.json b/data/hfopenllm_v2/Pinkstack/PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B/31e3beea-28dc-4b47-a5e9-5fafc89226db.json
deleted file mode 100644
index d4a800079..000000000
--- a/data/hfopenllm_v2/Pinkstack/PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B/31e3beea-28dc-4b47-a5e9-5fafc89226db.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Pinkstack_PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B",
-    "id": "Pinkstack/PARM-V1.5-base-QwQ-Qwen-2.5-o1-3B",
-    "developer": "Pinkstack",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5085
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4711
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1692
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4479
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3511
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Pinkstack/SuperThoughts-CoT-14B-16k-o1-QwQ/49315a95-394f-4508-8e6c-7c1d5547c257.json b/data/hfopenllm_v2/Pinkstack/SuperThoughts-CoT-14B-16k-o1-QwQ/49315a95-394f-4508-8e6c-7c1d5547c257.json
deleted file mode 100644
index 13d84a54a..000000000
--- a/data/hfopenllm_v2/Pinkstack/SuperThoughts-CoT-14B-16k-o1-QwQ/49315a95-394f-4508-8e6c-7c1d5547c257.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Pinkstack_SuperThoughts-CoT-14B-16k-o1-QwQ/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SuperThoughts-CoT-14B-16k-o1-QwQ",
-    "id": "Pinkstack/SuperThoughts-CoT-14B-16k-o1-QwQ",
-    "developer": "Pinkstack",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0515
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.672
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3926
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4914
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5268
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Pinkstack/Superthoughts-lite-1.8B-experimental-o1/375d3a94-97af-47ef-82af-afd7581663d4.json b/data/hfopenllm_v2/Pinkstack/Superthoughts-lite-1.8B-experimental-o1/375d3a94-97af-47ef-82af-afd7581663d4.json
deleted file mode 100644
index c0e931e91..000000000
--- a/data/hfopenllm_v2/Pinkstack/Superthoughts-lite-1.8B-experimental-o1/375d3a94-97af-47ef-82af-afd7581663d4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Pinkstack_Superthoughts-lite-1.8B-experimental-o1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Superthoughts-lite-1.8B-experimental-o1",
-    "id": "Pinkstack/Superthoughts-lite-1.8B-experimental-o1",
-    "developer": "Pinkstack",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.812
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0375
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3435
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0317
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3354
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1851
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Pinkstack/Superthoughts-lite-v1/77cfe896-4aa1-4bcd-a39a-f437c3f7e738.json b/data/hfopenllm_v2/Pinkstack/Superthoughts-lite-v1/77cfe896-4aa1-4bcd-a39a-f437c3f7e738.json
deleted file mode 100644
index 7b240a43f..000000000
--- a/data/hfopenllm_v2/Pinkstack/Superthoughts-lite-v1/77cfe896-4aa1-4bcd-a39a-f437c3f7e738.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Pinkstack_Superthoughts-lite-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Superthoughts-lite-v1",
-    "id": "Pinkstack/Superthoughts-lite-v1",
-    "developer": "Pinkstack",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.711
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1659
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3466
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3672
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1755
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PocketDoc/Dans-Instruct-CoreCurriculum-12b/3d69ec7d-9999-4e16-8dc9-99fad35e156e.json b/data/hfopenllm_v2/PocketDoc/Dans-Instruct-CoreCurriculum-12b/3d69ec7d-9999-4e16-8dc9-99fad35e156e.json
deleted file mode 100644
index cb170495e..000000000
--- a/data/hfopenllm_v2/PocketDoc/Dans-Instruct-CoreCurriculum-12b/3d69ec7d-9999-4e16-8dc9-99fad35e156e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PocketDoc_Dans-Instruct-CoreCurriculum-12b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dans-Instruct-CoreCurriculum-12b",
-    "id": "PocketDoc/Dans-Instruct-CoreCurriculum-12b",
-    "developer": "PocketDoc",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2191
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3789
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4096
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1219
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-V1.1.0-12b/d2a7459b-8a12-4529-b978-c7237979f16b.json b/data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-V1.1.0-12b/d2a7459b-8a12-4529-b978-c7237979f16b.json
deleted file mode 100644
index 54790f749..000000000
--- a/data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-V1.1.0-12b/d2a7459b-8a12-4529-b978-c7237979f16b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PocketDoc_Dans-PersonalityEngine-V1.1.0-12b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dans-PersonalityEngine-V1.1.0-12b",
-    "id": "PocketDoc/Dans-PersonalityEngine-V1.1.0-12b",
-    "developer": "PocketDoc",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7075
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5361
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.105
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4587
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-V1.2.0-24b/e7a228ad-69de-471a-9f31-6bdc7221999c.json b/data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-V1.2.0-24b/e7a228ad-69de-471a-9f31-6bdc7221999c.json
deleted file mode 100644
index 55de50f7e..000000000
--- a/data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-V1.2.0-24b/e7a228ad-69de-471a-9f31-6bdc7221999c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PocketDoc_Dans-PersonalityEngine-V1.2.0-24b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dans-PersonalityEngine-V1.2.0-24b",
-    "id": "PocketDoc/Dans-PersonalityEngine-V1.2.0-24b",
-    "developer": "PocketDoc",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7886
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6421
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2455
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.43
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5026
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-v1.0.0-8b/9196ae39-adb0-4d53-8399-0ccd4d628065.json b/data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-v1.0.0-8b/9196ae39-adb0-4d53-8399-0ccd4d628065.json
deleted file mode 100644
index 28534a7f1..000000000
--- a/data/hfopenllm_v2/PocketDoc/Dans-PersonalityEngine-v1.0.0-8b/9196ae39-adb0-4d53-8399-0ccd4d628065.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PocketDoc_Dans-PersonalityEngine-v1.0.0-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dans-PersonalityEngine-v1.0.0-8b",
-    "id": "PocketDoc/Dans-PersonalityEngine-v1.0.0-8b",
-    "developer": "PocketDoc",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4982
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4733
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0816
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3542
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3065
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PocketDoc/Dans-SakuraKaze-V1.0.0-12b/ea318f99-a1ab-41ed-ae5d-39c62ac40e1b.json b/data/hfopenllm_v2/PocketDoc/Dans-SakuraKaze-V1.0.0-12b/ea318f99-a1ab-41ed-ae5d-39c62ac40e1b.json
deleted file mode 100644
index 4fbe90b2e..000000000
--- a/data/hfopenllm_v2/PocketDoc/Dans-SakuraKaze-V1.0.0-12b/ea318f99-a1ab-41ed-ae5d-39c62ac40e1b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PocketDoc_Dans-SakuraKaze-V1.0.0-12b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dans-SakuraKaze-V1.0.0-12b",
-    "id": "PocketDoc/Dans-SakuraKaze-V1.0.0-12b",
-    "developer": "PocketDoc",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.652
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5405
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0929
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4745
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.356
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PowerInfer/SmallThinker-3B-Preview/05f69fd6-a77e-478d-ad86-3e83e615e892.json b/data/hfopenllm_v2/PowerInfer/SmallThinker-3B-Preview/05f69fd6-a77e-478d-ad86-3e83e615e892.json
deleted file mode 100644
index d49ad061e..000000000
--- a/data/hfopenllm_v2/PowerInfer/SmallThinker-3B-Preview/05f69fd6-a77e-478d-ad86-3e83e615e892.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PowerInfer_SmallThinker-3B-Preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmallThinker-3B-Preview",
-    "id": "PowerInfer/SmallThinker-3B-Preview",
-    "developer": "PowerInfer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.397
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.62
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4495
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2779
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3525
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3018
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PranavHarshan/LaMistral-V4/5b8e9508-befb-4674-bd84-9c722a0864ce.json b/data/hfopenllm_v2/PranavHarshan/LaMistral-V4/5b8e9508-befb-4674-bd84-9c722a0864ce.json
deleted file mode 100644
index 8281de999..000000000
--- a/data/hfopenllm_v2/PranavHarshan/LaMistral-V4/5b8e9508-befb-4674-bd84-9c722a0864ce.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PranavHarshan_LaMistral-V4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LaMistral-V4",
-    "id": "PranavHarshan/LaMistral-V4",
-    "developer": "PranavHarshan",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6239
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5184
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0687
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3643
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3599
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PranavHarshan/MedNarra-X1/8beb3730-23e8-4b89-933d-2d3f1a1d1365.json b/data/hfopenllm_v2/PranavHarshan/MedNarra-X1/8beb3730-23e8-4b89-933d-2d3f1a1d1365.json
deleted file mode 100644
index 3cf872d63..000000000
--- a/data/hfopenllm_v2/PranavHarshan/MedNarra-X1/8beb3730-23e8-4b89-933d-2d3f1a1d1365.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PranavHarshan_MedNarra-X1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MedNarra-X1",
-    "id": "PranavHarshan/MedNarra-X1",
-    "developer": "PranavHarshan",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4338
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4637
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.354
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3431
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Appended/07417712-1933-4920-8964-67ba74bf6d01.json b/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Appended/07417712-1933-4920-8964-67ba74bf6d01.json
deleted file mode 100644
index d98817118..000000000
--- a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Appended/07417712-1933-4920-8964-67ba74bf6d01.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Pretergeek_OpenChat-3.5-0106_10.7B_48Layers-Appended/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenChat-3.5-0106_10.7B_48Layers-Appended",
-    "id": "Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Appended",
-    "developer": "Pretergeek",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5961
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.462
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0793
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4254
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.329
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Interleaved/ae4cc05d-a65a-4f18-a99c-f133603686d1.json b/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Interleaved/ae4cc05d-a65a-4f18-a99c-f133603686d1.json
deleted file mode 100644
index 80b91addb..000000000
--- a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Interleaved/ae4cc05d-a65a-4f18-a99c-f133603686d1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Pretergeek_OpenChat-3.5-0106_10.7B_48Layers-Interleaved/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenChat-3.5-0106_10.7B_48Layers-Interleaved",
-    "id": "Pretergeek/OpenChat-3.5-0106_10.7B_48Layers-Interleaved",
-    "developer": "Pretergeek",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5961
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.462
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0778
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4254
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3299
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_32K-PoSE/54df4d3e-0ef0-4e30-aa46-b47a4589a34c.json b/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_32K-PoSE/54df4d3e-0ef0-4e30-aa46-b47a4589a34c.json
deleted file mode 100644
index f50ca0914..000000000
--- a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_32K-PoSE/54df4d3e-0ef0-4e30-aa46-b47a4589a34c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Pretergeek_OpenChat-3.5-0106_32K-PoSE/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenChat-3.5-0106_32K-PoSE",
-    "id": "Pretergeek/OpenChat-3.5-0106_32K-PoSE",
-    "developer": "Pretergeek",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3969
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3471
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4205
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2031
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Appended/a717d466-9157-4991-8459-f39847d914a2.json b/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Appended/a717d466-9157-4991-8459-f39847d914a2.json
deleted file mode 100644
index a32aae183..000000000
--- a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Appended/a717d466-9157-4991-8459-f39847d914a2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Pretergeek_OpenChat-3.5-0106_8.11B_36Layers-Appended/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenChat-3.5-0106_8.11B_36Layers-Appended",
-    "id": "Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Appended",
-    "developer": "Pretergeek",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 8.114
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5976
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.462
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0793
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4254
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.329
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Interleaved/15a8789b-27de-49d1-b3e5-9b1fc9b5694e.json b/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Interleaved/15a8789b-27de-49d1-b3e5-9b1fc9b5694e.json
deleted file mode 100644
index eea5d5d83..000000000
--- a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Interleaved/15a8789b-27de-49d1-b3e5-9b1fc9b5694e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Pretergeek_OpenChat-3.5-0106_8.11B_36Layers-Interleaved/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenChat-3.5-0106_8.11B_36Layers-Interleaved",
-    "id": "Pretergeek/OpenChat-3.5-0106_8.11B_36Layers-Interleaved",
-    "developer": "Pretergeek",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 8.114
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5961
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4621
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0778
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4241
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3299
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Appended/921562fe-cc21-4ff3-93de-a62e1d4bf7e7.json b/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Appended/921562fe-cc21-4ff3-93de-a62e1d4bf7e7.json
deleted file mode 100644
index 8ecc037ff..000000000
--- a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Appended/921562fe-cc21-4ff3-93de-a62e1d4bf7e7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Pretergeek_OpenChat-3.5-0106_8.99B_40Layers-Appended/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenChat-3.5-0106_8.99B_40Layers-Appended",
-    "id": "Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Appended",
-    "developer": "Pretergeek",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 8.987
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5961
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.462
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0793
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4254
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.329
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Interleaved/863969d9-e567-43cc-a0a9-7f80eaba374a.json b/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Interleaved/863969d9-e567-43cc-a0a9-7f80eaba374a.json
deleted file mode 100644
index dde7d977d..000000000
--- a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Interleaved/863969d9-e567-43cc-a0a9-7f80eaba374a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Pretergeek_OpenChat-3.5-0106_8.99B_40Layers-Interleaved/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenChat-3.5-0106_8.99B_40Layers-Interleaved",
-    "id": "Pretergeek/OpenChat-3.5-0106_8.99B_40Layers-Interleaved",
-    "developer": "Pretergeek",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 8.987
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5976
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4621
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0778
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4241
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3299
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_9.86B_44Layers-Appended/2987fa45-363e-4a07-8e9f-db01586a135b.json b/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_9.86B_44Layers-Appended/2987fa45-363e-4a07-8e9f-db01586a135b.json
deleted file mode 100644
index 465c9e212..000000000
--- a/data/hfopenllm_v2/Pretergeek/OpenChat-3.5-0106_9.86B_44Layers-Appended/2987fa45-363e-4a07-8e9f-db01586a135b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Pretergeek_OpenChat-3.5-0106_9.86B_44Layers-Appended/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenChat-3.5-0106_9.86B_44Layers-Appended",
-    "id": "Pretergeek/OpenChat-3.5-0106_9.86B_44Layers-Appended",
-    "developer": "Pretergeek",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 9.859
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5961
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.462
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0793
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4254
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.329
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Pretergeek/openchat-3.5-0106_Rebased_Mistral-7B-v0.2/3488de21-d9a6-49e8-ba8f-d9beee9bdabe.json b/data/hfopenllm_v2/Pretergeek/openchat-3.5-0106_Rebased_Mistral-7B-v0.2/3488de21-d9a6-49e8-ba8f-d9beee9bdabe.json
deleted file mode 100644
index cf3bb48e6..000000000
--- a/data/hfopenllm_v2/Pretergeek/openchat-3.5-0106_Rebased_Mistral-7B-v0.2/3488de21-d9a6-49e8-ba8f-d9beee9bdabe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Pretergeek_openchat-3.5-0106_Rebased_Mistral-7B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openchat-3.5-0106_Rebased_Mistral-7B-v0.2",
-    "id": "Pretergeek/openchat-3.5-0106_Rebased_Mistral-7B-v0.2",
-    "developer": "Pretergeek",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3706
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3627
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.484
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.283
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PrimeIntellect/INTELLECT-1-Instruct/0cacf042-6b62-4b67-8821-97cd703788d0.json b/data/hfopenllm_v2/PrimeIntellect/INTELLECT-1-Instruct/0cacf042-6b62-4b67-8821-97cd703788d0.json
deleted file mode 100644
index 10fb00e0e..000000000
--- a/data/hfopenllm_v2/PrimeIntellect/INTELLECT-1-Instruct/0cacf042-6b62-4b67-8821-97cd703788d0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PrimeIntellect_INTELLECT-1-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "INTELLECT-1-Instruct",
-    "id": "PrimeIntellect/INTELLECT-1-Instruct",
-    "developer": "PrimeIntellect",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.211
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.287
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0227
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2483
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3577
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1064
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PrimeIntellect/INTELLECT-1/9f0dfceb-1332-447a-bf6f-6c6c40686a6f.json b/data/hfopenllm_v2/PrimeIntellect/INTELLECT-1/9f0dfceb-1332-447a-bf6f-6c6c40686a6f.json
deleted file mode 100644
index 978dc1bf9..000000000
--- a/data/hfopenllm_v2/PrimeIntellect/INTELLECT-1/9f0dfceb-1332-447a-bf6f-6c6c40686a6f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PrimeIntellect_INTELLECT-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "INTELLECT-1",
-    "id": "PrimeIntellect/INTELLECT-1",
-    "developer": "PrimeIntellect",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.211
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1757
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3339
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1123
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PrimeIntellect/INTELLECT-1/c1308f95-6d55-4ff6-b14e-1bd09b467d99.json b/data/hfopenllm_v2/PrimeIntellect/INTELLECT-1/c1308f95-6d55-4ff6-b14e-1bd09b467d99.json
deleted file mode 100644
index 5993f7b41..000000000
--- a/data/hfopenllm_v2/PrimeIntellect/INTELLECT-1/c1308f95-6d55-4ff6-b14e-1bd09b467d99.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PrimeIntellect_INTELLECT-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "INTELLECT-1",
-    "id": "PrimeIntellect/INTELLECT-1",
-    "developer": "PrimeIntellect",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.211
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1757
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.274
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3753
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.112
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PuxAI/LUA_model/4ab16120-8d39-4dea-aa76-5c249506848d.json b/data/hfopenllm_v2/PuxAI/LUA_model/4ab16120-8d39-4dea-aa76-5c249506848d.json
deleted file mode 100644
index 31a3fa054..000000000
--- a/data/hfopenllm_v2/PuxAI/LUA_model/4ab16120-8d39-4dea-aa76-5c249506848d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PuxAI_LUA_model/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LUA_model",
-    "id": "PuxAI/LUA_model",
-    "developer": "PuxAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.386
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2282
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2877
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3484
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1123
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/PygmalionAI/pygmalion-6b/f9647ea0-6464-4aa0-b1ea-a994a7bcca3c.json b/data/hfopenllm_v2/PygmalionAI/pygmalion-6b/f9647ea0-6464-4aa0-b1ea-a994a7bcca3c.json
deleted file mode 100644
index e8e7d6daf..000000000
--- a/data/hfopenllm_v2/PygmalionAI/pygmalion-6b/f9647ea0-6464-4aa0-b1ea-a994a7bcca3c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/PygmalionAI_pygmalion-6b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pygmalion-6b",
-    "id": "PygmalionAI/pygmalion-6b",
-    "developer": "PygmalionAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTJForCausalLM",
-      "params_billions": 6.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2091
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3199
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3684
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1184
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Q-bert/MetaMath-1B/c5ef47ab-2e73-43d6-b9ea-1ee7e50d9df8.json b/data/hfopenllm_v2/Q-bert/MetaMath-1B/c5ef47ab-2e73-43d6-b9ea-1ee7e50d9df8.json
deleted file mode 100644
index c76a87d90..000000000
--- a/data/hfopenllm_v2/Q-bert/MetaMath-1B/c5ef47ab-2e73-43d6-b9ea-1ee7e50d9df8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Q-bert_MetaMath-1B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MetaMath-1B",
-    "id": "Q-bert/MetaMath-1B",
-    "developer": "Q-bert",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3451
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0627
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1495
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/1up-14b/9ef7a4a0-b751-45ff-ab1f-d50687a3f4c3.json b/data/hfopenllm_v2/Quazim0t0/1up-14b/9ef7a4a0-b751-45ff-ab1f-d50687a3f4c3.json
deleted file mode 100644
index 3622e1d5d..000000000
--- a/data/hfopenllm_v2/Quazim0t0/1up-14b/9ef7a4a0-b751-45ff-ab1f-d50687a3f4c3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_1up-14b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "1up-14b",
-    "id": "Quazim0t0/1up-14b",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6888
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6921
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4162
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3624
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4583
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5406
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Adamant-14B-sce/8b303795-557b-4fa1-bbc6-d36bd77ee739.json b/data/hfopenllm_v2/Quazim0t0/Adamant-14B-sce/8b303795-557b-4fa1-bbc6-d36bd77ee739.json
deleted file mode 100644
index 620dc27f3..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Adamant-14B-sce/8b303795-557b-4fa1-bbc6-d36bd77ee739.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Adamant-14B-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Adamant-14B-sce",
-    "id": "Quazim0t0/Adamant-14B-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6858
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6859
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3988
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3507
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4558
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5372
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Alice-14B/7fec288e-0b0d-45c0-b0e6-17b905cd7ea3.json b/data/hfopenllm_v2/Quazim0t0/Alice-14B/7fec288e-0b0d-45c0-b0e6-17b905cd7ea3.json
deleted file mode 100644
index bb5caa0bb..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Alice-14B/7fec288e-0b0d-45c0-b0e6-17b905cd7ea3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Alice-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Alice-14B",
-    "id": "Quazim0t0/Alice-14B",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6836
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6938
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4569
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4479
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5419
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Alien-CoT-14B-sce/5a09783b-82da-43ae-a607-2cfea550d931.json b/data/hfopenllm_v2/Quazim0t0/Alien-CoT-14B-sce/5a09783b-82da-43ae-a607-2cfea550d931.json
deleted file mode 100644
index e5578e0bc..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Alien-CoT-14B-sce/5a09783b-82da-43ae-a607-2cfea550d931.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Alien-CoT-14B-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Alien-CoT-14B-sce",
-    "id": "Quazim0t0/Alien-CoT-14B-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0749
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6395
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3918
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4785
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.517
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Aura-8B-Linear/6c2d191a-a2d1-459c-b2e2-5766bec62ce7.json b/data/hfopenllm_v2/Quazim0t0/Aura-8B-Linear/6c2d191a-a2d1-459c-b2e2-5766bec62ce7.json
deleted file mode 100644
index 3a532220d..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Aura-8B-Linear/6c2d191a-a2d1-459c-b2e2-5766bec62ce7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Aura-8B-Linear/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Aura-8B-Linear",
-    "id": "Quazim0t0/Aura-8B-Linear",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7948
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5074
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1805
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3687
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3801
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Casa-14b-sce/121cb5fc-2fa2-4718-b325-c40014802e40.json b/data/hfopenllm_v2/Quazim0t0/Casa-14b-sce/121cb5fc-2fa2-4718-b325-c40014802e40.json
deleted file mode 100644
index 90b774872..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Casa-14b-sce/121cb5fc-2fa2-4718-b325-c40014802e40.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Casa-14b-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Casa-14b-sce",
-    "id": "Quazim0t0/Casa-14b-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6718
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6891
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4985
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3339
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4323
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5408
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Casa-14b-sce/8bbfa040-b16e-4116-ad3e-b3e4e58a7de6.json b/data/hfopenllm_v2/Quazim0t0/Casa-14b-sce/8bbfa040-b16e-4116-ad3e-b3e4e58a7de6.json
deleted file mode 100644
index 1cd19c932..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Casa-14b-sce/8bbfa040-b16e-4116-ad3e-b3e4e58a7de6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Casa-14b-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Casa-14b-sce",
-    "id": "Quazim0t0/Casa-14b-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6654
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6901
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4698
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.431
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5426
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Charlie-8B-Linear/c8891914-c9fb-4b4d-9592-826f04520e7b.json b/data/hfopenllm_v2/Quazim0t0/Charlie-8B-Linear/c8891914-c9fb-4b4d-9592-826f04520e7b.json
deleted file mode 100644
index 4db9f876c..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Charlie-8B-Linear/c8891914-c9fb-4b4d-9592-826f04520e7b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Charlie-8B-Linear/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Charlie-8B-Linear",
-    "id": "Quazim0t0/Charlie-8B-Linear",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7381
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5141
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3485
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3573
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Chromatic-8b-sce/e77ffcb3-c7d8-4700-b4ea-fe4e5ba94223.json b/data/hfopenllm_v2/Quazim0t0/Chromatic-8b-sce/e77ffcb3-c7d8-4700-b4ea-fe4e5ba94223.json
deleted file mode 100644
index 3005f0449..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Chromatic-8b-sce/e77ffcb3-c7d8-4700-b4ea-fe4e5ba94223.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Chromatic-8b-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chromatic-8b-sce",
-    "id": "Quazim0t0/Chromatic-8b-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5085
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5063
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1556
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4051
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3755
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/CoT_Phi/da237415-f34e-4cbb-9a94-3ff621f3df8d.json b/data/hfopenllm_v2/Quazim0t0/CoT_Phi/da237415-f34e-4cbb-9a94-3ff621f3df8d.json
deleted file mode 100644
index c2af11b27..000000000
--- a/data/hfopenllm_v2/Quazim0t0/CoT_Phi/da237415-f34e-4cbb-9a94-3ff621f3df8d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_CoT_Phi/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CoT_Phi",
-    "id": "Quazim0t0/CoT_Phi",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6159
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6751
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3308
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3582
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4244
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4901
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Dyson-14b/479f3bfa-d614-46a9-88c7-9891852b0d8c.json b/data/hfopenllm_v2/Quazim0t0/Dyson-14b/479f3bfa-d614-46a9-88c7-9891852b0d8c.json
deleted file mode 100644
index 05cb9e800..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Dyson-14b/479f3bfa-d614-46a9-88c7-9891852b0d8c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Dyson-14b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dyson-14b",
-    "id": "Quazim0t0/Dyson-14b",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5857
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6863
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4259
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5399
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Edu-14B-Linear/f5f0c7da-fb03-4023-81a7-801b0729a19d.json b/data/hfopenllm_v2/Quazim0t0/Edu-14B-Linear/f5f0c7da-fb03-4023-81a7-801b0729a19d.json
deleted file mode 100644
index ce336e83e..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Edu-14B-Linear/f5f0c7da-fb03-4023-81a7-801b0729a19d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Edu-14B-Linear/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Edu-14B-Linear",
-    "id": "Quazim0t0/Edu-14B-Linear",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6158
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6758
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2447
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4378
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5086
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Fugazi14b/40f51424-2922-498d-bbbc-d500667a8554.json b/data/hfopenllm_v2/Quazim0t0/Fugazi14b/40f51424-2922-498d-bbbc-d500667a8554.json
deleted file mode 100644
index d3ba7ceca..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Fugazi14b/40f51424-2922-498d-bbbc-d500667a8554.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Fugazi14b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fugazi14b",
-    "id": "Quazim0t0/Fugazi14b",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6998
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6941
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4653
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4546
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5417
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/GZA-14B-sce/4f25d177-6bcf-4864-87a4-1beb21a7373d.json b/data/hfopenllm_v2/Quazim0t0/GZA-14B-sce/4f25d177-6bcf-4864-87a4-1beb21a7373d.json
deleted file mode 100644
index f440931a3..000000000
--- a/data/hfopenllm_v2/Quazim0t0/GZA-14B-sce/4f25d177-6bcf-4864-87a4-1beb21a7373d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_GZA-14B-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GZA-14B-sce",
-    "id": "Quazim0t0/GZA-14B-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6274
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6687
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4721
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4285
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5232
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Geedorah-14B/b160ab1f-be6b-4dfa-8fa9-36fc65a64782.json b/data/hfopenllm_v2/Quazim0t0/Geedorah-14B/b160ab1f-be6b-4dfa-8fa9-36fc65a64782.json
deleted file mode 100644
index 36ede1fd0..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Geedorah-14B/b160ab1f-be6b-4dfa-8fa9-36fc65a64782.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Geedorah-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Geedorah-14B",
-    "id": "Quazim0t0/Geedorah-14B",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6873
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6964
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4449
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4547
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5421
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/GivingTree-8b-sce/d497a7e3-11c2-4e0c-8788-091caabede56.json b/data/hfopenllm_v2/Quazim0t0/GivingTree-8b-sce/d497a7e3-11c2-4e0c-8788-091caabede56.json
deleted file mode 100644
index cab765743..000000000
--- a/data/hfopenllm_v2/Quazim0t0/GivingTree-8b-sce/d497a7e3-11c2-4e0c-8788-091caabede56.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_GivingTree-8b-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GivingTree-8b-sce",
-    "id": "Quazim0t0/GivingTree-8b-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5006
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.504
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1526
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4051
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3761
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/GuiltySpark-14B-ties/4a55bcf2-e1c1-4fce-8f79-472dae869b26.json b/data/hfopenllm_v2/Quazim0t0/GuiltySpark-14B-ties/4a55bcf2-e1c1-4fce-8f79-472dae869b26.json
deleted file mode 100644
index 00575bbe7..000000000
--- a/data/hfopenllm_v2/Quazim0t0/GuiltySpark-14B-ties/4a55bcf2-e1c1-4fce-8f79-472dae869b26.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_GuiltySpark-14B-ties/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GuiltySpark-14B-ties",
-    "id": "Quazim0t0/GuiltySpark-14B-ties",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6854
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6914
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3837
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3649
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4557
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.54
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Halo-14B-sce/5b00dd5e-0ad3-4ea0-aa0d-2327d610e6a6.json b/data/hfopenllm_v2/Quazim0t0/Halo-14B-sce/5b00dd5e-0ad3-4ea0-aa0d-2327d610e6a6.json
deleted file mode 100644
index 8957c0b18..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Halo-14B-sce/5b00dd5e-0ad3-4ea0-aa0d-2327d610e6a6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Halo-14B-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Halo-14B-sce",
-    "id": "Quazim0t0/Halo-14B-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6754
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6876
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.429
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4401
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5376
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Heretic1.5b/1c80d383-1ccb-4f32-a63d-dd3954fe5f6b.json b/data/hfopenllm_v2/Quazim0t0/Heretic1.5b/1c80d383-1ccb-4f32-a63d-dd3954fe5f6b.json
deleted file mode 100644
index 171487262..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Heretic1.5b/1c80d383-1ccb-4f32-a63d-dd3954fe5f6b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Heretic1.5b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Heretic1.5b",
-    "id": "Quazim0t0/Heretic1.5b",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.73
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2062
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3529
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.244
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3511
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1728
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Hyde-14b-sce/75065074-7ef6-41ac-be7c-496cc458640a.json b/data/hfopenllm_v2/Quazim0t0/Hyde-14b-sce/75065074-7ef6-41ac-be7c-496cc458640a.json
deleted file mode 100644
index 305107133..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Hyde-14b-sce/75065074-7ef6-41ac-be7c-496cc458640a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Hyde-14b-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hyde-14b-sce",
-    "id": "Quazim0t0/Hyde-14b-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6715
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6885
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2734
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3414
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4141
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Imagine-v0.5-16bit/49a0287b-48d7-44db-bf20-a084919d332f.json b/data/hfopenllm_v2/Quazim0t0/Imagine-v0.5-16bit/49a0287b-48d7-44db-bf20-a084919d332f.json
deleted file mode 100644
index f52a4474a..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Imagine-v0.5-16bit/49a0287b-48d7-44db-bf20-a084919d332f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Imagine-v0.5-16bit/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Imagine-v0.5-16bit",
-    "id": "Quazim0t0/Imagine-v0.5-16bit",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2759
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6769
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1397
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3649
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4349
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5354
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Imbue-14b/7b2861ee-58f9-4ac9-99ee-2ec663e1b157.json b/data/hfopenllm_v2/Quazim0t0/Imbue-14b/7b2861ee-58f9-4ac9-99ee-2ec663e1b157.json
deleted file mode 100644
index 8fbf439a2..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Imbue-14b/7b2861ee-58f9-4ac9-99ee-2ec663e1b157.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Imbue-14b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Imbue-14b",
-    "id": "Quazim0t0/Imbue-14b",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6845
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5317
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4167
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5402
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Insom/628542f9-fac6-42a7-8ec5-5cd93f977a7e.json b/data/hfopenllm_v2/Quazim0t0/Insom/628542f9-fac6-42a7-8ec5-5cd93f977a7e.json
deleted file mode 100644
index b3398ce09..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Insom/628542f9-fac6-42a7-8ec5-5cd93f977a7e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Insom/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Insom",
-    "id": "Quazim0t0/Insom",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6818
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6881
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3852
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3498
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4311
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5352
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/InspectorDeck-14B-sce/5b0924ae-cf52-4245-a687-91e4b1742c16.json b/data/hfopenllm_v2/Quazim0t0/InspectorDeck-14B-sce/5b0924ae-cf52-4245-a687-91e4b1742c16.json
deleted file mode 100644
index 6da062e91..000000000
--- a/data/hfopenllm_v2/Quazim0t0/InspectorDeck-14B-sce/5b0924ae-cf52-4245-a687-91e4b1742c16.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_InspectorDeck-14B-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "InspectorDeck-14B-sce",
-    "id": "Quazim0t0/InspectorDeck-14B-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3241
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6668
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3165
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3982
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5261
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Jekyl-8b-sce/459c2b98-c3af-4334-a4bc-13334efe49b8.json b/data/hfopenllm_v2/Quazim0t0/Jekyl-8b-sce/459c2b98-c3af-4334-a4bc-13334efe49b8.json
deleted file mode 100644
index 6119a139c..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Jekyl-8b-sce/459c2b98-c3af-4334-a4bc-13334efe49b8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Jekyl-8b-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Jekyl-8b-sce",
-    "id": "Quazim0t0/Jekyl-8b-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4697
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4994
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1616
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4197
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3686
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Jigsaw-14B-Linear/b2780aa3-d299-4180-8441-dd54e94255cb.json b/data/hfopenllm_v2/Quazim0t0/Jigsaw-14B-Linear/b2780aa3-d299-4180-8441-dd54e94255cb.json
deleted file mode 100644
index ab44c9d65..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Jigsaw-14B-Linear/b2780aa3-d299-4180-8441-dd54e94255cb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Jigsaw-14B-Linear/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Jigsaw-14B-Linear",
-    "id": "Quazim0t0/Jigsaw-14B-Linear",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.648
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6865
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4483
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5234
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Katana-8b-sce/f55d398d-0555-4e89-a37c-def04741a0dd.json b/data/hfopenllm_v2/Quazim0t0/Katana-8b-sce/f55d398d-0555-4e89-a37c-def04741a0dd.json
deleted file mode 100644
index 3136f0083..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Katana-8b-sce/f55d398d-0555-4e89-a37c-def04741a0dd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Katana-8b-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Katana-8b-sce",
-    "id": "Quazim0t0/Katana-8b-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5107
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5075
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1511
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4038
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3771
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Knot-CoT-14B-sce/63caf8f8-9e55-4ef6-ae76-ee7184a50675.json b/data/hfopenllm_v2/Quazim0t0/Knot-CoT-14B-sce/63caf8f8-9e55-4ef6-ae76-ee7184a50675.json
deleted file mode 100644
index 12f493aa0..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Knot-CoT-14B-sce/63caf8f8-9e55-4ef6-ae76-ee7184a50675.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Knot-CoT-14B-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Knot-CoT-14B-sce",
-    "id": "Quazim0t0/Knot-CoT-14B-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4832
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6616
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3995
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.414
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5154
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Lineage-14B/f82ccde3-bd3b-499c-8b8c-182822392cea.json b/data/hfopenllm_v2/Quazim0t0/Lineage-14B/f82ccde3-bd3b-499c-8b8c-182822392cea.json
deleted file mode 100644
index 2aa939b3a..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Lineage-14B/f82ccde3-bd3b-499c-8b8c-182822392cea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Lineage-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lineage-14B",
-    "id": "Quazim0t0/Lineage-14B",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.707
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6934
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4245
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3599
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4597
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5411
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Lo-Phi-14b/8a52fb4a-d6ae-4c8d-aed0-2137e0a83ea1.json b/data/hfopenllm_v2/Quazim0t0/Lo-Phi-14b/8a52fb4a-d6ae-4c8d-aed0-2137e0a83ea1.json
deleted file mode 100644
index 26d2ee23e..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Lo-Phi-14b/8a52fb4a-d6ae-4c8d-aed0-2137e0a83ea1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Lo-Phi-14b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lo-Phi-14b",
-    "id": "Quazim0t0/Lo-Phi-14b",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4941
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6852
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4232
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5369
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Loke-14B-sce/b7cbc2fb-2c52-4c13-9266-52103421f2ee.json b/data/hfopenllm_v2/Quazim0t0/Loke-14B-sce/b7cbc2fb-2c52-4c13-9266-52103421f2ee.json
deleted file mode 100644
index 65be87e3f..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Loke-14B-sce/b7cbc2fb-2c52-4c13-9266-52103421f2ee.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Loke-14B-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Loke-14B-sce",
-    "id": "Quazim0t0/Loke-14B-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6848
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6924
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3905
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3649
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4637
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5401
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/MFDOOM-14B/f4474361-e897-4dbb-a89e-5451a4724474.json b/data/hfopenllm_v2/Quazim0t0/MFDOOM-14B/f4474361-e897-4dbb-a89e-5451a4724474.json
deleted file mode 100644
index dc38a1ad9..000000000
--- a/data/hfopenllm_v2/Quazim0t0/MFDOOM-14B/f4474361-e897-4dbb-a89e-5451a4724474.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_MFDOOM-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFDOOM-14B",
-    "id": "Quazim0t0/MFDOOM-14B",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6736
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6916
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4377
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5426
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/MFGRIMM-14B/de257b5e-4629-4f8a-b08d-d2ca372593e2.json b/data/hfopenllm_v2/Quazim0t0/MFGRIMM-14B/de257b5e-4629-4f8a-b08d-d2ca372593e2.json
deleted file mode 100644
index ed96fac2b..000000000
--- a/data/hfopenllm_v2/Quazim0t0/MFGRIMM-14B/de257b5e-4629-4f8a-b08d-d2ca372593e2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_MFGRIMM-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFGRIMM-14B",
-    "id": "Quazim0t0/MFGRIMM-14B",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6894
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6909
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3339
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4361
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5416
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Math_Phi4_Reason/a37aada3-104a-488a-898f-245ff257de46.json b/data/hfopenllm_v2/Quazim0t0/Math_Phi4_Reason/a37aada3-104a-488a-898f-245ff257de46.json
deleted file mode 100644
index 79be95de5..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Math_Phi4_Reason/a37aada3-104a-488a-898f-245ff257de46.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Math_Phi4_Reason/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Math_Phi4_Reason",
-    "id": "Quazim0t0/Math_Phi4_Reason",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.322
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.624
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3278
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4034
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.503
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Mithril-14B-sce/d9d655d1-d94c-483a-a3a2-ca196e1391d1.json b/data/hfopenllm_v2/Quazim0t0/Mithril-14B-sce/d9d655d1-d94c-483a-a3a2-ca196e1391d1.json
deleted file mode 100644
index 0346b1864..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Mithril-14B-sce/d9d655d1-d94c-483a-a3a2-ca196e1391d1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Mithril-14B-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mithril-14B-sce",
-    "id": "Quazim0t0/Mithril-14B-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6958
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6926
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3822
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3691
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4611
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5403
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Mononoke-14B-sce/77bf7126-0cb9-43ef-8d23-5f1395f91642.json b/data/hfopenllm_v2/Quazim0t0/Mononoke-14B-sce/77bf7126-0cb9-43ef-8d23-5f1395f91642.json
deleted file mode 100644
index 24ae15f0e..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Mononoke-14B-sce/77bf7126-0cb9-43ef-8d23-5f1395f91642.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Mononoke-14B-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mononoke-14B-sce",
-    "id": "Quazim0t0/Mononoke-14B-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3502
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6744
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4698
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4155
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5298
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Motion-8B-Linear/73f410be-3084-4994-8406-f8ac70880626.json b/data/hfopenllm_v2/Quazim0t0/Motion-8B-Linear/73f410be-3084-4994-8406-f8ac70880626.json
deleted file mode 100644
index 6934a4cda..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Motion-8B-Linear/73f410be-3084-4994-8406-f8ac70880626.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Motion-8B-Linear/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Motion-8B-Linear",
-    "id": "Quazim0t0/Motion-8B-Linear",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7686
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5084
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1888
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3606
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3785
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Mouse-9B/24caad7a-15fa-4820-91cc-0f544a34d173.json b/data/hfopenllm_v2/Quazim0t0/Mouse-9B/24caad7a-15fa-4820-91cc-0f544a34d173.json
deleted file mode 100644
index cd04f17a6..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Mouse-9B/24caad7a-15fa-4820-91cc-0f544a34d173.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Mouse-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mouse-9B",
-    "id": "Quazim0t0/Mouse-9B",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 9.207
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1325
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2979
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0053
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.347
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1139
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Nova-14b-sce/e087b221-f813-4688-8d98-17980f98ac5b.json b/data/hfopenllm_v2/Quazim0t0/Nova-14b-sce/e087b221-f813-4688-8d98-17980f98ac5b.json
deleted file mode 100644
index 5316de52d..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Nova-14b-sce/e087b221-f813-4688-8d98-17980f98ac5b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Nova-14b-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nova-14b-sce",
-    "id": "Quazim0t0/Nova-14b-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7022
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6935
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4162
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3633
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4571
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5413
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/NovaScotia-14b-stock/f4d03bff-3b34-497f-a17f-0379bc562f11.json b/data/hfopenllm_v2/Quazim0t0/NovaScotia-14b-stock/f4d03bff-3b34-497f-a17f-0379bc562f11.json
deleted file mode 100644
index b80b45290..000000000
--- a/data/hfopenllm_v2/Quazim0t0/NovaScotia-14b-stock/f4d03bff-3b34-497f-a17f-0379bc562f11.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_NovaScotia-14b-stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NovaScotia-14b-stock",
-    "id": "Quazim0t0/NovaScotia-14b-stock",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6787
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6935
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.463
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.349
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4493
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5409
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/ODB-14B-sce/2ca21612-ea90-41f3-b618-3ea81c09c3ae.json b/data/hfopenllm_v2/Quazim0t0/ODB-14B-sce/2ca21612-ea90-41f3-b618-3ea81c09c3ae.json
deleted file mode 100644
index 78c79f5cf..000000000
--- a/data/hfopenllm_v2/Quazim0t0/ODB-14B-sce/2ca21612-ea90-41f3-b618-3ea81c09c3ae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_ODB-14B-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ODB-14B-sce",
-    "id": "Quazim0t0/ODB-14B-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Unknown",
-      "params_billions": 0.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2922
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6559
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2545
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3929
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5207
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/ODB-14B-sce/d4dc2088-9911-4966-afe9-022df89dd522.json b/data/hfopenllm_v2/Quazim0t0/ODB-14B-sce/d4dc2088-9911-4966-afe9-022df89dd522.json
deleted file mode 100644
index a373a9579..000000000
--- a/data/hfopenllm_v2/Quazim0t0/ODB-14B-sce/d4dc2088-9911-4966-afe9-022df89dd522.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_ODB-14b-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ODB-14b-sce",
-    "id": "Quazim0t0/ODB-14b-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7016
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6942
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4116
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3624
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4571
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5411
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Oasis-14B-ties/ad03a075-8f24-46f6-ae04-5a04eb7061c1.json b/data/hfopenllm_v2/Quazim0t0/Oasis-14B-ties/ad03a075-8f24-46f6-ae04-5a04eb7061c1.json
deleted file mode 100644
index 5c2da9f94..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Oasis-14B-ties/ad03a075-8f24-46f6-ae04-5a04eb7061c1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Oasis-14B-ties/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Oasis-14B-ties",
-    "id": "Quazim0t0/Oasis-14B-ties",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6937
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6915
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3754
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3649
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4571
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5405
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Origami-14B-sce/2d1da226-e65c-48a0-aabb-46b1cf670a82.json b/data/hfopenllm_v2/Quazim0t0/Origami-14B-sce/2d1da226-e65c-48a0-aabb-46b1cf670a82.json
deleted file mode 100644
index c9e2044e6..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Origami-14B-sce/2d1da226-e65c-48a0-aabb-46b1cf670a82.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Origami-14B-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Origami-14B-sce",
-    "id": "Quazim0t0/Origami-14B-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3259
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.662
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2915
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4035
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5244
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Phi4.Turn.R1Distill.16bit/7fb3a035-2b83-4a58-818f-16fe6d9a8ab3.json b/data/hfopenllm_v2/Quazim0t0/Phi4.Turn.R1Distill.16bit/7fb3a035-2b83-4a58-818f-16fe6d9a8ab3.json
deleted file mode 100644
index 75adf6c43..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Phi4.Turn.R1Distill.16bit/7fb3a035-2b83-4a58-818f-16fe6d9a8ab3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Phi4.Turn.R1Distill.16bit/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi4.Turn.R1Distill.16bit",
-    "id": "Quazim0t0/Phi4.Turn.R1Distill.16bit",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3126
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6563
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2311
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3902
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5257
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Phi4.Turn.R1Distill_v1.5.1-Tensors/87018726-9f81-47b1-883e-609afea7fb37.json b/data/hfopenllm_v2/Quazim0t0/Phi4.Turn.R1Distill_v1.5.1-Tensors/87018726-9f81-47b1-883e-609afea7fb37.json
deleted file mode 100644
index 5b87ab22e..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Phi4.Turn.R1Distill_v1.5.1-Tensors/87018726-9f81-47b1-883e-609afea7fb37.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Phi4.Turn.R1Distill_v1.5.1-Tensors/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi4.Turn.R1Distill_v1.5.1-Tensors",
-    "id": "Quazim0t0/Phi4.Turn.R1Distill_v1.5.1-Tensors",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6456
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3929
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5117
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Phi4Basis-14B-sce/292b9333-96c7-4fc7-bf35-78bbce9f10d3.json b/data/hfopenllm_v2/Quazim0t0/Phi4Basis-14B-sce/292b9333-96c7-4fc7-bf35-78bbce9f10d3.json
deleted file mode 100644
index 0f7720935..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Phi4Basis-14B-sce/292b9333-96c7-4fc7-bf35-78bbce9f10d3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Phi4Basis-14B-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi4Basis-14B-sce",
-    "id": "Quazim0t0/Phi4Basis-14B-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6502
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6909
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4789
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4338
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.539
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Ponder-14B-linear/b44224c3-ed2c-4120-9e2a-e6286358a4da.json b/data/hfopenllm_v2/Quazim0t0/Ponder-14B-linear/b44224c3-ed2c-4120-9e2a-e6286358a4da.json
deleted file mode 100644
index e66c3086b..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Ponder-14B-linear/b44224c3-ed2c-4120-9e2a-e6286358a4da.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Ponder-14B-linear/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ponder-14B-linear",
-    "id": "Quazim0t0/Ponder-14B-linear",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6906
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6943
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4282
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3582
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4558
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5408
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/RZA-14B-sce/f7a2c9af-c55c-4307-bfef-1ca709525d82.json b/data/hfopenllm_v2/Quazim0t0/RZA-14B-sce/f7a2c9af-c55c-4307-bfef-1ca709525d82.json
deleted file mode 100644
index eedf689a1..000000000
--- a/data/hfopenllm_v2/Quazim0t0/RZA-14B-sce/f7a2c9af-c55c-4307-bfef-1ca709525d82.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_RZA-14B-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RZA-14B-sce",
-    "id": "Quazim0t0/RZA-14B-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4774
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6686
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5189
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4113
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5383
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Rosemary-14b/d9655f35-edfd-4c53-b359-559870e8019e.json b/data/hfopenllm_v2/Quazim0t0/Rosemary-14b/d9655f35-edfd-4c53-b359-559870e8019e.json
deleted file mode 100644
index 38d6ae792..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Rosemary-14b/d9655f35-edfd-4c53-b359-559870e8019e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Rosemary-14b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rosemary-14b",
-    "id": "Quazim0t0/Rosemary-14b",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6915
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6955
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4388
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4492
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5396
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Rune-14b/afdd962d-652a-4395-92f7-c16dc874a779.json b/data/hfopenllm_v2/Quazim0t0/Rune-14b/afdd962d-652a-4395-92f7-c16dc874a779.json
deleted file mode 100644
index b20f38d6e..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Rune-14b/afdd962d-652a-4395-92f7-c16dc874a779.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Rune-14b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rune-14b",
-    "id": "Quazim0t0/Rune-14b",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7016
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6937
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4585
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4533
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5411
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/SZA-14B-sce/2594e917-3ebd-428b-8f36-cb0da668695d.json b/data/hfopenllm_v2/Quazim0t0/SZA-14B-sce/2594e917-3ebd-428b-8f36-cb0da668695d.json
deleted file mode 100644
index b8cc374a1..000000000
--- a/data/hfopenllm_v2/Quazim0t0/SZA-14B-sce/2594e917-3ebd-428b-8f36-cb0da668695d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_SZA-14B-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SZA-14B-sce",
-    "id": "Quazim0t0/SZA-14B-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5659
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6889
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5242
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3305
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4339
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5353
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Sake-20b/91a86644-ad96-4c66-8691-1c0b531b572c.json b/data/hfopenllm_v2/Quazim0t0/Sake-20b/91a86644-ad96-4c66-8691-1c0b531b572c.json
deleted file mode 100644
index a8bf797b3..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Sake-20b/91a86644-ad96-4c66-8691-1c0b531b572c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Sake-20b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sake-20b",
-    "id": "Quazim0t0/Sake-20b",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 21.475
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6693
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.677
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4653
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4494
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5391
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Spok-14b-sce/331f56ce-5e45-46d8-9143-3f66be20b699.json b/data/hfopenllm_v2/Quazim0t0/Spok-14b-sce/331f56ce-5e45-46d8-9143-3f66be20b699.json
deleted file mode 100644
index ee6d0d2f3..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Spok-14b-sce/331f56ce-5e45-46d8-9143-3f66be20b699.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Spok-14b-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Spok-14b-sce",
-    "id": "Quazim0t0/Spok-14b-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6682
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6899
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2719
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3456
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4141
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5298
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Sumatra-20b/6138ebe0-8483-4cfb-8d95-b334bb09e831.json b/data/hfopenllm_v2/Quazim0t0/Sumatra-20b/6138ebe0-8483-4cfb-8d95-b334bb09e831.json
deleted file mode 100644
index e73a1374c..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Sumatra-20b/6138ebe0-8483-4cfb-8d95-b334bb09e831.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Sumatra-20b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sumatra-20b",
-    "id": "Quazim0t0/Sumatra-20b",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 21.475
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6738
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6855
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3671
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.456
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5415
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/SuperNova14b/4d16dd47-42d1-4ea6-8f1b-dc50648bceab.json b/data/hfopenllm_v2/Quazim0t0/SuperNova14b/4d16dd47-42d1-4ea6-8f1b-dc50648bceab.json
deleted file mode 100644
index ae0580d15..000000000
--- a/data/hfopenllm_v2/Quazim0t0/SuperNova14b/4d16dd47-42d1-4ea6-8f1b-dc50648bceab.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_SuperNova14b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SuperNova14b",
-    "id": "Quazim0t0/SuperNova14b",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7076
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6937
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4396
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3523
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4545
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5435
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/TB0-8B-sce/a6b0f2bf-08da-472f-b858-8be967a44cdc.json b/data/hfopenllm_v2/Quazim0t0/TB0-8B-sce/a6b0f2bf-08da-472f-b858-8be967a44cdc.json
deleted file mode 100644
index 3f9a2e43e..000000000
--- a/data/hfopenllm_v2/Quazim0t0/TB0-8B-sce/a6b0f2bf-08da-472f-b858-8be967a44cdc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_TB0-8B-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TB0-8B-sce",
-    "id": "Quazim0t0/TB0-8B-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5107
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5075
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1511
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4038
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3771
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/TBL-8B-sce/57c7553d-f3e5-4a31-8c16-66aae570d8ec.json b/data/hfopenllm_v2/Quazim0t0/TBL-8B-sce/57c7553d-f3e5-4a31-8c16-66aae570d8ec.json
deleted file mode 100644
index dbe71df0d..000000000
--- a/data/hfopenllm_v2/Quazim0t0/TBL-8B-sce/57c7553d-f3e5-4a31-8c16-66aae570d8ec.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_TBL-8B-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TBL-8B-sce",
-    "id": "Quazim0t0/TBL-8B-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4581
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5008
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1533
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3339
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4236
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3689
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/ThinkPhi1.1-Tensors/58c31bdd-f86f-4fbb-8549-191bb9f46f02.json b/data/hfopenllm_v2/Quazim0t0/ThinkPhi1.1-Tensors/58c31bdd-f86f-4fbb-8549-191bb9f46f02.json
deleted file mode 100644
index 2cd01867f..000000000
--- a/data/hfopenllm_v2/Quazim0t0/ThinkPhi1.1-Tensors/58c31bdd-f86f-4fbb-8549-191bb9f46f02.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_ThinkPhi1.1-Tensors/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ThinkPhi1.1-Tensors",
-    "id": "Quazim0t0/ThinkPhi1.1-Tensors",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3908
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6449
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.182
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.418
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4908
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Venti-20b/dd25c1dd-0edf-44ca-b18c-633dbd47368f.json b/data/hfopenllm_v2/Quazim0t0/Venti-20b/dd25c1dd-0edf-44ca-b18c-633dbd47368f.json
deleted file mode 100644
index 34f0d31fe..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Venti-20b/dd25c1dd-0edf-44ca-b18c-633dbd47368f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Venti-20b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Venti-20b",
-    "id": "Quazim0t0/Venti-20b",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 21.475
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6641
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6901
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3391
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3322
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.448
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5386
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Venti-Blend-sce/2a030613-b5f7-4393-ac39-d2d072c913dc.json b/data/hfopenllm_v2/Quazim0t0/Venti-Blend-sce/2a030613-b5f7-4393-ac39-d2d072c913dc.json
deleted file mode 100644
index 2f424d5ab..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Venti-Blend-sce/2a030613-b5f7-4393-ac39-d2d072c913dc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Venti-Blend-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Venti-Blend-sce",
-    "id": "Quazim0t0/Venti-Blend-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 21.475
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6879
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6843
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4056
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4389
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5414
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Vine-14b-sce/f8c73290-c400-4f1f-a00a-516592497b0d.json b/data/hfopenllm_v2/Quazim0t0/Vine-14b-sce/f8c73290-c400-4f1f-a00a-516592497b0d.json
deleted file mode 100644
index 947f1d53a..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Vine-14b-sce/f8c73290-c400-4f1f-a00a-516592497b0d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Vine-14b-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Vine-14b-sce",
-    "id": "Quazim0t0/Vine-14b-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6733
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6891
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3339
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4323
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5408
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Wendy-14B/b31908fc-5e7e-45d6-835f-4e86a05b23fb.json b/data/hfopenllm_v2/Quazim0t0/Wendy-14B/b31908fc-5e7e-45d6-835f-4e86a05b23fb.json
deleted file mode 100644
index 385c8931d..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Wendy-14B/b31908fc-5e7e-45d6-835f-4e86a05b23fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Wendy-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Wendy-14B",
-    "id": "Quazim0t0/Wendy-14B",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6772
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6958
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4834
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3322
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4428
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5435
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/Wu-14b-sce/4320cb98-7f9f-4510-bb88-448ce231bae8.json b/data/hfopenllm_v2/Quazim0t0/Wu-14b-sce/4320cb98-7f9f-4510-bb88-448ce231bae8.json
deleted file mode 100644
index 8fbcfb8f2..000000000
--- a/data/hfopenllm_v2/Quazim0t0/Wu-14b-sce/4320cb98-7f9f-4510-bb88-448ce231bae8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_Wu-14b-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Wu-14b-sce",
-    "id": "Quazim0t0/Wu-14b-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6718
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6885
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2613
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3465
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4114
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5293
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/bloom-14b-stock/28b986d1-2e67-4462-9165-6cb8f260b6c6.json b/data/hfopenllm_v2/Quazim0t0/bloom-14b-stock/28b986d1-2e67-4462-9165-6cb8f260b6c6.json
deleted file mode 100644
index f549634bc..000000000
--- a/data/hfopenllm_v2/Quazim0t0/bloom-14b-stock/28b986d1-2e67-4462-9165-6cb8f260b6c6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_bloom-14b-stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bloom-14b-stock",
-    "id": "Quazim0t0/bloom-14b-stock",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6575
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6878
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4811
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.431
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5373
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/caramel-14B/fe1e21cb-7934-4022-a74a-777172310021.json b/data/hfopenllm_v2/Quazim0t0/caramel-14B/fe1e21cb-7934-4022-a74a-777172310021.json
deleted file mode 100644
index fc8e481dc..000000000
--- a/data/hfopenllm_v2/Quazim0t0/caramel-14B/fe1e21cb-7934-4022-a74a-777172310021.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_caramel-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "caramel-14B",
-    "id": "Quazim0t0/caramel-14B",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6745
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6919
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4713
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3448
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4454
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5436
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/graphite-14b-sce/90871638-b828-484d-8822-95ffceb20909.json b/data/hfopenllm_v2/Quazim0t0/graphite-14b-sce/90871638-b828-484d-8822-95ffceb20909.json
deleted file mode 100644
index b45a1e117..000000000
--- a/data/hfopenllm_v2/Quazim0t0/graphite-14b-sce/90871638-b828-484d-8822-95ffceb20909.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_graphite-14b-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "graphite-14b-sce",
-    "id": "Quazim0t0/graphite-14b-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3217
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6631
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3981
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.528
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/mocha-14B/04a98dfb-8e96-444c-8df4-ed7cf72a26ea.json b/data/hfopenllm_v2/Quazim0t0/mocha-14B/04a98dfb-8e96-444c-8df4-ed7cf72a26ea.json
deleted file mode 100644
index c48422312..000000000
--- a/data/hfopenllm_v2/Quazim0t0/mocha-14B/04a98dfb-8e96-444c-8df4-ed7cf72a26ea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_mocha-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mocha-14B",
-    "id": "Quazim0t0/mocha-14B",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5893
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6895
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3305
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4272
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5384
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/mosaic-14b-sce/8c5c22af-f230-4d34-b80d-f42ef27e1675.json b/data/hfopenllm_v2/Quazim0t0/mosaic-14b-sce/8c5c22af-f230-4d34-b80d-f42ef27e1675.json
deleted file mode 100644
index e02cc70c3..000000000
--- a/data/hfopenllm_v2/Quazim0t0/mosaic-14b-sce/8c5c22af-f230-4d34-b80d-f42ef27e1675.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_mosaic-14b-sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mosaic-14b-sce",
-    "id": "Quazim0t0/mosaic-14b-sce",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6876
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6907
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4026
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3624
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4558
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5396
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/tesseract-14b-stock/f3466a90-541b-4a08-a9c6-d5a79b2299b0.json b/data/hfopenllm_v2/Quazim0t0/tesseract-14b-stock/f3466a90-541b-4a08-a9c6-d5a79b2299b0.json
deleted file mode 100644
index f5ff78824..000000000
--- a/data/hfopenllm_v2/Quazim0t0/tesseract-14b-stock/f3466a90-541b-4a08-a9c6-d5a79b2299b0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_tesseract-14b-stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tesseract-14b-stock",
-    "id": "Quazim0t0/tesseract-14b-stock",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5848
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.688
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4232
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5389
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Quazim0t0/time-14b-stock/ef9ee5ae-d92b-4143-af1b-d62a7c3c7fd4.json b/data/hfopenllm_v2/Quazim0t0/time-14b-stock/ef9ee5ae-d92b-4143-af1b-d62a7c3c7fd4.json
deleted file mode 100644
index 763d06ba9..000000000
--- a/data/hfopenllm_v2/Quazim0t0/time-14b-stock/ef9ee5ae-d92b-4143-af1b-d62a7c3c7fd4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Quazim0t0_time-14b-stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "time-14b-stock",
-    "id": "Quazim0t0/time-14b-stock",
-    "developer": "Quazim0t0",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6699
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6897
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3347
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4323
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5419
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/QwQ-32B-Preview/859af708-ac37-4749-bc06-73d92338d1f5.json b/data/hfopenllm_v2/Qwen/QwQ-32B-Preview/859af708-ac37-4749-bc06-73d92338d1f5.json
deleted file mode 100644
index 9072fe047..000000000
--- a/data/hfopenllm_v2/Qwen/QwQ-32B-Preview/859af708-ac37-4749-bc06-73d92338d1f5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_QwQ-32B-Preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwQ-32B-Preview",
-    "id": "Qwen/QwQ-32B-Preview",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4035
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6691
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4494
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.411
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5678
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/QwQ-32B/e274380d-e0f7-47c3-afc3-e603e6cecf9e.json b/data/hfopenllm_v2/Qwen/QwQ-32B/e274380d-e0f7-47c3-afc3-e603e6cecf9e.json
deleted file mode 100644
index 8a460b8de..000000000
--- a/data/hfopenllm_v2/Qwen/QwQ-32B/e274380d-e0f7-47c3-afc3-e603e6cecf9e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_QwQ-32B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwQ-32B",
-    "id": "Qwen/QwQ-32B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3977
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2983
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1609
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4206
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1196
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-0.5B-Chat/19810be8-ea81-4db5-9854-1830b05a5732.json b/data/hfopenllm_v2/Qwen/Qwen1.5-0.5B-Chat/19810be8-ea81-4db5-9854-1830b05a5732.json
deleted file mode 100644
index d12f434fb..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen1.5-0.5B-Chat/19810be8-ea81-4db5-9854-1830b05a5732.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-0.5B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5-0.5B-Chat",
-    "id": "Qwen/Qwen1.5-0.5B-Chat",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.62
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1807
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3167
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3837
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1213
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-0.5B/1258c282-3672-4b42-9d4d-117568e17bf5.json b/data/hfopenllm_v2/Qwen/Qwen1.5-0.5B/1258c282-3672-4b42-9d4d-117568e17bf5.json
deleted file mode 100644
index b3a7d8e15..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen1.5-0.5B/1258c282-3672-4b42-9d4d-117568e17bf5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-0.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5-0.5B",
-    "id": "Qwen/Qwen1.5-0.5B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.62
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1706
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0174
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3616
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1307
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-1.8B-Chat/9b9f6e01-238e-4893-b398-4e1c83c44dfa.json b/data/hfopenllm_v2/Qwen/Qwen1.5-1.8B-Chat/9b9f6e01-238e-4893-b398-4e1c83c44dfa.json
deleted file mode 100644
index 5d7e32fc8..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen1.5-1.8B-Chat/9b9f6e01-238e-4893-b398-4e1c83c44dfa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-1.8B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5-1.8B-Chat",
-    "id": "Qwen/Qwen1.5-1.8B-Chat",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.837
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2019
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3256
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.426
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1804
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-1.8B/b267621b-dbba-4c4a-bb9f-fa85734d0f59.json b/data/hfopenllm_v2/Qwen/Qwen1.5-1.8B/b267621b-dbba-4c4a-bb9f-fa85734d0f59.json
deleted file mode 100644
index c4b4b7c05..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen1.5-1.8B/b267621b-dbba-4c4a-bb9f-fa85734d0f59.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-1.8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5-1.8B",
-    "id": "Qwen/Qwen1.5-1.8B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.837
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2154
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3476
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0317
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3605
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1882
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-110B-Chat/a7e4e787-8e95-48a0-9d50-53ba9f05cd1c.json b/data/hfopenllm_v2/Qwen/Qwen1.5-110B-Chat/a7e4e787-8e95-48a0-9d50-53ba9f05cd1c.json
deleted file mode 100644
index 1539af866..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen1.5-110B-Chat/a7e4e787-8e95-48a0-9d50-53ba9f05cd1c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-110B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5-110B-Chat",
-    "id": "Qwen/Qwen1.5-110B-Chat",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 111.21
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5939
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6184
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2341
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3414
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4522
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4825
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-110B/3d39dcab-55df-4ad3-bdc8-03ae684e4390.json b/data/hfopenllm_v2/Qwen/Qwen1.5-110B/3d39dcab-55df-4ad3-bdc8-03ae684e4390.json
deleted file mode 100644
index 3d0d4bcdf..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen1.5-110B/3d39dcab-55df-4ad3-bdc8-03ae684e4390.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-110B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5-110B",
-    "id": "Qwen/Qwen1.5-110B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 111.21
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3422
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.61
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.247
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3523
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4408
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5361
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-14B-Chat/1b499881-9edb-4626-a919-977393d6bef1.json b/data/hfopenllm_v2/Qwen/Qwen1.5-14B-Chat/1b499881-9edb-4626-a919-977393d6bef1.json
deleted file mode 100644
index c47867900..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen1.5-14B-Chat/1b499881-9edb-4626-a919-977393d6bef1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-14B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5-14B-Chat",
-    "id": "Qwen/Qwen1.5-14B-Chat",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.167
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4768
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5229
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1526
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3618
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-14B/84b8970c-6c29-4ee1-93b8-c97e4a7c4950.json b/data/hfopenllm_v2/Qwen/Qwen1.5-14B/84b8970c-6c29-4ee1-93b8-c97e4a7c4950.json
deleted file mode 100644
index d12634623..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen1.5-14B/84b8970c-6c29-4ee1-93b8-c97e4a7c4950.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5-14B",
-    "id": "Qwen/Qwen1.5-14B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.167
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2905
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.508
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2024
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4186
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3644
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-32B-Chat/2e070663-2622-4a8e-bd39-7f0ef9df399e.json b/data/hfopenllm_v2/Qwen/Qwen1.5-32B-Chat/2e070663-2622-4a8e-bd39-7f0ef9df399e.json
deleted file mode 100644
index 695a36dcd..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen1.5-32B-Chat/2e070663-2622-4a8e-bd39-7f0ef9df399e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-32B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5-32B-Chat",
-    "id": "Qwen/Qwen1.5-32B-Chat",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.512
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5532
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6067
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1956
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.416
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4457
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-32B/047fa91e-2dc7-4881-8254-3dfbd4a2ff1b.json b/data/hfopenllm_v2/Qwen/Qwen1.5-32B/047fa91e-2dc7-4881-8254-3dfbd4a2ff1b.json
deleted file mode 100644
index ff8b2ab29..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen1.5-32B/047fa91e-2dc7-4881-8254-3dfbd4a2ff1b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-32B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5-32B",
-    "id": "Qwen/Qwen1.5-32B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.512
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3297
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5715
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4278
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.45
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-4B-Chat/6d73016e-078e-4ffe-b2ae-5b829d1456df.json b/data/hfopenllm_v2/Qwen/Qwen1.5-4B-Chat/6d73016e-078e-4ffe-b2ae-5b829d1456df.json
deleted file mode 100644
index 3d23aaf52..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen1.5-4B-Chat/6d73016e-078e-4ffe-b2ae-5b829d1456df.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-4B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5-4B-Chat",
-    "id": "Qwen/Qwen1.5-4B-Chat",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.95
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3157
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4006
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0279
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3978
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2396
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-4B/0b68b5bd-d22c-4194-9ddf-f22e9181f84d.json b/data/hfopenllm_v2/Qwen/Qwen1.5-4B/0b68b5bd-d22c-4194-9ddf-f22e9181f84d.json
deleted file mode 100644
index b317e8f1c..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen1.5-4B/0b68b5bd-d22c-4194-9ddf-f22e9181f84d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-4B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5-4B",
-    "id": "Qwen/Qwen1.5-4B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.95
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2445
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4054
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0529
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3604
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.246
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-7B-Chat/03d51d90-fd15-42b7-ad5f-c7326cc642a7.json b/data/hfopenllm_v2/Qwen/Qwen1.5-7B-Chat/03d51d90-fd15-42b7-ad5f-c7326cc642a7.json
deleted file mode 100644
index 7865c5891..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen1.5-7B-Chat/03d51d90-fd15-42b7-ad5f-c7326cc642a7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-7B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5-7B-Chat",
-    "id": "Qwen/Qwen1.5-7B-Chat",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.721
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4371
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.451
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0627
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3779
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2951
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-7B/d3e5c939-c53a-49d6-80cd-34420dbb176a.json b/data/hfopenllm_v2/Qwen/Qwen1.5-7B/d3e5c939-c53a-49d6-80cd-34420dbb176a.json
deleted file mode 100644
index 29ccd9046..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen1.5-7B/d3e5c939-c53a-49d6-80cd-34420dbb176a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5-7B",
-    "id": "Qwen/Qwen1.5-7B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.721
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2684
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.456
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0929
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4103
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2916
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-MoE-A2.7B-Chat/ab321358-26f9-4577-a5fb-1f5d4b8784b4.json b/data/hfopenllm_v2/Qwen/Qwen1.5-MoE-A2.7B-Chat/ab321358-26f9-4577-a5fb-1f5d4b8784b4.json
deleted file mode 100644
index c414d02d4..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen1.5-MoE-A2.7B-Chat/ab321358-26f9-4577-a5fb-1f5d4b8784b4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-MoE-A2.7B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5-MoE-A2.7B-Chat",
-    "id": "Qwen/Qwen1.5-MoE-A2.7B-Chat",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2MoeForCausalLM",
-      "params_billions": 14.316
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3795
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4272
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0634
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3899
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2923
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen1.5-MoE-A2.7B/a43aae68-f12c-4a6d-b846-c498cf35f6cd.json b/data/hfopenllm_v2/Qwen/Qwen1.5-MoE-A2.7B/a43aae68-f12c-4a6d-b846-c498cf35f6cd.json
deleted file mode 100644
index 07658bbfa..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen1.5-MoE-A2.7B/a43aae68-f12c-4a6d-b846-c498cf35f6cd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen1.5-MoE-A2.7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5-MoE-A2.7B",
-    "id": "Qwen/Qwen1.5-MoE-A2.7B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2MoeForCausalLM",
-      "params_billions": 14.316
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.266
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4114
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0929
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4013
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2778
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2-0.5B-Instruct/b84615c0-43c4-49ec-83fe-5d3f8e6026af.json b/data/hfopenllm_v2/Qwen/Qwen2-0.5B-Instruct/b84615c0-43c4-49ec-83fe-5d3f8e6026af.json
deleted file mode 100644
index d6e75e508..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2-0.5B-Instruct/b84615c0-43c4-49ec-83fe-5d3f8e6026af.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-0.5B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-0.5B-Instruct",
-    "id": "Qwen/Qwen2-0.5B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2247
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3173
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2466
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3353
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1531
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2-0.5B/7e687d24-9e12-4ecf-b283-e222efb9473a.json b/data/hfopenllm_v2/Qwen/Qwen2-0.5B/7e687d24-9e12-4ecf-b283-e222efb9473a.json
deleted file mode 100644
index 164002250..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2-0.5B/7e687d24-9e12-4ecf-b283-e222efb9473a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-0.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-0.5B",
-    "id": "Qwen/Qwen2-0.5B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1873
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3239
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3752
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.172
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2-1.5B-Instruct/4aea143c-28fd-48bb-b911-37ac3fe58220.json b/data/hfopenllm_v2/Qwen/Qwen2-1.5B-Instruct/4aea143c-28fd-48bb-b911-37ac3fe58220.json
deleted file mode 100644
index 29c2d61f9..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2-1.5B-Instruct/4aea143c-28fd-48bb-b911-37ac3fe58220.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-1.5B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-1.5B-Instruct",
-    "id": "Qwen/Qwen2-1.5B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3371
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3852
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0718
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4293
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2501
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2-1.5B/34a8daec-bfff-4cf4-9011-0542b30c1d10.json b/data/hfopenllm_v2/Qwen/Qwen2-1.5B/34a8daec-bfff-4cf4-9011-0542b30c1d10.json
deleted file mode 100644
index 290263529..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2-1.5B/34a8daec-bfff-4cf4-9011-0542b30c1d10.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-1.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-1.5B",
-    "id": "Qwen/Qwen2-1.5B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2113
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3575
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0702
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3658
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2552
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2-57B-A14B-Instruct/3e919d7b-53db-41fb-ac93-224e2768b9c6.json b/data/hfopenllm_v2/Qwen/Qwen2-57B-A14B-Instruct/3e919d7b-53db-41fb-ac93-224e2768b9c6.json
deleted file mode 100644
index c5c5b3781..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2-57B-A14B-Instruct/3e919d7b-53db-41fb-ac93-224e2768b9c6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-57B-A14B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-57B-A14B-Instruct",
-    "id": "Qwen/Qwen2-57B-A14B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2MoeForCausalLM",
-      "params_billions": 57.409
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6338
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5888
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2817
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4361
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4575
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2-57B-A14B/66becca1-d92b-409f-ab56-44d05cac66fd.json b/data/hfopenllm_v2/Qwen/Qwen2-57B-A14B/66becca1-d92b-409f-ab56-44d05cac66fd.json
deleted file mode 100644
index b8c0fe259..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2-57B-A14B/66becca1-d92b-409f-ab56-44d05cac66fd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-57B-A14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-57B-A14B",
-    "id": "Qwen/Qwen2-57B-A14B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2MoeForCausalLM",
-      "params_billions": 57.409
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3113
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5618
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1866
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4174
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4916
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2-72B-Instruct/6293b269-7c4c-44da-bd85-e51954c173a1.json b/data/hfopenllm_v2/Qwen/Qwen2-72B-Instruct/6293b269-7c4c-44da-bd85-e51954c173a1.json
deleted file mode 100644
index f6edd8027..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2-72B-Instruct/6293b269-7c4c-44da-bd85-e51954c173a1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-72B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-72B-Instruct",
-    "id": "Qwen/Qwen2-72B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7989
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6977
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4177
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3725
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.456
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5403
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2-72B/add3b058-e7bc-4b7b-bb98-0d7039979072.json b/data/hfopenllm_v2/Qwen/Qwen2-72B/add3b058-e7bc-4b7b-bb98-0d7039979072.json
deleted file mode 100644
index fbfd29d40..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2-72B/add3b058-e7bc-4b7b-bb98-0d7039979072.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-72B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-72B",
-    "id": "Qwen/Qwen2-72B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3824
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6617
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3943
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4704
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5731
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2-7B-Instruct/db0b6b3f-e5a9-4367-ab87-e58d5c6ccd81.json b/data/hfopenllm_v2/Qwen/Qwen2-7B-Instruct/db0b6b3f-e5a9-4367-ab87-e58d5c6ccd81.json
deleted file mode 100644
index 14d4c60a1..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2-7B-Instruct/db0b6b3f-e5a9-4367-ab87-e58d5c6ccd81.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-7B-Instruct",
-    "id": "Qwen/Qwen2-7B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5679
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5545
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2764
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3928
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3847
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2-7B/54b055d0-80ae-4bba-b729-bd77b3ec7502.json b/data/hfopenllm_v2/Qwen/Qwen2-7B/54b055d0-80ae-4bba-b729-bd77b3ec7502.json
deleted file mode 100644
index e3631e1e9..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2-7B/54b055d0-80ae-4bba-b729-bd77b3ec7502.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-7B",
-    "id": "Qwen/Qwen2-7B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3149
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5315
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2039
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4439
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4183
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2-Math-72B-Instruct/5c22d0b3-5082-4c6e-865c-71da03cf9378.json b/data/hfopenllm_v2/Qwen/Qwen2-Math-72B-Instruct/5c22d0b3-5082-4c6e-865c-71da03cf9378.json
deleted file mode 100644
index a24afa6b4..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2-Math-72B-Instruct/5c22d0b3-5082-4c6e-865c-71da03cf9378.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-Math-72B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-Math-72B-Instruct",
-    "id": "Qwen/Qwen2-Math-72B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5694
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6343
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3683
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4517
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4273
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2-Math-7B/f8e5ee9f-519d-4ed8-bd2a-88897075f401.json b/data/hfopenllm_v2/Qwen/Qwen2-Math-7B/f8e5ee9f-519d-4ed8-bd2a-88897075f401.json
deleted file mode 100644
index 93ad6f156..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2-Math-7B/f8e5ee9f-519d-4ed8-bd2a-88897075f401.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-Math-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-Math-7B",
-    "id": "Qwen/Qwen2-Math-7B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2687
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.387
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2477
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3593
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1197
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2-VL-72B-Instruct/b74c3215-7bd5-42d1-9193-f4c9c6a8bec2.json b/data/hfopenllm_v2/Qwen/Qwen2-VL-72B-Instruct/b74c3215-7bd5-42d1-9193-f4c9c6a8bec2.json
deleted file mode 100644
index 30ca6ebbd..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2-VL-72B-Instruct/b74c3215-7bd5-42d1-9193-f4c9c6a8bec2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-VL-72B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-VL-72B-Instruct",
-    "id": "Qwen/Qwen2-VL-72B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2VLForConditionalGeneration",
-      "params_billions": 73.406
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5982
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6946
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3444
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3876
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4492
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5717
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2-VL-7B-Instruct/27df1e06-463b-4519-87eb-a1666ad3f98c.json b/data/hfopenllm_v2/Qwen/Qwen2-VL-7B-Instruct/27df1e06-463b-4519-87eb-a1666ad3f98c.json
deleted file mode 100644
index 276403439..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2-VL-7B-Instruct/27df1e06-463b-4519-87eb-a1666ad3f98c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2-VL-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-VL-7B-Instruct",
-    "id": "Qwen/Qwen2-VL-7B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2VLForConditionalGeneration",
-      "params_billions": 8.291
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4599
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5465
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1986
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4375
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4095
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-0.5B-Instruct/9d975b05-7bee-462d-a33a-afa0d5af94d4.json b/data/hfopenllm_v2/Qwen/Qwen2.5-0.5B-Instruct/9d975b05-7bee-462d-a33a-afa0d5af94d4.json
deleted file mode 100644
index d78bde574..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-0.5B-Instruct/9d975b05-7bee-462d-a33a-afa0d5af94d4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-0.5B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-Instruct",
-    "id": "Qwen/Qwen2.5-0.5B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3153
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3322
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1035
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.172
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-0.5B-Instruct/9ef9135a-473e-43a5-a460-fd3ec50226f9.json b/data/hfopenllm_v2/Qwen/Qwen2.5-0.5B-Instruct/9ef9135a-473e-43a5-a460-fd3ec50226f9.json
deleted file mode 100644
index 38a9f628f..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-0.5B-Instruct/9ef9135a-473e-43a5-a460-fd3ec50226f9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-0.5B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-Instruct",
-    "id": "Qwen/Qwen2.5-0.5B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.5
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3071
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3341
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3329
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1697
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-0.5B/c57cae01-328e-447b-8945-e3cd2c4b8a7b.json b/data/hfopenllm_v2/Qwen/Qwen2.5-0.5B/c57cae01-328e-447b-8945-e3cd2c4b8a7b.json
deleted file mode 100644
index 7aa39af3d..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-0.5B/c57cae01-328e-447b-8945-e3cd2c4b8a7b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-0.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B",
-    "id": "Qwen/Qwen2.5-0.5B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.5
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1627
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3275
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2466
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3433
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1906
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-1.5B-Instruct/494c86cf-7f37-49d8-8160-b81859552c87.json b/data/hfopenllm_v2/Qwen/Qwen2.5-1.5B-Instruct/494c86cf-7f37-49d8-8160-b81859552c87.json
deleted file mode 100644
index 6d0e76598..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-1.5B-Instruct/494c86cf-7f37-49d8-8160-b81859552c87.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-1.5B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-1.5B-Instruct",
-    "id": "Qwen/Qwen2.5-1.5B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.5
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4476
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4289
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2205
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3663
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2799
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-1.5B/6de5e76e-4297-4bcd-b06e-f63fa28da0e0.json b/data/hfopenllm_v2/Qwen/Qwen2.5-1.5B/6de5e76e-4297-4bcd-b06e-f63fa28da0e0.json
deleted file mode 100644
index 86834fae0..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-1.5B/6de5e76e-4297-4bcd-b06e-f63fa28da0e0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-1.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-1.5B",
-    "id": "Qwen/Qwen2.5-1.5B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.5
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2674
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4078
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0914
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3576
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2855
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-14B-Instruct-1M/9b10cd14-82f3-4b36-a4be-5092127d68c3.json b/data/hfopenllm_v2/Qwen/Qwen2.5-14B-Instruct-1M/9b10cd14-82f3-4b36-a4be-5092127d68c3.json
deleted file mode 100644
index 7ccfcf4c1..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-14B-Instruct-1M/9b10cd14-82f3-4b36-a4be-5092127d68c3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-14B-Instruct-1M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Instruct-1M",
-    "id": "Qwen/Qwen2.5-14B-Instruct-1M",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8414
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6198
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3431
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.418
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.485
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-14B-Instruct/bbd94181-0523-4543-80a7-056b041e03b7.json b/data/hfopenllm_v2/Qwen/Qwen2.5-14B-Instruct/bbd94181-0523-4543-80a7-056b041e03b7.json
deleted file mode 100644
index ffc120545..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-14B-Instruct/bbd94181-0523-4543-80a7-056b041e03b7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-14B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Instruct",
-    "id": "Qwen/Qwen2.5-14B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8158
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.639
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4101
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4904
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-14B/e10d8573-e201-460e-a931-49a1b13ceeea.json b/data/hfopenllm_v2/Qwen/Qwen2.5-14B/e10d8573-e201-460e-a931-49a1b13ceeea.json
deleted file mode 100644
index 20c1af18c..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-14B/e10d8573-e201-460e-a931-49a1b13ceeea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B",
-    "id": "Qwen/Qwen2.5-14B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3694
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6161
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.29
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4502
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5249
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-32B-Instruct/e2ca9477-2414-4b8a-8d22-68f9ced54ae5.json b/data/hfopenllm_v2/Qwen/Qwen2.5-32B-Instruct/e2ca9477-2414-4b8a-8d22-68f9ced54ae5.json
deleted file mode 100644
index d6d218cf5..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-32B-Instruct/e2ca9477-2414-4b8a-8d22-68f9ced54ae5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-32B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-32B-Instruct",
-    "id": "Qwen/Qwen2.5-32B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8346
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6913
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6254
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4261
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5667
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-32B/831246b8-5433-48e6-ba11-8a4239373106.json b/data/hfopenllm_v2/Qwen/Qwen2.5-32B/831246b8-5433-48e6-ba11-8a4239373106.json
deleted file mode 100644
index ba1f6cfae..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-32B/831246b8-5433-48e6-ba11-8a4239373106.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-32B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-32B",
-    "id": "Qwen/Qwen2.5-32B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4077
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6771
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4119
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4978
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5805
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-3B-Instruct/8277994c-8bf5-4ece-9f34-4fe9a4310bbf.json b/data/hfopenllm_v2/Qwen/Qwen2.5-3B-Instruct/8277994c-8bf5-4ece-9f34-4fe9a4310bbf.json
deleted file mode 100644
index a44b6ae5b..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-3B-Instruct/8277994c-8bf5-4ece-9f34-4fe9a4310bbf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-3B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-3B-Instruct",
-    "id": "Qwen/Qwen2.5-3B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6475
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4693
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3678
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3968
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-3B/5aabc7c5-eb3a-42e0-8b40-0a08004f6e1a.json b/data/hfopenllm_v2/Qwen/Qwen2.5-3B/5aabc7c5-eb3a-42e0-8b40-0a08004f6e1a.json
deleted file mode 100644
index 229075fd2..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-3B/5aabc7c5-eb3a-42e0-8b40-0a08004f6e1a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-3B",
-    "id": "Qwen/Qwen2.5-3B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.269
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4612
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4303
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3203
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-72B-Instruct/cbb73c83-ad94-4973-9bf5-a5e7ca4d1653.json b/data/hfopenllm_v2/Qwen/Qwen2.5-72B-Instruct/cbb73c83-ad94-4973-9bf5-a5e7ca4d1653.json
deleted file mode 100644
index 9fcc6ecb3..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-72B-Instruct/cbb73c83-ad94-4973-9bf5-a5e7ca4d1653.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-72B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-72B-Instruct",
-    "id": "Qwen/Qwen2.5-72B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8638
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7273
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5982
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4206
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5626
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-72B/3ed06a16-d5fe-43d3-a369-f4ed29fb3a5d.json b/data/hfopenllm_v2/Qwen/Qwen2.5-72B/3ed06a16-d5fe-43d3-a369-f4ed29fb3a5d.json
deleted file mode 100644
index af273092b..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-72B/3ed06a16-d5fe-43d3-a369-f4ed29fb3a5d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-72B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-72B",
-    "id": "Qwen/Qwen2.5-72B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4137
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6797
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3912
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4052
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4771
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5968
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-7B-Instruct-1M/fc817789-2f44-4d2b-b40e-2422fe33d104.json b/data/hfopenllm_v2/Qwen/Qwen2.5-7B-Instruct-1M/fc817789-2f44-4d2b-b40e-2422fe33d104.json
deleted file mode 100644
index 776013deb..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-7B-Instruct-1M/fc817789-2f44-4d2b-b40e-2422fe33d104.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-7B-Instruct-1M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-Instruct-1M",
-    "id": "Qwen/Qwen2.5-7B-Instruct-1M",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7448
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5404
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4335
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4087
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3505
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-7B-Instruct/5e1c8723-7c43-4d8f-8c7c-386c2eb6b9cf.json b/data/hfopenllm_v2/Qwen/Qwen2.5-7B-Instruct/5e1c8723-7c43-4d8f-8c7c-386c2eb6b9cf.json
deleted file mode 100644
index 66802b86b..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-7B-Instruct/5e1c8723-7c43-4d8f-8c7c-386c2eb6b9cf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-Instruct",
-    "id": "Qwen/Qwen2.5-7B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7585
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5394
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.402
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4287
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-7B/b6740747-19ac-4a9c-892f-6556013ddc8b.json b/data/hfopenllm_v2/Qwen/Qwen2.5-7B/b6740747-19ac-4a9c-892f-6556013ddc8b.json
deleted file mode 100644
index 744db06b8..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-7B/b6740747-19ac-4a9c-892f-6556013ddc8b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B",
-    "id": "Qwen/Qwen2.5-7B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3374
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5416
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4424
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4365
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-14B-Instruct/3263ab46-09ae-4c24-9332-b6874d0d0330.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-14B-Instruct/3263ab46-09ae-4c24-9332-b6874d0d0330.json
deleted file mode 100644
index efdce1948..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-14B-Instruct/3263ab46-09ae-4c24-9332-b6874d0d0330.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Coder-14B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Coder-14B-Instruct",
-    "id": "Qwen/Qwen2.5-Coder-14B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6908
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.614
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3248
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3915
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3939
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-14B/a8706a7e-5693-4768-a955-a448549d2e77.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-14B/a8706a7e-5693-4768-a955-a448549d2e77.json
deleted file mode 100644
index b59e9c9ed..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-14B/a8706a7e-5693-4768-a955-a448549d2e77.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Coder-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Coder-14B",
-    "id": "Qwen/Qwen2.5-Coder-14B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5865
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2251
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3874
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4521
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-32B-Instruct/3c932329-0440-4799-886f-10bc4a5aeb09.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-32B-Instruct/3c932329-0440-4799-886f-10bc4a5aeb09.json
deleted file mode 100644
index 61b3a04a0..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-32B-Instruct/3c932329-0440-4799-886f-10bc4a5aeb09.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Coder-32B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Coder-32B-Instruct",
-    "id": "Qwen/Qwen2.5-Coder-32B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7265
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6625
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4955
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.349
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4386
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4413
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-32B/b1e42d9d-827d-4109-8d1b-182694033b21.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-32B/b1e42d9d-827d-4109-8d1b-182694033b21.json
deleted file mode 100644
index 636fd2224..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-32B/b1e42d9d-827d-4109-8d1b-182694033b21.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Coder-32B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Coder-32B",
-    "id": "Qwen/Qwen2.5-Coder-32B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4363
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6404
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3089
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3465
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4528
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5303
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B-Instruct/0c6f0d92-3ee0-48d7-b3fc-70149911a51d.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B-Instruct/0c6f0d92-3ee0-48d7-b3fc-70149911a51d.json
deleted file mode 100644
index 00f370a8a..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B-Instruct/0c6f0d92-3ee0-48d7-b3fc-70149911a51d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Coder-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Coder-7B-Instruct",
-    "id": "Qwen/Qwen2.5-Coder-7B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6147
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4999
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.031
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4099
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3354
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B-Instruct/73b07681-8e10-414e-8922-650908f9cf6a.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B-Instruct/73b07681-8e10-414e-8922-650908f9cf6a.json
deleted file mode 100644
index f33929085..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B-Instruct/73b07681-8e10-414e-8922-650908f9cf6a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Coder-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Coder-7B-Instruct",
-    "id": "Qwen/Qwen2.5-Coder-7B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6101
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5008
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4073
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3352
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B/8b1549f8-0602-4538-842c-abe9dca7baff.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B/8b1549f8-0602-4538-842c-abe9dca7baff.json
deleted file mode 100644
index db8e13ac7..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-Coder-7B/8b1549f8-0602-4538-842c-abe9dca7baff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Coder-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Coder-7B",
-    "id": "Qwen/Qwen2.5-Coder-7B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3446
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4856
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1918
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3449
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3679
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Math-1.5B-Instruct/ad395ad4-0f9f-4b49-83c9-b89fa6b6dd89.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Math-1.5B-Instruct/ad395ad4-0f9f-4b49-83c9-b89fa6b6dd89.json
deleted file mode 100644
index 376ff5457..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-Math-1.5B-Instruct/ad395ad4-0f9f-4b49-83c9-b89fa6b6dd89.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Math-1.5B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Math-1.5B-Instruct",
-    "id": "Qwen/Qwen2.5-Math-1.5B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1856
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3752
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2628
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3685
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1801
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Math-72B-Instruct/14c01681-fbef-49c4-b737-a7baaa02d393.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Math-72B-Instruct/14c01681-fbef-49c4-b737-a7baaa02d393.json
deleted file mode 100644
index 0366bc367..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-Math-72B-Instruct/14c01681-fbef-49c4-b737-a7baaa02d393.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Math-72B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Math-72B-Instruct",
-    "id": "Qwen/Qwen2.5-Math-72B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4003
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6452
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6239
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4473
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4812
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Math-7B-Instruct/3ad495c0-da8e-4776-8d05-bc7dce1fe120.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Math-7B-Instruct/3ad495c0-da8e-4776-8d05-bc7dce1fe120.json
deleted file mode 100644
index c41423d24..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-Math-7B-Instruct/3ad495c0-da8e-4776-8d05-bc7dce1fe120.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Math-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Math-7B-Instruct",
-    "id": "Qwen/Qwen2.5-Math-7B-Instruct",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2636
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4388
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5808
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3647
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.282
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Qwen/Qwen2.5-Math-7B/0762ca9e-f0d4-408e-9992-e91a10e0e65f.json b/data/hfopenllm_v2/Qwen/Qwen2.5-Math-7B/0762ca9e-f0d4-408e-9992-e91a10e0e65f.json
deleted file mode 100644
index e97af5279..000000000
--- a/data/hfopenllm_v2/Qwen/Qwen2.5-Math-7B/0762ca9e-f0d4-408e-9992-e91a10e0e65f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Qwen_Qwen2.5-Math-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Math-7B",
-    "id": "Qwen/Qwen2.5-Math-7B",
-    "developer": "Qwen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.246
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4455
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3051
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3781
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/RDson/WomboCombo-R1-Coder-14B-Preview/ec6c1d05-cea7-445c-bed3-9eee1e1ff03d.json b/data/hfopenllm_v2/RDson/WomboCombo-R1-Coder-14B-Preview/ec6c1d05-cea7-445c-bed3-9eee1e1ff03d.json
deleted file mode 100644
index 8ed8e2c46..000000000
--- a/data/hfopenllm_v2/RDson/WomboCombo-R1-Coder-14B-Preview/ec6c1d05-cea7-445c-bed3-9eee1e1ff03d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/RDson_WomboCombo-R1-Coder-14B-Preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "WomboCombo-R1-Coder-14B-Preview",
-    "id": "RDson/WomboCombo-R1-Coder-14B-Preview",
-    "developer": "RDson",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6286
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6392
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5989
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4844
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5168
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/RESMPDEV/EVA-Qwen2.5-1.5B-FRFR/1fc39812-77fb-4d0c-b9fb-706e94c40afe.json b/data/hfopenllm_v2/RESMPDEV/EVA-Qwen2.5-1.5B-FRFR/1fc39812-77fb-4d0c-b9fb-706e94c40afe.json
deleted file mode 100644
index 44c5b7786..000000000
--- a/data/hfopenllm_v2/RESMPDEV/EVA-Qwen2.5-1.5B-FRFR/1fc39812-77fb-4d0c-b9fb-706e94c40afe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/RESMPDEV_EVA-Qwen2.5-1.5B-FRFR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "EVA-Qwen2.5-1.5B-FRFR",
-    "id": "RESMPDEV/EVA-Qwen2.5-1.5B-FRFR",
-    "developer": "RESMPDEV",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3082
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3932
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1027
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3539
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.277
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/RESMPDEV/Qwen2-Wukong-0.5B/fdc3c502-53ad-4bf7-85ce-51eaed72754b.json b/data/hfopenllm_v2/RESMPDEV/Qwen2-Wukong-0.5B/fdc3c502-53ad-4bf7-85ce-51eaed72754b.json
deleted file mode 100644
index 7c90e36d8..000000000
--- a/data/hfopenllm_v2/RESMPDEV/Qwen2-Wukong-0.5B/fdc3c502-53ad-4bf7-85ce-51eaed72754b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/RESMPDEV_Qwen2-Wukong-0.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-Wukong-0.5B",
-    "id": "RESMPDEV/Qwen2-Wukong-0.5B",
-    "developer": "RESMPDEV",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1854
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3085
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0015
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2366
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3525
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1327
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/RLHFlow/ArmoRM-Llama3-8B-v0.1/3f74c1c7-f349-4193-95cf-b0033112fea0.json b/data/hfopenllm_v2/RLHFlow/ArmoRM-Llama3-8B-v0.1/3f74c1c7-f349-4193-95cf-b0033112fea0.json
deleted file mode 100644
index 0705593b8..000000000
--- a/data/hfopenllm_v2/RLHFlow/ArmoRM-Llama3-8B-v0.1/3f74c1c7-f349-4193-95cf-b0033112fea0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/RLHFlow_ArmoRM-Llama3-8B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ArmoRM-Llama3-8B-v0.1",
-    "id": "RLHFlow/ArmoRM-Llama3-8B-v0.1",
-    "developer": "RLHFlow",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForRewardModelWithGating",
-      "params_billions": 7.511
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1897
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2876
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3948
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1078
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/RLHFlow/LLaMA3-iterative-DPO-final/36a803da-83ab-4c49-8855-9344aaa7a68b.json b/data/hfopenllm_v2/RLHFlow/LLaMA3-iterative-DPO-final/36a803da-83ab-4c49-8855-9344aaa7a68b.json
deleted file mode 100644
index 49dd63ad0..000000000
--- a/data/hfopenllm_v2/RLHFlow/LLaMA3-iterative-DPO-final/36a803da-83ab-4c49-8855-9344aaa7a68b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/RLHFlow_LLaMA3-iterative-DPO-final/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMA3-iterative-DPO-final",
-    "id": "RLHFlow/LLaMA3-iterative-DPO-final",
-    "developer": "RLHFlow",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.534
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5058
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0884
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3673
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3257
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/RWKV/rwkv-raven-14b/df986996-249e-49f9-b074-91e8dcdf62e2.json b/data/hfopenllm_v2/RWKV/rwkv-raven-14b/df986996-249e-49f9-b074-91e8dcdf62e2.json
deleted file mode 100644
index 608ee4b4e..000000000
--- a/data/hfopenllm_v2/RWKV/rwkv-raven-14b/df986996-249e-49f9-b074-91e8dcdf62e2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/RWKV_rwkv-raven-14b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "rwkv-raven-14b",
-    "id": "RWKV/rwkv-raven-14b",
-    "developer": "RWKV",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "RwkvForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0768
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3307
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.229
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3951
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.115
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Rakuten/RakutenAI-2.0-mini-instruct/90f007e9-e323-4a82-b276-ac1b928030ca.json b/data/hfopenllm_v2/Rakuten/RakutenAI-2.0-mini-instruct/90f007e9-e323-4a82-b276-ac1b928030ca.json
deleted file mode 100644
index 559b245ad..000000000
--- a/data/hfopenllm_v2/Rakuten/RakutenAI-2.0-mini-instruct/90f007e9-e323-4a82-b276-ac1b928030ca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Rakuten_RakutenAI-2.0-mini-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RakutenAI-2.0-mini-instruct",
-    "id": "Rakuten/RakutenAI-2.0-mini-instruct",
-    "developer": "Rakuten",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 1.535
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6794
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2867
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0521
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3249
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1118
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Rakuten/RakutenAI-7B-chat/2b627f93-5cc7-4a5e-b682-d129396362e5.json b/data/hfopenllm_v2/Rakuten/RakutenAI-7B-chat/2b627f93-5cc7-4a5e-b682-d129396362e5.json
deleted file mode 100644
index 29a9ba71c..000000000
--- a/data/hfopenllm_v2/Rakuten/RakutenAI-7B-chat/2b627f93-5cc7-4a5e-b682-d129396362e5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Rakuten_RakutenAI-7B-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RakutenAI-7B-chat",
-    "id": "Rakuten/RakutenAI-7B-chat",
-    "developer": "Rakuten",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.373
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2686
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4316
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.379
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2798
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Rakuten/RakutenAI-7B/2fde07ac-d218-4cc6-947e-8ceb87eedbee.json b/data/hfopenllm_v2/Rakuten/RakutenAI-7B/2fde07ac-d218-4cc6-947e-8ceb87eedbee.json
deleted file mode 100644
index 7c3ef32cc..000000000
--- a/data/hfopenllm_v2/Rakuten/RakutenAI-7B/2fde07ac-d218-4cc6-947e-8ceb87eedbee.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Rakuten_RakutenAI-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RakutenAI-7B",
-    "id": "Rakuten/RakutenAI-7B",
-    "developer": "Rakuten",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.373
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1556
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4315
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2877
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Replete-AI/L3-Pneuma-8B/2a141bfe-4632-4058-a232-1f2c5540c41f.json b/data/hfopenllm_v2/Replete-AI/L3-Pneuma-8B/2a141bfe-4632-4058-a232-1f2c5540c41f.json
deleted file mode 100644
index 4c2435507..000000000
--- a/data/hfopenllm_v2/Replete-AI/L3-Pneuma-8B/2a141bfe-4632-4058-a232-1f2c5540c41f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Replete-AI_L3-Pneuma-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-Pneuma-8B",
-    "id": "Replete-AI/L3-Pneuma-8B",
-    "developer": "Replete-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2413
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4909
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4105
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3176
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Replete-AI/L3.1-Pneuma-8B/fa2d74a5-e8f6-4a1c-9310-a9b16c2e59d1.json b/data/hfopenllm_v2/Replete-AI/L3.1-Pneuma-8B/fa2d74a5-e8f6-4a1c-9310-a9b16c2e59d1.json
deleted file mode 100644
index dc4c32906..000000000
--- a/data/hfopenllm_v2/Replete-AI/L3.1-Pneuma-8B/fa2d74a5-e8f6-4a1c-9310-a9b16c2e59d1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Replete-AI_L3.1-Pneuma-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.1-Pneuma-8B",
-    "id": "Replete-AI/L3.1-Pneuma-8B",
-    "developer": "Replete-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7076
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.505
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2198
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3871
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3691
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Replete-AI/Llama3-8B-Instruct-Replete-Adapted/c7c0ceff-9273-4cc3-8f8e-bd93181590ba.json b/data/hfopenllm_v2/Replete-AI/Llama3-8B-Instruct-Replete-Adapted/c7c0ceff-9273-4cc3-8f8e-bd93181590ba.json
deleted file mode 100644
index 1449f9a9c..000000000
--- a/data/hfopenllm_v2/Replete-AI/Llama3-8B-Instruct-Replete-Adapted/c7c0ceff-9273-4cc3-8f8e-bd93181590ba.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Replete-AI_Llama3-8B-Instruct-Replete-Adapted/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3-8B-Instruct-Replete-Adapted",
-    "id": "Replete-AI/Llama3-8B-Instruct-Replete-Adapted",
-    "developer": "Replete-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6915
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.487
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.071
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3634
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3391
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Replete-AI/Replete-Coder-Instruct-8b-Merged/c439478a-1734-4038-aa8b-bb2d12ec022d.json b/data/hfopenllm_v2/Replete-AI/Replete-Coder-Instruct-8b-Merged/c439478a-1734-4038-aa8b-bb2d12ec022d.json
deleted file mode 100644
index f5afd447c..000000000
--- a/data/hfopenllm_v2/Replete-AI/Replete-Coder-Instruct-8b-Merged/c439478a-1734-4038-aa8b-bb2d12ec022d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Replete-AI_Replete-Coder-Instruct-8b-Merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Replete-Coder-Instruct-8b-Merged",
-    "id": "Replete-AI/Replete-Coder-Instruct-8b-Merged",
-    "developer": "Replete-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5388
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4462
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0778
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.366
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1805
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Replete-AI/Replete-Coder-Llama3-8B/4a36f73a-9495-4ea2-863c-220b8ca6bf99.json b/data/hfopenllm_v2/Replete-AI/Replete-Coder-Llama3-8B/4a36f73a-9495-4ea2-863c-220b8ca6bf99.json
deleted file mode 100644
index c28e41a33..000000000
--- a/data/hfopenllm_v2/Replete-AI/Replete-Coder-Llama3-8B/4a36f73a-9495-4ea2-863c-220b8ca6bf99.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Replete-AI_Replete-Coder-Llama3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Replete-Coder-Llama3-8B",
-    "id": "Replete-AI/Replete-Coder-Llama3-8B",
-    "developer": "Replete-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4729
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3271
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3953
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1331
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Replete-AI/Replete-Coder-Qwen2-1.5b/faa9d3b9-343a-4a9e-82c5-6bc81bc87b9c.json b/data/hfopenllm_v2/Replete-AI/Replete-Coder-Qwen2-1.5b/faa9d3b9-343a-4a9e-82c5-6bc81bc87b9c.json
deleted file mode 100644
index 5845dc173..000000000
--- a/data/hfopenllm_v2/Replete-AI/Replete-Coder-Qwen2-1.5b/faa9d3b9-343a-4a9e-82c5-6bc81bc87b9c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Replete-AI_Replete-Coder-Qwen2-1.5b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Replete-Coder-Qwen2-1.5b",
-    "id": "Replete-AI/Replete-Coder-Qwen2-1.5b",
-    "developer": "Replete-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3014
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3475
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4073
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2147
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b/a55bf380-d567-4228-b30c-57e9df31e844.json b/data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b/a55bf380-d567-4228-b30c-57e9df31e844.json
deleted file mode 100644
index fcb0416cd..000000000
--- a/data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b/a55bf380-d567-4228-b30c-57e9df31e844.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Replete-AI_Replete-LLM-Qwen2-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Replete-LLM-Qwen2-7b",
-    "id": "Replete-AI/Replete-LLM-Qwen2-7b",
-    "developer": "Replete-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0932
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2977
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2475
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3941
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1157
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b/dfd92311-4f3d-4355-8ccf-a59f29914b8f.json b/data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b/dfd92311-4f3d-4355-8ccf-a59f29914b8f.json
deleted file mode 100644
index 2013766f3..000000000
--- a/data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b/dfd92311-4f3d-4355-8ccf-a59f29914b8f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Replete-AI_Replete-LLM-Qwen2-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Replete-LLM-Qwen2-7b",
-    "id": "Replete-AI/Replete-LLM-Qwen2-7b",
-    "developer": "Replete-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0905
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2985
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3848
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1158
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b_Beta-Preview/d98e190e-5b5f-46eb-b701-e32d2dbef3a0.json b/data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b_Beta-Preview/d98e190e-5b5f-46eb-b701-e32d2dbef3a0.json
deleted file mode 100644
index 43cd838cf..000000000
--- a/data/hfopenllm_v2/Replete-AI/Replete-LLM-Qwen2-7b_Beta-Preview/d98e190e-5b5f-46eb-b701-e32d2dbef3a0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Replete-AI_Replete-LLM-Qwen2-7b_Beta-Preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Replete-LLM-Qwen2-7b_Beta-Preview",
-    "id": "Replete-AI/Replete-LLM-Qwen2-7b_Beta-Preview",
-    "developer": "Replete-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0858
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2929
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2483
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3981
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1285
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Replete-AI/Replete-LLM-V2-Llama-3.1-8b/32edb764-2a42-4efe-ac86-9eda81942b84.json b/data/hfopenllm_v2/Replete-AI/Replete-LLM-V2-Llama-3.1-8b/32edb764-2a42-4efe-ac86-9eda81942b84.json
deleted file mode 100644
index f78d9e7b1..000000000
--- a/data/hfopenllm_v2/Replete-AI/Replete-LLM-V2-Llama-3.1-8b/32edb764-2a42-4efe-ac86-9eda81942b84.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Replete-AI_Replete-LLM-V2-Llama-3.1-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Replete-LLM-V2-Llama-3.1-8b",
-    "id": "Replete-AI/Replete-LLM-V2-Llama-3.1-8b",
-    "developer": "Replete-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5515
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5339
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1405
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4001
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3753
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/RezVortex/JAJUKA-WEWILLNEVERFORGETYOU-3B/36855ebd-2030-4d5d-9c42-ca049244e694.json b/data/hfopenllm_v2/RezVortex/JAJUKA-WEWILLNEVERFORGETYOU-3B/36855ebd-2030-4d5d-9c42-ca049244e694.json
deleted file mode 100644
index bf5d8960d..000000000
--- a/data/hfopenllm_v2/RezVortex/JAJUKA-WEWILLNEVERFORGETYOU-3B/36855ebd-2030-4d5d-9c42-ca049244e694.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/RezVortex_JAJUKA-WEWILLNEVERFORGETYOU-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "JAJUKA-WEWILLNEVERFORGETYOU-3B",
-    "id": "RezVortex/JAJUKA-WEWILLNEVERFORGETYOU-3B",
-    "developer": "RezVortex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6858
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4619
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1548
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.363
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3143
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/RezVortex/Jajuka-3b/9651a0a1-4004-42f3-ad8f-2aebb38ec967.json b/data/hfopenllm_v2/RezVortex/Jajuka-3b/9651a0a1-4004-42f3-ad8f-2aebb38ec967.json
deleted file mode 100644
index 797c96c6d..000000000
--- a/data/hfopenllm_v2/RezVortex/Jajuka-3b/9651a0a1-4004-42f3-ad8f-2aebb38ec967.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/RezVortex_Jajuka-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Jajuka-3b",
-    "id": "RezVortex/Jajuka-3b",
-    "developer": "RezVortex",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6925
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4594
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1594
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3671
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3137
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Ro-xe/FMixIA-7B-DARE-0/a59e55dc-e2b5-43be-8469-49eee0e98d55.json b/data/hfopenllm_v2/Ro-xe/FMixIA-7B-DARE-0/a59e55dc-e2b5-43be-8469-49eee0e98d55.json
deleted file mode 100644
index b2648eb78..000000000
--- a/data/hfopenllm_v2/Ro-xe/FMixIA-7B-DARE-0/a59e55dc-e2b5-43be-8469-49eee0e98d55.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Ro-xe_FMixIA-7B-DARE-0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FMixIA-7B-DARE-0",
-    "id": "Ro-xe/FMixIA-7B-DARE-0",
-    "developer": "Ro-xe",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3341
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5035
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0529
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4545
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3016
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Ro-xe/FMixIA-7B-SLERP-27/a956e306-f184-4dbc-ac7a-3793ae735801.json b/data/hfopenllm_v2/Ro-xe/FMixIA-7B-SLERP-27/a956e306-f184-4dbc-ac7a-3793ae735801.json
deleted file mode 100644
index f8798e128..000000000
--- a/data/hfopenllm_v2/Ro-xe/FMixIA-7B-SLERP-27/a956e306-f184-4dbc-ac7a-3793ae735801.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Ro-xe_FMixIA-7B-SLERP-27/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FMixIA-7B-SLERP-27",
-    "id": "Ro-xe/FMixIA-7B-SLERP-27",
-    "developer": "Ro-xe",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3765
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5151
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0634
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4412
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3008
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Ro-xe/FMixIA-7B-TIES-1/c05cc6ce-12fd-491d-b41b-57cc14b6d34a.json b/data/hfopenllm_v2/Ro-xe/FMixIA-7B-TIES-1/c05cc6ce-12fd-491d-b41b-57cc14b6d34a.json
deleted file mode 100644
index 1a47f9d40..000000000
--- a/data/hfopenllm_v2/Ro-xe/FMixIA-7B-TIES-1/c05cc6ce-12fd-491d-b41b-57cc14b6d34a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Ro-xe_FMixIA-7B-TIES-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FMixIA-7B-TIES-1",
-    "id": "Ro-xe/FMixIA-7B-TIES-1",
-    "developer": "Ro-xe",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3453
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5092
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4689
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2992
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Ro-xe/FMixIA-FrankenMerge-9.5B-PT-9/415875b7-fe10-47e7-aca0-029c2f51c067.json b/data/hfopenllm_v2/Ro-xe/FMixIA-FrankenMerge-9.5B-PT-9/415875b7-fe10-47e7-aca0-029c2f51c067.json
deleted file mode 100644
index 2f038af4f..000000000
--- a/data/hfopenllm_v2/Ro-xe/FMixIA-FrankenMerge-9.5B-PT-9/415875b7-fe10-47e7-aca0-029c2f51c067.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Ro-xe_FMixIA-FrankenMerge-9.5B-PT-9/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FMixIA-FrankenMerge-9.5B-PT-9",
-    "id": "Ro-xe/FMixIA-FrankenMerge-9.5B-PT-9",
-    "developer": "Ro-xe",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.141
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.194
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5088
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.003
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.417
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3657
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Rombo-Org/Rombo-LLM-V2.5-Qwen-7b/c505ee64-3d3b-48e2-9c8a-f59609a758e9.json b/data/hfopenllm_v2/Rombo-Org/Rombo-LLM-V2.5-Qwen-7b/c505ee64-3d3b-48e2-9c8a-f59609a758e9.json
deleted file mode 100644
index 92335ffa4..000000000
--- a/data/hfopenllm_v2/Rombo-Org/Rombo-LLM-V2.5-Qwen-7b/c505ee64-3d3b-48e2-9c8a-f59609a758e9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Rombo-Org_Rombo-LLM-V2.5-Qwen-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rombo-LLM-V2.5-Qwen-7b",
-    "id": "Rombo-Org/Rombo-LLM-V2.5-Qwen-7b",
-    "developer": "Rombo-Org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7482
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.54
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.398
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4283
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/RubielLabarta/LogoS-7Bx2-MoE-13B-v0.2/00003185-c291-40c5-bba1-f87eae0afc08.json b/data/hfopenllm_v2/RubielLabarta/LogoS-7Bx2-MoE-13B-v0.2/00003185-c291-40c5-bba1-f87eae0afc08.json
deleted file mode 100644
index 14bcc9da2..000000000
--- a/data/hfopenllm_v2/RubielLabarta/LogoS-7Bx2-MoE-13B-v0.2/00003185-c291-40c5-bba1-f87eae0afc08.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/RubielLabarta_LogoS-7Bx2-MoE-13B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LogoS-7Bx2-MoE-13B-v0.2",
-    "id": "RubielLabarta/LogoS-7Bx2-MoE-13B-v0.2",
-    "developer": "RubielLabarta",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 12.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4379
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5207
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4226
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3088
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SaisExperiments/Evil-Alpaca-3B-L3.2/328f61d7-677b-4a06-b464-0da42153f9ae.json b/data/hfopenllm_v2/SaisExperiments/Evil-Alpaca-3B-L3.2/328f61d7-677b-4a06-b464-0da42153f9ae.json
deleted file mode 100644
index 2ee369a00..000000000
--- a/data/hfopenllm_v2/SaisExperiments/Evil-Alpaca-3B-L3.2/328f61d7-677b-4a06-b464-0da42153f9ae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SaisExperiments_Evil-Alpaca-3B-L3.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Evil-Alpaca-3B-L3.2",
-    "id": "SaisExperiments/Evil-Alpaca-3B-L3.2",
-    "developer": "SaisExperiments",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3251
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4341
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0702
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4198
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2621
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SaisExperiments/Gemma-2-2B-Opus-Instruct/9cb5b8fd-062c-4161-9301-640980d21b9f.json b/data/hfopenllm_v2/SaisExperiments/Gemma-2-2B-Opus-Instruct/9cb5b8fd-062c-4161-9301-640980d21b9f.json
deleted file mode 100644
index b9f60ffdb..000000000
--- a/data/hfopenllm_v2/SaisExperiments/Gemma-2-2B-Opus-Instruct/9cb5b8fd-062c-4161-9301-640980d21b9f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SaisExperiments_Gemma-2-2B-Opus-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-2B-Opus-Instruct",
-    "id": "SaisExperiments/Gemma-2-2B-Opus-Instruct",
-    "developer": "SaisExperiments",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.475
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4293
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4057
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.265
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SaisExperiments/Gemma-2-2B-Stheno-Filtered/09284b75-a2f9-40ea-8135-7aa61c626fa2.json b/data/hfopenllm_v2/SaisExperiments/Gemma-2-2B-Stheno-Filtered/09284b75-a2f9-40ea-8135-7aa61c626fa2.json
deleted file mode 100644
index cef011e36..000000000
--- a/data/hfopenllm_v2/SaisExperiments/Gemma-2-2B-Stheno-Filtered/09284b75-a2f9-40ea-8135-7aa61c626fa2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SaisExperiments_Gemma-2-2B-Stheno-Filtered/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-2B-Stheno-Filtered",
-    "id": "SaisExperiments/Gemma-2-2B-Stheno-Filtered",
-    "developer": "SaisExperiments",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4197
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4149
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0461
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4003
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.263
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SaisExperiments/Not-So-Small-Alpaca-24B/e2502331-6ac3-43bc-8218-259b44333283.json b/data/hfopenllm_v2/SaisExperiments/Not-So-Small-Alpaca-24B/e2502331-6ac3-43bc-8218-259b44333283.json
deleted file mode 100644
index f23479676..000000000
--- a/data/hfopenllm_v2/SaisExperiments/Not-So-Small-Alpaca-24B/e2502331-6ac3-43bc-8218-259b44333283.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SaisExperiments_Not-So-Small-Alpaca-24B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Not-So-Small-Alpaca-24B",
-    "id": "SaisExperiments/Not-So-Small-Alpaca-24B",
-    "developer": "SaisExperiments",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6244
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5339
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1828
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3591
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4282
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3694
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SaisExperiments/QwOwO-7B-V1/8dde454d-aa48-4ee1-b5c6-f3353087d492.json b/data/hfopenllm_v2/SaisExperiments/QwOwO-7B-V1/8dde454d-aa48-4ee1-b5c6-f3353087d492.json
deleted file mode 100644
index 858e45e66..000000000
--- a/data/hfopenllm_v2/SaisExperiments/QwOwO-7B-V1/8dde454d-aa48-4ee1-b5c6-f3353087d492.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SaisExperiments_QwOwO-7B-V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwOwO-7B-V1",
-    "id": "SaisExperiments/QwOwO-7B-V1",
-    "developer": "SaisExperiments",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4556
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5431
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.386
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3835
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4224
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SaisExperiments/RightSheep-Llama3.2-3B/662c8ed2-2407-4606-ac1e-ec7ade185d2d.json b/data/hfopenllm_v2/SaisExperiments/RightSheep-Llama3.2-3B/662c8ed2-2407-4606-ac1e-ec7ade185d2d.json
deleted file mode 100644
index 8ded05543..000000000
--- a/data/hfopenllm_v2/SaisExperiments/RightSheep-Llama3.2-3B/662c8ed2-2407-4606-ac1e-ec7ade185d2d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SaisExperiments_RightSheep-Llama3.2-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RightSheep-Llama3.2-3B",
-    "id": "SaisExperiments/RightSheep-Llama3.2-3B",
-    "developer": "SaisExperiments",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4156
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4241
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0808
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3767
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.254
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Anemoi-3B/332aef8c-7c62-463e-ba3c-07ae0205d457.json b/data/hfopenllm_v2/Sakalti/Anemoi-3B/332aef8c-7c62-463e-ba3c-07ae0205d457.json
deleted file mode 100644
index 311f81095..000000000
--- a/data/hfopenllm_v2/Sakalti/Anemoi-3B/332aef8c-7c62-463e-ba3c-07ae0205d457.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Anemoi-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Anemoi-3B",
-    "id": "Sakalti/Anemoi-3B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.397
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3804
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4922
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1775
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4371
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3766
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Euphrates-14B/cfdfcf21-e445-430e-a295-946cb8c3fce9.json b/data/hfopenllm_v2/Sakalti/Euphrates-14B/cfdfcf21-e445-430e-a295-946cb8c3fce9.json
deleted file mode 100644
index 228187fd8..000000000
--- a/data/hfopenllm_v2/Sakalti/Euphrates-14B/cfdfcf21-e445-430e-a295-946cb8c3fce9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Euphrates-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Euphrates-14B",
-    "id": "Sakalti/Euphrates-14B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2647
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6138
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3051
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4516
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5255
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Llama3.2-3B-Uranus-1/a5606b92-aa2d-44e3-a92c-47d0b38fef9c.json b/data/hfopenllm_v2/Sakalti/Llama3.2-3B-Uranus-1/a5606b92-aa2d-44e3-a92c-47d0b38fef9c.json
deleted file mode 100644
index 077b4b960..000000000
--- a/data/hfopenllm_v2/Sakalti/Llama3.2-3B-Uranus-1/a5606b92-aa2d-44e3-a92c-47d0b38fef9c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Llama3.2-3B-Uranus-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.2-3B-Uranus-1",
-    "id": "Sakalti/Llama3.2-3B-Uranus-1",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5335
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4437
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1495
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3669
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3094
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Magro-7B-v1.1/465d473c-ef28-4725-8cac-02f2a031b22c.json b/data/hfopenllm_v2/Sakalti/Magro-7B-v1.1/465d473c-ef28-4725-8cac-02f2a031b22c.json
deleted file mode 100644
index 748ddb98f..000000000
--- a/data/hfopenllm_v2/Sakalti/Magro-7B-v1.1/465d473c-ef28-4725-8cac-02f2a031b22c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Magro-7B-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Magro-7B-v1.1",
-    "id": "Sakalti/Magro-7B-v1.1",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1204
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4179
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4433
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2764
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Neptuno-3B/2c636544-8676-4eee-8bcd-d623be0275be.json b/data/hfopenllm_v2/Sakalti/Neptuno-3B/2c636544-8676-4eee-8bcd-d623be0275be.json
deleted file mode 100644
index 6e3a0d4d0..000000000
--- a/data/hfopenllm_v2/Sakalti/Neptuno-3B/2c636544-8676-4eee-8bcd-d623be0275be.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Neptuno-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Neptuno-3B",
-    "id": "Sakalti/Neptuno-3B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.397
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4296
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4834
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2553
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4002
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3773
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Neptuno-Alpha/8b332fac-1cfa-498b-853a-52ec5492ddc7.json b/data/hfopenllm_v2/Sakalti/Neptuno-Alpha/8b332fac-1cfa-498b-853a-52ec5492ddc7.json
deleted file mode 100644
index c98619fdf..000000000
--- a/data/hfopenllm_v2/Sakalti/Neptuno-Alpha/8b332fac-1cfa-498b-853a-52ec5492ddc7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Neptuno-Alpha/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Neptuno-Alpha",
-    "id": "Sakalti/Neptuno-Alpha",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.397
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.378
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4925
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1835
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4371
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3767
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Oxyge1-33B/2bf1b38b-e90b-4fa8-b19e-47d93ff9ab4e.json b/data/hfopenllm_v2/Sakalti/Oxyge1-33B/2bf1b38b-e90b-4fa8-b19e-47d93ff9ab4e.json
deleted file mode 100644
index eb5bc1d97..000000000
--- a/data/hfopenllm_v2/Sakalti/Oxyge1-33B/2bf1b38b-e90b-4fa8-b19e-47d93ff9ab4e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Oxyge1-33B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Oxyge1-33B",
-    "id": "Sakalti/Oxyge1-33B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4548
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7033
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4962
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3826
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5008
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5909
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Phi3.5-Comets-3.8B/69bb0243-75b2-4858-ba6b-5e70cfb516a7.json b/data/hfopenllm_v2/Sakalti/Phi3.5-Comets-3.8B/69bb0243-75b2-4858-ba6b-5e70cfb516a7.json
deleted file mode 100644
index 0dfa27332..000000000
--- a/data/hfopenllm_v2/Sakalti/Phi3.5-Comets-3.8B/69bb0243-75b2-4858-ba6b-5e70cfb516a7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Phi3.5-Comets-3.8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi3.5-Comets-3.8B",
-    "id": "Sakalti/Phi3.5-Comets-3.8B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2094
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3335
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3764
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1153
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Qwen2.5-1B-Instruct/4bb7e325-8741-4c09-81f6-9efdb30ef5a5.json b/data/hfopenllm_v2/Sakalti/Qwen2.5-1B-Instruct/4bb7e325-8741-4c09-81f6-9efdb30ef5a5.json
deleted file mode 100644
index 6c0cb747d..000000000
--- a/data/hfopenllm_v2/Sakalti/Qwen2.5-1B-Instruct/4bb7e325-8741-4c09-81f6-9efdb30ef5a5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Qwen2.5-1B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-1B-Instruct",
-    "id": "Sakalti/Qwen2.5-1B-Instruct",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.988
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1751
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3027
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3369
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1213
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/QwenTest-7/87878b74-22ce-4554-914c-03e486d13de3.json b/data/hfopenllm_v2/Sakalti/QwenTest-7/87878b74-22ce-4554-914c-03e486d13de3.json
deleted file mode 100644
index 5e4bcbff5..000000000
--- a/data/hfopenllm_v2/Sakalti/QwenTest-7/87878b74-22ce-4554-914c-03e486d13de3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_QwenTest-7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenTest-7",
-    "id": "Sakalti/QwenTest-7",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.988
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1672
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3063
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0038
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3422
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1212
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJT-0.5B/5030f8d4-f216-4f78-84f1-dd03b0324bb0.json b/data/hfopenllm_v2/Sakalti/SJT-0.5B/5030f8d4-f216-4f78-84f1-dd03b0324bb0.json
deleted file mode 100644
index 8cfc9ba99..000000000
--- a/data/hfopenllm_v2/Sakalti/SJT-0.5B/5030f8d4-f216-4f78-84f1-dd03b0324bb0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJT-0.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJT-0.5B",
-    "id": "Sakalti/SJT-0.5B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2425
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3306
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0521
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1891
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJT-1.5B-Alpha-1.1/c5e244fd-e85e-4fbb-9703-b8e733fb91bf.json b/data/hfopenllm_v2/Sakalti/SJT-1.5B-Alpha-1.1/c5e244fd-e85e-4fbb-9703-b8e733fb91bf.json
deleted file mode 100644
index b6b2e229c..000000000
--- a/data/hfopenllm_v2/Sakalti/SJT-1.5B-Alpha-1.1/c5e244fd-e85e-4fbb-9703-b8e733fb91bf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJT-1.5B-Alpha-1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJT-1.5B-Alpha-1.1",
-    "id": "Sakalti/SJT-1.5B-Alpha-1.1",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3439
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4243
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0959
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4239
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2966
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJT-1.5B-Alpha/38261a01-62df-42b2-9b1d-f924598e70ef.json b/data/hfopenllm_v2/Sakalti/SJT-1.5B-Alpha/38261a01-62df-42b2-9b1d-f924598e70ef.json
deleted file mode 100644
index b438fe370..000000000
--- a/data/hfopenllm_v2/Sakalti/SJT-1.5B-Alpha/38261a01-62df-42b2-9b1d-f924598e70ef.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJT-1.5B-Alpha/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJT-1.5B-Alpha",
-    "id": "Sakalti/SJT-1.5B-Alpha",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3449
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4241
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0997
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4226
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJT-1.7B/5736f0b5-3903-4774-a84a-c3db260d36e4.json b/data/hfopenllm_v2/Sakalti/SJT-1.7B/5736f0b5-3903-4774-a84a-c3db260d36e4.json
deleted file mode 100644
index b6d4f8db5..000000000
--- a/data/hfopenllm_v2/Sakalti/SJT-1.7B/5736f0b5-3903-4774-a84a-c3db260d36e4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJT-1.7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJT-1.7B",
-    "id": "Sakalti/SJT-1.7B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.684
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1776
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2934
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0015
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2416
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3964
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1133
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJT-14B/70134d58-972e-49c9-8cde-4ba2691d3dc3.json b/data/hfopenllm_v2/Sakalti/SJT-14B/70134d58-972e-49c9-8cde-4ba2691d3dc3.json
deleted file mode 100644
index ff7d304d7..000000000
--- a/data/hfopenllm_v2/Sakalti/SJT-14B/70134d58-972e-49c9-8cde-4ba2691d3dc3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJT-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJT-14B",
-    "id": "Sakalti/SJT-14B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5494
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6536
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3844
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4766
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5381
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJT-2.4B/d4bb1440-2064-4752-bcb3-c9cec234fd1b.json b/data/hfopenllm_v2/Sakalti/SJT-2.4B/d4bb1440-2064-4752-bcb3-c9cec234fd1b.json
deleted file mode 100644
index b6553ae79..000000000
--- a/data/hfopenllm_v2/Sakalti/SJT-2.4B/d4bb1440-2064-4752-bcb3-c9cec234fd1b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJT-2.4B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJT-2.4B",
-    "id": "Sakalti/SJT-2.4B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 2.432
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2804
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.349
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3699
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1858
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJT-24B-Alpha/d9e6059e-d20b-4465-b7ba-2ee3a72562b6.json b/data/hfopenllm_v2/Sakalti/SJT-24B-Alpha/d9e6059e-d20b-4465-b7ba-2ee3a72562b6.json
deleted file mode 100644
index d24b4279e..000000000
--- a/data/hfopenllm_v2/Sakalti/SJT-24B-Alpha/d9e6059e-d20b-4465-b7ba-2ee3a72562b6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJT-24B-Alpha/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJT-24B-Alpha",
-    "id": "Sakalti/SJT-24B-Alpha",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 24.125
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3206
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6081
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.253
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3809
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4595
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4857
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJT-2B-V1.1/f8b02d65-c8a0-43eb-b48e-d1e1f7f363d6.json b/data/hfopenllm_v2/Sakalti/SJT-2B-V1.1/f8b02d65-c8a0-43eb-b48e-d1e1f7f363d6.json
deleted file mode 100644
index 1042b7d30..000000000
--- a/data/hfopenllm_v2/Sakalti/SJT-2B-V1.1/f8b02d65-c8a0-43eb-b48e-d1e1f7f363d6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJT-2B-V1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJT-2B-V1.1",
-    "id": "Sakalti/SJT-2B-V1.1",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3977
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3984
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0483
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4299
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2124
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJT-2B/7bf23db0-877c-4700-95c8-e35dee5e57b4.json b/data/hfopenllm_v2/Sakalti/SJT-2B/7bf23db0-877c-4700-95c8-e35dee5e57b4.json
deleted file mode 100644
index 344d5d801..000000000
--- a/data/hfopenllm_v2/Sakalti/SJT-2B/7bf23db0-877c-4700-95c8-e35dee5e57b4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJT-2B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJT-2B",
-    "id": "Sakalti/SJT-2B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2151
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2416
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3564
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1187
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJT-3.7B/07f8351e-c7c6-463f-9e91-ee1d3bb2b35c.json b/data/hfopenllm_v2/Sakalti/SJT-3.7B/07f8351e-c7c6-463f-9e91-ee1d3bb2b35c.json
deleted file mode 100644
index c851ca6fd..000000000
--- a/data/hfopenllm_v2/Sakalti/SJT-3.7B/07f8351e-c7c6-463f-9e91-ee1d3bb2b35c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJT-3.7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJT-3.7B",
-    "id": "Sakalti/SJT-3.7B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.783
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1078
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3393
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3617
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1505
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJT-4B/8535ffae-f39d-46ed-89bb-a1656885db91.json b/data/hfopenllm_v2/Sakalti/SJT-4B/8535ffae-f39d-46ed-89bb-a1656885db91.json
deleted file mode 100644
index 841001a9a..000000000
--- a/data/hfopenllm_v2/Sakalti/SJT-4B/8535ffae-f39d-46ed-89bb-a1656885db91.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJT-4B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJT-4B",
-    "id": "Sakalti/SJT-4B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4077
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4886
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1156
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.478
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3281
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJT-7.5B/5e832121-9a67-44d9-973d-fffdb1b37975.json b/data/hfopenllm_v2/Sakalti/SJT-7.5B/5e832121-9a67-44d9-973d-fffdb1b37975.json
deleted file mode 100644
index 157236e81..000000000
--- a/data/hfopenllm_v2/Sakalti/SJT-7.5B/5e832121-9a67-44d9-973d-fffdb1b37975.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJT-7.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJT-7.5B",
-    "id": "Sakalti/SJT-7.5B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4223
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5367
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2168
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4399
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3951
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJT-7B-V1.1-Multilingal/92d3f67d-a026-49e3-a440-68c10fb358ae.json b/data/hfopenllm_v2/Sakalti/SJT-7B-V1.1-Multilingal/92d3f67d-a026-49e3-a440-68c10fb358ae.json
deleted file mode 100644
index 8dfb066a7..000000000
--- a/data/hfopenllm_v2/Sakalti/SJT-7B-V1.1-Multilingal/92d3f67d-a026-49e3-a440-68c10fb358ae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJT-7B-V1.1-Multilingal/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJT-7B-V1.1-Multilingal",
-    "id": "Sakalti/SJT-7B-V1.1-Multilingal",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1949
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.292
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3621
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1137
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJT-7B-V1.1/9d0baaef-bd31-4a96-bb2a-e92b62b748d2.json b/data/hfopenllm_v2/Sakalti/SJT-7B-V1.1/9d0baaef-bd31-4a96-bb2a-e92b62b748d2.json
deleted file mode 100644
index 7f654b278..000000000
--- a/data/hfopenllm_v2/Sakalti/SJT-7B-V1.1/9d0baaef-bd31-4a96-bb2a-e92b62b748d2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJT-7B-V1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJT-7B-V1.1",
-    "id": "Sakalti/SJT-7B-V1.1",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4703
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5419
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2432
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3339
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4411
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4412
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJT-8B-V1.1/489e8e84-5e30-46fa-a421-f52308f051e7.json b/data/hfopenllm_v2/Sakalti/SJT-8B-V1.1/489e8e84-5e30-46fa-a421-f52308f051e7.json
deleted file mode 100644
index 011bbc7a2..000000000
--- a/data/hfopenllm_v2/Sakalti/SJT-8B-V1.1/489e8e84-5e30-46fa-a421-f52308f051e7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJT-8B-V1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJT-8B-V1.1",
-    "id": "Sakalti/SJT-8B-V1.1",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 8.545
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4621
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5121
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2069
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4266
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4231
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJT-8B/a208f807-c930-4e81-8ebd-dcbb4db76442.json b/data/hfopenllm_v2/Sakalti/SJT-8B/a208f807-c930-4e81-8ebd-dcbb4db76442.json
deleted file mode 100644
index effa93eb9..000000000
--- a/data/hfopenllm_v2/Sakalti/SJT-8B/a208f807-c930-4e81-8ebd-dcbb4db76442.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJT-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJT-8B",
-    "id": "Sakalti/SJT-8B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 8.548
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6535
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5282
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2538
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.408
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4266
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJT-900M/4956539d-a255-4c56-877f-257e463fa3e4.json b/data/hfopenllm_v2/Sakalti/SJT-900M/4956539d-a255-4c56-877f-257e463fa3e4.json
deleted file mode 100644
index 73e5c563b..000000000
--- a/data/hfopenllm_v2/Sakalti/SJT-900M/4956539d-a255-4c56-877f-257e463fa3e4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJT-900M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJT-900M",
-    "id": "Sakalti/SJT-900M",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.899
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.241
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3169
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3595
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1142
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJT-Moe2x7.5B/3451eb65-020c-4e34-9128-7410e6b293cd.json b/data/hfopenllm_v2/Sakalti/SJT-Moe2x7.5B/3451eb65-020c-4e34-9128-7410e6b293cd.json
deleted file mode 100644
index af8b9e5e0..000000000
--- a/data/hfopenllm_v2/Sakalti/SJT-Moe2x7.5B/3451eb65-020c-4e34-9128-7410e6b293cd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJT-Moe2x7.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJT-Moe2x7.5B",
-    "id": "Sakalti/SJT-Moe2x7.5B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 13.401
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4117
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5371
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4399
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3954
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJTPass-2/b5cd0061-e4dd-4049-a51e-b16490e69120.json b/data/hfopenllm_v2/Sakalti/SJTPass-2/b5cd0061-e4dd-4049-a51e-b16490e69120.json
deleted file mode 100644
index b2c0d1e00..000000000
--- a/data/hfopenllm_v2/Sakalti/SJTPass-2/b5cd0061-e4dd-4049-a51e-b16490e69120.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJTPass-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJTPass-2",
-    "id": "Sakalti/SJTPass-2",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.24
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3302
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0529
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3222
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1902
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJTPass-4/c4686af6-0b7b-4df3-9152-14a3ef087b7f.json b/data/hfopenllm_v2/Sakalti/SJTPass-4/c4686af6-0b7b-4df3-9152-14a3ef087b7f.json
deleted file mode 100644
index fbfa47dbe..000000000
--- a/data/hfopenllm_v2/Sakalti/SJTPass-4/c4686af6-0b7b-4df3-9152-14a3ef087b7f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJTPass-4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJTPass-4",
-    "id": "Sakalti/SJTPass-4",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.167
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1913
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2964
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0023
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3898
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1083
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SJTPass-5/155885ca-11e7-4cd2-b26c-53e001e2a6f9.json b/data/hfopenllm_v2/Sakalti/SJTPass-5/155885ca-11e7-4cd2-b26c-53e001e2a6f9.json
deleted file mode 100644
index 8136444ac..000000000
--- a/data/hfopenllm_v2/Sakalti/SJTPass-5/155885ca-11e7-4cd2-b26c-53e001e2a6f9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SJTPass-5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SJTPass-5",
-    "id": "Sakalti/SJTPass-5",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.809
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2425
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3103
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3794
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1327
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Saba-Passthrough-2/d9ca5411-def6-43b3-a522-595131d8e5e6.json b/data/hfopenllm_v2/Sakalti/Saba-Passthrough-2/d9ca5411-def6-43b3-a522-595131d8e5e6.json
deleted file mode 100644
index 4e51d28f5..000000000
--- a/data/hfopenllm_v2/Sakalti/Saba-Passthrough-2/d9ca5411-def6-43b3-a522-595131d8e5e6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Saba-Passthrough-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Saba-Passthrough-2",
-    "id": "Sakalti/Saba-Passthrough-2",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.087
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1691
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3672
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3844
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2077
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Saba1-1.8B/e54553ab-0897-4cb5-9213-5bb72758d2b5.json b/data/hfopenllm_v2/Sakalti/Saba1-1.8B/e54553ab-0897-4cb5-9213-5bb72758d2b5.json
deleted file mode 100644
index db3cc96a9..000000000
--- a/data/hfopenllm_v2/Sakalti/Saba1-1.8B/e54553ab-0897-4cb5-9213-5bb72758d2b5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Saba1-1.8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Saba1-1.8B",
-    "id": "Sakalti/Saba1-1.8B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3333
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4147
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1541
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4239
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2926
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Saba1-7B/eed48cdc-18db-4c03-84bf-d2d50e3328b0.json b/data/hfopenllm_v2/Sakalti/Saba1-7B/eed48cdc-18db-4c03-84bf-d2d50e3328b0.json
deleted file mode 100644
index 9e78fd60a..000000000
--- a/data/hfopenllm_v2/Sakalti/Saba1-7B/eed48cdc-18db-4c03-84bf-d2d50e3328b0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Saba1-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Saba1-7B",
-    "id": "Sakalti/Saba1-7B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4585
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5489
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3663
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4793
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4376
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Saba1.5-1.5B/d7952aef-37e2-4c15-a1a4-598690773bbb.json b/data/hfopenllm_v2/Sakalti/Saba1.5-1.5B/d7952aef-37e2-4c15-a1a4-598690773bbb.json
deleted file mode 100644
index d446d42e8..000000000
--- a/data/hfopenllm_v2/Sakalti/Saba1.5-1.5B/d7952aef-37e2-4c15-a1a4-598690773bbb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Saba1.5-1.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Saba1.5-1.5B",
-    "id": "Sakalti/Saba1.5-1.5B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3333
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4147
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1541
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4239
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2926
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Saba1.5-Pro-3B/5e1e1376-bb22-4fc9-a1d6-3f2fe7d302b9.json b/data/hfopenllm_v2/Sakalti/Saba1.5-Pro-3B/5e1e1376-bb22-4fc9-a1d6-3f2fe7d302b9.json
deleted file mode 100644
index a268ff87b..000000000
--- a/data/hfopenllm_v2/Sakalti/Saba1.5-Pro-3B/5e1e1376-bb22-4fc9-a1d6-3f2fe7d302b9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Saba1.5-Pro-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Saba1.5-Pro-3B",
-    "id": "Sakalti/Saba1.5-Pro-3B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 2.9
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2386
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3623
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0272
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4405
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1958
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Saba2-14B-Preview/cfdae559-f3f1-4a78-b4cc-fbfb8bb37b16.json b/data/hfopenllm_v2/Sakalti/Saba2-14B-Preview/cfdae559-f3f1-4a78-b4cc-fbfb8bb37b16.json
deleted file mode 100644
index 2d5a6e567..000000000
--- a/data/hfopenllm_v2/Sakalti/Saba2-14B-Preview/cfdae559-f3f1-4a78-b4cc-fbfb8bb37b16.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Saba2-14B-Preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Saba2-14B-Preview",
-    "id": "Sakalti/Saba2-14B-Preview",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4722
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6496
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3127
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3826
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4781
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5384
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Saba2-3B/a12208ce-e9e1-4476-8054-0d565efad92c.json b/data/hfopenllm_v2/Sakalti/Saba2-3B/a12208ce-e9e1-4476-8054-0d565efad92c.json
deleted file mode 100644
index 1efd679c7..000000000
--- a/data/hfopenllm_v2/Sakalti/Saba2-3B/a12208ce-e9e1-4476-8054-0d565efad92c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Saba2-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Saba2-3B",
-    "id": "Sakalti/Saba2-3B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2865
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2801
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3927
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.121
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Sailor-japanese/f46e1eeb-8b8b-4d47-9510-445109b5518b.json b/data/hfopenllm_v2/Sakalti/Sailor-japanese/f46e1eeb-8b8b-4d47-9510-445109b5518b.json
deleted file mode 100644
index 353e25d9c..000000000
--- a/data/hfopenllm_v2/Sakalti/Sailor-japanese/f46e1eeb-8b8b-4d47-9510-445109b5518b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Sailor-japanese/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sailor-japanese",
-    "id": "Sakalti/Sailor-japanese",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1605
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2913
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.003
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3912
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1164
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Saka-1.5B/7dc4970f-ce35-4ffa-9052-2ab40abb1e55.json b/data/hfopenllm_v2/Sakalti/Saka-1.5B/7dc4970f-ce35-4ffa-9052-2ab40abb1e55.json
deleted file mode 100644
index bae9cee17..000000000
--- a/data/hfopenllm_v2/Sakalti/Saka-1.5B/7dc4970f-ce35-4ffa-9052-2ab40abb1e55.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Saka-1.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Saka-1.5B",
-    "id": "Sakalti/Saka-1.5B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2726
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3988
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0801
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3739
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2415
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Saka-14B/823e886a-1431-4078-81a3-4b941983461d.json b/data/hfopenllm_v2/Sakalti/Saka-14B/823e886a-1431-4078-81a3-4b941983461d.json
deleted file mode 100644
index a2883bea2..000000000
--- a/data/hfopenllm_v2/Sakalti/Saka-14B/823e886a-1431-4078-81a3-4b941983461d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Saka-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Saka-14B",
-    "id": "Sakalti/Saka-14B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7174
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6497
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4094
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.396
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4886
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5396
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Saka-24B/583609f0-de5b-43cd-a667-bb2c36679fd2.json b/data/hfopenllm_v2/Sakalti/Saka-24B/583609f0-de5b-43cd-a667-bb2c36679fd2.json
deleted file mode 100644
index 873a3e63b..000000000
--- a/data/hfopenllm_v2/Sakalti/Saka-24B/583609f0-de5b-43cd-a667-bb2c36679fd2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Saka-24B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Saka-24B",
-    "id": "Sakalti/Saka-24B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3819
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6072
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1805
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3423
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4541
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4766
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Saka-7.2B/2d2cea8b-167e-4d63-b01c-537f372672f9.json b/data/hfopenllm_v2/Sakalti/Saka-7.2B/2d2cea8b-167e-4d63-b01c-537f372672f9.json
deleted file mode 100644
index fc53daeea..000000000
--- a/data/hfopenllm_v2/Sakalti/Saka-7.2B/2d2cea8b-167e-4d63-b01c-537f372672f9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Saka-7.2B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Saka-7.2B",
-    "id": "Sakalti/Saka-7.2B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.292
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1545
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2391
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3711
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.116
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Saka-7.6B/f584f596-3a17-404a-81a2-3033ad38cad6.json b/data/hfopenllm_v2/Sakalti/Saka-7.6B/f584f596-3a17-404a-81a2-3033ad38cad6.json
deleted file mode 100644
index 045f0e9ed..000000000
--- a/data/hfopenllm_v2/Sakalti/Saka-7.6B/f584f596-3a17-404a-81a2-3033ad38cad6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Saka-7.6B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Saka-7.6B",
-    "id": "Sakalti/Saka-7.6B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4524
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5655
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4489
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.454
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SakaMoe-3x1.6B-Instruct/ebb0930f-92be-4e1b-a2a6-779f69d2151c.json b/data/hfopenllm_v2/Sakalti/SakaMoe-3x1.6B-Instruct/ebb0930f-92be-4e1b-a2a6-779f69d2151c.json
deleted file mode 100644
index e8b9b5045..000000000
--- a/data/hfopenllm_v2/Sakalti/SakaMoe-3x1.6B-Instruct/ebb0930f-92be-4e1b-a2a6-779f69d2151c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SakaMoe-3x1.6B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SakaMoe-3x1.6B-Instruct",
-    "id": "Sakalti/SakaMoe-3x1.6B-Instruct",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2MoeForCausalLM",
-      "params_billions": 1.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2371
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3282
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1882
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SakalFusion-7B-Alpha/b8926567-e208-442e-8ba8-c6dd4ecc5c4a.json b/data/hfopenllm_v2/Sakalti/SakalFusion-7B-Alpha/b8926567-e208-442e-8ba8-c6dd4ecc5c4a.json
deleted file mode 100644
index d9f49d39c..000000000
--- a/data/hfopenllm_v2/Sakalti/SakalFusion-7B-Alpha/b8926567-e208-442e-8ba8-c6dd4ecc5c4a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SakalFusion-7B-Alpha/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SakalFusion-7B-Alpha",
-    "id": "Sakalti/SakalFusion-7B-Alpha",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.529
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5591
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3844
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4581
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4474
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/SakalFusion-7B-Beta/4bf6efe1-81fc-48f6-96ba-8df9ffbef2f2.json b/data/hfopenllm_v2/Sakalti/SakalFusion-7B-Beta/4bf6efe1-81fc-48f6-96ba-8df9ffbef2f2.json
deleted file mode 100644
index 55e8681c6..000000000
--- a/data/hfopenllm_v2/Sakalti/SakalFusion-7B-Beta/4bf6efe1-81fc-48f6-96ba-8df9ffbef2f2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_SakalFusion-7B-Beta/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SakalFusion-7B-Beta",
-    "id": "Sakalti/SakalFusion-7B-Beta",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1809
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2881
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2433
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3872
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.109
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/Tara-3.8B-v1.1/05ffcb7a-2694-4276-bf45-73e1110bc494.json b/data/hfopenllm_v2/Sakalti/Tara-3.8B-v1.1/05ffcb7a-2694-4276-bf45-73e1110bc494.json
deleted file mode 100644
index 1aeae7ac8..000000000
--- a/data/hfopenllm_v2/Sakalti/Tara-3.8B-v1.1/05ffcb7a-2694-4276-bf45-73e1110bc494.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_Tara-3.8B-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tara-3.8B-v1.1",
-    "id": "Sakalti/Tara-3.8B-v1.1",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4062
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4886
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1156
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.478
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3281
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/light-1.1-3B/dc3b944b-a57a-44ab-87ac-8e1882b7bcce.json b/data/hfopenllm_v2/Sakalti/light-1.1-3B/dc3b944b-a57a-44ab-87ac-8e1882b7bcce.json
deleted file mode 100644
index 731f06557..000000000
--- a/data/hfopenllm_v2/Sakalti/light-1.1-3B/dc3b944b-a57a-44ab-87ac-8e1882b7bcce.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_light-1.1-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "light-1.1-3B",
-    "id": "Sakalti/light-1.1-3B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2803
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0113
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3901
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1209
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/light-3B/154f70b4-d77c-4d1b-b85c-bc81fe8162bd.json b/data/hfopenllm_v2/Sakalti/light-3B/154f70b4-d77c-4d1b-b85c-bc81fe8162bd.json
deleted file mode 100644
index 08a907de4..000000000
--- a/data/hfopenllm_v2/Sakalti/light-3B/154f70b4-d77c-4d1b-b85c-bc81fe8162bd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_light-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "light-3B",
-    "id": "Sakalti/light-3B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.397
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5337
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4831
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2591
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4015
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3775
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/light-3b-beta/998316d2-389a-4ce0-b0b0-0430c1361de7.json b/data/hfopenllm_v2/Sakalti/light-3b-beta/998316d2-389a-4ce0-b0b0-0430c1361de7.json
deleted file mode 100644
index e8992c2b0..000000000
--- a/data/hfopenllm_v2/Sakalti/light-3b-beta/998316d2-389a-4ce0-b0b0-0430c1361de7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_light-3b-beta/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "light-3b-beta",
-    "id": "Sakalti/light-3b-beta",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.397
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5485
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4815
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2772
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4015
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3758
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/light-7b-beta/ce803cde-6e23-433c-a4d2-38c5cb5ba14b.json b/data/hfopenllm_v2/Sakalti/light-7b-beta/ce803cde-6e23-433c-a4d2-38c5cb5ba14b.json
deleted file mode 100644
index 0e0fb6d0e..000000000
--- a/data/hfopenllm_v2/Sakalti/light-7b-beta/ce803cde-6e23-433c-a4d2-38c5cb5ba14b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_light-7b-beta/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "light-7b-beta",
-    "id": "Sakalti/light-7b-beta",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6234
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5548
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3769
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4291
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4456
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/llama-3-yanyuedao-8b-instruct/2519485b-47cd-497c-a349-9e69db0266f3.json b/data/hfopenllm_v2/Sakalti/llama-3-yanyuedao-8b-instruct/2519485b-47cd-497c-a349-9e69db0266f3.json
deleted file mode 100644
index 6cdfb156a..000000000
--- a/data/hfopenllm_v2/Sakalti/llama-3-yanyuedao-8b-instruct/2519485b-47cd-497c-a349-9e69db0266f3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_llama-3-yanyuedao-8b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-yanyuedao-8b-instruct",
-    "id": "Sakalti/llama-3-yanyuedao-8b-instruct",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2186
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.435
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/magro-7B/56d86e26-4ee6-4652-9b7b-a538238a24d4.json b/data/hfopenllm_v2/Sakalti/magro-7B/56d86e26-4ee6-4652-9b7b-a538238a24d4.json
deleted file mode 100644
index 412d9d54e..000000000
--- a/data/hfopenllm_v2/Sakalti/magro-7B/56d86e26-4ee6-4652-9b7b-a538238a24d4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_magro-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "magro-7B",
-    "id": "Sakalti/magro-7B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1344
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4186
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.446
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2765
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/mergekit-01/416b89e4-5e8a-4131-9403-e8967a4127b8.json b/data/hfopenllm_v2/Sakalti/mergekit-01/416b89e4-5e8a-4131-9403-e8967a4127b8.json
deleted file mode 100644
index 6209d9fb6..000000000
--- a/data/hfopenllm_v2/Sakalti/mergekit-01/416b89e4-5e8a-4131-9403-e8967a4127b8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_mergekit-01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mergekit-01",
-    "id": "Sakalti/mergekit-01",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6234
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5548
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3769
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4291
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4456
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/mergekit-della_linear-vmeykci/347a90e8-d8b7-4266-8242-ceac865796a0.json b/data/hfopenllm_v2/Sakalti/mergekit-della_linear-vmeykci/347a90e8-d8b7-4266-8242-ceac865796a0.json
deleted file mode 100644
index 17a180e43..000000000
--- a/data/hfopenllm_v2/Sakalti/mergekit-della_linear-vmeykci/347a90e8-d8b7-4266-8242-ceac865796a0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_mergekit-della_linear-vmeykci/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mergekit-della_linear-vmeykci",
-    "id": "Sakalti/mergekit-della_linear-vmeykci",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1126
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2816
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3897
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1089
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/model-3/389f7ab8-b30e-4d0c-b9a4-625e74a1f73f.json b/data/hfopenllm_v2/Sakalti/model-3/389f7ab8-b30e-4d0c-b9a4-625e74a1f73f.json
deleted file mode 100644
index 768fa5f46..000000000
--- a/data/hfopenllm_v2/Sakalti/model-3/389f7ab8-b30e-4d0c-b9a4-625e74a1f73f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_model-3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "model-3",
-    "id": "Sakalti/model-3",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6264
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5542
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3708
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4264
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4455
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/qwen2.5-2.3B/6ae33b7f-53a1-45c5-8b0b-d462188c3f9d.json b/data/hfopenllm_v2/Sakalti/qwen2.5-2.3B/6ae33b7f-53a1-45c5-8b0b-d462188c3f9d.json
deleted file mode 100644
index a8265114f..000000000
--- a/data/hfopenllm_v2/Sakalti/qwen2.5-2.3B/6ae33b7f-53a1-45c5-8b0b-d462188c3f9d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_qwen2.5-2.3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen2.5-2.3B",
-    "id": "Sakalti/qwen2.5-2.3B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2Model",
-      "params_billions": 2.339
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1288
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2849
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0053
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3857
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1173
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/tara-3.8B/d96fb0b2-7cba-4cc4-a5f4-b8a451754857.json b/data/hfopenllm_v2/Sakalti/tara-3.8B/d96fb0b2-7cba-4cc4-a5f4-b8a451754857.json
deleted file mode 100644
index 87315ad45..000000000
--- a/data/hfopenllm_v2/Sakalti/tara-3.8B/d96fb0b2-7cba-4cc4-a5f4-b8a451754857.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_tara-3.8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tara-3.8B",
-    "id": "Sakalti/tara-3.8B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4077
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4886
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1156
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.478
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3281
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/ultiima-14B-v0.2/f8d362f6-eafc-4d11-bc40-d169d69d3a95.json b/data/hfopenllm_v2/Sakalti/ultiima-14B-v0.2/f8d362f6-eafc-4d11-bc40-d169d69d3a95.json
deleted file mode 100644
index 8ee1c7c15..000000000
--- a/data/hfopenllm_v2/Sakalti/ultiima-14B-v0.2/f8d362f6-eafc-4d11-bc40-d169d69d3a95.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_ultiima-14B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ultiima-14B-v0.2",
-    "id": "Sakalti/ultiima-14B-v0.2",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.707
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6472
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3995
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3826
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4794
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5387
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/ultiima-14B-v0.3/4bacd3dd-44c2-42d8-98c0-3eeb920dc0f0.json b/data/hfopenllm_v2/Sakalti/ultiima-14B-v0.3/4bacd3dd-44c2-42d8-98c0-3eeb920dc0f0.json
deleted file mode 100644
index 86e48a32c..000000000
--- a/data/hfopenllm_v2/Sakalti/ultiima-14B-v0.3/4bacd3dd-44c2-42d8-98c0-3eeb920dc0f0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_ultiima-14B-v0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ultiima-14B-v0.3",
-    "id": "Sakalti/ultiima-14B-v0.3",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.704
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6398
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3965
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3767
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4754
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5337
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/ultiima-14B-v0.4/de073f45-0d14-4f8a-9d3b-d4fd961186b8.json b/data/hfopenllm_v2/Sakalti/ultiima-14B-v0.4/de073f45-0d14-4f8a-9d3b-d4fd961186b8.json
deleted file mode 100644
index 172ff4245..000000000
--- a/data/hfopenllm_v2/Sakalti/ultiima-14B-v0.4/de073f45-0d14-4f8a-9d3b-d4fd961186b8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_ultiima-14B-v0.4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ultiima-14B-v0.4",
-    "id": "Sakalti/ultiima-14B-v0.4",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3008
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.642
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3535
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.396
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4886
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5278
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/ultiima-14B/fd88d234-b3f9-4f48-896c-af58f1a69880.json b/data/hfopenllm_v2/Sakalti/ultiima-14B/fd88d234-b3f9-4f48-896c-af58f1a69880.json
deleted file mode 100644
index 640b74564..000000000
--- a/data/hfopenllm_v2/Sakalti/ultiima-14B/fd88d234-b3f9-4f48-896c-af58f1a69880.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_ultiima-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ultiima-14B",
-    "id": "Sakalti/ultiima-14B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5701
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6491
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4698
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3742
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4718
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5381
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/ultiima-32B/273745b1-3761-463e-b9ab-7860968064eb.json b/data/hfopenllm_v2/Sakalti/ultiima-32B/273745b1-3761-463e-b9ab-7860968064eb.json
deleted file mode 100644
index d7a87927c..000000000
--- a/data/hfopenllm_v2/Sakalti/ultiima-32B/273745b1-3761-463e-b9ab-7860968064eb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_ultiima-32B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ultiima-32B",
-    "id": "Sakalti/ultiima-32B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6854
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7037
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4962
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3809
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4995
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.591
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/ultiima-72B-v1.5/101d84d3-e741-4eb2-bd8a-db6c12022fe2.json b/data/hfopenllm_v2/Sakalti/ultiima-72B-v1.5/101d84d3-e741-4eb2-bd8a-db6c12022fe2.json
deleted file mode 100644
index aec4261bf..000000000
--- a/data/hfopenllm_v2/Sakalti/ultiima-72B-v1.5/101d84d3-e741-4eb2-bd8a-db6c12022fe2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_ultiima-72B-v1.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ultiima-72B-v1.5",
-    "id": "Sakalti/ultiima-72B-v1.5",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.655
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7392
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4396
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4136
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4691
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6054
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sakalti/ultiima-72B/9c82deca-1998-4506-b038-c5dd592324d8.json b/data/hfopenllm_v2/Sakalti/ultiima-72B/9c82deca-1998-4506-b038-c5dd592324d8.json
deleted file mode 100644
index 8be07e143..000000000
--- a/data/hfopenllm_v2/Sakalti/ultiima-72B/9c82deca-1998-4506-b038-c5dd592324d8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sakalti_ultiima-72B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ultiima-72B",
-    "id": "Sakalti/ultiima-72B",
-    "developer": "Sakalti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.714
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7218
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5355
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4144
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4652
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5906
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Salesforce/LLaMA-3-8B-SFR-Iterative-DPO-R/da620a94-4c0d-4c50-9619-10e12001fb5d.json b/data/hfopenllm_v2/Salesforce/LLaMA-3-8B-SFR-Iterative-DPO-R/da620a94-4c0d-4c50-9619-10e12001fb5d.json
deleted file mode 100644
index d2ff7565c..000000000
--- a/data/hfopenllm_v2/Salesforce/LLaMA-3-8B-SFR-Iterative-DPO-R/da620a94-4c0d-4c50-9619-10e12001fb5d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Salesforce_LLaMA-3-8B-SFR-Iterative-DPO-R/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMA-3-8B-SFR-Iterative-DPO-R",
-    "id": "Salesforce/LLaMA-3-8B-SFR-Iterative-DPO-R",
-    "developer": "Salesforce",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3816
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5012
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0914
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3633
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3172
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SanjiWatsuki/Kunoichi-DPO-v2-7B/51dade8f-34e7-4237-8691-22655249bf76.json b/data/hfopenllm_v2/SanjiWatsuki/Kunoichi-DPO-v2-7B/51dade8f-34e7-4237-8691-22655249bf76.json
deleted file mode 100644
index ea148741d..000000000
--- a/data/hfopenllm_v2/SanjiWatsuki/Kunoichi-DPO-v2-7B/51dade8f-34e7-4237-8691-22655249bf76.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SanjiWatsuki_Kunoichi-DPO-v2-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kunoichi-DPO-v2-7B",
-    "id": "SanjiWatsuki/Kunoichi-DPO-v2-7B",
-    "developer": "SanjiWatsuki",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5431
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4416
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0763
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4188
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3107
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SanjiWatsuki/Silicon-Maid-7B/cdd59385-0a54-4ca1-b24d-9316a70f2875.json b/data/hfopenllm_v2/SanjiWatsuki/Silicon-Maid-7B/cdd59385-0a54-4ca1-b24d-9316a70f2875.json
deleted file mode 100644
index f3201e23e..000000000
--- a/data/hfopenllm_v2/SanjiWatsuki/Silicon-Maid-7B/cdd59385-0a54-4ca1-b24d-9316a70f2875.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SanjiWatsuki_Silicon-Maid-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Silicon-Maid-7B",
-    "id": "SanjiWatsuki/Silicon-Maid-7B",
-    "developer": "SanjiWatsuki",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5368
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4128
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4188
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3083
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sao10K/70B-L3.3-Cirrus-x1/514a3103-e8a1-49e8-b9da-a85963f5b3dd.json b/data/hfopenllm_v2/Sao10K/70B-L3.3-Cirrus-x1/514a3103-e8a1-49e8-b9da-a85963f5b3dd.json
deleted file mode 100644
index c45307b07..000000000
--- a/data/hfopenllm_v2/Sao10K/70B-L3.3-Cirrus-x1/514a3103-e8a1-49e8-b9da-a85963f5b3dd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sao10K_70B-L3.3-Cirrus-x1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "70B-L3.3-Cirrus-x1",
-    "id": "Sao10K/70B-L3.3-Cirrus-x1",
-    "developer": "Sao10K",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6681
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7029
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3739
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4497
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4842
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5378
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sao10K/Fimbulvetr-11B-v2/daafaafa-1e00-4433-95f3-91c169598ebd.json b/data/hfopenllm_v2/Sao10K/Fimbulvetr-11B-v2/daafaafa-1e00-4433-95f3-91c169598ebd.json
deleted file mode 100644
index 3f9b9daef..000000000
--- a/data/hfopenllm_v2/Sao10K/Fimbulvetr-11B-v2/daafaafa-1e00-4433-95f3-91c169598ebd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sao10K_Fimbulvetr-11B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fimbulvetr-11B-v2",
-    "id": "Sao10K/Fimbulvetr-11B-v2",
-    "developer": "Sao10K",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.51
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4544
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4354
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3301
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sao10K/L3-70B-Euryale-v2.1/50e53ad5-8693-44c1-b5c7-45b91d7e0ae4.json b/data/hfopenllm_v2/Sao10K/L3-70B-Euryale-v2.1/50e53ad5-8693-44c1-b5c7-45b91d7e0ae4.json
deleted file mode 100644
index 6cc9b0ce2..000000000
--- a/data/hfopenllm_v2/Sao10K/L3-70B-Euryale-v2.1/50e53ad5-8693-44c1-b5c7-45b91d7e0ae4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sao10K_L3-70B-Euryale-v2.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-70B-Euryale-v2.1",
-    "id": "Sao10K/L3-70B-Euryale-v2.1",
-    "developer": "Sao10K",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7384
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6471
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2137
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4209
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5104
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sao10K/L3-70B-Euryale-v2.1/bda5d02f-7973-41a3-8f8e-4e33a12b74e0.json b/data/hfopenllm_v2/Sao10K/L3-70B-Euryale-v2.1/bda5d02f-7973-41a3-8f8e-4e33a12b74e0.json
deleted file mode 100644
index f445ab801..000000000
--- a/data/hfopenllm_v2/Sao10K/L3-70B-Euryale-v2.1/bda5d02f-7973-41a3-8f8e-4e33a12b74e0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sao10K_L3-70B-Euryale-v2.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-70B-Euryale-v2.1",
-    "id": "Sao10K/L3-70B-Euryale-v2.1",
-    "developer": "Sao10K",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7281
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6503
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2243
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4196
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5096
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sao10K/L3-8B-Lunaris-v1/99ff5ca5-4409-4d9c-9ec0-4cf392afeff2.json b/data/hfopenllm_v2/Sao10K/L3-8B-Lunaris-v1/99ff5ca5-4409-4d9c-9ec0-4cf392afeff2.json
deleted file mode 100644
index a7a1e4096..000000000
--- a/data/hfopenllm_v2/Sao10K/L3-8B-Lunaris-v1/99ff5ca5-4409-4d9c-9ec0-4cf392afeff2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sao10K_L3-8B-Lunaris-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-8B-Lunaris-v1",
-    "id": "Sao10K/L3-8B-Lunaris-v1",
-    "developer": "Sao10K",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6895
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5235
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0906
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3727
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3787
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sao10K/L3-8B-Niitama-v1/362f5875-4dbc-4e68-90ce-789f692bb533.json b/data/hfopenllm_v2/Sao10K/L3-8B-Niitama-v1/362f5875-4dbc-4e68-90ce-789f692bb533.json
deleted file mode 100644
index 70117da5c..000000000
--- a/data/hfopenllm_v2/Sao10K/L3-8B-Niitama-v1/362f5875-4dbc-4e68-90ce-789f692bb533.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sao10K_L3-8B-Niitama-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-8B-Niitama-v1",
-    "id": "Sao10K/L3-8B-Niitama-v1",
-    "developer": "Sao10K",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6791
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5303
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0982
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3807
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3701
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sao10K/L3-8B-Stheno-v3.2/fdb5faf6-2cdd-42bb-b154-d6e93b2348bf.json b/data/hfopenllm_v2/Sao10K/L3-8B-Stheno-v3.2/fdb5faf6-2cdd-42bb-b154-d6e93b2348bf.json
deleted file mode 100644
index cb2eabee5..000000000
--- a/data/hfopenllm_v2/Sao10K/L3-8B-Stheno-v3.2/fdb5faf6-2cdd-42bb-b154-d6e93b2348bf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sao10K_L3-8B-Stheno-v3.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-8B-Stheno-v3.2",
-    "id": "Sao10K/L3-8B-Stheno-v3.2",
-    "developer": "Sao10K",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6873
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5228
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0929
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3794
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3768
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sao10K/L3-8B-Stheno-v3.3-32K/93f829b8-b8d9-4389-a210-2a38c3a30edb.json b/data/hfopenllm_v2/Sao10K/L3-8B-Stheno-v3.3-32K/93f829b8-b8d9-4389-a210-2a38c3a30edb.json
deleted file mode 100644
index 317e3d07a..000000000
--- a/data/hfopenllm_v2/Sao10K/L3-8B-Stheno-v3.3-32K/93f829b8-b8d9-4389-a210-2a38c3a30edb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sao10K_L3-8B-Stheno-v3.3-32K/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-8B-Stheno-v3.3-32K",
-    "id": "Sao10K/L3-8B-Stheno-v3.3-32K",
-    "developer": "Sao10K",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4604
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3844
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3725
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1896
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sao10K/MN-12B-Lyra-v3/6ec3554d-377b-4bf6-88ef-8a4c9e70f485.json b/data/hfopenllm_v2/Sao10K/MN-12B-Lyra-v3/6ec3554d-377b-4bf6-88ef-8a4c9e70f485.json
deleted file mode 100644
index 6ca309c31..000000000
--- a/data/hfopenllm_v2/Sao10K/MN-12B-Lyra-v3/6ec3554d-377b-4bf6-88ef-8a4c9e70f485.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sao10K_MN-12B-Lyra-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-Lyra-v3",
-    "id": "Sao10K/MN-12B-Lyra-v3",
-    "developer": "Sao10K",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4486
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4804
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0937
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4019
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3249
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V1-32B/70d749cf-2e92-4847-86de-7964fc8eb990.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V1-32B/70d749cf-2e92-4847-86de-7964fc8eb990.json
deleted file mode 100644
index 991d8a526..000000000
--- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V1-32B/70d749cf-2e92-4847-86de-7964fc8eb990.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Avengers-V1-32B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Linkbricks-Horizon-AI-Avengers-V1-32B",
-    "id": "Saxo/Linkbricks-Horizon-AI-Avengers-V1-32B",
-    "developer": "Saxo",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.76
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7972
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7001
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6027
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3624
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4538
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5793
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V2-32B/623f2b04-6cd7-4ea0-8844-badb0ff6c9c6.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V2-32B/623f2b04-6cd7-4ea0-8844-badb0ff6c9c6.json
deleted file mode 100644
index f34b82b65..000000000
--- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V2-32B/623f2b04-6cd7-4ea0-8844-badb0ff6c9c6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Avengers-V2-32B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Linkbricks-Horizon-AI-Avengers-V2-32B",
-    "id": "Saxo/Linkbricks-Horizon-AI-Avengers-V2-32B",
-    "developer": "Saxo",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.76
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7956
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7023
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4166
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.572
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V3-32B/e1aca741-2765-4e47-b6a1-49f3d9532432.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V3-32B/e1aca741-2765-4e47-b6a1-49f3d9532432.json
deleted file mode 100644
index 32459a45a..000000000
--- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V3-32B/e1aca741-2765-4e47-b6a1-49f3d9532432.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Avengers-V3-32B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Linkbricks-Horizon-AI-Avengers-V3-32B",
-    "id": "Saxo/Linkbricks-Horizon-AI-Avengers-V3-32B",
-    "developer": "Saxo",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8249
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6913
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6178
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4275
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5664
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V4-32B/4f42366e-e6aa-4974-9a40-5781e350616d.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V4-32B/4f42366e-e6aa-4974-9a40-5781e350616d.json
deleted file mode 100644
index cba352636..000000000
--- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V4-32B/4f42366e-e6aa-4974-9a40-5781e350616d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Avengers-V4-32B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Linkbricks-Horizon-AI-Avengers-V4-32B",
-    "id": "Saxo/Linkbricks-Horizon-AI-Avengers-V4-32B",
-    "developer": "Saxo",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7631
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.692
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5363
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3616
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4643
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5752
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V5-32B/4ec2231d-c012-4ad3-830c-8ff86c977202.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V5-32B/4ec2231d-c012-4ad3-830c-8ff86c977202.json
deleted file mode 100644
index 9e429543e..000000000
--- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V5-32B/4ec2231d-c012-4ad3-830c-8ff86c977202.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Avengers-V5-32B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Linkbricks-Horizon-AI-Avengers-V5-32B",
-    "id": "Saxo/Linkbricks-Horizon-AI-Avengers-V5-32B",
-    "developer": "Saxo",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7516
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6929
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5461
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3557
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4709
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5762
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V6-32B/1d2e5513-bd0c-4795-8487-f5266c6e368f.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V6-32B/1d2e5513-bd0c-4795-8487-f5266c6e368f.json
deleted file mode 100644
index 1faf938ef..000000000
--- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Avengers-V6-32B/1d2e5513-bd0c-4795-8487-f5266c6e368f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Avengers-V6-32B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Linkbricks-Horizon-AI-Avengers-V6-32B",
-    "id": "Saxo/Linkbricks-Horizon-AI-Avengers-V6-32B",
-    "developer": "Saxo",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.76
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8209
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.689
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6224
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3347
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4274
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5672
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V2-27B/104172b7-86f5-410a-a454-63e1cfbeb87f.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V2-27B/104172b7-86f5-410a-a454-63e1cfbeb87f.json
deleted file mode 100644
index f8ce92b90..000000000
--- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V2-27B/104172b7-86f5-410a-a454-63e1cfbeb87f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Korean-Avengers-V2-27B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Linkbricks-Horizon-AI-Korean-Avengers-V2-27B",
-    "id": "Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V2-27B",
-    "developer": "Saxo",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8146
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6463
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4139
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4599
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V3-27B/d28e04ac-7d18-43fb-80b8-82c0662fec79.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V3-27B/d28e04ac-7d18-43fb-80b8-82c0662fec79.json
deleted file mode 100644
index 2df6b6d76..000000000
--- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V3-27B/d28e04ac-7d18-43fb-80b8-82c0662fec79.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Korean-Avengers-V3-27B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Linkbricks-Horizon-AI-Korean-Avengers-V3-27B",
-    "id": "Saxo/Linkbricks-Horizon-AI-Korean-Avengers-V3-27B",
-    "developer": "Saxo",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8142
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6404
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3591
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4467
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4524
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Superb-22B/20bb3819-9d85-4d84-99ba-65e33965f0c5.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Superb-22B/20bb3819-9d85-4d84-99ba-65e33965f0c5.json
deleted file mode 100644
index f703921c4..000000000
--- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Superb-22B/20bb3819-9d85-4d84-99ba-65e33965f0c5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Korean-Superb-22B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Linkbricks-Horizon-AI-Korean-Superb-22B",
-    "id": "Saxo/Linkbricks-Horizon-AI-Korean-Superb-22B",
-    "developer": "Saxo",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6767
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5626
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2372
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3908
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3871
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Superb-27B/3a4bdf58-0137-4d85-b567-59b3fed3dad5.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Superb-27B/3a4bdf58-0137-4d85-b567-59b3fed3dad5.json
deleted file mode 100644
index d43e428ac..000000000
--- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Korean-Superb-27B/3a4bdf58-0137-4d85-b567-59b3fed3dad5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Korean-Superb-27B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Linkbricks-Horizon-AI-Korean-Superb-27B",
-    "id": "Saxo/Linkbricks-Horizon-AI-Korean-Superb-27B",
-    "developer": "Saxo",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7768
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6518
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2719
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3599
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4791
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4647
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Superb-27B/04f843ba-947c-4732-979c-2aeae7d34e5a.json b/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Superb-27B/04f843ba-947c-4732-979c-2aeae7d34e5a.json
deleted file mode 100644
index f7e4a247b..000000000
--- a/data/hfopenllm_v2/Saxo/Linkbricks-Horizon-AI-Superb-27B/04f843ba-947c-4732-979c-2aeae7d34e5a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Saxo_Linkbricks-Horizon-AI-Superb-27B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Linkbricks-Horizon-AI-Superb-27B",
-    "id": "Saxo/Linkbricks-Horizon-AI-Superb-27B",
-    "developer": "Saxo",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7302
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6186
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2221
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3574
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.465
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.406
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SeaLLMs/SeaLLM-7B-v2.5/173a31d3-7d12-4ab1-a963-005a81aee767.json b/data/hfopenllm_v2/SeaLLMs/SeaLLM-7B-v2.5/173a31d3-7d12-4ab1-a963-005a81aee767.json
deleted file mode 100644
index 03259aebd..000000000
--- a/data/hfopenllm_v2/SeaLLMs/SeaLLM-7B-v2.5/173a31d3-7d12-4ab1-a963-005a81aee767.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SeaLLMs_SeaLLM-7B-v2.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SeaLLM-7B-v2.5",
-    "id": "SeaLLMs/SeaLLM-7B-v2.5",
-    "developer": "SeaLLMs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 8.538
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4522
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.498
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1088
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4203
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3203
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SeaLLMs/SeaLLM-7B-v2/d0555736-b614-43ca-91d7-8264e3566872.json b/data/hfopenllm_v2/SeaLLMs/SeaLLM-7B-v2/d0555736-b614-43ca-91d7-8264e3566872.json
deleted file mode 100644
index 98630be5e..000000000
--- a/data/hfopenllm_v2/SeaLLMs/SeaLLM-7B-v2/d0555736-b614-43ca-91d7-8264e3566872.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SeaLLMs_SeaLLM-7B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SeaLLM-7B-v2",
-    "id": "SeaLLMs/SeaLLM-7B-v2",
-    "developer": "SeaLLMs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.376
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3671
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4902
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0853
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.407
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3083
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SeaLLMs/SeaLLMs-v3-7B-Chat/4b7b13b7-4aee-4462-87e6-aa6c15068236.json b/data/hfopenllm_v2/SeaLLMs/SeaLLMs-v3-7B-Chat/4b7b13b7-4aee-4462-87e6-aa6c15068236.json
deleted file mode 100644
index 05981a658..000000000
--- a/data/hfopenllm_v2/SeaLLMs/SeaLLMs-v3-7B-Chat/4b7b13b7-4aee-4462-87e6-aa6c15068236.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SeaLLMs_SeaLLMs-v3-7B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SeaLLMs-v3-7B-Chat",
-    "id": "SeaLLMs/SeaLLMs-v3-7B-Chat",
-    "developer": "SeaLLMs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4377
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5266
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1858
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4174
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3895
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SenseLLM/ReflectionCoder-CL-34B/4b1f9ce5-bb12-42e3-b0e0-afaa784b0c4c.json b/data/hfopenllm_v2/SenseLLM/ReflectionCoder-CL-34B/4b1f9ce5-bb12-42e3-b0e0-afaa784b0c4c.json
deleted file mode 100644
index e86dab4c2..000000000
--- a/data/hfopenllm_v2/SenseLLM/ReflectionCoder-CL-34B/4b1f9ce5-bb12-42e3-b0e0-afaa784b0c4c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SenseLLM_ReflectionCoder-CL-34B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReflectionCoder-CL-34B",
-    "id": "SenseLLM/ReflectionCoder-CL-34B",
-    "developer": "SenseLLM",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 33.744
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4008
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3953
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0332
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4155
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1424
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SenseLLM/ReflectionCoder-DS-33B/acbcd5a5-bcd8-4209-b35f-425feada7e8b.json b/data/hfopenllm_v2/SenseLLM/ReflectionCoder-DS-33B/acbcd5a5-bcd8-4209-b35f-425feada7e8b.json
deleted file mode 100644
index 20a36d5af..000000000
--- a/data/hfopenllm_v2/SenseLLM/ReflectionCoder-DS-33B/acbcd5a5-bcd8-4209-b35f-425feada7e8b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SenseLLM_ReflectionCoder-DS-33B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReflectionCoder-DS-33B",
-    "id": "SenseLLM/ReflectionCoder-DS-33B",
-    "developer": "SenseLLM",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 33.34
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3787
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3449
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3343
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1202
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SentientAGI/Dobby-Mini-Leashed-Llama-3.1-8B/cb9a415f-1a02-46ad-a731-bf825ddd78ae.json b/data/hfopenllm_v2/SentientAGI/Dobby-Mini-Leashed-Llama-3.1-8B/cb9a415f-1a02-46ad-a731-bf825ddd78ae.json
deleted file mode 100644
index df38f37c7..000000000
--- a/data/hfopenllm_v2/SentientAGI/Dobby-Mini-Leashed-Llama-3.1-8B/cb9a415f-1a02-46ad-a731-bf825ddd78ae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SentientAGI_Dobby-Mini-Leashed-Llama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dobby-Mini-Leashed-Llama-3.1-8B",
-    "id": "SentientAGI/Dobby-Mini-Leashed-Llama-3.1-8B",
-    "developer": "SentientAGI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7847
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5138
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1858
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4254
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3694
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SentientAGI/Dobby-Mini-Unhinged-Llama-3.1-8B/92cde6db-47f4-43c6-9ad5-643c35faa226.json b/data/hfopenllm_v2/SentientAGI/Dobby-Mini-Unhinged-Llama-3.1-8B/92cde6db-47f4-43c6-9ad5-643c35faa226.json
deleted file mode 100644
index 771f354ce..000000000
--- a/data/hfopenllm_v2/SentientAGI/Dobby-Mini-Unhinged-Llama-3.1-8B/92cde6db-47f4-43c6-9ad5-643c35faa226.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SentientAGI_Dobby-Mini-Unhinged-Llama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dobby-Mini-Unhinged-Llama-3.1-8B",
-    "id": "SentientAGI/Dobby-Mini-Unhinged-Llama-3.1-8B",
-    "developer": "SentientAGI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7457
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5142
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1563
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4013
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3585
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SeppeV/SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo/5e88a037-f9bd-4b39-944f-f0781bb7884f.json b/data/hfopenllm_v2/SeppeV/SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo/5e88a037-f9bd-4b39-944f-f0781bb7884f.json
deleted file mode 100644
index d4f48f999..000000000
--- a/data/hfopenllm_v2/SeppeV/SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo/5e88a037-f9bd-4b39-944f-f0781bb7884f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SeppeV_SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo",
-    "id": "SeppeV/SmolLM_pretrained_with_sft_trained_with_1pc_data_on_a_preference_dpo",
-    "developer": "SeppeV",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0955
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3073
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4032
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1161
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sharathhebbar24/SSH_355M/d4b08f5d-5add-49f4-b8db-c1a12e0a5313.json b/data/hfopenllm_v2/Sharathhebbar24/SSH_355M/d4b08f5d-5add-49f4-b8db-c1a12e0a5313.json
deleted file mode 100644
index ab1e96d43..000000000
--- a/data/hfopenllm_v2/Sharathhebbar24/SSH_355M/d4b08f5d-5add-49f4-b8db-c1a12e0a5313.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sharathhebbar24_SSH_355M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SSH_355M",
-    "id": "Sharathhebbar24/SSH_355M",
-    "developer": "Sharathhebbar24",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPT2LMHeadModel",
-      "params_billions": 0.355
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1424
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3099
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4178
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1176
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sharathhebbar24/chat_gpt2_dpo/ac5adf39-f0a4-439b-9873-9141e0a554b1.json b/data/hfopenllm_v2/Sharathhebbar24/chat_gpt2_dpo/ac5adf39-f0a4-439b-9873-9141e0a554b1.json
deleted file mode 100644
index 3c6c53f61..000000000
--- a/data/hfopenllm_v2/Sharathhebbar24/chat_gpt2_dpo/ac5adf39-f0a4-439b-9873-9141e0a554b1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sharathhebbar24_chat_gpt2_dpo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "chat_gpt2_dpo",
-    "id": "Sharathhebbar24/chat_gpt2_dpo",
-    "developer": "Sharathhebbar24",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPT2LMHeadModel",
-      "params_billions": 0.124
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0986
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2902
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0053
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3818
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1142
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Shreyash2010/Uma-4x4B-Instruct-v0.1/62965c92-cdf4-4a3b-b035-990abaab615c.json b/data/hfopenllm_v2/Shreyash2010/Uma-4x4B-Instruct-v0.1/62965c92-cdf4-4a3b-b035-990abaab615c.json
deleted file mode 100644
index 5d50114d7..000000000
--- a/data/hfopenllm_v2/Shreyash2010/Uma-4x4B-Instruct-v0.1/62965c92-cdf4-4a3b-b035-990abaab615c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Shreyash2010_Uma-4x4B-Instruct-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Uma-4x4B-Instruct-v0.1",
-    "id": "Shreyash2010/Uma-4x4B-Instruct-v0.1",
-    "developer": "Shreyash2010",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5517
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5512
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1775
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3347
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4441
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.387
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sicarius-Prototyping/Brainy_LLAMA/3866ece8-d70a-4061-9e86-0798ecd98bd6.json b/data/hfopenllm_v2/Sicarius-Prototyping/Brainy_LLAMA/3866ece8-d70a-4061-9e86-0798ecd98bd6.json
deleted file mode 100644
index 26ba36c75..000000000
--- a/data/hfopenllm_v2/Sicarius-Prototyping/Brainy_LLAMA/3866ece8-d70a-4061-9e86-0798ecd98bd6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sicarius-Prototyping_Brainy_LLAMA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Brainy_LLAMA",
-    "id": "Sicarius-Prototyping/Brainy_LLAMA",
-    "developer": "Sicarius-Prototyping",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5204
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5117
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1337
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4143
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3849
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sicarius-Prototyping/Micropenis_1B/ff484d0e-bb14-4a80-ae29-2351b03cf278.json b/data/hfopenllm_v2/Sicarius-Prototyping/Micropenis_1B/ff484d0e-bb14-4a80-ae29-2351b03cf278.json
deleted file mode 100644
index 61ea14597..000000000
--- a/data/hfopenllm_v2/Sicarius-Prototyping/Micropenis_1B/ff484d0e-bb14-4a80-ae29-2351b03cf278.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sicarius-Prototyping_Micropenis_1B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Micropenis_1B",
-    "id": "Sicarius-Prototyping/Micropenis_1B",
-    "developer": "Sicarius-Prototyping",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.618
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3461
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3372
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0461
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3325
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.186
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sicarius-Prototyping/bacon_and_food/06ac1718-fe71-4e05-a47f-1200e067336c.json b/data/hfopenllm_v2/Sicarius-Prototyping/bacon_and_food/06ac1718-fe71-4e05-a47f-1200e067336c.json
deleted file mode 100644
index a582b1eff..000000000
--- a/data/hfopenllm_v2/Sicarius-Prototyping/bacon_and_food/06ac1718-fe71-4e05-a47f-1200e067336c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sicarius-Prototyping_bacon_and_food/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bacon_and_food",
-    "id": "Sicarius-Prototyping/bacon_and_food",
-    "developer": "Sicarius-Prototyping",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.586
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4725
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0982
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3884
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/2B-ad/4ddb1616-7889-45ef-96de-823fee338e1d.json b/data/hfopenllm_v2/SicariusSicariiStuff/2B-ad/4ddb1616-7889-45ef-96de-823fee338e1d.json
deleted file mode 100644
index 01e28f2ac..000000000
--- a/data/hfopenllm_v2/SicariusSicariiStuff/2B-ad/4ddb1616-7889-45ef-96de-823fee338e1d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_2B-ad/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "2B-ad",
-    "id": "SicariusSicariiStuff/2B-ad",
-    "developer": "SicariusSicariiStuff",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 3.204
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4379
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4092
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4015
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2662
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/2B_or_not_2B/487dd91b-5bc4-4355-90d3-c82ecc789ab3.json b/data/hfopenllm_v2/SicariusSicariiStuff/2B_or_not_2B/487dd91b-5bc4-4355-90d3-c82ecc789ab3.json
deleted file mode 100644
index 55d96ac62..000000000
--- a/data/hfopenllm_v2/SicariusSicariiStuff/2B_or_not_2B/487dd91b-5bc4-4355-90d3-c82ecc789ab3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_2B_or_not_2B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "2B_or_not_2B",
-    "id": "SicariusSicariiStuff/2B_or_not_2B",
-    "developer": "SicariusSicariiStuff",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 2.506
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2062
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3416
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2475
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3791
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1399
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Dusk_Rainbow/a74e86d9-8b94-4f60-8f0c-73cc4b04d905.json b/data/hfopenllm_v2/SicariusSicariiStuff/Dusk_Rainbow/a74e86d9-8b94-4f60-8f0c-73cc4b04d905.json
deleted file mode 100644
index 0d3c0a4eb..000000000
--- a/data/hfopenllm_v2/SicariusSicariiStuff/Dusk_Rainbow/a74e86d9-8b94-4f60-8f0c-73cc4b04d905.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Dusk_Rainbow/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dusk_Rainbow",
-    "id": "SicariusSicariiStuff/Dusk_Rainbow",
-    "developer": "SicariusSicariiStuff",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3588
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4772
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0748
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4025
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3443
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Eximius_Persona_5B/9a9239ab-9e0e-449b-bd1b-6ec280fad505.json b/data/hfopenllm_v2/SicariusSicariiStuff/Eximius_Persona_5B/9a9239ab-9e0e-449b-bd1b-6ec280fad505.json
deleted file mode 100644
index 0bf2ac15d..000000000
--- a/data/hfopenllm_v2/SicariusSicariiStuff/Eximius_Persona_5B/9a9239ab-9e0e-449b-bd1b-6ec280fad505.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Eximius_Persona_5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Eximius_Persona_5B",
-    "id": "SicariusSicariiStuff/Eximius_Persona_5B",
-    "developer": "SicariusSicariiStuff",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 5.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.656
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4512
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3818
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.314
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Impish_LLAMA_3B/2c710cd5-75a6-46b7-8356-212da7bf864d.json b/data/hfopenllm_v2/SicariusSicariiStuff/Impish_LLAMA_3B/2c710cd5-75a6-46b7-8356-212da7bf864d.json
deleted file mode 100644
index a3193370f..000000000
--- a/data/hfopenllm_v2/SicariusSicariiStuff/Impish_LLAMA_3B/2c710cd5-75a6-46b7-8356-212da7bf864d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Impish_LLAMA_3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Impish_LLAMA_3B",
-    "id": "SicariusSicariiStuff/Impish_LLAMA_3B",
-    "developer": "SicariusSicariiStuff",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.463
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4091
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3673
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2941
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Impish_Mind_8B/377d5240-73b5-48d0-bbdc-0960ad1d9069.json b/data/hfopenllm_v2/SicariusSicariiStuff/Impish_Mind_8B/377d5240-73b5-48d0-bbdc-0960ad1d9069.json
deleted file mode 100644
index 4fcce76c3..000000000
--- a/data/hfopenllm_v2/SicariusSicariiStuff/Impish_Mind_8B/377d5240-73b5-48d0-bbdc-0960ad1d9069.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Impish_Mind_8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Impish_Mind_8B",
-    "id": "SicariusSicariiStuff/Impish_Mind_8B",
-    "developer": "SicariusSicariiStuff",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3179
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4674
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.105
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.407
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3309
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Impish_QWEN_14B-1M/9f31a6da-c5bd-4143-b2f9-715c0e9f7b74.json b/data/hfopenllm_v2/SicariusSicariiStuff/Impish_QWEN_14B-1M/9f31a6da-c5bd-4143-b2f9-715c0e9f7b74.json
deleted file mode 100644
index 644abf1f0..000000000
--- a/data/hfopenllm_v2/SicariusSicariiStuff/Impish_QWEN_14B-1M/9f31a6da-c5bd-4143-b2f9-715c0e9f7b74.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Impish_QWEN_14B-1M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Impish_QWEN_14B-1M",
-    "id": "SicariusSicariiStuff/Impish_QWEN_14B-1M",
-    "developer": "SicariusSicariiStuff",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7868
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6283
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3965
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3507
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4615
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5044
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Impish_QWEN_7B-1M/104a0157-c614-44cf-b6cc-9f15dab4b187.json b/data/hfopenllm_v2/SicariusSicariiStuff/Impish_QWEN_7B-1M/104a0157-c614-44cf-b6cc-9f15dab4b187.json
deleted file mode 100644
index 30b1d2ab9..000000000
--- a/data/hfopenllm_v2/SicariusSicariiStuff/Impish_QWEN_7B-1M/104a0157-c614-44cf-b6cc-9f15dab4b187.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Impish_QWEN_7B-1M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Impish_QWEN_7B-1M",
-    "id": "SicariusSicariiStuff/Impish_QWEN_7B-1M",
-    "developer": "SicariusSicariiStuff",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6382
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5372
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3089
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4074
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4265
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/LLAMA-3_8B_Unaligned_BETA/bb379093-c169-44bd-ac86-edb8ab8fc225.json b/data/hfopenllm_v2/SicariusSicariiStuff/LLAMA-3_8B_Unaligned_BETA/bb379093-c169-44bd-ac86-edb8ab8fc225.json
deleted file mode 100644
index be20ff82c..000000000
--- a/data/hfopenllm_v2/SicariusSicariiStuff/LLAMA-3_8B_Unaligned_BETA/bb379093-c169-44bd-ac86-edb8ab8fc225.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_LLAMA-3_8B_Unaligned_BETA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLAMA-3_8B_Unaligned_BETA",
-    "id": "SicariusSicariiStuff/LLAMA-3_8B_Unaligned_BETA",
-    "developer": "SicariusSicariiStuff",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3713
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4717
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0838
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4119
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3465
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Phi-Line_14B/e29001c0-17c0-4deb-8ca2-ce9ad06d8cb3.json b/data/hfopenllm_v2/SicariusSicariiStuff/Phi-Line_14B/e29001c0-17c0-4deb-8ca2-ce9ad06d8cb3.json
deleted file mode 100644
index d361cd930..000000000
--- a/data/hfopenllm_v2/SicariusSicariiStuff/Phi-Line_14B/e29001c0-17c0-4deb-8ca2-ce9ad06d8cb3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Phi-Line_14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-Line_14B",
-    "id": "SicariusSicariiStuff/Phi-Line_14B",
-    "developer": "SicariusSicariiStuff",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6496
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6154
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.386
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3532
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4479
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5454
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Phi-lthy4/43d87bf5-2620-4f8e-a8b6-f86fc157d987.json b/data/hfopenllm_v2/SicariusSicariiStuff/Phi-lthy4/43d87bf5-2620-4f8e-a8b6-f86fc157d987.json
deleted file mode 100644
index 9e1a945d7..000000000
--- a/data/hfopenllm_v2/SicariusSicariiStuff/Phi-lthy4/43d87bf5-2620-4f8e-a8b6-f86fc157d987.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Phi-lthy4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-lthy4",
-    "id": "SicariusSicariiStuff/Phi-lthy4",
-    "developer": "SicariusSicariiStuff",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 11.933
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7679
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5879
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1367
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4083
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4333
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncencored/735d9d75-d9d1-4553-b7cf-f8e7c2e65218.json b/data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncencored/735d9d75-d9d1-4553-b7cf-f8e7c2e65218.json
deleted file mode 100644
index becab3e7a..000000000
--- a/data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncencored/735d9d75-d9d1-4553-b7cf-f8e7c2e65218.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Qwen2.5-14B_Uncencored/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B_Uncencored",
-    "id": "SicariusSicariiStuff/Qwen2.5-14B_Uncencored",
-    "developer": "SicariusSicariiStuff",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3158
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6309
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4517
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5266
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncensored/0c6dcc87-343c-4973-a589-3e3393829184.json b/data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncensored/0c6dcc87-343c-4973-a589-3e3393829184.json
deleted file mode 100644
index ffe8535f1..000000000
--- a/data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncensored/0c6dcc87-343c-4973-a589-3e3393829184.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Qwen2.5-14B_Uncensored/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B_Uncensored",
-    "id": "SicariusSicariiStuff/Qwen2.5-14B_Uncensored",
-    "developer": "SicariusSicariiStuff",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3173
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6309
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4517
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5266
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncensored_Instruct/7c1d1657-e9ae-433f-be9d-523431bfc7ae.json b/data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncensored_Instruct/7c1d1657-e9ae-433f-be9d-523431bfc7ae.json
deleted file mode 100644
index 79cd730d2..000000000
--- a/data/hfopenllm_v2/SicariusSicariiStuff/Qwen2.5-14B_Uncensored_Instruct/7c1d1657-e9ae-433f-be9d-523431bfc7ae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Qwen2.5-14B_Uncensored_Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B_Uncensored_Instruct",
-    "id": "SicariusSicariiStuff/Qwen2.5-14B_Uncensored_Instruct",
-    "developer": "SicariusSicariiStuff",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3789
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5937
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3285
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3697
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5127
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Redemption_Wind_24B/0b2d9a65-c028-4f4b-a280-dc0c35ac9516.json b/data/hfopenllm_v2/SicariusSicariiStuff/Redemption_Wind_24B/0b2d9a65-c028-4f4b-a280-dc0c35ac9516.json
deleted file mode 100644
index 9778a9581..000000000
--- a/data/hfopenllm_v2/SicariusSicariiStuff/Redemption_Wind_24B/0b2d9a65-c028-4f4b-a280-dc0c35ac9516.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Redemption_Wind_24B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Redemption_Wind_24B",
-    "id": "SicariusSicariiStuff/Redemption_Wind_24B",
-    "developer": "SicariusSicariiStuff",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2501
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6428
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1858
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3834
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4262
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5432
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Winged_Imp_8B/e87e1d3f-1476-499d-a9f3-b6463b429262.json b/data/hfopenllm_v2/SicariusSicariiStuff/Winged_Imp_8B/e87e1d3f-1476-499d-a9f3-b6463b429262.json
deleted file mode 100644
index d8868496d..000000000
--- a/data/hfopenllm_v2/SicariusSicariiStuff/Winged_Imp_8B/e87e1d3f-1476-499d-a9f3-b6463b429262.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Winged_Imp_8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Winged_Imp_8B",
-    "id": "SicariusSicariiStuff/Winged_Imp_8B",
-    "developer": "SicariusSicariiStuff",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.743
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.512
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4148
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3639
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Wingless_Imp_8B/246e8450-3c53-4bde-99bb-5663f751e88e.json b/data/hfopenllm_v2/SicariusSicariiStuff/Wingless_Imp_8B/246e8450-3c53-4bde-99bb-5663f751e88e.json
deleted file mode 100644
index 822e85e02..000000000
--- a/data/hfopenllm_v2/SicariusSicariiStuff/Wingless_Imp_8B/246e8450-3c53-4bde-99bb-5663f751e88e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Wingless_Imp_8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Wingless_Imp_8B",
-    "id": "SicariusSicariiStuff/Wingless_Imp_8B",
-    "developer": "SicariusSicariiStuff",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.743
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.512
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4148
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3639
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/Zion_Alpha/496b9e45-2f64-456e-b35e-12a94c5643b1.json b/data/hfopenllm_v2/SicariusSicariiStuff/Zion_Alpha/496b9e45-2f64-456e-b35e-12a94c5643b1.json
deleted file mode 100644
index 87bd4943e..000000000
--- a/data/hfopenllm_v2/SicariusSicariiStuff/Zion_Alpha/496b9e45-2f64-456e-b35e-12a94c5643b1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_Zion_Alpha/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Zion_Alpha",
-    "id": "SicariusSicariiStuff/Zion_Alpha",
-    "developer": "SicariusSicariiStuff",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3324
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4932
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0521
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4727
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3132
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SicariusSicariiStuff/dn_ep02/05890047-a95a-433e-b6b6-fb037592cdd1.json b/data/hfopenllm_v2/SicariusSicariiStuff/dn_ep02/05890047-a95a-433e-b6b6-fb037592cdd1.json
deleted file mode 100644
index 1653d6cec..000000000
--- a/data/hfopenllm_v2/SicariusSicariiStuff/dn_ep02/05890047-a95a-433e-b6b6-fb037592cdd1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SicariusSicariiStuff_dn_ep02/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dn_ep02",
-    "id": "SicariusSicariiStuff/dn_ep02",
-    "developer": "SicariusSicariiStuff",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5064
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5266
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.142
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4316
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3998
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.1-8B-lora-epoch1/4a30580c-1d25-49d4-984d-2d28ef3a5656.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.1-8B-lora-epoch1/4a30580c-1d25-49d4-984d-2d28ef3a5656.json
deleted file mode 100644
index 79a1347e0..000000000
--- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.1-8B-lora-epoch1/4a30580c-1d25-49d4-984d-2d28ef3a5656.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Llama3.1-8B-lora-epoch1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SKY-Ko-Llama3.1-8B-lora-epoch1",
-    "id": "SkyOrbis/SKY-Ko-Llama3.1-8B-lora-epoch1",
-    "developer": "SkyOrbis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5058
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5088
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1548
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3998
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3777
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.1-8B-lora/696d7966-d140-4f43-91df-54f02247b34f.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.1-8B-lora/696d7966-d140-4f43-91df-54f02247b34f.json
deleted file mode 100644
index 3e699beeb..000000000
--- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.1-8B-lora/696d7966-d140-4f43-91df-54f02247b34f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Llama3.1-8B-lora/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SKY-Ko-Llama3.1-8B-lora",
-    "id": "SkyOrbis/SKY-Ko-Llama3.1-8B-lora",
-    "developer": "SkyOrbis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5058
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5088
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1548
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3998
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3777
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch3/fdf10ab8-e3f9-49e6-8fd0-ed116868c217.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch3/fdf10ab8-e3f9-49e6-8fd0-ed116868c217.json
deleted file mode 100644
index cc2e232f1..000000000
--- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch3/fdf10ab8-e3f9-49e6-8fd0-ed116868c217.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-epoch3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SKY-Ko-Llama3.2-1B-lora-epoch3",
-    "id": "SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch3",
-    "developer": "SkyOrbis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3167
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0272
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1279
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch5/9ac16d1f-d894-414d-8a14-110e971d0ba6.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch5/9ac16d1f-d894-414d-8a14-110e971d0ba6.json
deleted file mode 100644
index c6e0028d4..000000000
--- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch5/9ac16d1f-d894-414d-8a14-110e971d0ba6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-epoch5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SKY-Ko-Llama3.2-1B-lora-epoch5",
-    "id": "SkyOrbis/SKY-Ko-Llama3.2-1B-lora-epoch5",
-    "developer": "SkyOrbis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.436
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0521
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3471
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1946
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch3/2eb01e0e-8f7b-4956-9a2d-b32ecaa936f6.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch3/2eb01e0e-8f7b-4956-9a2d-b32ecaa936f6.json
deleted file mode 100644
index 13bfe0eca..000000000
--- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch3/2eb01e0e-8f7b-4956-9a2d-b32ecaa936f6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-v2-epoch3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SKY-Ko-Llama3.2-1B-lora-v2-epoch3",
-    "id": "SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch3",
-    "developer": "SkyOrbis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.436
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0521
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3471
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1946
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch5/3b221b0e-6158-471f-bcd2-b09514f28bd7.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch5/3b221b0e-6158-471f-bcd2-b09514f28bd7.json
deleted file mode 100644
index e22db53cd..000000000
--- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch5/3b221b0e-6158-471f-bcd2-b09514f28bd7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Llama3.2-1B-lora-v2-epoch5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SKY-Ko-Llama3.2-1B-lora-v2-epoch5",
-    "id": "SkyOrbis/SKY-Ko-Llama3.2-1B-lora-v2-epoch5",
-    "developer": "SkyOrbis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4247
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3397
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3458
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1946
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch1/c8af8428-aab6-4d19-b185-2b437c0334fa.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch1/c8af8428-aab6-4d19-b185-2b437c0334fa.json
deleted file mode 100644
index e29268df8..000000000
--- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch1/c8af8428-aab6-4d19-b185-2b437c0334fa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Llama3.2-3B-lora-epoch1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SKY-Ko-Llama3.2-3B-lora-epoch1",
-    "id": "SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch1",
-    "developer": "SkyOrbis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5331
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1458
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3522
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3004
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch2/c617d12b-c37f-47ef-9704-e19774c67aeb.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch2/c617d12b-c37f-47ef-9704-e19774c67aeb.json
deleted file mode 100644
index 69fae55da..000000000
--- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch2/c617d12b-c37f-47ef-9704-e19774c67aeb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Llama3.2-3B-lora-epoch2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SKY-Ko-Llama3.2-3B-lora-epoch2",
-    "id": "SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch2",
-    "developer": "SkyOrbis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5331
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1458
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3522
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3004
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch3/577f31e2-1808-45e2-a528-5933019cfa85.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch3/577f31e2-1808-45e2-a528-5933019cfa85.json
deleted file mode 100644
index 44054c6cc..000000000
--- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch3/577f31e2-1808-45e2-a528-5933019cfa85.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Llama3.2-3B-lora-epoch3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SKY-Ko-Llama3.2-3B-lora-epoch3",
-    "id": "SkyOrbis/SKY-Ko-Llama3.2-3B-lora-epoch3",
-    "developer": "SkyOrbis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5331
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1458
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3522
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3004
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-3B-Instruct/7bd7f5c8-be9e-473e-be18-03ad22a195ee.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-3B-Instruct/7bd7f5c8-be9e-473e-be18-03ad22a195ee.json
deleted file mode 100644
index db8427937..000000000
--- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-3B-Instruct/7bd7f5c8-be9e-473e-be18-03ad22a195ee.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Qwen2.5-3B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SKY-Ko-Qwen2.5-3B-Instruct",
-    "id": "SkyOrbis/SKY-Ko-Qwen2.5-3B-Instruct",
-    "developer": "SkyOrbis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3534
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4265
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0695
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4024
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2812
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000/5036a549-5583-4775-935a-1a12b6de3e7d.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000/5036a549-5583-4775-935a-1a12b6de3e7d.json
deleted file mode 100644
index ce701d5f4..000000000
--- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000/5036a549-5583-4775-935a-1a12b6de3e7d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000",
-    "id": "SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-15000",
-    "developer": "SkyOrbis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3819
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5078
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1866
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4436
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3914
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000/5c0ffff9-542c-424e-88e9-89584e686e12.json b/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000/5c0ffff9-542c-424e-88e9-89584e686e12.json
deleted file mode 100644
index 08f3d6990..000000000
--- a/data/hfopenllm_v2/SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000/5c0ffff9-542c-424e-88e9-89584e686e12.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SkyOrbis_SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000",
-    "id": "SkyOrbis/SKY-Ko-Qwen2.5-7B-Instruct-SFT-step-5000",
-    "developer": "SkyOrbis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3812
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.539
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.21
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4238
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4238
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/5c6a045d-2c90-4938-9185-9c1a0f82903a.json b/data/hfopenllm_v2/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/5c6a045d-2c90-4938-9185-9c1a0f82903a.json
deleted file mode 100644
index ef679628f..000000000
--- a/data/hfopenllm_v2/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/5c6a045d-2c90-4938-9185-9c1a0f82903a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork-Reward-Gemma-2-27B-v0.2",
-    "id": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2",
-    "developer": "Skywork",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForSequenceClassification",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7807
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.636
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2273
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4231
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4103
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Skywork/Skywork-o1-Open-Llama-3.1-8B/02480176-2058-4e71-a970-9698be8d235e.json b/data/hfopenllm_v2/Skywork/Skywork-o1-Open-Llama-3.1-8B/02480176-2058-4e71-a970-9698be8d235e.json
deleted file mode 100644
index 92fd0cf95..000000000
--- a/data/hfopenllm_v2/Skywork/Skywork-o1-Open-Llama-3.1-8B/02480176-2058-4e71-a970-9698be8d235e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Skywork_Skywork-o1-Open-Llama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork-o1-Open-Llama-3.1-8B",
-    "id": "Skywork/Skywork-o1-Open-Llama-3.1-8B",
-    "developer": "Skywork",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3518
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4516
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5211
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3156
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.203
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Solshine/Brimful-merged-replete/4be1e5b4-254c-4287-907d-cc845042de37.json b/data/hfopenllm_v2/Solshine/Brimful-merged-replete/4be1e5b4-254c-4287-907d-cc845042de37.json
deleted file mode 100644
index 8374a1a8d..000000000
--- a/data/hfopenllm_v2/Solshine/Brimful-merged-replete/4be1e5b4-254c-4287-907d-cc845042de37.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Solshine_Brimful-merged-replete/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Brimful-merged-replete",
-    "id": "Solshine/Brimful-merged-replete",
-    "developer": "Solshine",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 12.277
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1761
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2883
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.003
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3421
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1085
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Solshine/Llama-3-1-big-thoughtful-passthrough-merge-2/21b51852-5cad-414e-92d5-31878f025d67.json b/data/hfopenllm_v2/Solshine/Llama-3-1-big-thoughtful-passthrough-merge-2/21b51852-5cad-414e-92d5-31878f025d67.json
deleted file mode 100644
index 92595ba08..000000000
--- a/data/hfopenllm_v2/Solshine/Llama-3-1-big-thoughtful-passthrough-merge-2/21b51852-5cad-414e-92d5-31878f025d67.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Solshine_Llama-3-1-big-thoughtful-passthrough-merge-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-1-big-thoughtful-passthrough-merge-2",
-    "id": "Solshine/Llama-3-1-big-thoughtful-passthrough-merge-2",
-    "developer": "Solshine",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 18.5
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2547
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3209
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3889
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1185
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sorawiz/Gemma-9B-Base/9eb07d4a-1f01-4696-9137-d477ffca43be.json b/data/hfopenllm_v2/Sorawiz/Gemma-9B-Base/9eb07d4a-1f01-4696-9137-d477ffca43be.json
deleted file mode 100644
index 57b9d2ddf..000000000
--- a/data/hfopenllm_v2/Sorawiz/Gemma-9B-Base/9eb07d4a-1f01-4696-9137-d477ffca43be.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sorawiz_Gemma-9B-Base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-9B-Base",
-    "id": "Sorawiz/Gemma-9B-Base",
-    "developer": "Sorawiz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1667
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.593
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0982
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3398
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4045
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4235
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sorawiz/Gemma-Creative-9B-Base/4236485b-aa92-4bc4-a652-17ed3231ecf4.json b/data/hfopenllm_v2/Sorawiz/Gemma-Creative-9B-Base/4236485b-aa92-4bc4-a652-17ed3231ecf4.json
deleted file mode 100644
index 5a311b551..000000000
--- a/data/hfopenllm_v2/Sorawiz/Gemma-Creative-9B-Base/4236485b-aa92-4bc4-a652-17ed3231ecf4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sorawiz_Gemma-Creative-9B-Base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-Creative-9B-Base",
-    "id": "Sorawiz/Gemma-Creative-9B-Base",
-    "developer": "Sorawiz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1515
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5459
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0778
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4019
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4008
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sourjayon/DeepSeek-R1-8b-Sify/9c0d6b71-8c6a-4294-961c-972a002b847f.json b/data/hfopenllm_v2/Sourjayon/DeepSeek-R1-8b-Sify/9c0d6b71-8c6a-4294-961c-972a002b847f.json
deleted file mode 100644
index 509be1166..000000000
--- a/data/hfopenllm_v2/Sourjayon/DeepSeek-R1-8b-Sify/9c0d6b71-8c6a-4294-961c-972a002b847f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sourjayon_DeepSeek-R1-8b-Sify/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-8b-Sify",
-    "id": "Sourjayon/DeepSeek-R1-8b-Sify",
-    "developer": "Sourjayon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3679
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3379
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2447
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3303
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1981
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Sourjayon/DeepSeek-R1-ForumNXT/d1e906d5-8f0d-49c2-88c3-cf71774de600.json b/data/hfopenllm_v2/Sourjayon/DeepSeek-R1-ForumNXT/d1e906d5-8f0d-49c2-88c3-cf71774de600.json
deleted file mode 100644
index e4a1e60b8..000000000
--- a/data/hfopenllm_v2/Sourjayon/DeepSeek-R1-ForumNXT/d1e906d5-8f0d-49c2-88c3-cf71774de600.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Sourjayon_DeepSeek-R1-ForumNXT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-ForumNXT",
-    "id": "Sourjayon/DeepSeek-R1-ForumNXT",
-    "developer": "Sourjayon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2603
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.331
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3392
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1648
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SpaceYL/ECE_Poirot/798e4f83-6262-4d5b-a854-6ff114167209.json b/data/hfopenllm_v2/SpaceYL/ECE_Poirot/798e4f83-6262-4d5b-a854-6ff114167209.json
deleted file mode 100644
index fa2400717..000000000
--- a/data/hfopenllm_v2/SpaceYL/ECE_Poirot/798e4f83-6262-4d5b-a854-6ff114167209.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SpaceYL_ECE_Poirot/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE_Poirot",
-    "id": "SpaceYL/ECE_Poirot",
-    "developer": "SpaceYL",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3107
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4262
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0914
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4026
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2883
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Spestly/Athena-1-3B/dd2603d5-e99e-4778-95d0-159c788626cf.json b/data/hfopenllm_v2/Spestly/Athena-1-3B/dd2603d5-e99e-4778-95d0-159c788626cf.json
deleted file mode 100644
index da9add401..000000000
--- a/data/hfopenllm_v2/Spestly/Athena-1-3B/dd2603d5-e99e-4778-95d0-159c788626cf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Spestly_Athena-1-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Athena-1-3B",
-    "id": "Spestly/Athena-1-3B",
-    "developer": "Spestly",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5569
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4702
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2379
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4362
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3519
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Spestly/Atlas-Pro-1.5B-Preview/41c71990-e79d-447f-b082-63c96fd67a1f.json b/data/hfopenllm_v2/Spestly/Atlas-Pro-1.5B-Preview/41c71990-e79d-447f-b082-63c96fd67a1f.json
deleted file mode 100644
index 81aa9d6c5..000000000
--- a/data/hfopenllm_v2/Spestly/Atlas-Pro-1.5B-Preview/41c71990-e79d-447f-b082-63c96fd67a1f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Spestly_Atlas-Pro-1.5B-Preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Atlas-Pro-1.5B-Preview",
-    "id": "Spestly/Atlas-Pro-1.5B-Preview",
-    "developer": "Spestly",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.243
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3499
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3195
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3354
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1925
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Spestly/Atlas-Pro-7B-Preview/b9e25948-2871-4b6c-933b-8a731e48e81b.json b/data/hfopenllm_v2/Spestly/Atlas-Pro-7B-Preview/b9e25948-2871-4b6c-933b-8a731e48e81b.json
deleted file mode 100644
index 5b58d6fbc..000000000
--- a/data/hfopenllm_v2/Spestly/Atlas-Pro-7B-Preview/b9e25948-2871-4b6c-933b-8a731e48e81b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Spestly_Atlas-Pro-7B-Preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Atlas-Pro-7B-Preview",
-    "id": "Spestly/Atlas-Pro-7B-Preview",
-    "developer": "Spestly",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4668
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3372
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3911
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Stark2008/GutenLaserPi/7c70df74-2bc2-40e0-b0f4-77be1a7e044c.json b/data/hfopenllm_v2/Stark2008/GutenLaserPi/7c70df74-2bc2-40e0-b0f4-77be1a7e044c.json
deleted file mode 100644
index 1770ffa03..000000000
--- a/data/hfopenllm_v2/Stark2008/GutenLaserPi/7c70df74-2bc2-40e0-b0f4-77be1a7e044c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Stark2008_GutenLaserPi/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GutenLaserPi",
-    "id": "Stark2008/GutenLaserPi",
-    "developer": "Stark2008",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4227
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5212
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0785
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.462
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3106
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Stark2008/LayleleFlamPi/ea71bdd5-3aa1-4d26-9256-5aeb2f79fa8c.json b/data/hfopenllm_v2/Stark2008/LayleleFlamPi/ea71bdd5-3aa1-4d26-9256-5aeb2f79fa8c.json
deleted file mode 100644
index 9aef1f6e6..000000000
--- a/data/hfopenllm_v2/Stark2008/LayleleFlamPi/ea71bdd5-3aa1-4d26-9256-5aeb2f79fa8c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Stark2008_LayleleFlamPi/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LayleleFlamPi",
-    "id": "Stark2008/LayleleFlamPi",
-    "developer": "Stark2008",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4284
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5116
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4608
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3093
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Stark2008/VisFlamCat/b0e9c0ca-cd56-42c8-96ed-477884bfd9f9.json b/data/hfopenllm_v2/Stark2008/VisFlamCat/b0e9c0ca-cd56-42c8-96ed-477884bfd9f9.json
deleted file mode 100644
index afdd9f905..000000000
--- a/data/hfopenllm_v2/Stark2008/VisFlamCat/b0e9c0ca-cd56-42c8-96ed-477884bfd9f9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Stark2008_VisFlamCat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "VisFlamCat",
-    "id": "Stark2008/VisFlamCat",
-    "developer": "Stark2008",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4366
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5217
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0763
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4463
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3144
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Steelskull/L3.3-MS-Nevoria-70b/7395fcde-49dd-47f4-a8ea-463eda40f5e3.json b/data/hfopenllm_v2/Steelskull/L3.3-MS-Nevoria-70b/7395fcde-49dd-47f4-a8ea-463eda40f5e3.json
deleted file mode 100644
index 26de03a08..000000000
--- a/data/hfopenllm_v2/Steelskull/L3.3-MS-Nevoria-70b/7395fcde-49dd-47f4-a8ea-463eda40f5e3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Steelskull_L3.3-MS-Nevoria-70b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.3-MS-Nevoria-70b",
-    "id": "Steelskull/L3.3-MS-Nevoria-70b",
-    "developer": "Steelskull",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6963
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6998
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3958
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4706
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4682
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5535
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Steelskull/L3.3-Nevoria-R1-70b/a130087f-566f-4405-b662-1102f1664c49.json b/data/hfopenllm_v2/Steelskull/L3.3-Nevoria-R1-70b/a130087f-566f-4405-b662-1102f1664c49.json
deleted file mode 100644
index ff75741dd..000000000
--- a/data/hfopenllm_v2/Steelskull/L3.3-Nevoria-R1-70b/a130087f-566f-4405-b662-1102f1664c49.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Steelskull_L3.3-Nevoria-R1-70b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.3-Nevoria-R1-70b",
-    "id": "Steelskull/L3.3-Nevoria-R1-70b",
-    "developer": "Steelskull",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6024
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6972
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.463
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.469
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4775
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5463
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/StelleX/Qwen2.5_Math_7B_Cot/3be58cf3-4761-4459-9f3c-eabf812a3c19.json b/data/hfopenllm_v2/StelleX/Qwen2.5_Math_7B_Cot/3be58cf3-4761-4459-9f3c-eabf812a3c19.json
deleted file mode 100644
index e315d4ecc..000000000
--- a/data/hfopenllm_v2/StelleX/Qwen2.5_Math_7B_Cot/3be58cf3-4761-4459-9f3c-eabf812a3c19.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/StelleX_Qwen2.5_Math_7B_Cot/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5_Math_7B_Cot",
-    "id": "StelleX/Qwen2.5_Math_7B_Cot",
-    "developer": "StelleX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2143
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4313
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3924
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/StelleX/Vorisatex-7B-preview/dbdd71ad-db5b-4b4b-8856-68b55adbe127.json b/data/hfopenllm_v2/StelleX/Vorisatex-7B-preview/dbdd71ad-db5b-4b4b-8856-68b55adbe127.json
deleted file mode 100644
index 7a0e33163..000000000
--- a/data/hfopenllm_v2/StelleX/Vorisatex-7B-preview/dbdd71ad-db5b-4b4b-8856-68b55adbe127.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/StelleX_Vorisatex-7B-preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Vorisatex-7B-preview",
-    "id": "StelleX/Vorisatex-7B-preview",
-    "developer": "StelleX",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1515
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4192
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1166
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SultanR/SmolTulu-1.7b-Instruct/da159a16-48a0-45e3-ad4d-bdc9e8b5288c.json b/data/hfopenllm_v2/SultanR/SmolTulu-1.7b-Instruct/da159a16-48a0-45e3-ad4d-bdc9e8b5288c.json
deleted file mode 100644
index 0304b53c6..000000000
--- a/data/hfopenllm_v2/SultanR/SmolTulu-1.7b-Instruct/da159a16-48a0-45e3-ad4d-bdc9e8b5288c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SultanR_SmolTulu-1.7b-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolTulu-1.7b-Instruct",
-    "id": "SultanR/SmolTulu-1.7b-Instruct",
-    "developer": "SultanR",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.711
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6541
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3713
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0793
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.354
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.171
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SultanR/SmolTulu-1.7b-Reinforced/77d5f51e-5ad2-42a6-a32c-060cd844b949.json b/data/hfopenllm_v2/SultanR/SmolTulu-1.7b-Reinforced/77d5f51e-5ad2-42a6-a32c-060cd844b949.json
deleted file mode 100644
index 37b00c3a4..000000000
--- a/data/hfopenllm_v2/SultanR/SmolTulu-1.7b-Reinforced/77d5f51e-5ad2-42a6-a32c-060cd844b949.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SultanR_SmolTulu-1.7b-Reinforced/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolTulu-1.7b-Reinforced",
-    "id": "SultanR/SmolTulu-1.7b-Reinforced",
-    "developer": "SultanR",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.711
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6791
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3552
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0718
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1763
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/SultanR/SmolTulu-1.7b-it-v0/724cc582-cc83-474b-9606-70dbc22f3581.json b/data/hfopenllm_v2/SultanR/SmolTulu-1.7b-it-v0/724cc582-cc83-474b-9606-70dbc22f3581.json
deleted file mode 100644
index 4d26688c4..000000000
--- a/data/hfopenllm_v2/SultanR/SmolTulu-1.7b-it-v0/724cc582-cc83-474b-9606-70dbc22f3581.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/SultanR_SmolTulu-1.7b-it-v0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolTulu-1.7b-it-v0",
-    "id": "SultanR/SmolTulu-1.7b-it-v0",
-    "developer": "SultanR",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.711
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6541
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3713
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0793
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.354
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.171
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Supichi/BBA-123/8a1b2aae-d717-4b49-8ed2-a7ee2cee1940.json b/data/hfopenllm_v2/Supichi/BBA-123/8a1b2aae-d717-4b49-8ed2-a7ee2cee1940.json
deleted file mode 100644
index c72fad264..000000000
--- a/data/hfopenllm_v2/Supichi/BBA-123/8a1b2aae-d717-4b49-8ed2-a7ee2cee1940.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Supichi_BBA-123/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBA-123",
-    "id": "Supichi/BBA-123",
-    "developer": "Supichi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 17.161
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.208
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.292
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3499
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1167
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Supichi/BBA99/0dfb062d-a6ec-42a6-a9f9-6f6424bbdf0c.json b/data/hfopenllm_v2/Supichi/BBA99/0dfb062d-a6ec-42a6-a9f9-6f6424bbdf0c.json
deleted file mode 100644
index 7dab217a1..000000000
--- a/data/hfopenllm_v2/Supichi/BBA99/0dfb062d-a6ec-42a6-a9f9-6f6424bbdf0c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Supichi_BBA99/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBA99",
-    "id": "Supichi/BBA99",
-    "developer": "Supichi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 17.161
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1407
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2769
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3218
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1112
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Supichi/BBAIK29/ab2512fa-2335-4817-9a76-3259690bbc67.json b/data/hfopenllm_v2/Supichi/BBAIK29/ab2512fa-2335-4817-9a76-3259690bbc67.json
deleted file mode 100644
index 90e50a5c5..000000000
--- a/data/hfopenllm_v2/Supichi/BBAIK29/ab2512fa-2335-4817-9a76-3259690bbc67.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Supichi_BBAIK29/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAIK29",
-    "id": "Supichi/BBAIK29",
-    "developer": "Supichi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4588
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.559
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3678
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4501
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4469
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Supichi/BBAI_135_Gemma/fe7f1442-b7db-42d5-bc83-b8afd1d0c802.json b/data/hfopenllm_v2/Supichi/BBAI_135_Gemma/fe7f1442-b7db-42d5-bc83-b8afd1d0c802.json
deleted file mode 100644
index f1fa97e95..000000000
--- a/data/hfopenllm_v2/Supichi/BBAI_135_Gemma/fe7f1442-b7db-42d5-bc83-b8afd1d0c802.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Supichi_BBAI_135_Gemma/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI_135_Gemma",
-    "id": "Supichi/BBAI_135_Gemma",
-    "developer": "Supichi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 19.3
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0656
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3568
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3805
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1672
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Supichi/BBAI_250_Xia0_gZ/0e14484a-69d7-423e-bf6c-33d0992f408c.json b/data/hfopenllm_v2/Supichi/BBAI_250_Xia0_gZ/0e14484a-69d7-423e-bf6c-33d0992f408c.json
deleted file mode 100644
index dbf736418..000000000
--- a/data/hfopenllm_v2/Supichi/BBAI_250_Xia0_gZ/0e14484a-69d7-423e-bf6c-33d0992f408c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Supichi_BBAI_250_Xia0_gZ/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI_250_Xia0_gZ",
-    "id": "Supichi/BBAI_250_Xia0_gZ",
-    "developer": "Supichi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4685
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5568
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.364
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4579
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4465
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Supichi/BBAI_275_Tsunami_gZ/881eaa2c-af5f-4e84-8807-d0835c10ebd2.json b/data/hfopenllm_v2/Supichi/BBAI_275_Tsunami_gZ/881eaa2c-af5f-4e84-8807-d0835c10ebd2.json
deleted file mode 100644
index c2465ba88..000000000
--- a/data/hfopenllm_v2/Supichi/BBAI_275_Tsunami_gZ/881eaa2c-af5f-4e84-8807-d0835c10ebd2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Supichi_BBAI_275_Tsunami_gZ/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI_275_Tsunami_gZ",
-    "id": "Supichi/BBAI_275_Tsunami_gZ",
-    "developer": "Supichi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.537
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5531
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3285
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4448
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4492
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Supichi/BBAI_525_Tsu_gZ_Xia0/ef8a7079-9d13-42b7-ab2d-b72df5ae5d95.json b/data/hfopenllm_v2/Supichi/BBAI_525_Tsu_gZ_Xia0/ef8a7079-9d13-42b7-ab2d-b72df5ae5d95.json
deleted file mode 100644
index 8e1fdf952..000000000
--- a/data/hfopenllm_v2/Supichi/BBAI_525_Tsu_gZ_Xia0/ef8a7079-9d13-42b7-ab2d-b72df5ae5d95.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Supichi_BBAI_525_Tsu_gZ_Xia0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI_525_Tsu_gZ_Xia0",
-    "id": "Supichi/BBAI_525_Tsu_gZ_Xia0",
-    "developer": "Supichi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5339
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5562
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3429
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4474
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4477
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Supichi/BBAI_78B_Calme_3_1_Ties/db8d3fc4-58f4-4f07-8c27-c73a4a4719fb.json b/data/hfopenllm_v2/Supichi/BBAI_78B_Calme_3_1_Ties/db8d3fc4-58f4-4f07-8c27-c73a4a4719fb.json
deleted file mode 100644
index 7bb33875a..000000000
--- a/data/hfopenllm_v2/Supichi/BBAI_78B_Calme_3_1_Ties/db8d3fc4-58f4-4f07-8c27-c73a4a4719fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Supichi_BBAI_78B_Calme_3_1_Ties/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI_78B_Calme_3_1_Ties",
-    "id": "Supichi/BBAI_78B_Calme_3_1_Ties",
-    "developer": "Supichi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 27.06
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1828
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2828
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.229
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.31
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1144
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Supichi/BBAI_QWEEN_V000000_LUMEN_14B/0c44a429-e705-4794-b702-1a731e52df90.json b/data/hfopenllm_v2/Supichi/BBAI_QWEEN_V000000_LUMEN_14B/0c44a429-e705-4794-b702-1a731e52df90.json
deleted file mode 100644
index 4b3c57482..000000000
--- a/data/hfopenllm_v2/Supichi/BBAI_QWEEN_V000000_LUMEN_14B/0c44a429-e705-4794-b702-1a731e52df90.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Supichi_BBAI_QWEEN_V000000_LUMEN_14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI_QWEEN_V000000_LUMEN_14B",
-    "id": "Supichi/BBAI_QWEEN_V000000_LUMEN_14B",
-    "developer": "Supichi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 10.366
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1815
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2297
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2315
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3445
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.116
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Supichi/HF_TOKEN/92b3d2c1-61f4-432a-82a7-43b4367f7ef0.json b/data/hfopenllm_v2/Supichi/HF_TOKEN/92b3d2c1-61f4-432a-82a7-43b4367f7ef0.json
deleted file mode 100644
index 3690ec90c..000000000
--- a/data/hfopenllm_v2/Supichi/HF_TOKEN/92b3d2c1-61f4-432a-82a7-43b4367f7ef0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Supichi_HF_TOKEN/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HF_TOKEN",
-    "id": "Supichi/HF_TOKEN",
-    "developer": "Supichi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 17.161
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.138
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2764
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.111
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Supichi/NJS26/5703e81d-055c-459b-8202-80ec382a8d5b.json b/data/hfopenllm_v2/Supichi/NJS26/5703e81d-055c-459b-8202-80ec382a8d5b.json
deleted file mode 100644
index fb56689db..000000000
--- a/data/hfopenllm_v2/Supichi/NJS26/5703e81d-055c-459b-8202-80ec382a8d5b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Supichi_NJS26/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NJS26",
-    "id": "Supichi/NJS26",
-    "developer": "Supichi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0448
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.478
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0325
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3854
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Svak/MN-12B-Inferor-v0.0/f6260b6e-52a2-4142-93ba-5393807fa0d4.json b/data/hfopenllm_v2/Svak/MN-12B-Inferor-v0.0/f6260b6e-52a2-4142-93ba-5393807fa0d4.json
deleted file mode 100644
index cf81cb961..000000000
--- a/data/hfopenllm_v2/Svak/MN-12B-Inferor-v0.0/f6260b6e-52a2-4142-93ba-5393807fa0d4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Svak_MN-12B-Inferor-v0.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-Inferor-v0.0",
-    "id": "Svak/MN-12B-Inferor-v0.0",
-    "developer": "Svak",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5708
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5195
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4639
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3559
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Svak/MN-12B-Inferor-v0.1/83b84506-4826-48de-a6fe-2af6ae5d425a.json b/data/hfopenllm_v2/Svak/MN-12B-Inferor-v0.1/83b84506-4826-48de-a6fe-2af6ae5d425a.json
deleted file mode 100644
index 30b42165b..000000000
--- a/data/hfopenllm_v2/Svak/MN-12B-Inferor-v0.1/83b84506-4826-48de-a6fe-2af6ae5d425a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Svak_MN-12B-Inferor-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-Inferor-v0.1",
-    "id": "Svak/MN-12B-Inferor-v0.1",
-    "developer": "Svak",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6347
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5147
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1261
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4351
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3662
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Syed-Hasan-8503/Phi-3-mini-4K-instruct-cpo-simpo/7483e260-9853-4d3f-aa10-187796d96de9.json b/data/hfopenllm_v2/Syed-Hasan-8503/Phi-3-mini-4K-instruct-cpo-simpo/7483e260-9853-4d3f-aa10-187796d96de9.json
deleted file mode 100644
index e37c2fec5..000000000
--- a/data/hfopenllm_v2/Syed-Hasan-8503/Phi-3-mini-4K-instruct-cpo-simpo/7483e260-9853-4d3f-aa10-187796d96de9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Syed-Hasan-8503_Phi-3-mini-4K-instruct-cpo-simpo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3-mini-4K-instruct-cpo-simpo",
-    "id": "Syed-Hasan-8503/Phi-3-mini-4K-instruct-cpo-simpo",
-    "developer": "Syed-Hasan-8503",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5714
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5682
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1571
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3305
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3964
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3861
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V1-P1/f9925806-4252-44e8-b67e-917737572bd4.json b/data/hfopenllm_v2/T145/KRONOS-8B-V1-P1/f9925806-4252-44e8-b67e-917737572bd4.json
deleted file mode 100644
index a479386a3..000000000
--- a/data/hfopenllm_v2/T145/KRONOS-8B-V1-P1/f9925806-4252-44e8-b67e-917737572bd4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V1-P1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "KRONOS-8B-V1-P1",
-    "id": "T145/KRONOS-8B-V1-P1",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.785
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5085
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1979
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3881
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.376
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V1-P2/70470e6c-8d66-4249-b762-a5a2e3589a53.json b/data/hfopenllm_v2/T145/KRONOS-8B-V1-P2/70470e6c-8d66-4249-b762-a5a2e3589a53.json
deleted file mode 100644
index 9dacbbf53..000000000
--- a/data/hfopenllm_v2/T145/KRONOS-8B-V1-P2/70470e6c-8d66-4249-b762-a5a2e3589a53.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V1-P2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "KRONOS-8B-V1-P2",
-    "id": "T145/KRONOS-8B-V1-P2",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6724
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4772
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1601
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3568
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3453
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V1-P3/d3abfe3c-ebfe-4dfd-b0db-93c14d32c585.json b/data/hfopenllm_v2/T145/KRONOS-8B-V1-P3/d3abfe3c-ebfe-4dfd-b0db-93c14d32c585.json
deleted file mode 100644
index 4ae2c0b29..000000000
--- a/data/hfopenllm_v2/T145/KRONOS-8B-V1-P3/d3abfe3c-ebfe-4dfd-b0db-93c14d32c585.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V1-P3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "KRONOS-8B-V1-P3",
-    "id": "T145/KRONOS-8B-V1-P3",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7137
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5128
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1926
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3616
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3405
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V2/a35b06bc-d759-421a-94cf-f408a98e9273.json b/data/hfopenllm_v2/T145/KRONOS-8B-V2/a35b06bc-d759-421a-94cf-f408a98e9273.json
deleted file mode 100644
index 1869aec88..000000000
--- a/data/hfopenllm_v2/T145/KRONOS-8B-V2/a35b06bc-d759-421a-94cf-f408a98e9273.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "KRONOS-8B-V2",
-    "id": "T145/KRONOS-8B-V2",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.518
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5133
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2266
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3829
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V3/bbac659c-7cf8-41d4-98d4-ded4c471bd98.json b/data/hfopenllm_v2/T145/KRONOS-8B-V3/bbac659c-7cf8-41d4-98d4-ded4c471bd98.json
deleted file mode 100644
index 86f598a6c..000000000
--- a/data/hfopenllm_v2/T145/KRONOS-8B-V3/bbac659c-7cf8-41d4-98d4-ded4c471bd98.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "KRONOS-8B-V3",
-    "id": "T145/KRONOS-8B-V3",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5475
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5119
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2598
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3922
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V4/0c73f3a0-0a92-4b1c-abfa-6eb77138dacd.json b/data/hfopenllm_v2/T145/KRONOS-8B-V4/0c73f3a0-0a92-4b1c-abfa-6eb77138dacd.json
deleted file mode 100644
index b0d2848fc..000000000
--- a/data/hfopenllm_v2/T145/KRONOS-8B-V4/0c73f3a0-0a92-4b1c-abfa-6eb77138dacd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "KRONOS-8B-V4",
-    "id": "T145/KRONOS-8B-V4",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7889
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5092
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1949
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.383
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3786
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V5/a7ab6f16-717f-4567-8057-a4a18e1a1e77.json b/data/hfopenllm_v2/T145/KRONOS-8B-V5/a7ab6f16-717f-4567-8057-a4a18e1a1e77.json
deleted file mode 100644
index fb35a1d47..000000000
--- a/data/hfopenllm_v2/T145/KRONOS-8B-V5/a7ab6f16-717f-4567-8057-a4a18e1a1e77.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "KRONOS-8B-V5",
-    "id": "T145/KRONOS-8B-V5",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5405
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5089
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2689
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4055
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3759
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V6/2abe2c9d-032d-469e-852b-114eca5e84f8.json b/data/hfopenllm_v2/T145/KRONOS-8B-V6/2abe2c9d-032d-469e-852b-114eca5e84f8.json
deleted file mode 100644
index 55fd3a74f..000000000
--- a/data/hfopenllm_v2/T145/KRONOS-8B-V6/2abe2c9d-032d-469e-852b-114eca5e84f8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "KRONOS-8B-V6",
-    "id": "T145/KRONOS-8B-V6",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7022
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5034
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2598
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4121
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3501
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V7/2e8a83dc-c760-4f42-a361-e02cf3a65427.json b/data/hfopenllm_v2/T145/KRONOS-8B-V7/2e8a83dc-c760-4f42-a361-e02cf3a65427.json
deleted file mode 100644
index 5846cc83c..000000000
--- a/data/hfopenllm_v2/T145/KRONOS-8B-V7/2e8a83dc-c760-4f42-a361-e02cf3a65427.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "KRONOS-8B-V7",
-    "id": "T145/KRONOS-8B-V7",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3529
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4526
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.111
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3671
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2697
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V8/743dfe64-e7cd-493e-817d-8d5fcdc2ea24.json b/data/hfopenllm_v2/T145/KRONOS-8B-V8/743dfe64-e7cd-493e-817d-8d5fcdc2ea24.json
deleted file mode 100644
index d7f05654d..000000000
--- a/data/hfopenllm_v2/T145/KRONOS-8B-V8/743dfe64-e7cd-493e-817d-8d5fcdc2ea24.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V8/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "KRONOS-8B-V8",
-    "id": "T145/KRONOS-8B-V8",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.777
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5094
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2047
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3869
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3782
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/KRONOS-8B-V9/4e37c90b-65a8-4b71-bfc2-d63541fb8962.json b/data/hfopenllm_v2/T145/KRONOS-8B-V9/4e37c90b-65a8-4b71-bfc2-d63541fb8962.json
deleted file mode 100644
index 405395324..000000000
--- a/data/hfopenllm_v2/T145/KRONOS-8B-V9/4e37c90b-65a8-4b71-bfc2-d63541fb8962.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_KRONOS-8B-V9/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "KRONOS-8B-V9",
-    "id": "T145/KRONOS-8B-V9",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7856
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5099
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1986
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3868
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3752
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/Llama-3.1-8B-Instruct-Zeus/2e34d74e-1b69-4daf-8bee-77e5357fd439.json b/data/hfopenllm_v2/T145/Llama-3.1-8B-Instruct-Zeus/2e34d74e-1b69-4daf-8bee-77e5357fd439.json
deleted file mode 100644
index 41ff9323c..000000000
--- a/data/hfopenllm_v2/T145/Llama-3.1-8B-Instruct-Zeus/2e34d74e-1b69-4daf-8bee-77e5357fd439.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_Llama-3.1-8B-Instruct-Zeus/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-Instruct-Zeus",
-    "id": "T145/Llama-3.1-8B-Instruct-Zeus",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7941
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5174
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1956
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3976
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3893
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/Llama-3.1-8B-Zeus/0646e2f7-d2e6-42d3-8f09-f8daee302709.json b/data/hfopenllm_v2/T145/Llama-3.1-8B-Zeus/0646e2f7-d2e6-42d3-8f09-f8daee302709.json
deleted file mode 100644
index 0756465ea..000000000
--- a/data/hfopenllm_v2/T145/Llama-3.1-8B-Zeus/0646e2f7-d2e6-42d3-8f09-f8daee302709.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_Llama-3.1-8B-Zeus/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-Zeus",
-    "id": "T145/Llama-3.1-8B-Zeus",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3518
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3671
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3316
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1332
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/Meta-Llama-3.1-8B-Instruct-TIES/c66b1ff8-9c04-4f9c-b83e-088f31f79590.json b/data/hfopenllm_v2/T145/Meta-Llama-3.1-8B-Instruct-TIES/c66b1ff8-9c04-4f9c-b83e-088f31f79590.json
deleted file mode 100644
index d2e2124a8..000000000
--- a/data/hfopenllm_v2/T145/Meta-Llama-3.1-8B-Instruct-TIES/c66b1ff8-9c04-4f9c-b83e-088f31f79590.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_Meta-Llama-3.1-8B-Instruct-TIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Meta-Llama-3.1-8B-Instruct-TIES",
-    "id": "T145/Meta-Llama-3.1-8B-Instruct-TIES",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5424
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.507
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.21
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3843
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.378
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V10/1bd2affc-9970-4149-b52b-51549b1f0029.json b/data/hfopenllm_v2/T145/ZEUS-8B-V10/1bd2affc-9970-4149-b52b-51549b1f0029.json
deleted file mode 100644
index 0c3eeb4e0..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V10/1bd2affc-9970-4149-b52b-51549b1f0029.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V10/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V10",
-    "id": "T145/ZEUS-8B-V10",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7707
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.527
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2115
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3898
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3904
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V11/f0479d74-4684-4b41-a63b-16d7fe0e3290.json b/data/hfopenllm_v2/T145/ZEUS-8B-V11/f0479d74-4684-4b41-a63b-16d7fe0e3290.json
deleted file mode 100644
index 65e7ba095..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V11/f0479d74-4684-4b41-a63b-16d7fe0e3290.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V11/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V11",
-    "id": "T145/ZEUS-8B-V11",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5162
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1964
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3807
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3884
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V12/95deb890-a15d-4c71-8151-ed45c3dfb87f.json b/data/hfopenllm_v2/T145/ZEUS-8B-V12/95deb890-a15d-4c71-8151-ed45c3dfb87f.json
deleted file mode 100644
index 2173c77bb..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V12/95deb890-a15d-4c71-8151-ed45c3dfb87f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V12/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V12",
-    "id": "T145/ZEUS-8B-V12",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7816
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5254
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2115
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3858
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3912
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V13-abliterated/1c07fc4c-a773-4e03-bb14-7144e7815c01.json b/data/hfopenllm_v2/T145/ZEUS-8B-V13-abliterated/1c07fc4c-a773-4e03-bb14-7144e7815c01.json
deleted file mode 100644
index 4f8852135..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V13-abliterated/1c07fc4c-a773-4e03-bb14-7144e7815c01.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V13-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V13-abliterated",
-    "id": "T145/ZEUS-8B-V13-abliterated",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7878
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5198
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.179
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3871
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3872
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V13/e7e8388e-db3c-4881-b67c-5177c60562b9.json b/data/hfopenllm_v2/T145/ZEUS-8B-V13/e7e8388e-db3c-4881-b67c-5177c60562b9.json
deleted file mode 100644
index f1a974e92..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V13/e7e8388e-db3c-4881-b67c-5177c60562b9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V13/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V13",
-    "id": "T145/ZEUS-8B-V13",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7904
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5277
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2137
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3845
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3911
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V14/c4923208-2a47-45f2-a74a-4483e4b99bee.json b/data/hfopenllm_v2/T145/ZEUS-8B-V14/c4923208-2a47-45f2-a74a-4483e4b99bee.json
deleted file mode 100644
index 0b3a25678..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V14/c4923208-2a47-45f2-a74a-4483e4b99bee.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V14/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V14",
-    "id": "T145/ZEUS-8B-V14",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7709
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5275
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.213
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3844
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3914
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V15/b5f06a78-5b57-45a5-93be-4f3c1b36f208.json b/data/hfopenllm_v2/T145/ZEUS-8B-V15/b5f06a78-5b57-45a5-93be-4f3c1b36f208.json
deleted file mode 100644
index b2f90345b..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V15/b5f06a78-5b57-45a5-93be-4f3c1b36f208.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V15/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V15",
-    "id": "T145/ZEUS-8B-V15",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7013
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5538
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2304
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.402
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4059
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V16/835f19d3-515c-4bc4-ab96-5cb5bece45dc.json b/data/hfopenllm_v2/T145/ZEUS-8B-V16/835f19d3-515c-4bc4-ab96-5cb5bece45dc.json
deleted file mode 100644
index 5181ab5f4..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V16/835f19d3-515c-4bc4-ab96-5cb5bece45dc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V16/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V16",
-    "id": "T145/ZEUS-8B-V16",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7925
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5266
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2205
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3951
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3926
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated-V2/7dd96382-6fc1-4a39-924b-d9034b5b0839.json b/data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated-V2/7dd96382-6fc1-4a39-924b-d9034b5b0839.json
deleted file mode 100644
index 978451253..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated-V2/7dd96382-6fc1-4a39-924b-d9034b5b0839.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V17-abliterated-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V17-abliterated-V2",
-    "id": "T145/ZEUS-8B-V17-abliterated-V2",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6532
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4928
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1118
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3407
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3402
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated-V4/77a666a2-a9b2-43cc-8e64-67172f4ab6c8.json b/data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated-V4/77a666a2-a9b2-43cc-8e64-67172f4ab6c8.json
deleted file mode 100644
index cee81252f..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated-V4/77a666a2-a9b2-43cc-8e64-67172f4ab6c8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V17-abliterated-V4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V17-abliterated-V4",
-    "id": "T145/ZEUS-8B-V17-abliterated-V4",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7228
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5169
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0937
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4187
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3774
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated/e3eae267-46ab-4433-a8f3-2a2f8448299b.json b/data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated/e3eae267-46ab-4433-a8f3-2a2f8448299b.json
deleted file mode 100644
index f3ac720d6..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V17-abliterated/e3eae267-46ab-4433-a8f3-2a2f8448299b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V17-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V17-abliterated",
-    "id": "T145/ZEUS-8B-V17-abliterated",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.594
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7576
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4269
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3622
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V17/e31308c4-8eb2-4a72-8127-18049d58b814.json b/data/hfopenllm_v2/T145/ZEUS-8B-V17/e31308c4-8eb2-4a72-8127-18049d58b814.json
deleted file mode 100644
index 7b6a3680c..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V17/e31308c4-8eb2-4a72-8127-18049d58b814.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V17/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V17",
-    "id": "T145/ZEUS-8B-V17",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7941
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5251
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2243
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4016
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V18/c7098a7a-e865-4ecd-b511-abeb2c0872bd.json b/data/hfopenllm_v2/T145/ZEUS-8B-V18/c7098a7a-e865-4ecd-b511-abeb2c0872bd.json
deleted file mode 100644
index 4f93f40a0..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V18/c7098a7a-e865-4ecd-b511-abeb2c0872bd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V18/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V18",
-    "id": "T145/ZEUS-8B-V18",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7834
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.527
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2183
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4043
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3942
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V19/b3a8c734-e63a-47f7-af2c-a3b6518802fa.json b/data/hfopenllm_v2/T145/ZEUS-8B-V19/b3a8c734-e63a-47f7-af2c-a3b6518802fa.json
deleted file mode 100644
index 893297bba..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V19/b3a8c734-e63a-47f7-af2c-a3b6518802fa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V19/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V19",
-    "id": "T145/ZEUS-8B-V19",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7883
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5276
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2205
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4043
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3934
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V2-ORPO/35937965-2791-4f75-8954-5a2280381c91.json b/data/hfopenllm_v2/T145/ZEUS-8B-V2-ORPO/35937965-2791-4f75-8954-5a2280381c91.json
deleted file mode 100644
index 970e9dcd6..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V2-ORPO/35937965-2791-4f75-8954-5a2280381c91.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V2-ORPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V2-ORPO",
-    "id": "T145/ZEUS-8B-V2-ORPO",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7187
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5075
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1828
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3678
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V2-abliterated/4ab806fe-738d-4f5b-89e4-004134d2f7fe.json b/data/hfopenllm_v2/T145/ZEUS-8B-V2-abliterated/4ab806fe-738d-4f5b-89e4-004134d2f7fe.json
deleted file mode 100644
index 1e938c379..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V2-abliterated/4ab806fe-738d-4f5b-89e4-004134d2f7fe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V2-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V2-abliterated",
-    "id": "T145/ZEUS-8B-V2-abliterated",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7895
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5129
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2115
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3911
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3825
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V2/a937e27e-b757-4de7-b679-01ac29d8bb22.json b/data/hfopenllm_v2/T145/ZEUS-8B-V2/a937e27e-b757-4de7-b679-01ac29d8bb22.json
deleted file mode 100644
index d403098af..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V2/a937e27e-b757-4de7-b679-01ac29d8bb22.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V2",
-    "id": "T145/ZEUS-8B-V2",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8029
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5194
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.391
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3896
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V20/1d906aab-33a6-4ffe-8a63-694482d83d09.json b/data/hfopenllm_v2/T145/ZEUS-8B-V20/1d906aab-33a6-4ffe-8a63-694482d83d09.json
deleted file mode 100644
index 269fa1ea5..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V20/1d906aab-33a6-4ffe-8a63-694482d83d09.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V20/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V20",
-    "id": "T145/ZEUS-8B-V20",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7956
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5244
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4043
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.393
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V21/9e101298-6482-4ae8-83e4-b948ba8fa550.json b/data/hfopenllm_v2/T145/ZEUS-8B-V21/9e101298-6482-4ae8-83e4-b948ba8fa550.json
deleted file mode 100644
index 55ec61818..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V21/9e101298-6482-4ae8-83e4-b948ba8fa550.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V21/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V21",
-    "id": "T145/ZEUS-8B-V21",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3785
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3398
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1594
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1714
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V22/3818710d-80a9-4e7d-90e3-f06afffb71ac.json b/data/hfopenllm_v2/T145/ZEUS-8B-V22/3818710d-80a9-4e7d-90e3-f06afffb71ac.json
deleted file mode 100644
index 35e2f1e24..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V22/3818710d-80a9-4e7d-90e3-f06afffb71ac.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V22/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V22",
-    "id": "T145/ZEUS-8B-V22",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7995
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5245
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2228
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.399
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3938
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V23/a18ec0c4-6f3f-4904-b69c-e40770df169e.json b/data/hfopenllm_v2/T145/ZEUS-8B-V23/a18ec0c4-6f3f-4904-b69c-e40770df169e.json
deleted file mode 100644
index 9423e9265..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V23/a18ec0c4-6f3f-4904-b69c-e40770df169e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V23/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V23",
-    "id": "T145/ZEUS-8B-V23",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7621
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5195
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.182
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3922
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3666
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V24/529c2bd4-6b8e-4e3c-8737-c0b794444d13.json b/data/hfopenllm_v2/T145/ZEUS-8B-V24/529c2bd4-6b8e-4e3c-8737-c0b794444d13.json
deleted file mode 100644
index cebba5e48..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V24/529c2bd4-6b8e-4e3c-8737-c0b794444d13.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V24/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V24",
-    "id": "T145/ZEUS-8B-V24",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4778
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1458
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3729
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3285
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V25/9e994362-a1d1-48f7-9db1-dd9d532b9f35.json b/data/hfopenllm_v2/T145/ZEUS-8B-V25/9e994362-a1d1-48f7-9db1-dd9d532b9f35.json
deleted file mode 100644
index 7c03079a9..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V25/9e994362-a1d1-48f7-9db1-dd9d532b9f35.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V25/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V25",
-    "id": "T145/ZEUS-8B-V25",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.332
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4547
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2039
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3488
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2885
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V26/cf35b7db-f675-4362-8916-36b0582b64f4.json b/data/hfopenllm_v2/T145/ZEUS-8B-V26/cf35b7db-f675-4362-8916-36b0582b64f4.json
deleted file mode 100644
index 1c35fccff..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V26/cf35b7db-f675-4362-8916-36b0582b64f4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V26/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V26",
-    "id": "T145/ZEUS-8B-V26",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6708
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5232
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1246
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4016
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3907
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V27/79ee7e34-36cd-4024-8978-86c1b059ae5f.json b/data/hfopenllm_v2/T145/ZEUS-8B-V27/79ee7e34-36cd-4024-8978-86c1b059ae5f.json
deleted file mode 100644
index 6b17c7f6a..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V27/79ee7e34-36cd-4024-8978-86c1b059ae5f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V27/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V27",
-    "id": "T145/ZEUS-8B-V27",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6544
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.523
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1344
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3977
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3902
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V28/9ec4fb99-ed4d-416e-9342-0c036aadd35d.json b/data/hfopenllm_v2/T145/ZEUS-8B-V28/9ec4fb99-ed4d-416e-9342-0c036aadd35d.json
deleted file mode 100644
index 1ebb0754d..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V28/9ec4fb99-ed4d-416e-9342-0c036aadd35d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V28/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V28",
-    "id": "T145/ZEUS-8B-V28",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6353
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5254
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1269
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3896
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3902
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V29/8788e4fa-04c5-4f7c-bb4e-523287901f71.json b/data/hfopenllm_v2/T145/ZEUS-8B-V29/8788e4fa-04c5-4f7c-bb4e-523287901f71.json
deleted file mode 100644
index a0483b34c..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V29/8788e4fa-04c5-4f7c-bb4e-523287901f71.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V29/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V29",
-    "id": "T145/ZEUS-8B-V29",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7418
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5253
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1601
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4003
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.392
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V2L1/18097bf4-5149-40e9-9850-558c3f143ed8.json b/data/hfopenllm_v2/T145/ZEUS-8B-V2L1/18097bf4-5149-40e9-9850-558c3f143ed8.json
deleted file mode 100644
index e6c750d51..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V2L1/18097bf4-5149-40e9-9850-558c3f143ed8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V2L1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V2L1",
-    "id": "T145/ZEUS-8B-V2L1",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3192
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5013
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1239
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3882
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3638
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V2L2/b5942721-5c30-4c49-a6e1-fb5419539652.json b/data/hfopenllm_v2/T145/ZEUS-8B-V2L2/b5942721-5c30-4c49-a6e1-fb5419539652.json
deleted file mode 100644
index d9184ab9f..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V2L2/b5942721-5c30-4c49-a6e1-fb5419539652.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V2L2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V2L2",
-    "id": "T145/ZEUS-8B-V2L2",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8021
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5203
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2017
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3975
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3884
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V3/76d27de3-0309-4e4b-8d0d-0e402bde0a31.json b/data/hfopenllm_v2/T145/ZEUS-8B-V3/76d27de3-0309-4e4b-8d0d-0e402bde0a31.json
deleted file mode 100644
index 5781ec510..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V3/76d27de3-0309-4e4b-8d0d-0e402bde0a31.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V3",
-    "id": "T145/ZEUS-8B-V3",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7887
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5265
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1677
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4017
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3804
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V30/5c0553ff-4910-45a9-aa8d-3a76af098403.json b/data/hfopenllm_v2/T145/ZEUS-8B-V30/5c0553ff-4910-45a9-aa8d-3a76af098403.json
deleted file mode 100644
index cf4cd7678..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V30/5c0553ff-4910-45a9-aa8d-3a76af098403.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V30/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V30",
-    "id": "T145/ZEUS-8B-V30",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7436
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5243
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1586
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4029
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3944
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V4/fd97d1d9-a1b5-429d-b73d-1ea92ae1d61c.json b/data/hfopenllm_v2/T145/ZEUS-8B-V4/fd97d1d9-a1b5-429d-b73d-1ea92ae1d61c.json
deleted file mode 100644
index b4583aa87..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V4/fd97d1d9-a1b5-429d-b73d-1ea92ae1d61c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V4",
-    "id": "T145/ZEUS-8B-V4",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7807
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5246
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1926
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4029
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3788
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V6/f77aa103-5a09-409c-ad72-7992b6049f94.json b/data/hfopenllm_v2/T145/ZEUS-8B-V6/f77aa103-5a09-409c-ad72-7992b6049f94.json
deleted file mode 100644
index 0d6b104c5..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V6/f77aa103-5a09-409c-ad72-7992b6049f94.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V6",
-    "id": "T145/ZEUS-8B-V6",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7838
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.524
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2024
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4068
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3759
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V7/0afdaa1d-c1e7-4283-a2b3-f459c09df4a9.json b/data/hfopenllm_v2/T145/ZEUS-8B-V7/0afdaa1d-c1e7-4283-a2b3-f459c09df4a9.json
deleted file mode 100644
index 4c23b2855..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V7/0afdaa1d-c1e7-4283-a2b3-f459c09df4a9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V7",
-    "id": "T145/ZEUS-8B-V7",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7786
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.507
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4162
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3812
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V8/044ed79b-0c54-4a7a-94ba-a3f999adeb0d.json b/data/hfopenllm_v2/T145/ZEUS-8B-V8/044ed79b-0c54-4a7a-94ba-a3f999adeb0d.json
deleted file mode 100644
index dac50a4bc..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V8/044ed79b-0c54-4a7a-94ba-a3f999adeb0d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V8/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V8",
-    "id": "T145/ZEUS-8B-V8",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7914
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5065
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1329
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4214
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3761
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/ZEUS-8B-V9/ac6b884d-62ea-4ff5-8eee-cfce08869030.json b/data/hfopenllm_v2/T145/ZEUS-8B-V9/ac6b884d-62ea-4ff5-8eee-cfce08869030.json
deleted file mode 100644
index 73c3bb0b4..000000000
--- a/data/hfopenllm_v2/T145/ZEUS-8B-V9/ac6b884d-62ea-4ff5-8eee-cfce08869030.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_ZEUS-8B-V9/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZEUS-8B-V9",
-    "id": "T145/ZEUS-8B-V9",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5551
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5207
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2137
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3949
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3901
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/T145/qwen-2.5-3B-merge-test/8ffa696e-adef-4808-ba0e-bb04921a433d.json b/data/hfopenllm_v2/T145/qwen-2.5-3B-merge-test/8ffa696e-adef-4808-ba0e-bb04921a433d.json
deleted file mode 100644
index 2f5ff3f18..000000000
--- a/data/hfopenllm_v2/T145/qwen-2.5-3B-merge-test/8ffa696e-adef-4808-ba0e-bb04921a433d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/T145_qwen-2.5-3B-merge-test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen-2.5-3B-merge-test",
-    "id": "T145/qwen-2.5-3B-merge-test",
-    "developer": "T145",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.397
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5751
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4842
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3202
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4007
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.329
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/THUDM/glm-4-9b-chat-1m-hf/8a2cfa62-5f13-447e-8d0f-2503e4962ac5.json b/data/hfopenllm_v2/THUDM/glm-4-9b-chat-1m-hf/8a2cfa62-5f13-447e-8d0f-2503e4962ac5.json
deleted file mode 100644
index 2a4ac4794..000000000
--- a/data/hfopenllm_v2/THUDM/glm-4-9b-chat-1m-hf/8a2cfa62-5f13-447e-8d0f-2503e4962ac5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/THUDM_glm-4-9b-chat-1m-hf/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "glm-4-9b-chat-1m-hf",
-    "id": "THUDM/glm-4-9b-chat-1m-hf",
-    "developer": "THUDM",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GlmForCausalLM",
-      "params_billions": 9.484
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5341
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3901
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0483
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3689
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1814
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/THUDM/glm-4-9b-chat-1m/4f24fc46-3686-41fa-bf25-a0e39b252cc9.json b/data/hfopenllm_v2/THUDM/glm-4-9b-chat-1m/4f24fc46-3686-41fa-bf25-a0e39b252cc9.json
deleted file mode 100644
index cbe32d26e..000000000
--- a/data/hfopenllm_v2/THUDM/glm-4-9b-chat-1m/4f24fc46-3686-41fa-bf25-a0e39b252cc9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/THUDM_glm-4-9b-chat-1m/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "glm-4-9b-chat-1m",
-    "id": "THUDM/glm-4-9b-chat-1m",
-    "developer": "THUDM",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "ChatGLMModel",
-      "params_billions": 9.484
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.418
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3795
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/THUDM/glm-4-9b-chat-hf/b1375cb4-b0d5-4cb4-ad43-394ebd1a481f.json b/data/hfopenllm_v2/THUDM/glm-4-9b-chat-hf/b1375cb4-b0d5-4cb4-ad43-394ebd1a481f.json
deleted file mode 100644
index f54072bac..000000000
--- a/data/hfopenllm_v2/THUDM/glm-4-9b-chat-hf/b1375cb4-b0d5-4cb4-ad43-394ebd1a481f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/THUDM_glm-4-9b-chat-hf/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "glm-4-9b-chat-hf",
-    "id": "THUDM/glm-4-9b-chat-hf",
-    "developer": "THUDM",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GlmForCausalLM",
-      "params_billions": 9.4
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6513
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4432
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0846
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3593
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2774
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/THUDM/glm-4-9b-chat/4ce062da-acfc-4684-95c2-679cbe5a697b.json b/data/hfopenllm_v2/THUDM/glm-4-9b-chat/4ce062da-acfc-4684-95c2-679cbe5a697b.json
deleted file mode 100644
index 01dceb960..000000000
--- a/data/hfopenllm_v2/THUDM/glm-4-9b-chat/4ce062da-acfc-4684-95c2-679cbe5a697b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/THUDM_glm-4-9b-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "glm-4-9b-chat",
-    "id": "THUDM/glm-4-9b-chat",
-    "developer": "THUDM",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "ChatGLMModelM",
-      "params_billions": 9.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4736
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3994
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3167
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/THUDM/glm-4-9b/3d785765-befa-4e53-8672-769f7bb87dcd.json b/data/hfopenllm_v2/THUDM/glm-4-9b/3d785765-befa-4e53-8672-769f7bb87dcd.json
deleted file mode 100644
index f7c95a078..000000000
--- a/data/hfopenllm_v2/THUDM/glm-4-9b/3d785765-befa-4e53-8672-769f7bb87dcd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/THUDM_glm-4-9b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "glm-4-9b",
-    "id": "THUDM/glm-4-9b",
-    "developer": "THUDM",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "ChatGLMModelM",
-      "params_billions": 9.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1426
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5528
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4386
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4145
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TIGER-Lab/AceCodeRM-7B/ab0d3a24-19db-4d00-892e-bcb7c0f2f30f.json b/data/hfopenllm_v2/TIGER-Lab/AceCodeRM-7B/ab0d3a24-19db-4d00-892e-bcb7c0f2f30f.json
deleted file mode 100644
index a74035d6b..000000000
--- a/data/hfopenllm_v2/TIGER-Lab/AceCodeRM-7B/ab0d3a24-19db-4d00-892e-bcb7c0f2f30f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TIGER-Lab_AceCodeRM-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AceCodeRM-7B",
-    "id": "TIGER-Lab/AceCodeRM-7B",
-    "developer": "TIGER-Lab",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalRM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5855
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4773
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3467
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4192
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3361
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-7B-Ins-Rule/31f0b186-1805-42ff-86cf-d8455a66d538.json b/data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-7B-Ins-Rule/31f0b186-1805-42ff-86cf-d8455a66d538.json
deleted file mode 100644
index 0ea92826f..000000000
--- a/data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-7B-Ins-Rule/31f0b186-1805-42ff-86cf-d8455a66d538.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TIGER-Lab_AceCoder-Qwen2.5-7B-Ins-Rule/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AceCoder-Qwen2.5-7B-Ins-Rule",
-    "id": "TIGER-Lab/AceCoder-Qwen2.5-7B-Ins-Rule",
-    "developer": "TIGER-Lab",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7424
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5404
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4992
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.398
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4322
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Base-Rule/ed6b3e7e-d294-420d-b9b9-460a52cd0239.json b/data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Base-Rule/ed6b3e7e-d294-420d-b9b9-460a52cd0239.json
deleted file mode 100644
index 60cf1a5b7..000000000
--- a/data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Base-Rule/ed6b3e7e-d294-420d-b9b9-460a52cd0239.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TIGER-Lab_AceCoder-Qwen2.5-Coder-7B-Base-Rule/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AceCoder-Qwen2.5-Coder-7B-Base-Rule",
-    "id": "TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Base-Rule",
-    "developer": "TIGER-Lab",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4408
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4902
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2017
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3449
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3745
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Ins-Rule/91dec0c0-9854-4790-a0a5-e17d19636f17.json b/data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Ins-Rule/91dec0c0-9854-4790-a0a5-e17d19636f17.json
deleted file mode 100644
index e2e2f2697..000000000
--- a/data/hfopenllm_v2/TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Ins-Rule/91dec0c0-9854-4790-a0a5-e17d19636f17.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TIGER-Lab_AceCoder-Qwen2.5-Coder-7B-Ins-Rule/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AceCoder-Qwen2.5-Coder-7B-Ins-Rule",
-    "id": "TIGER-Lab/AceCoder-Qwen2.5-Coder-7B-Ins-Rule",
-    "developer": "TIGER-Lab",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6222
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5089
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3603
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4046
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3428
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TIGER-Lab/MAmmoTH2-7B-Plus/599616fb-26c1-47e3-a98b-9ad922a95c08.json b/data/hfopenllm_v2/TIGER-Lab/MAmmoTH2-7B-Plus/599616fb-26c1-47e3-a98b-9ad922a95c08.json
deleted file mode 100644
index 0a1e9887a..000000000
--- a/data/hfopenllm_v2/TIGER-Lab/MAmmoTH2-7B-Plus/599616fb-26c1-47e3-a98b-9ad922a95c08.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TIGER-Lab_MAmmoTH2-7B-Plus/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MAmmoTH2-7B-Plus",
-    "id": "TIGER-Lab/MAmmoTH2-7B-Plus",
-    "developer": "TIGER-Lab",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5575
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4235
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1858
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4124
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3017
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TIGER-Lab/Qwen2.5-Math-7B-CFT/aeee4365-c34d-46b9-8c98-29976010bb62.json b/data/hfopenllm_v2/TIGER-Lab/Qwen2.5-Math-7B-CFT/aeee4365-c34d-46b9-8c98-29976010bb62.json
deleted file mode 100644
index 92ab6326a..000000000
--- a/data/hfopenllm_v2/TIGER-Lab/Qwen2.5-Math-7B-CFT/aeee4365-c34d-46b9-8c98-29976010bb62.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TIGER-Lab_Qwen2.5-Math-7B-CFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Math-7B-CFT",
-    "id": "TIGER-Lab/Qwen2.5-Math-7B-CFT",
-    "developer": "TIGER-Lab",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4637
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3887
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TTTXXX01/Mistral-7B-Base-SimPO2-5e-7/1ec68708-94c9-4561-bb99-7f211d7a9950.json b/data/hfopenllm_v2/TTTXXX01/Mistral-7B-Base-SimPO2-5e-7/1ec68708-94c9-4561-bb99-7f211d7a9950.json
deleted file mode 100644
index f4ed2d887..000000000
--- a/data/hfopenllm_v2/TTTXXX01/Mistral-7B-Base-SimPO2-5e-7/1ec68708-94c9-4561-bb99-7f211d7a9950.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TTTXXX01_Mistral-7B-Base-SimPO2-5e-7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Base-SimPO2-5e-7",
-    "id": "TTTXXX01/Mistral-7B-Base-SimPO2-5e-7",
-    "developer": "TTTXXX01",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4392
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.432
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3604
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2766
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Tarek07/Progenitor-V1.1-LLaMa-70B/0b53e7b4-0e91-40a2-911b-cd0d415e9fad.json b/data/hfopenllm_v2/Tarek07/Progenitor-V1.1-LLaMa-70B/0b53e7b4-0e91-40a2-911b-cd0d415e9fad.json
deleted file mode 100644
index 80dd34f48..000000000
--- a/data/hfopenllm_v2/Tarek07/Progenitor-V1.1-LLaMa-70B/0b53e7b4-0e91-40a2-911b-cd0d415e9fad.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Tarek07_Progenitor-V1.1-LLaMa-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Progenitor-V1.1-LLaMa-70B",
-    "id": "Tarek07/Progenitor-V1.1-LLaMa-70B",
-    "developer": "Tarek07",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6906
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6971
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3573
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4581
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4736
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5465
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Tarek07/Thalassic-Alpha-LLaMa-70B/91bcd646-fe3d-458b-a426-a6a8863d69a0.json b/data/hfopenllm_v2/Tarek07/Thalassic-Alpha-LLaMa-70B/91bcd646-fe3d-458b-a426-a6a8863d69a0.json
deleted file mode 100644
index f5ea7f4d8..000000000
--- a/data/hfopenllm_v2/Tarek07/Thalassic-Alpha-LLaMa-70B/91bcd646-fe3d-458b-a426-a6a8863d69a0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Tarek07_Thalassic-Alpha-LLaMa-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Thalassic-Alpha-LLaMa-70B",
-    "id": "Tarek07/Thalassic-Alpha-LLaMa-70B",
-    "developer": "Tarek07",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7003
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.694
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.315
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4438
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4802
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5435
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TeeZee/DoubleBagel-57B-v1.0/2e0458cc-e092-4770-bd80-00dff169d754.json b/data/hfopenllm_v2/TeeZee/DoubleBagel-57B-v1.0/2e0458cc-e092-4770-bd80-00dff169d754.json
deleted file mode 100644
index b1e13e640..000000000
--- a/data/hfopenllm_v2/TeeZee/DoubleBagel-57B-v1.0/2e0458cc-e092-4770-bd80-00dff169d754.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TeeZee_DoubleBagel-57B-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DoubleBagel-57B-v1.0",
-    "id": "TeeZee/DoubleBagel-57B-v1.0",
-    "developer": "TeeZee",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 56.703
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2336
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3251
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4315
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1478
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Telugu-LLM-Labs/Indic-gemma-2b-finetuned-sft-Navarasa-2.0/d56ef415-0edf-4fde-8277-ae44b4bb4ed2.json b/data/hfopenllm_v2/Telugu-LLM-Labs/Indic-gemma-2b-finetuned-sft-Navarasa-2.0/d56ef415-0edf-4fde-8277-ae44b4bb4ed2.json
deleted file mode 100644
index 2039e5d97..000000000
--- a/data/hfopenllm_v2/Telugu-LLM-Labs/Indic-gemma-2b-finetuned-sft-Navarasa-2.0/d56ef415-0edf-4fde-8277-ae44b4bb4ed2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Telugu-LLM-Labs_Indic-gemma-2b-finetuned-sft-Navarasa-2.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Indic-gemma-2b-finetuned-sft-Navarasa-2.0",
-    "id": "Telugu-LLM-Labs/Indic-gemma-2b-finetuned-sft-Navarasa-2.0",
-    "developer": "Telugu-LLM-Labs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 2.506
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2103
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3241
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0272
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2433
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3899
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1279
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Telugu-LLM-Labs/Indic-gemma-7b-finetuned-sft-Navarasa-2.0/a0a1beb8-ee9a-4e88-b939-6e0104ed76a7.json b/data/hfopenllm_v2/Telugu-LLM-Labs/Indic-gemma-7b-finetuned-sft-Navarasa-2.0/a0a1beb8-ee9a-4e88-b939-6e0104ed76a7.json
deleted file mode 100644
index a9a9aa606..000000000
--- a/data/hfopenllm_v2/Telugu-LLM-Labs/Indic-gemma-7b-finetuned-sft-Navarasa-2.0/a0a1beb8-ee9a-4e88-b939-6e0104ed76a7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Telugu-LLM-Labs_Indic-gemma-7b-finetuned-sft-Navarasa-2.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Indic-gemma-7b-finetuned-sft-Navarasa-2.0",
-    "id": "Telugu-LLM-Labs/Indic-gemma-7b-finetuned-sft-Navarasa-2.0",
-    "developer": "Telugu-LLM-Labs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 8.538
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3237
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4023
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0257
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4083
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.235
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TencentARC/LLaMA-Pro-8B-Instruct/f9b7c3ee-ea8b-42f0-a55a-6171d4e3d0ea.json b/data/hfopenllm_v2/TencentARC/LLaMA-Pro-8B-Instruct/f9b7c3ee-ea8b-42f0-a55a-6171d4e3d0ea.json
deleted file mode 100644
index 2c3b0edf0..000000000
--- a/data/hfopenllm_v2/TencentARC/LLaMA-Pro-8B-Instruct/f9b7c3ee-ea8b-42f0-a55a-6171d4e3d0ea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TencentARC_LLaMA-Pro-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMA-Pro-8B-Instruct",
-    "id": "TencentARC/LLaMA-Pro-8B-Instruct",
-    "developer": "TencentARC",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.357
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4486
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4224
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.419
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1946
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TencentARC/LLaMA-Pro-8B/2c8c6c6a-ce95-4d11-a33a-d547859fee11.json b/data/hfopenllm_v2/TencentARC/LLaMA-Pro-8B/2c8c6c6a-ce95-4d11-a33a-d547859fee11.json
deleted file mode 100644
index 5af85f4e1..000000000
--- a/data/hfopenllm_v2/TencentARC/LLaMA-Pro-8B/2c8c6c6a-ce95-4d11-a33a-d547859fee11.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TencentARC_LLaMA-Pro-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMA-Pro-8B",
-    "id": "TencentARC/LLaMA-Pro-8B",
-    "developer": "TencentARC",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.357
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2277
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3484
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0189
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4018
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1811
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TencentARC/MetaMath-Mistral-Pro/47858744-3378-4ed4-9101-8acbc3a53cda.json b/data/hfopenllm_v2/TencentARC/MetaMath-Mistral-Pro/47858744-3378-4ed4-9101-8acbc3a53cda.json
deleted file mode 100644
index 666b6ee5b..000000000
--- a/data/hfopenllm_v2/TencentARC/MetaMath-Mistral-Pro/47858744-3378-4ed4-9101-8acbc3a53cda.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TencentARC_MetaMath-Mistral-Pro/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MetaMath-Mistral-Pro",
-    "id": "TencentARC/MetaMath-Mistral-Pro",
-    "developer": "TencentARC",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 8.987
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2119
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4413
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0763
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3524
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2472
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TencentARC/Mistral_Pro_8B_v0.1/2aaeaaa7-89ed-4666-b0a5-8c1320ec4ec5.json b/data/hfopenllm_v2/TencentARC/Mistral_Pro_8B_v0.1/2aaeaaa7-89ed-4666-b0a5-8c1320ec4ec5.json
deleted file mode 100644
index 4307e8288..000000000
--- a/data/hfopenllm_v2/TencentARC/Mistral_Pro_8B_v0.1/2aaeaaa7-89ed-4666-b0a5-8c1320ec4ec5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TencentARC_Mistral_Pro_8B_v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral_Pro_8B_v0.1",
-    "id": "TencentARC/Mistral_Pro_8B_v0.1",
-    "developer": "TencentARC",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 8.987
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2115
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4526
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4242
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2765
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheDrummer/Cydonia-22B-v1.2/23ae6a72-5a1f-4961-8662-feb4d8ad8a26.json b/data/hfopenllm_v2/TheDrummer/Cydonia-22B-v1.2/23ae6a72-5a1f-4961-8662-feb4d8ad8a26.json
deleted file mode 100644
index 66b44eb6d..000000000
--- a/data/hfopenllm_v2/TheDrummer/Cydonia-22B-v1.2/23ae6a72-5a1f-4961-8662-feb4d8ad8a26.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheDrummer_Cydonia-22B-v1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cydonia-22B-v1.2",
-    "id": "TheDrummer/Cydonia-22B-v1.2",
-    "developer": "TheDrummer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5635
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5809
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2032
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3305
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4022
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4141
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheDrummer/Gemmasutra-9B-v1/312ec315-6175-4f99-8741-97d97eb26b47.json b/data/hfopenllm_v2/TheDrummer/Gemmasutra-9B-v1/312ec315-6175-4f99-8741-97d97eb26b47.json
deleted file mode 100644
index d22fd945e..000000000
--- a/data/hfopenllm_v2/TheDrummer/Gemmasutra-9B-v1/312ec315-6175-4f99-8741-97d97eb26b47.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheDrummer_Gemmasutra-9B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemmasutra-9B-v1",
-    "id": "TheDrummer/Gemmasutra-9B-v1",
-    "developer": "TheDrummer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2416
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5887
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0831
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4846
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4045
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheDrummer/Gemmasutra-Mini-2B-v1/7869bbe3-fd17-4e6d-9546-94d3df5e83ef.json b/data/hfopenllm_v2/TheDrummer/Gemmasutra-Mini-2B-v1/7869bbe3-fd17-4e6d-9546-94d3df5e83ef.json
deleted file mode 100644
index ebce47bf5..000000000
--- a/data/hfopenllm_v2/TheDrummer/Gemmasutra-Mini-2B-v1/7869bbe3-fd17-4e6d-9546-94d3df5e83ef.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheDrummer_Gemmasutra-Mini-2B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemmasutra-Mini-2B-v1",
-    "id": "TheDrummer/Gemmasutra-Mini-2B-v1",
-    "developer": "TheDrummer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2549
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3575
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0378
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.349
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2055
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheDrummer/Llama-3SOME-8B-v2/68c9fb85-f90e-442f-aa96-458dabe30b39.json b/data/hfopenllm_v2/TheDrummer/Llama-3SOME-8B-v2/68c9fb85-f90e-442f-aa96-458dabe30b39.json
deleted file mode 100644
index 4ed973176..000000000
--- a/data/hfopenllm_v2/TheDrummer/Llama-3SOME-8B-v2/68c9fb85-f90e-442f-aa96-458dabe30b39.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheDrummer_Llama-3SOME-8B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3SOME-8B-v2",
-    "id": "TheDrummer/Llama-3SOME-8B-v2",
-    "developer": "TheDrummer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4508
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5203
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0937
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3833
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3753
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheDrummer/Ministrations-8B-v1/6891d1dd-0e1a-42e8-9206-64a4c71854f9.json b/data/hfopenllm_v2/TheDrummer/Ministrations-8B-v1/6891d1dd-0e1a-42e8-9206-64a4c71854f9.json
deleted file mode 100644
index f103c0ed1..000000000
--- a/data/hfopenllm_v2/TheDrummer/Ministrations-8B-v1/6891d1dd-0e1a-42e8-9206-64a4c71854f9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheDrummer_Ministrations-8B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ministrations-8B-v1",
-    "id": "TheDrummer/Ministrations-8B-v1",
-    "developer": "TheDrummer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 8.02
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2822
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4877
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1843
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4449
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3644
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheDrummer/Rocinante-12B-v1/c62eb6b3-2a3d-45bd-acdf-bad717e51766.json b/data/hfopenllm_v2/TheDrummer/Rocinante-12B-v1/c62eb6b3-2a3d-45bd-acdf-bad717e51766.json
deleted file mode 100644
index f5272f067..000000000
--- a/data/hfopenllm_v2/TheDrummer/Rocinante-12B-v1/c62eb6b3-2a3d-45bd-acdf-bad717e51766.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheDrummer_Rocinante-12B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rocinante-12B-v1",
-    "id": "TheDrummer/Rocinante-12B-v1",
-    "developer": "TheDrummer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6076
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5065
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1269
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4017
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3477
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v1/55d4a6ae-44e5-4a1b-9509-299fbc6c3a36.json b/data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v1/55d4a6ae-44e5-4a1b-9509-299fbc6c3a36.json
deleted file mode 100644
index d5d96478e..000000000
--- a/data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v1/55d4a6ae-44e5-4a1b-9509-299fbc6c3a36.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheDrummer_Tiger-Gemma-9B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tiger-Gemma-9B-v1",
-    "id": "TheDrummer/Tiger-Gemma-9B-v1",
-    "developer": "TheDrummer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7282
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5704
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1835
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3389
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4162
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4118
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v2/227e3e19-29d6-414f-b538-9f6f89d47677.json b/data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v2/227e3e19-29d6-414f-b538-9f6f89d47677.json
deleted file mode 100644
index 8fccf8d00..000000000
--- a/data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v2/227e3e19-29d6-414f-b538-9f6f89d47677.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheDrummer_Tiger-Gemma-9B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tiger-Gemma-9B-v2",
-    "id": "TheDrummer/Tiger-Gemma-9B-v2",
-    "developer": "TheDrummer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6986
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5617
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.182
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3398
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4084
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4112
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v3/e922ac2c-e8d0-48f2-99fc-da70c925136c.json b/data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v3/e922ac2c-e8d0-48f2-99fc-da70c925136c.json
deleted file mode 100644
index 4a7777424..000000000
--- a/data/hfopenllm_v2/TheDrummer/Tiger-Gemma-9B-v3/e922ac2c-e8d0-48f2-99fc-da70c925136c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheDrummer_Tiger-Gemma-9B-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tiger-Gemma-9B-v3",
-    "id": "TheDrummer/Tiger-Gemma-9B-v3",
-    "developer": "TheDrummer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6821
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5812
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1624
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3389
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4004
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4059
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheDrunkenSnail/Daughter-of-Rhodia-12B/59f93c1c-3712-4ee2-a3d2-999e5acc2ee5.json b/data/hfopenllm_v2/TheDrunkenSnail/Daughter-of-Rhodia-12B/59f93c1c-3712-4ee2-a3d2-999e5acc2ee5.json
deleted file mode 100644
index c592a549e..000000000
--- a/data/hfopenllm_v2/TheDrunkenSnail/Daughter-of-Rhodia-12B/59f93c1c-3712-4ee2-a3d2-999e5acc2ee5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheDrunkenSnail_Daughter-of-Rhodia-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Daughter-of-Rhodia-12B",
-    "id": "TheDrunkenSnail/Daughter-of-Rhodia-12B",
-    "developer": "TheDrunkenSnail",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6904
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5179
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1224
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4348
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3641
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheDrunkenSnail/Mother-of-Rhodia-12B/a98dcf1e-6abb-402b-9e0c-da7c23b74bde.json b/data/hfopenllm_v2/TheDrunkenSnail/Mother-of-Rhodia-12B/a98dcf1e-6abb-402b-9e0c-da7c23b74bde.json
deleted file mode 100644
index e5301b44e..000000000
--- a/data/hfopenllm_v2/TheDrunkenSnail/Mother-of-Rhodia-12B/a98dcf1e-6abb-402b-9e0c-da7c23b74bde.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheDrunkenSnail_Mother-of-Rhodia-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mother-of-Rhodia-12B",
-    "id": "TheDrunkenSnail/Mother-of-Rhodia-12B",
-    "developer": "TheDrunkenSnail",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6505
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4948
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1224
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4124
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3551
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheDrunkenSnail/Son-of-Rhodia/a889f561-0d8a-4345-9131-0a897ec215ac.json b/data/hfopenllm_v2/TheDrunkenSnail/Son-of-Rhodia/a889f561-0d8a-4345-9131-0a897ec215ac.json
deleted file mode 100644
index b5fd619fc..000000000
--- a/data/hfopenllm_v2/TheDrunkenSnail/Son-of-Rhodia/a889f561-0d8a-4345-9131-0a897ec215ac.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheDrunkenSnail_Son-of-Rhodia/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Son-of-Rhodia",
-    "id": "TheDrunkenSnail/Son-of-Rhodia",
-    "developer": "TheDrunkenSnail",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7046
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5097
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1314
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4203
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3608
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheHierophant/Underground-Cognitive-V0.3-test/6402facc-6258-43a4-a0fd-78e21765c504.json b/data/hfopenllm_v2/TheHierophant/Underground-Cognitive-V0.3-test/6402facc-6258-43a4-a0fd-78e21765c504.json
deleted file mode 100644
index 53650d960..000000000
--- a/data/hfopenllm_v2/TheHierophant/Underground-Cognitive-V0.3-test/6402facc-6258-43a4-a0fd-78e21765c504.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheHierophant_Underground-Cognitive-V0.3-test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Underground-Cognitive-V0.3-test",
-    "id": "TheHierophant/Underground-Cognitive-V0.3-test",
-    "developer": "TheHierophant",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4808
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.529
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0589
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4351
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3318
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheTsar1209/nemo-carpmuscle-v0.1/29fbd2e0-e08a-48f4-905e-d2aa54886915.json b/data/hfopenllm_v2/TheTsar1209/nemo-carpmuscle-v0.1/29fbd2e0-e08a-48f4-905e-d2aa54886915.json
deleted file mode 100644
index 2f11b0d16..000000000
--- a/data/hfopenllm_v2/TheTsar1209/nemo-carpmuscle-v0.1/29fbd2e0-e08a-48f4-905e-d2aa54886915.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheTsar1209_nemo-carpmuscle-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "nemo-carpmuscle-v0.1",
-    "id": "TheTsar1209/nemo-carpmuscle-v0.1",
-    "developer": "TheTsar1209",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2276
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5084
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4135
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-r-v0.3/313e0379-d3ea-4f5a-8e06-4b0a94317487.json b/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-r-v0.3/313e0379-d3ea-4f5a-8e06-4b0a94317487.json
deleted file mode 100644
index cd556b29c..000000000
--- a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-r-v0.3/313e0379-d3ea-4f5a-8e06-4b0a94317487.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheTsar1209_qwen-carpmuscle-r-v0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen-carpmuscle-r-v0.3",
-    "id": "TheTsar1209/qwen-carpmuscle-r-v0.3",
-    "developer": "TheTsar1209",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4455
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6227
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3507
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4278
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5103
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.1/f326fbd0-5f92-4324-a587-1f08cf7da208.json b/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.1/f326fbd0-5f92-4324-a587-1f08cf7da208.json
deleted file mode 100644
index ee6712ad0..000000000
--- a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.1/f326fbd0-5f92-4324-a587-1f08cf7da208.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheTsar1209_qwen-carpmuscle-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen-carpmuscle-v0.1",
-    "id": "TheTsar1209/qwen-carpmuscle-v0.1",
-    "developer": "TheTsar1209",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5622
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6434
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2628
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4161
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.2/d61310e9-5267-4a87-8e24-ae25172cd64e.json b/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.2/d61310e9-5267-4a87-8e24-ae25172cd64e.json
deleted file mode 100644
index 13326c754..000000000
--- a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.2/d61310e9-5267-4a87-8e24-ae25172cd64e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheTsar1209_qwen-carpmuscle-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen-carpmuscle-v0.2",
-    "id": "TheTsar1209/qwen-carpmuscle-v0.2",
-    "developer": "TheTsar1209",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5257
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6387
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2832
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3557
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4346
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5147
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.3/60953e5e-523d-43c0-ad00-f746308030b1.json b/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.3/60953e5e-523d-43c0-ad00-f746308030b1.json
deleted file mode 100644
index 246e41951..000000000
--- a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.3/60953e5e-523d-43c0-ad00-f746308030b1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheTsar1209_qwen-carpmuscle-v0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen-carpmuscle-v0.3",
-    "id": "TheTsar1209/qwen-carpmuscle-v0.3",
-    "developer": "TheTsar1209",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4476
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6152
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3134
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4132
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5062
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.4.1/5afd8861-d7cb-45cd-af1b-6db966cb56e0.json b/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.4.1/5afd8861-d7cb-45cd-af1b-6db966cb56e0.json
deleted file mode 100644
index 255cd662b..000000000
--- a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.4.1/5afd8861-d7cb-45cd-af1b-6db966cb56e0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheTsar1209_qwen-carpmuscle-v0.4.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen-carpmuscle-v0.4.1",
-    "id": "TheTsar1209/qwen-carpmuscle-v0.4.1",
-    "developer": "TheTsar1209",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.736
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6507
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2779
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3456
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4489
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5191
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.4/c3972df1-4414-4c71-b473-fb9459cf085b.json b/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.4/c3972df1-4414-4c71-b473-fb9459cf085b.json
deleted file mode 100644
index 9edb7f769..000000000
--- a/data/hfopenllm_v2/TheTsar1209/qwen-carpmuscle-v0.4/c3972df1-4414-4c71-b473-fb9459cf085b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TheTsar1209_qwen-carpmuscle-v0.4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen-carpmuscle-v0.4",
-    "id": "TheTsar1209/qwen-carpmuscle-v0.4",
-    "developer": "TheTsar1209",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7202
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6454
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2772
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3523
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4516
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5144
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Tijmen2/cosmosage-v3/b89d54b7-2329-4608-b9f6-07017e63f1cd.json b/data/hfopenllm_v2/Tijmen2/cosmosage-v3/b89d54b7-2329-4608-b9f6-07017e63f1cd.json
deleted file mode 100644
index 39d760183..000000000
--- a/data/hfopenllm_v2/Tijmen2/cosmosage-v3/b89d54b7-2329-4608-b9f6-07017e63f1cd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Tijmen2_cosmosage-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "cosmosage-v3",
-    "id": "Tijmen2/cosmosage-v3",
-    "developer": "Tijmen2",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4482
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4551
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2486
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.1/50389350-af23-41ba-af46-5ffe338ff9d2.json b/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.1/50389350-af23-41ba-af46-5ffe338ff9d2.json
deleted file mode 100644
index a2c39dd23..000000000
--- a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.1/50389350-af23-41ba-af46-5ffe338ff9d2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TinyLlama_TinyLlama-1.1B-Chat-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TinyLlama-1.1B-Chat-v0.1",
-    "id": "TinyLlama/TinyLlama-1.1B-Chat-v0.1",
-    "developer": "TinyLlama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.1
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1479
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3084
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.229
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3592
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1098
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.5/b8f8f045-2306-43ad-8fa0-6a8bdb494db6.json b/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.5/b8f8f045-2306-43ad-8fa0-6a8bdb494db6.json
deleted file mode 100644
index f0a1f9188..000000000
--- a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.5/b8f8f045-2306-43ad-8fa0-6a8bdb494db6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TinyLlama_TinyLlama-1.1B-Chat-v0.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TinyLlama-1.1B-Chat-v0.5",
-    "id": "TinyLlama/TinyLlama-1.1B-Chat-v0.5",
-    "developer": "TinyLlama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.1
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1634
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3105
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0038
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2483
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3661
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1096
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.6/7cd59011-75d7-4497-956c-322d5d609c5f.json b/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.6/7cd59011-75d7-4497-956c-322d5d609c5f.json
deleted file mode 100644
index 07b269a48..000000000
--- a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v0.6/7cd59011-75d7-4497-956c-322d5d609c5f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TinyLlama_TinyLlama-1.1B-Chat-v0.6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TinyLlama-1.1B-Chat-v0.6",
-    "id": "TinyLlama/TinyLlama-1.1B-Chat-v0.6",
-    "developer": "TinyLlama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.1
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1574
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3067
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3422
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1149
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v1.0/1313d865-9c5b-45d2-ad64-629c65f07f2c.json b/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v1.0/1313d865-9c5b-45d2-ad64-629c65f07f2c.json
deleted file mode 100644
index 03ae524fb..000000000
--- a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-Chat-v1.0/1313d865-9c5b-45d2-ad64-629c65f07f2c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TinyLlama_TinyLlama-1.1B-Chat-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TinyLlama-1.1B-Chat-v1.0",
-    "id": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-    "developer": "TinyLlama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.1
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0596
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1101
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T/0efc2583-bf21-4b60-96cc-716928768eb1.json b/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T/0efc2583-bf21-4b60-96cc-716928768eb1.json
deleted file mode 100644
index 36550ed25..000000000
--- a/data/hfopenllm_v2/TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T/0efc2583-bf21-4b60-96cc-716928768eb1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TinyLlama_TinyLlama-1.1B-intermediate-step-1431k-3T/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TinyLlama-1.1B-intermediate-step-1431k-3T",
-    "id": "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T",
-    "developer": "TinyLlama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.1
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2277
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3071
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.338
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.112
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/TinyLlama/TinyLlama_v1.1/be0a2737-19a0-4401-998a-a03663467133.json b/data/hfopenllm_v2/TinyLlama/TinyLlama_v1.1/be0a2737-19a0-4401-998a-a03663467133.json
deleted file mode 100644
index 0ccc783b3..000000000
--- a/data/hfopenllm_v2/TinyLlama/TinyLlama_v1.1/be0a2737-19a0-4401-998a-a03663467133.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/TinyLlama_TinyLlama_v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TinyLlama_v1.1",
-    "id": "TinyLlama/TinyLlama_v1.1",
-    "developer": "TinyLlama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.1
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2001
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3024
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2458
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1049
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ToastyPigeon/Sto-vo-kor-12B/71720e07-2de0-4402-bdfd-102150c61765.json b/data/hfopenllm_v2/ToastyPigeon/Sto-vo-kor-12B/71720e07-2de0-4402-bdfd-102150c61765.json
deleted file mode 100644
index 65bce7ce5..000000000
--- a/data/hfopenllm_v2/ToastyPigeon/Sto-vo-kor-12B/71720e07-2de0-4402-bdfd-102150c61765.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ToastyPigeon_Sto-vo-kor-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sto-vo-kor-12B",
-    "id": "ToastyPigeon/Sto-vo-kor-12B",
-    "developer": "ToastyPigeon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5501
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5065
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1088
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3938
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3398
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Trappu/Magnum-Picaro-0.7-v2-12b/38c84c69-5cdb-4f24-820d-4b39c5b118ff.json b/data/hfopenllm_v2/Trappu/Magnum-Picaro-0.7-v2-12b/38c84c69-5cdb-4f24-820d-4b39c5b118ff.json
deleted file mode 100644
index c9c3a7878..000000000
--- a/data/hfopenllm_v2/Trappu/Magnum-Picaro-0.7-v2-12b/38c84c69-5cdb-4f24-820d-4b39c5b118ff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Trappu_Magnum-Picaro-0.7-v2-12b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Magnum-Picaro-0.7-v2-12b",
-    "id": "Trappu/Magnum-Picaro-0.7-v2-12b",
-    "developer": "Trappu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5507
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4727
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Trappu/Nemo-Picaro-12B/de9d274d-f213-4037-9711-3e9d3dbbcc96.json b/data/hfopenllm_v2/Trappu/Nemo-Picaro-12B/de9d274d-f213-4037-9711-3e9d3dbbcc96.json
deleted file mode 100644
index 0162cd45d..000000000
--- a/data/hfopenllm_v2/Trappu/Nemo-Picaro-12B/de9d274d-f213-4037-9711-3e9d3dbbcc96.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Trappu_Nemo-Picaro-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nemo-Picaro-12B",
-    "id": "Trappu/Nemo-Picaro-12B",
-    "developer": "Trappu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2577
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.549
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0846
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4726
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3605
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Tremontaine/L3-12B-Lunaris-v1/92381da4-b9d1-43c4-a5c9-59f375017e11.json b/data/hfopenllm_v2/Tremontaine/L3-12B-Lunaris-v1/92381da4-b9d1-43c4-a5c9-59f375017e11.json
deleted file mode 100644
index 8a5408f5a..000000000
--- a/data/hfopenllm_v2/Tremontaine/L3-12B-Lunaris-v1/92381da4-b9d1-43c4-a5c9-59f375017e11.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Tremontaine_L3-12B-Lunaris-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-12B-Lunaris-v1",
-    "id": "Tremontaine/L3-12B-Lunaris-v1",
-    "developer": "Tremontaine",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 11.52
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6909
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.523
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0876
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3674
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3775
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Annunaki-12b/44ab6a50-027d-47df-a518-5aa944eb2a61.json b/data/hfopenllm_v2/Triangle104/Annunaki-12b/44ab6a50-027d-47df-a518-5aa944eb2a61.json
deleted file mode 100644
index d0a2060be..000000000
--- a/data/hfopenllm_v2/Triangle104/Annunaki-12b/44ab6a50-027d-47df-a518-5aa944eb2a61.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Annunaki-12b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Annunaki-12b",
-    "id": "Triangle104/Annunaki-12b",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3872
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5499
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4409
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3721
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/BigTalker-Lite-8B/2a1947d7-74e0-43d0-931d-b2862348e90a.json b/data/hfopenllm_v2/Triangle104/BigTalker-Lite-8B/2a1947d7-74e0-43d0-931d-b2862348e90a.json
deleted file mode 100644
index c521e8873..000000000
--- a/data/hfopenllm_v2/Triangle104/BigTalker-Lite-8B/2a1947d7-74e0-43d0-931d-b2862348e90a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_BigTalker-Lite-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BigTalker-Lite-8B",
-    "id": "Triangle104/BigTalker-Lite-8B",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3689
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5308
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4208
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3431
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Chatty-Harry_V2.0/3677b71c-387d-4182-b15d-c3525bc7bc36.json b/data/hfopenllm_v2/Triangle104/Chatty-Harry_V2.0/3677b71c-387d-4182-b15d-c3525bc7bc36.json
deleted file mode 100644
index 2703e6bd2..000000000
--- a/data/hfopenllm_v2/Triangle104/Chatty-Harry_V2.0/3677b71c-387d-4182-b15d-c3525bc7bc36.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Chatty-Harry_V2.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chatty-Harry_V2.0",
-    "id": "Triangle104/Chatty-Harry_V2.0",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3326
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5319
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.139
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4078
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3683
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Chatty-Harry_V3.0/6b125a8e-5b53-48ca-8875-926249879f39.json b/data/hfopenllm_v2/Triangle104/Chatty-Harry_V3.0/6b125a8e-5b53-48ca-8875-926249879f39.json
deleted file mode 100644
index 58fe5380d..000000000
--- a/data/hfopenllm_v2/Triangle104/Chatty-Harry_V3.0/6b125a8e-5b53-48ca-8875-926249879f39.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Chatty-Harry_V3.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chatty-Harry_V3.0",
-    "id": "Triangle104/Chatty-Harry_V3.0",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3675
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5526
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4408
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3702
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Chronos-Prism_V1.0/af851d4b-69d4-49a9-a160-a180146c3963.json b/data/hfopenllm_v2/Triangle104/Chronos-Prism_V1.0/af851d4b-69d4-49a9-a160-a180146c3963.json
deleted file mode 100644
index a35b89646..000000000
--- a/data/hfopenllm_v2/Triangle104/Chronos-Prism_V1.0/af851d4b-69d4-49a9-a160-a180146c3963.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Chronos-Prism_V1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chronos-Prism_V1.0",
-    "id": "Triangle104/Chronos-Prism_V1.0",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3259
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5554
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4263
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3673
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/DS-Distilled-Hermes-Llama-3.1/7aa6ce37-c0e4-48ce-b9db-f158ac47d366.json b/data/hfopenllm_v2/Triangle104/DS-Distilled-Hermes-Llama-3.1/7aa6ce37-c0e4-48ce-b9db-f158ac47d366.json
deleted file mode 100644
index bb6dc9f21..000000000
--- a/data/hfopenllm_v2/Triangle104/DS-Distilled-Hermes-Llama-3.1/7aa6ce37-c0e4-48ce-b9db-f158ac47d366.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_DS-Distilled-Hermes-Llama-3.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DS-Distilled-Hermes-Llama-3.1",
-    "id": "Triangle104/DS-Distilled-Hermes-Llama-3.1",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3229
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5117
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2931
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4039
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.311
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/DS-Distilled-Hermes-Llama-3.1_TIES/1bce093e-27c0-41ad-aad6-b656f6773ed5.json b/data/hfopenllm_v2/Triangle104/DS-Distilled-Hermes-Llama-3.1_TIES/1bce093e-27c0-41ad-aad6-b656f6773ed5.json
deleted file mode 100644
index 5e892b1b2..000000000
--- a/data/hfopenllm_v2/Triangle104/DS-Distilled-Hermes-Llama-3.1_TIES/1bce093e-27c0-41ad-aad6-b656f6773ed5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_DS-Distilled-Hermes-Llama-3.1_TIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DS-Distilled-Hermes-Llama-3.1_TIES",
-    "id": "Triangle104/DS-Distilled-Hermes-Llama-3.1_TIES",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1364
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.245
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3621
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1104
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-10B-Harmony/5c6cffab-ef72-4e12-808c-c26ee8ec6999.json b/data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-10B-Harmony/5c6cffab-ef72-4e12-808c-c26ee8ec6999.json
deleted file mode 100644
index 8e46d73b5..000000000
--- a/data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-10B-Harmony/5c6cffab-ef72-4e12-808c-c26ee8ec6999.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_DS-R1-Distill-Q2.5-10B-Harmony/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DS-R1-Distill-Q2.5-10B-Harmony",
-    "id": "Triangle104/DS-R1-Distill-Q2.5-10B-Harmony",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 10.366
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1751
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2106
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3128
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1173
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-14B-Harmony_V0.1/e288a874-f750-4a90-be07-616094c220cf.json b/data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-14B-Harmony_V0.1/e288a874-f750-4a90-be07-616094c220cf.json
deleted file mode 100644
index 350f712ec..000000000
--- a/data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-14B-Harmony_V0.1/e288a874-f750-4a90-be07-616094c220cf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_DS-R1-Distill-Q2.5-14B-Harmony_V0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DS-R1-Distill-Q2.5-14B-Harmony_V0.1",
-    "id": "Triangle104/DS-R1-Distill-Q2.5-14B-Harmony_V0.1",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4515
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5783
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5551
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5567
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4601
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-7B-RP/0607da8d-3f4e-468a-91a6-b975261a87c0.json b/data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-7B-RP/0607da8d-3f4e-468a-91a6-b975261a87c0.json
deleted file mode 100644
index 0817ae888..000000000
--- a/data/hfopenllm_v2/Triangle104/DS-R1-Distill-Q2.5-7B-RP/0607da8d-3f4e-468a-91a6-b975261a87c0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_DS-R1-Distill-Q2.5-7B-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DS-R1-Distill-Q2.5-7B-RP",
-    "id": "Triangle104/DS-R1-Distill-Q2.5-7B-RP",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3445
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4383
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4683
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.403
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2891
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/DS-R1-Llama-8B-Harmony/be2cc2fd-c8e7-4421-b8c8-d3b937272d0d.json b/data/hfopenllm_v2/Triangle104/DS-R1-Llama-8B-Harmony/be2cc2fd-c8e7-4421-b8c8-d3b937272d0d.json
deleted file mode 100644
index c407ea12b..000000000
--- a/data/hfopenllm_v2/Triangle104/DS-R1-Llama-8B-Harmony/be2cc2fd-c8e7-4421-b8c8-d3b937272d0d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_DS-R1-Llama-8B-Harmony/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DS-R1-Llama-8B-Harmony",
-    "id": "Triangle104/DS-R1-Llama-8B-Harmony",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3566
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4154
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4282
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3762
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2744
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/DSR1-Distill-Llama-Lit-8B/15ffe64e-72fd-4e65-8632-babf137a386d.json b/data/hfopenllm_v2/Triangle104/DSR1-Distill-Llama-Lit-8B/15ffe64e-72fd-4e65-8632-babf137a386d.json
deleted file mode 100644
index db0609b3f..000000000
--- a/data/hfopenllm_v2/Triangle104/DSR1-Distill-Llama-Lit-8B/15ffe64e-72fd-4e65-8632-babf137a386d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_DSR1-Distill-Llama-Lit-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DSR1-Distill-Llama-Lit-8B",
-    "id": "Triangle104/DSR1-Distill-Llama-Lit-8B",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1885
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4284
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.352
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3535
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2798
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/DSR1-Distill-Qwen-7B-RP/ce1c0d4f-f5a3-49e7-ab77-65ff51bbd0ca.json b/data/hfopenllm_v2/Triangle104/DSR1-Distill-Qwen-7B-RP/ce1c0d4f-f5a3-49e7-ab77-65ff51bbd0ca.json
deleted file mode 100644
index 78fbf1df3..000000000
--- a/data/hfopenllm_v2/Triangle104/DSR1-Distill-Qwen-7B-RP/ce1c0d4f-f5a3-49e7-ab77-65ff51bbd0ca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_DSR1-Distill-Qwen-7B-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DSR1-Distill-Qwen-7B-RP",
-    "id": "Triangle104/DSR1-Distill-Qwen-7B-RP",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3609
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4326
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4804
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4045
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3028
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Dark-Chivalry_V1.0/b5afab38-13ba-4abd-9d04-a433c41061c5.json b/data/hfopenllm_v2/Triangle104/Dark-Chivalry_V1.0/b5afab38-13ba-4abd-9d04-a433c41061c5.json
deleted file mode 100644
index 48178e07b..000000000
--- a/data/hfopenllm_v2/Triangle104/Dark-Chivalry_V1.0/b5afab38-13ba-4abd-9d04-a433c41061c5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Dark-Chivalry_V1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dark-Chivalry_V1.0",
-    "id": "Triangle104/Dark-Chivalry_V1.0",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4326
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4974
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1314
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4182
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3444
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Distilled-DarkPlanet-Allades-8B/a862c2a5-f66b-4d09-ac57-6cbe565f9f35.json b/data/hfopenllm_v2/Triangle104/Distilled-DarkPlanet-Allades-8B/a862c2a5-f66b-4d09-ac57-6cbe565f9f35.json
deleted file mode 100644
index b8c909e09..000000000
--- a/data/hfopenllm_v2/Triangle104/Distilled-DarkPlanet-Allades-8B/a862c2a5-f66b-4d09-ac57-6cbe565f9f35.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Distilled-DarkPlanet-Allades-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Distilled-DarkPlanet-Allades-8B",
-    "id": "Triangle104/Distilled-DarkPlanet-Allades-8B",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.346
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4634
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4003
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3538
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2901
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Distilled-DarkPlanet-Allades-8B_TIES/d8254f6c-8110-44d3-800e-101fc731d779.json b/data/hfopenllm_v2/Triangle104/Distilled-DarkPlanet-Allades-8B_TIES/d8254f6c-8110-44d3-800e-101fc731d779.json
deleted file mode 100644
index ab8e49bea..000000000
--- a/data/hfopenllm_v2/Triangle104/Distilled-DarkPlanet-Allades-8B_TIES/d8254f6c-8110-44d3-800e-101fc731d779.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Distilled-DarkPlanet-Allades-8B_TIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Distilled-DarkPlanet-Allades-8B_TIES",
-    "id": "Triangle104/Distilled-DarkPlanet-Allades-8B_TIES",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3892
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5042
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0906
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3868
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3401
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Distilled-Whiskey-8b/ccbcd5a7-2b98-4d90-ace1-3ad5971a5f18.json b/data/hfopenllm_v2/Triangle104/Distilled-Whiskey-8b/ccbcd5a7-2b98-4d90-ace1-3ad5971a5f18.json
deleted file mode 100644
index 56ce9bac6..000000000
--- a/data/hfopenllm_v2/Triangle104/Distilled-Whiskey-8b/ccbcd5a7-2b98-4d90-ace1-3ad5971a5f18.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Distilled-Whiskey-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Distilled-Whiskey-8b",
-    "id": "Triangle104/Distilled-Whiskey-8b",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3448
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5028
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2545
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4172
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3367
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Dolphin3-Llama3.2-Smart/c208b19b-4ecf-4fad-b931-54f65d4b711b.json b/data/hfopenllm_v2/Triangle104/Dolphin3-Llama3.2-Smart/c208b19b-4ecf-4fad-b931-54f65d4b711b.json
deleted file mode 100644
index b54a4d6b2..000000000
--- a/data/hfopenllm_v2/Triangle104/Dolphin3-Llama3.2-Smart/c208b19b-4ecf-4fad-b931-54f65d4b711b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Dolphin3-Llama3.2-Smart/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dolphin3-Llama3.2-Smart",
-    "id": "Triangle104/Dolphin3-Llama3.2-Smart",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4137
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3975
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3922
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2195
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Gemmadevi-Stock-10B/debaf4a0-c734-47ea-bea0-2ddc65dc397d.json b/data/hfopenllm_v2/Triangle104/Gemmadevi-Stock-10B/debaf4a0-c734-47ea-bea0-2ddc65dc397d.json
deleted file mode 100644
index edaf3ab86..000000000
--- a/data/hfopenllm_v2/Triangle104/Gemmadevi-Stock-10B/debaf4a0-c734-47ea-bea0-2ddc65dc397d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Gemmadevi-Stock-10B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemmadevi-Stock-10B",
-    "id": "Triangle104/Gemmadevi-Stock-10B",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1582
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6066
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0967
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3532
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4621
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4262
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Hermes-Llama-3.2-CoT-Summary/0eeb5962-ccc0-407b-92e6-7cf17c00941f.json b/data/hfopenllm_v2/Triangle104/Hermes-Llama-3.2-CoT-Summary/0eeb5962-ccc0-407b-92e6-7cf17c00941f.json
deleted file mode 100644
index 2948054fa..000000000
--- a/data/hfopenllm_v2/Triangle104/Hermes-Llama-3.2-CoT-Summary/0eeb5962-ccc0-407b-92e6-7cf17c00941f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Hermes-Llama-3.2-CoT-Summary/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hermes-Llama-3.2-CoT-Summary",
-    "id": "Triangle104/Hermes-Llama-3.2-CoT-Summary",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.483
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.42
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0831
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3575
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2901
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Hermes-Llama-3.2-CoT/4b60e863-482c-4f91-8cd1-6c993d3c5988.json b/data/hfopenllm_v2/Triangle104/Hermes-Llama-3.2-CoT/4b60e863-482c-4f91-8cd1-6c993d3c5988.json
deleted file mode 100644
index 156711b1f..000000000
--- a/data/hfopenllm_v2/Triangle104/Hermes-Llama-3.2-CoT/4b60e863-482c-4f91-8cd1-6c993d3c5988.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Hermes-Llama-3.2-CoT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hermes-Llama-3.2-CoT",
-    "id": "Triangle104/Hermes-Llama-3.2-CoT",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4178
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4616
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0952
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3698
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2947
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Hermes3-L3.1-DirtyHarry-8B/f5f0bc72-427d-4703-aab1-1bb1bea73895.json b/data/hfopenllm_v2/Triangle104/Hermes3-L3.1-DirtyHarry-8B/f5f0bc72-427d-4703-aab1-1bb1bea73895.json
deleted file mode 100644
index 92f5b47d3..000000000
--- a/data/hfopenllm_v2/Triangle104/Hermes3-L3.1-DirtyHarry-8B/f5f0bc72-427d-4703-aab1-1bb1bea73895.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Hermes3-L3.1-DirtyHarry-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hermes3-L3.1-DirtyHarry-8B",
-    "id": "Triangle104/Hermes3-L3.1-DirtyHarry-8B",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3242
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5066
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0718
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4069
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3339
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Herodotos-14B/aae7f543-7b5b-435f-a506-e3ab901a8c5a.json b/data/hfopenllm_v2/Triangle104/Herodotos-14B/aae7f543-7b5b-435f-a506-e3ab901a8c5a.json
deleted file mode 100644
index 1a742c4be..000000000
--- a/data/hfopenllm_v2/Triangle104/Herodotos-14B/aae7f543-7b5b-435f-a506-e3ab901a8c5a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Herodotos-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Herodotos-14B",
-    "id": "Triangle104/Herodotos-14B",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4667
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6435
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4795
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.529
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Herodotos-14B_V0.1/6e6ff4c3-3cfd-4790-80c4-544d9cbe47e2.json b/data/hfopenllm_v2/Triangle104/Herodotos-14B_V0.1/6e6ff4c3-3cfd-4790-80c4-544d9cbe47e2.json
deleted file mode 100644
index cc2ae0230..000000000
--- a/data/hfopenllm_v2/Triangle104/Herodotos-14B_V0.1/6e6ff4c3-3cfd-4790-80c4-544d9cbe47e2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Herodotos-14B_V0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Herodotos-14B_V0.1",
-    "id": "Triangle104/Herodotos-14B_V0.1",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1879
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3017
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.224
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3684
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1164
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/L3.1-8B-Dusky-Ink/3ee76278-89d4-44fb-a449-717534b00161.json b/data/hfopenllm_v2/Triangle104/L3.1-8B-Dusky-Ink/3ee76278-89d4-44fb-a449-717534b00161.json
deleted file mode 100644
index c76643578..000000000
--- a/data/hfopenllm_v2/Triangle104/L3.1-8B-Dusky-Ink/3ee76278-89d4-44fb-a449-717534b00161.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_L3.1-8B-Dusky-Ink/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.1-8B-Dusky-Ink",
-    "id": "Triangle104/L3.1-8B-Dusky-Ink",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.453
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5098
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1231
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4224
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3683
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/L3.1-8B-Dusky-Ink_v0.r1/fa2854d3-9e2f-4f79-ac8c-e1cb5a638745.json b/data/hfopenllm_v2/Triangle104/L3.1-8B-Dusky-Ink_v0.r1/fa2854d3-9e2f-4f79-ac8c-e1cb5a638745.json
deleted file mode 100644
index 021c2c4e6..000000000
--- a/data/hfopenllm_v2/Triangle104/L3.1-8B-Dusky-Ink_v0.r1/fa2854d3-9e2f-4f79-ac8c-e1cb5a638745.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_L3.1-8B-Dusky-Ink_v0.r1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.1-8B-Dusky-Ink_v0.r1",
-    "id": "Triangle104/L3.1-8B-Dusky-Ink_v0.r1",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1985
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4337
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3988
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3206
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/LThreePointOne-8B-HermesBlackroot/9ddaa721-bf3a-416a-9be8-291188793cc9.json b/data/hfopenllm_v2/Triangle104/LThreePointOne-8B-HermesBlackroot/9ddaa721-bf3a-416a-9be8-291188793cc9.json
deleted file mode 100644
index e187b6fe9..000000000
--- a/data/hfopenllm_v2/Triangle104/LThreePointOne-8B-HermesBlackroot/9ddaa721-bf3a-416a-9be8-291188793cc9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_LThreePointOne-8B-HermesBlackroot/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LThreePointOne-8B-HermesBlackroot",
-    "id": "Triangle104/LThreePointOne-8B-HermesBlackroot",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1792
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4998
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3586
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3285
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/LThreePointOne-8B-HermesInk/d659077d-7261-4c69-862c-d61be21662a2.json b/data/hfopenllm_v2/Triangle104/LThreePointOne-8B-HermesInk/d659077d-7261-4c69-862c-d61be21662a2.json
deleted file mode 100644
index d949801cc..000000000
--- a/data/hfopenllm_v2/Triangle104/LThreePointOne-8B-HermesInk/d659077d-7261-4c69-862c-d61be21662a2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_LThreePointOne-8B-HermesInk/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LThreePointOne-8B-HermesInk",
-    "id": "Triangle104/LThreePointOne-8B-HermesInk",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4031
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5223
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1722
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4129
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3467
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Llama3.1-Allades-Lit-8b/e87ba227-c55e-4666-949d-b45913f8336b.json b/data/hfopenllm_v2/Triangle104/Llama3.1-Allades-Lit-8b/e87ba227-c55e-4666-949d-b45913f8336b.json
deleted file mode 100644
index 8839950e8..000000000
--- a/data/hfopenllm_v2/Triangle104/Llama3.1-Allades-Lit-8b/e87ba227-c55e-4666-949d-b45913f8336b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Llama3.1-Allades-Lit-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-Allades-Lit-8b",
-    "id": "Triangle104/Llama3.1-Allades-Lit-8b",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2461
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4183
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0023
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3708
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2724
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Llama3.1-cc-Lit-8b/077f683a-af6f-4a71-b599-b9b269546b7c.json b/data/hfopenllm_v2/Triangle104/Llama3.1-cc-Lit-8b/077f683a-af6f-4a71-b599-b9b269546b7c.json
deleted file mode 100644
index 1c20850c6..000000000
--- a/data/hfopenllm_v2/Triangle104/Llama3.1-cc-Lit-8b/077f683a-af6f-4a71-b599-b9b269546b7c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Llama3.1-cc-Lit-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-cc-Lit-8b",
-    "id": "Triangle104/Llama3.1-cc-Lit-8b",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2993
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3848
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.003
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3854
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3004
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Minerva-1.5b/54808b08-d10d-4a06-ab60-8d99039311b8.json b/data/hfopenllm_v2/Triangle104/Minerva-1.5b/54808b08-d10d-4a06-ab60-8d99039311b8.json
deleted file mode 100644
index fc5d9a10e..000000000
--- a/data/hfopenllm_v2/Triangle104/Minerva-1.5b/54808b08-d10d-4a06-ab60-8d99039311b8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Minerva-1.5b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Minerva-1.5b",
-    "id": "Triangle104/Minerva-1.5b",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2694
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4026
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1027
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3655
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2698
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Minerva-1.5b_V0.2/138e6fdb-7092-4ee6-be82-7bb86c1fc759.json b/data/hfopenllm_v2/Triangle104/Minerva-1.5b_V0.2/138e6fdb-7092-4ee6-be82-7bb86c1fc759.json
deleted file mode 100644
index 5d8751a69..000000000
--- a/data/hfopenllm_v2/Triangle104/Minerva-1.5b_V0.2/138e6fdb-7092-4ee6-be82-7bb86c1fc759.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Minerva-1.5b_V0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Minerva-1.5b_V0.2",
-    "id": "Triangle104/Minerva-1.5b_V0.2",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3083
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3989
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.396
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Minerva-10b/1b27423f-62cc-4189-a293-5af84ef1f2c8.json b/data/hfopenllm_v2/Triangle104/Minerva-10b/1b27423f-62cc-4189-a293-5af84ef1f2c8.json
deleted file mode 100644
index 90b1e2629..000000000
--- a/data/hfopenllm_v2/Triangle104/Minerva-10b/1b27423f-62cc-4189-a293-5af84ef1f2c8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Minerva-10b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Minerva-10b",
-    "id": "Triangle104/Minerva-10b",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 10.067
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1879
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4462
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3627
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2318
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Minerva-14b-V0.1/f5468512-d2c7-4486-9d31-bef61225af52.json b/data/hfopenllm_v2/Triangle104/Minerva-14b-V0.1/f5468512-d2c7-4486-9d31-bef61225af52.json
deleted file mode 100644
index 4c579dddf..000000000
--- a/data/hfopenllm_v2/Triangle104/Minerva-14b-V0.1/f5468512-d2c7-4486-9d31-bef61225af52.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Minerva-14b-V0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Minerva-14b-V0.1",
-    "id": "Triangle104/Minerva-14b-V0.1",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0861
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.609
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3051
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3658
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.47
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5118
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Minerva-14b/0e0ec1a9-76aa-4d7e-9c0e-946d6b000a6a.json b/data/hfopenllm_v2/Triangle104/Minerva-14b/0e0ec1a9-76aa-4d7e-9c0e-946d6b000a6a.json
deleted file mode 100644
index 814ca6f10..000000000
--- a/data/hfopenllm_v2/Triangle104/Minerva-14b/0e0ec1a9-76aa-4d7e-9c0e-946d6b000a6a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Minerva-14b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Minerva-14b",
-    "id": "Triangle104/Minerva-14b",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3468
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6301
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3051
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3742
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4766
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5194
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Minerva-7b/07b87b98-0d61-4479-937f-7447565b4631.json b/data/hfopenllm_v2/Triangle104/Minerva-7b/07b87b98-0d61-4479-937f-7447565b4631.json
deleted file mode 100644
index 6e62d362b..000000000
--- a/data/hfopenllm_v2/Triangle104/Minerva-7b/07b87b98-0d61-4479-937f-7447565b4631.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Minerva-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Minerva-7b",
-    "id": "Triangle104/Minerva-7b",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3724
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5498
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.284
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4143
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4444
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Minerva-8b/85b11b91-d686-49e9-8db0-971dd7cafb75.json b/data/hfopenllm_v2/Triangle104/Minerva-8b/85b11b91-d686-49e9-8db0-971dd7cafb75.json
deleted file mode 100644
index fa931f160..000000000
--- a/data/hfopenllm_v2/Triangle104/Minerva-8b/85b11b91-d686-49e9-8db0-971dd7cafb75.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Minerva-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Minerva-8b",
-    "id": "Triangle104/Minerva-8b",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1721
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4669
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4273
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3089
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Mistral-Redemption-Arc/21bac032-a092-4afa-8d29-ebdefb3a0650.json b/data/hfopenllm_v2/Triangle104/Mistral-Redemption-Arc/21bac032-a092-4afa-8d29-ebdefb3a0650.json
deleted file mode 100644
index 5d71ad3d3..000000000
--- a/data/hfopenllm_v2/Triangle104/Mistral-Redemption-Arc/21bac032-a092-4afa-8d29-ebdefb3a0650.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Mistral-Redemption-Arc/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Redemption-Arc",
-    "id": "Triangle104/Mistral-Redemption-Arc",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4029
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6255
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4101
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4595
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.451
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Mistral-Small-24b-Harmony/29e3a687-429f-4f33-ae5f-48db85127364.json b/data/hfopenllm_v2/Triangle104/Mistral-Small-24b-Harmony/29e3a687-429f-4f33-ae5f-48db85127364.json
deleted file mode 100644
index 9760eb31d..000000000
--- a/data/hfopenllm_v2/Triangle104/Mistral-Small-24b-Harmony/29e3a687-429f-4f33-ae5f-48db85127364.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Mistral-Small-24b-Harmony/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Small-24b-Harmony",
-    "id": "Triangle104/Mistral-Small-24b-Harmony",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1687
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6434
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1911
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4276
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5431
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Pans_Gutenbergum_V0.1/d98493a6-f237-4565-8508-9e4cc3188d2d.json b/data/hfopenllm_v2/Triangle104/Pans_Gutenbergum_V0.1/d98493a6-f237-4565-8508-9e4cc3188d2d.json
deleted file mode 100644
index 6cd483d58..000000000
--- a/data/hfopenllm_v2/Triangle104/Pans_Gutenbergum_V0.1/d98493a6-f237-4565-8508-9e4cc3188d2d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Pans_Gutenbergum_V0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Pans_Gutenbergum_V0.1",
-    "id": "Triangle104/Pans_Gutenbergum_V0.1",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3097
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5541
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1057
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4528
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3697
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Pans_Gutenbergum_V0.2/2def6fbd-7488-4e9f-a822-2405d4f7a315.json b/data/hfopenllm_v2/Triangle104/Pans_Gutenbergum_V0.2/2def6fbd-7488-4e9f-a822-2405d4f7a315.json
deleted file mode 100644
index 9dd3b30ff..000000000
--- a/data/hfopenllm_v2/Triangle104/Pans_Gutenbergum_V0.2/2def6fbd-7488-4e9f-a822-2405d4f7a315.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Pans_Gutenbergum_V0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Pans_Gutenbergum_V0.2",
-    "id": "Triangle104/Pans_Gutenbergum_V0.2",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3215
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5526
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0687
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4673
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3585
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Pantheon_ChatWaifu_V0.2/819143d4-9538-48b9-b7af-128bc15c518a.json b/data/hfopenllm_v2/Triangle104/Pantheon_ChatWaifu_V0.2/819143d4-9538-48b9-b7af-128bc15c518a.json
deleted file mode 100644
index 643b17008..000000000
--- a/data/hfopenllm_v2/Triangle104/Pantheon_ChatWaifu_V0.2/819143d4-9538-48b9-b7af-128bc15c518a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Pantheon_ChatWaifu_V0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Pantheon_ChatWaifu_V0.2",
-    "id": "Triangle104/Pantheon_ChatWaifu_V0.2",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2683
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5532
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4755
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3442
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Phi-4-AbliteratedRP/c29d47af-a9de-4edb-acac-6763c0d44ca3.json b/data/hfopenllm_v2/Triangle104/Phi-4-AbliteratedRP/c29d47af-a9de-4edb-acac-6763c0d44ca3.json
deleted file mode 100644
index b786d9979..000000000
--- a/data/hfopenllm_v2/Triangle104/Phi-4-AbliteratedRP/c29d47af-a9de-4edb-acac-6763c0d44ca3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Phi-4-AbliteratedRP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-AbliteratedRP",
-    "id": "Triangle104/Phi-4-AbliteratedRP",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4923
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6709
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3074
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3951
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5098
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5308
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Phi4-RP-o1-Ablit/22bf3fb7-9235-4a57-b8fd-c85b12047b0e.json b/data/hfopenllm_v2/Triangle104/Phi4-RP-o1-Ablit/22bf3fb7-9235-4a57-b8fd-c85b12047b0e.json
deleted file mode 100644
index 82381bf4b..000000000
--- a/data/hfopenllm_v2/Triangle104/Phi4-RP-o1-Ablit/22bf3fb7-9235-4a57-b8fd-c85b12047b0e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Phi4-RP-o1-Ablit/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi4-RP-o1-Ablit",
-    "id": "Triangle104/Phi4-RP-o1-Ablit",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0239
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.663
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3882
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3633
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4754
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5105
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Phi4-RP-o1/2bea7014-460d-470b-918f-468b58d70fd6.json b/data/hfopenllm_v2/Triangle104/Phi4-RP-o1/2bea7014-460d-470b-918f-468b58d70fd6.json
deleted file mode 100644
index 49827c7aa..000000000
--- a/data/hfopenllm_v2/Triangle104/Phi4-RP-o1/2bea7014-460d-470b-918f-468b58d70fd6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Phi4-RP-o1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi4-RP-o1",
-    "id": "Triangle104/Phi4-RP-o1",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.022
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6653
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3776
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4756
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5111
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Porpoise-R1-Llama3.2-3b/3927a5dd-002b-441a-b769-ba68547cd5f3.json b/data/hfopenllm_v2/Triangle104/Porpoise-R1-Llama3.2-3b/3927a5dd-002b-441a-b769-ba68547cd5f3.json
deleted file mode 100644
index 341da9ea4..000000000
--- a/data/hfopenllm_v2/Triangle104/Porpoise-R1-Llama3.2-3b/3927a5dd-002b-441a-b769-ba68547cd5f3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Porpoise-R1-Llama3.2-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Porpoise-R1-Llama3.2-3b",
-    "id": "Triangle104/Porpoise-R1-Llama3.2-3b",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4352
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3824
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0423
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3576
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2117
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Q2.5-14B-Instruct-1M-Harmony/476fc734-dedd-4192-aa59-eb2f9dabf16b.json b/data/hfopenllm_v2/Triangle104/Q2.5-14B-Instruct-1M-Harmony/476fc734-dedd-4192-aa59-eb2f9dabf16b.json
deleted file mode 100644
index b32ca4aa3..000000000
--- a/data/hfopenllm_v2/Triangle104/Q2.5-14B-Instruct-1M-Harmony/476fc734-dedd-4192-aa59-eb2f9dabf16b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Q2.5-14B-Instruct-1M-Harmony/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Q2.5-14B-Instruct-1M-Harmony",
-    "id": "Triangle104/Q2.5-14B-Instruct-1M-Harmony",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5986
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6339
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3769
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4795
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5075
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Q2.5-AthensCOT/817e2fbe-0866-489f-b987-391228a68c53.json b/data/hfopenllm_v2/Triangle104/Q2.5-AthensCOT/817e2fbe-0866-489f-b987-391228a68c53.json
deleted file mode 100644
index 46f11cded..000000000
--- a/data/hfopenllm_v2/Triangle104/Q2.5-AthensCOT/817e2fbe-0866-489f-b987-391228a68c53.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Q2.5-AthensCOT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Q2.5-AthensCOT",
-    "id": "Triangle104/Q2.5-AthensCOT",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4573
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5542
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2915
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4578
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4379
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Q2.5-CodeR1-3B/f25f5eb1-ff22-4be3-a639-a9d25207078f.json b/data/hfopenllm_v2/Triangle104/Q2.5-CodeR1-3B/f25f5eb1-ff22-4be3-a639-a9d25207078f.json
deleted file mode 100644
index 61a0bc9d8..000000000
--- a/data/hfopenllm_v2/Triangle104/Q2.5-CodeR1-3B/f25f5eb1-ff22-4be3-a639-a9d25207078f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Q2.5-CodeR1-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Q2.5-CodeR1-3B",
-    "id": "Triangle104/Q2.5-CodeR1-3B",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.085
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3588
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4661
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1639
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4315
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2979
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Q2.5-EVACOT-7b/f71d1c31-184b-46be-a288-bdc92f0ebe09.json b/data/hfopenllm_v2/Triangle104/Q2.5-EVACOT-7b/f71d1c31-184b-46be-a288-bdc92f0ebe09.json
deleted file mode 100644
index 2603a2922..000000000
--- a/data/hfopenllm_v2/Triangle104/Q2.5-EVACOT-7b/f71d1c31-184b-46be-a288-bdc92f0ebe09.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Q2.5-EVACOT-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Q2.5-EVACOT-7b",
-    "id": "Triangle104/Q2.5-EVACOT-7b",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5784
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5506
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2825
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4499
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4331
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Q2.5-EvaHumane-RP/0d9547b3-7bef-4815-9c44-7d714fe81bbb.json b/data/hfopenllm_v2/Triangle104/Q2.5-EvaHumane-RP/0d9547b3-7bef-4815-9c44-7d714fe81bbb.json
deleted file mode 100644
index 356d6f7ee..000000000
--- a/data/hfopenllm_v2/Triangle104/Q2.5-EvaHumane-RP/0d9547b3-7bef-4815-9c44-7d714fe81bbb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Q2.5-EvaHumane-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Q2.5-EvaHumane-RP",
-    "id": "Triangle104/Q2.5-EvaHumane-RP",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3676
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5328
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2923
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4276
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4412
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Q2.5-Humane-RP/22dbc5a2-0ff6-4566-9bfd-e5ce314be597.json b/data/hfopenllm_v2/Triangle104/Q2.5-Humane-RP/22dbc5a2-0ff6-4566-9bfd-e5ce314be597.json
deleted file mode 100644
index 99a18a092..000000000
--- a/data/hfopenllm_v2/Triangle104/Q2.5-Humane-RP/22dbc5a2-0ff6-4566-9bfd-e5ce314be597.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Q2.5-Humane-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Q2.5-Humane-RP",
-    "id": "Triangle104/Q2.5-Humane-RP",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4412
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5649
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3391
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4528
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4492
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Q2.5-Instruct-1M_Harmony/afedb249-f1a5-42d6-b6c0-54b2cc303f64.json b/data/hfopenllm_v2/Triangle104/Q2.5-Instruct-1M_Harmony/afedb249-f1a5-42d6-b6c0-54b2cc303f64.json
deleted file mode 100644
index cd9816ee7..000000000
--- a/data/hfopenllm_v2/Triangle104/Q2.5-Instruct-1M_Harmony/afedb249-f1a5-42d6-b6c0-54b2cc303f64.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Q2.5-Instruct-1M_Harmony/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Q2.5-Instruct-1M_Harmony",
-    "id": "Triangle104/Q2.5-Instruct-1M_Harmony",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6038
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5373
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3323
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4688
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4366
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Q2.5-R1-3B/61b1bf5e-6aa4-4e90-af2c-dcf5fc9903f2.json b/data/hfopenllm_v2/Triangle104/Q2.5-R1-3B/61b1bf5e-6aa4-4e90-af2c-dcf5fc9903f2.json
deleted file mode 100644
index 7e4fafbbe..000000000
--- a/data/hfopenllm_v2/Triangle104/Q2.5-R1-3B/61b1bf5e-6aa4-4e90-af2c-dcf5fc9903f2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Q2.5-R1-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Q2.5-R1-3B",
-    "id": "Triangle104/Q2.5-R1-3B",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.085
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4214
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4812
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2674
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.432
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3813
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Q2.5-R1-7B/c0adc04c-1e02-4891-a5a1-1fab0ddf18ca.json b/data/hfopenllm_v2/Triangle104/Q2.5-R1-7B/c0adc04c-1e02-4891-a5a1-1fab0ddf18ca.json
deleted file mode 100644
index 7d190bdde..000000000
--- a/data/hfopenllm_v2/Triangle104/Q2.5-R1-7B/c0adc04c-1e02-4891-a5a1-1fab0ddf18ca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Q2.5-R1-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Q2.5-R1-7B",
-    "id": "Triangle104/Q2.5-R1-7B",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1346
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3007
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3607
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.118
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Robo-Gutenberg_V1.0/cc57e6f0-ab55-4ab9-983c-63d74632d016.json b/data/hfopenllm_v2/Triangle104/Robo-Gutenberg_V1.0/cc57e6f0-ab55-4ab9-983c-63d74632d016.json
deleted file mode 100644
index 96cf9ac6e..000000000
--- a/data/hfopenllm_v2/Triangle104/Robo-Gutenberg_V1.0/cc57e6f0-ab55-4ab9-983c-63d74632d016.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Robo-Gutenberg_V1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Robo-Gutenberg_V1.0",
-    "id": "Triangle104/Robo-Gutenberg_V1.0",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6008
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6537
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4562
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3859
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4744
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5391
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Rocinante-Prism_V2.0/0d3c5fdb-c4a5-4436-b9d4-f0f42cb4db96.json b/data/hfopenllm_v2/Triangle104/Rocinante-Prism_V2.0/0d3c5fdb-c4a5-4436-b9d4-f0f42cb4db96.json
deleted file mode 100644
index 670f07b44..000000000
--- a/data/hfopenllm_v2/Triangle104/Rocinante-Prism_V2.0/0d3c5fdb-c4a5-4436-b9d4-f0f42cb4db96.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Rocinante-Prism_V2.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rocinante-Prism_V2.0",
-    "id": "Triangle104/Rocinante-Prism_V2.0",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2616
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5361
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.111
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.445
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.364
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Rocinante-Prism_V2.1/a6ec2934-e9fd-481d-8f00-932603bc6e0a.json b/data/hfopenllm_v2/Triangle104/Rocinante-Prism_V2.1/a6ec2934-e9fd-481d-8f00-932603bc6e0a.json
deleted file mode 100644
index 5d588967a..000000000
--- a/data/hfopenllm_v2/Triangle104/Rocinante-Prism_V2.1/a6ec2934-e9fd-481d-8f00-932603bc6e0a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Rocinante-Prism_V2.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rocinante-Prism_V2.1",
-    "id": "Triangle104/Rocinante-Prism_V2.1",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2558
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5333
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.449
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3651
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/RomboHermes3-R1-Llama3.2-3b/e2553c93-60df-4126-9e64-ecd4a5003389.json b/data/hfopenllm_v2/Triangle104/RomboHermes3-R1-Llama3.2-3b/e2553c93-60df-4126-9e64-ecd4a5003389.json
deleted file mode 100644
index 1d3c45558..000000000
--- a/data/hfopenllm_v2/Triangle104/RomboHermes3-R1-Llama3.2-3b/e2553c93-60df-4126-9e64-ecd4a5003389.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_RomboHermes3-R1-Llama3.2-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RomboHermes3-R1-Llama3.2-3b",
-    "id": "Triangle104/RomboHermes3-R1-Llama3.2-3b",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3007
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4264
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0816
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3657
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2957
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Rombos-Novasky-7B_V1c/e7c2fb42-e82a-4dac-9cc3-a9f41ab54e0f.json b/data/hfopenllm_v2/Triangle104/Rombos-Novasky-7B_V1c/e7c2fb42-e82a-4dac-9cc3-a9f41ab54e0f.json
deleted file mode 100644
index d45631435..000000000
--- a/data/hfopenllm_v2/Triangle104/Rombos-Novasky-7B_V1c/e7c2fb42-e82a-4dac-9cc3-a9f41ab54e0f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Rombos-Novasky-7B_V1c/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rombos-Novasky-7B_V1c",
-    "id": "Triangle104/Rombos-Novasky-7B_V1c",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.408
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4349
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0853
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4465
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2738
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Triangle104/Set-70b/a807ee8c-509e-4b6d-a414-df24444d8a0a.json b/data/hfopenllm_v2/Triangle104/Set-70b/a807ee8c-509e-4b6d-a414-df24444d8a0a.json
deleted file mode 100644
index c29b4d9d5..000000000
--- a/data/hfopenllm_v2/Triangle104/Set-70b/a807ee8c-509e-4b6d-a414-df24444d8a0a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Triangle104_Set-70b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Set-70b",
-    "id": "Triangle104/Set-70b",
-    "developer": "Triangle104",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7643
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7014
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.364
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4463
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4696
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5442
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Tsunami-th/Tsunami-0.5-7B-Instruct/2199024b-7944-4950-8335-32a536efad02.json b/data/hfopenllm_v2/Tsunami-th/Tsunami-0.5-7B-Instruct/2199024b-7944-4950-8335-32a536efad02.json
deleted file mode 100644
index 8d21eff8a..000000000
--- a/data/hfopenllm_v2/Tsunami-th/Tsunami-0.5-7B-Instruct/2199024b-7944-4950-8335-32a536efad02.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Tsunami-th_Tsunami-0.5-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tsunami-0.5-7B-Instruct",
-    "id": "Tsunami-th/Tsunami-0.5-7B-Instruct",
-    "developer": "Tsunami-th",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.74
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5524
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4257
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4413
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Tsunami-th/Tsunami-0.5x-7B-Instruct/97919c86-6161-4548-95b9-d44263a29f8a.json b/data/hfopenllm_v2/Tsunami-th/Tsunami-0.5x-7B-Instruct/97919c86-6161-4548-95b9-d44263a29f8a.json
deleted file mode 100644
index 2f1a0d0d7..000000000
--- a/data/hfopenllm_v2/Tsunami-th/Tsunami-0.5x-7B-Instruct/97919c86-6161-4548-95b9-d44263a29f8a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Tsunami-th_Tsunami-0.5x-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tsunami-0.5x-7B-Instruct",
-    "id": "Tsunami-th/Tsunami-0.5x-7B-Instruct",
-    "developer": "Tsunami-th",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7099
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5593
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4207
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4667
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4458
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Tsunami-th/Tsunami-1.0-14B-Instruct/c40c1a46-2e30-4cf1-bcf3-a316a793fbcd.json b/data/hfopenllm_v2/Tsunami-th/Tsunami-1.0-14B-Instruct/c40c1a46-2e30-4cf1-bcf3-a316a793fbcd.json
deleted file mode 100644
index 640901714..000000000
--- a/data/hfopenllm_v2/Tsunami-th/Tsunami-1.0-14B-Instruct/c40c1a46-2e30-4cf1-bcf3-a316a793fbcd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Tsunami-th_Tsunami-1.0-14B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tsunami-1.0-14B-Instruct",
-    "id": "Tsunami-th/Tsunami-1.0-14B-Instruct",
-    "developer": "Tsunami-th",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7829
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6439
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4585
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4459
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5249
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Tsunami-th/Tsunami-1.0-7B-Instruct/c1294268-b5f5-4d64-b91a-147f58a21a47.json b/data/hfopenllm_v2/Tsunami-th/Tsunami-1.0-7B-Instruct/c1294268-b5f5-4d64-b91a-147f58a21a47.json
deleted file mode 100644
index 25c1c6f74..000000000
--- a/data/hfopenllm_v2/Tsunami-th/Tsunami-1.0-7B-Instruct/c1294268-b5f5-4d64-b91a-147f58a21a47.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Tsunami-th_Tsunami-1.0-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tsunami-1.0-7B-Instruct",
-    "id": "Tsunami-th/Tsunami-1.0-7B-Instruct",
-    "developer": "Tsunami-th",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7309
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5491
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4335
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4493
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4424
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter1/2b029e6d-a0b8-4b6c-b62d-144b8dc4f739.json b/data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter1/2b029e6d-a0b8-4b6c-b62d-144b8dc4f739.json
deleted file mode 100644
index b9787022d..000000000
--- a/data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter1/2b029e6d-a0b8-4b6c-b62d-144b8dc4f739.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/UCLA-AGI_Gemma-2-9B-It-SPPO-Iter1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-9B-It-SPPO-Iter1",
-    "id": "UCLA-AGI/Gemma-2-9B-It-SPPO-Iter1",
-    "developer": "UCLA-AGI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3082
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5969
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0899
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4099
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3907
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter2/b926ca6c-60c9-4353-9671-0453b46d0222.json b/data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter2/b926ca6c-60c9-4353-9671-0453b46d0222.json
deleted file mode 100644
index 51f77c00b..000000000
--- a/data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter2/b926ca6c-60c9-4353-9671-0453b46d0222.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/UCLA-AGI_Gemma-2-9B-It-SPPO-Iter2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-9B-It-SPPO-Iter2",
-    "id": "UCLA-AGI/Gemma-2-9B-It-SPPO-Iter2",
-    "developer": "UCLA-AGI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.31
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.599
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0808
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3347
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4139
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.387
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter3/44db30b4-2010-4f96-a39e-9ccc8568374f.json b/data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter3/44db30b4-2010-4f96-a39e-9ccc8568374f.json
deleted file mode 100644
index 6ac45353a..000000000
--- a/data/hfopenllm_v2/UCLA-AGI/Gemma-2-9B-It-SPPO-Iter3/44db30b4-2010-4f96-a39e-9ccc8568374f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/UCLA-AGI_Gemma-2-9B-It-SPPO-Iter3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-9B-It-SPPO-Iter3",
-    "id": "UCLA-AGI/Gemma-2-9B-It-SPPO-Iter3",
-    "developer": "UCLA-AGI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3167
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6007
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.071
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3389
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4166
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3826
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter1/2210d673-d417-46be-aeca-de48cd846e01.json b/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter1/2210d673-d417-46be-aeca-de48cd846e01.json
deleted file mode 100644
index 336128396..000000000
--- a/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter1/2210d673-d417-46be-aeca-de48cd846e01.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-SPPO-Iter1",
-    "id": "UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter1",
-    "developer": "UCLA-AGI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7299
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5058
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3568
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3711
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter2/892d27cc-dfb3-40c7-ae0f-a7cd06784808.json b/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter2/892d27cc-dfb3-40c7-ae0f-a7cd06784808.json
deleted file mode 100644
index 22ac5bc50..000000000
--- a/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter2/892d27cc-dfb3-40c7-ae0f-a7cd06784808.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-SPPO-Iter2",
-    "id": "UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter2",
-    "developer": "UCLA-AGI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6989
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5089
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1035
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3594
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3692
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3/49b3f293-721d-4d44-9748-88d1ce275050.json b/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3/49b3f293-721d-4d44-9748-88d1ce275050.json
deleted file mode 100644
index 973c80332..000000000
--- a/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3/49b3f293-721d-4d44-9748-88d1ce275050.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-SPPO-Iter3",
-    "id": "UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3",
-    "developer": "UCLA-AGI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6834
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.508
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0959
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3661
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3644
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3/70fb41fe-46af-49e3-8270-5882e12f710f.json b/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3/70fb41fe-46af-49e3-8270-5882e12f710f.json
deleted file mode 100644
index 9aab7af07..000000000
--- a/data/hfopenllm_v2/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3/70fb41fe-46af-49e3-8270-5882e12f710f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/UCLA-AGI_Llama-3-Instruct-8B-SPPO-Iter3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-SPPO-Iter3",
-    "id": "UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3",
-    "developer": "UCLA-AGI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6703
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5076
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0718
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3647
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3658
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter1/13e2489f-9d96-4f68-8e22-c937604c2145.json b/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter1/13e2489f-9d96-4f68-8e22-c937604c2145.json
deleted file mode 100644
index 44afb2750..000000000
--- a/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter1/13e2489f-9d96-4f68-8e22-c937604c2145.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/UCLA-AGI_Mistral7B-PairRM-SPPO-Iter1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral7B-PairRM-SPPO-Iter1",
-    "id": "UCLA-AGI/Mistral7B-PairRM-SPPO-Iter1",
-    "developer": "UCLA-AGI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5047
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4468
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3992
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2695
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter2/0c386ea0-4706-4a6f-994c-b6ee21dbce92.json b/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter2/0c386ea0-4706-4a6f-994c-b6ee21dbce92.json
deleted file mode 100644
index b7fd6fe39..000000000
--- a/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter2/0c386ea0-4706-4a6f-994c-b6ee21dbce92.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/UCLA-AGI_Mistral7B-PairRM-SPPO-Iter2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral7B-PairRM-SPPO-Iter2",
-    "id": "UCLA-AGI/Mistral7B-PairRM-SPPO-Iter2",
-    "developer": "UCLA-AGI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4446
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4466
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4085
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2677
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter3/a8d5a193-6c87-4b5b-8ea3-b3ab78e73104.json b/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter3/a8d5a193-6c87-4b5b-8ea3-b3ab78e73104.json
deleted file mode 100644
index 34544a27f..000000000
--- a/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO-Iter3/a8d5a193-6c87-4b5b-8ea3-b3ab78e73104.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/UCLA-AGI_Mistral7B-PairRM-SPPO-Iter3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral7B-PairRM-SPPO-Iter3",
-    "id": "UCLA-AGI/Mistral7B-PairRM-SPPO-Iter3",
-    "developer": "UCLA-AGI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4351
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4397
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0234
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4071
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2658
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO/4018f4bd-492a-4814-9a7a-1f0c376f2d2e.json b/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO/4018f4bd-492a-4814-9a7a-1f0c376f2d2e.json
deleted file mode 100644
index d29d93827..000000000
--- a/data/hfopenllm_v2/UCLA-AGI/Mistral7B-PairRM-SPPO/4018f4bd-492a-4814-9a7a-1f0c376f2d2e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/UCLA-AGI_Mistral7B-PairRM-SPPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral7B-PairRM-SPPO",
-    "id": "UCLA-AGI/Mistral7B-PairRM-SPPO",
-    "developer": "UCLA-AGI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4355
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4439
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.031
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3965
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2621
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/UKzExecution/LlamaExecutor-8B-3.0.5/568072cb-118d-41af-bfe8-fa14cb4c7348.json b/data/hfopenllm_v2/UKzExecution/LlamaExecutor-8B-3.0.5/568072cb-118d-41af-bfe8-fa14cb4c7348.json
deleted file mode 100644
index d992c32d0..000000000
--- a/data/hfopenllm_v2/UKzExecution/LlamaExecutor-8B-3.0.5/568072cb-118d-41af-bfe8-fa14cb4c7348.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/UKzExecution_LlamaExecutor-8B-3.0.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LlamaExecutor-8B-3.0.5",
-    "id": "UKzExecution/LlamaExecutor-8B-3.0.5",
-    "developer": "UKzExecution",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7403
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5006
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3754
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3625
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Unbabel/TowerInstruct-Mistral-7B-v0.2/a6d08766-8c36-41bf-8bbc-acdfdc3f8e23.json b/data/hfopenllm_v2/Unbabel/TowerInstruct-Mistral-7B-v0.2/a6d08766-8c36-41bf-8bbc-acdfdc3f8e23.json
deleted file mode 100644
index 21f0ff869..000000000
--- a/data/hfopenllm_v2/Unbabel/TowerInstruct-Mistral-7B-v0.2/a6d08766-8c36-41bf-8bbc-acdfdc3f8e23.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Unbabel_TowerInstruct-Mistral-7B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TowerInstruct-Mistral-7B-v0.2",
-    "id": "Unbabel/TowerInstruct-Mistral-7B-v0.2",
-    "developer": "Unbabel",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2843
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3882
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2475
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4522
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1968
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Undi95/MG-FinalMix-72B/2504fed5-c8a1-4ffc-8ce5-9559aa8c4325.json b/data/hfopenllm_v2/Undi95/MG-FinalMix-72B/2504fed5-c8a1-4ffc-8ce5-9559aa8c4325.json
deleted file mode 100644
index c0dc3e377..000000000
--- a/data/hfopenllm_v2/Undi95/MG-FinalMix-72B/2504fed5-c8a1-4ffc-8ce5-9559aa8c4325.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Undi95_MG-FinalMix-72B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MG-FinalMix-72B",
-    "id": "Undi95/MG-FinalMix-72B",
-    "developer": "Undi95",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8014
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6973
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3973
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3851
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4823
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5427
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Undi95/Phi4-abliterated/359dde31-d9dc-4c22-b829-77df652dcc73.json b/data/hfopenllm_v2/Undi95/Phi4-abliterated/359dde31-d9dc-4c22-b829-77df652dcc73.json
deleted file mode 100644
index 8b09b9cab..000000000
--- a/data/hfopenllm_v2/Undi95/Phi4-abliterated/359dde31-d9dc-4c22-b829-77df652dcc73.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Undi95_Phi4-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi4-abliterated",
-    "id": "Undi95/Phi4-abliterated",
-    "developer": "Undi95",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6618
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6809
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3701
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3305
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4034
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5281
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/V3N0M/Jenna-Tiny-2.0/34a79823-b993-402a-89a7-538e126ee02a.json b/data/hfopenllm_v2/V3N0M/Jenna-Tiny-2.0/34a79823-b993-402a-89a7-538e126ee02a.json
deleted file mode 100644
index 05681b21a..000000000
--- a/data/hfopenllm_v2/V3N0M/Jenna-Tiny-2.0/34a79823-b993-402a-89a7-538e126ee02a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/V3N0M_Jenna-Tiny-2.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Jenna-Tiny-2.0",
-    "id": "V3N0M/Jenna-Tiny-2.0",
-    "developer": "V3N0M",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.631
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2309
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3148
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3367
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1147
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VAGOsolutions/Llama-3-SauerkrautLM-70b-Instruct/f392c5c3-9bee-4111-9a22-6a1b706fd2ad.json b/data/hfopenllm_v2/VAGOsolutions/Llama-3-SauerkrautLM-70b-Instruct/f392c5c3-9bee-4111-9a22-6a1b706fd2ad.json
deleted file mode 100644
index a2a4784dd..000000000
--- a/data/hfopenllm_v2/VAGOsolutions/Llama-3-SauerkrautLM-70b-Instruct/f392c5c3-9bee-4111-9a22-6a1b706fd2ad.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VAGOsolutions_Llama-3-SauerkrautLM-70b-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-SauerkrautLM-70b-Instruct",
-    "id": "VAGOsolutions/Llama-3-SauerkrautLM-70b-Instruct",
-    "developer": "VAGOsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8045
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6663
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2281
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4339
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5392
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct/73bbdd22-4e5f-496b-b39f-290d8e0d2aa4.json b/data/hfopenllm_v2/VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct/73bbdd22-4e5f-496b-b39f-290d8e0d2aa4.json
deleted file mode 100644
index af84212e1..000000000
--- a/data/hfopenllm_v2/VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct/73bbdd22-4e5f-496b-b39f-290d8e0d2aa4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VAGOsolutions_Llama-3-SauerkrautLM-8b-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-SauerkrautLM-8b-Instruct",
-    "id": "VAGOsolutions/Llama-3-SauerkrautLM-8b-Instruct",
-    "developer": "VAGOsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7445
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4943
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4241
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3857
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VAGOsolutions/Llama-3.1-SauerkrautLM-70b-Instruct/72a66eae-9c94-40e3-b3c9-211303e5cba8.json b/data/hfopenllm_v2/VAGOsolutions/Llama-3.1-SauerkrautLM-70b-Instruct/72a66eae-9c94-40e3-b3c9-211303e5cba8.json
deleted file mode 100644
index c22f37f6c..000000000
--- a/data/hfopenllm_v2/VAGOsolutions/Llama-3.1-SauerkrautLM-70b-Instruct/72a66eae-9c94-40e3-b3c9-211303e5cba8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VAGOsolutions_Llama-3.1-SauerkrautLM-70b-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-SauerkrautLM-70b-Instruct",
-    "id": "VAGOsolutions/Llama-3.1-SauerkrautLM-70b-Instruct",
-    "developer": "VAGOsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8656
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7006
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3693
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3414
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4711
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5335
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VAGOsolutions/Llama-3.1-SauerkrautLM-8b-Instruct/ef7390b5-599b-4354-805b-9486e4ce34fa.json b/data/hfopenllm_v2/VAGOsolutions/Llama-3.1-SauerkrautLM-8b-Instruct/ef7390b5-599b-4354-805b-9486e4ce34fa.json
deleted file mode 100644
index c83915153..000000000
--- a/data/hfopenllm_v2/VAGOsolutions/Llama-3.1-SauerkrautLM-8b-Instruct/ef7390b5-599b-4354-805b-9486e4ce34fa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VAGOsolutions_Llama-3.1-SauerkrautLM-8b-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-SauerkrautLM-8b-Instruct",
-    "id": "VAGOsolutions/Llama-3.1-SauerkrautLM-8b-Instruct",
-    "developer": "VAGOsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8017
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5115
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1941
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4148
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.389
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-1.5b/57f964c3-0504-4b60-9539-ce0e369816ea.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-1.5b/57f964c3-0504-4b60-9539-ce0e369816ea.json
deleted file mode 100644
index f33769597..000000000
--- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-1.5b/57f964c3-0504-4b60-9539-ce0e369816ea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-1.5b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SauerkrautLM-1.5b",
-    "id": "VAGOsolutions/SauerkrautLM-1.5b",
-    "developer": "VAGOsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2404
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3704
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0363
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3739
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2151
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-7b-HerO/4e6c0336-5d94-4417-a194-92a4d6f38481.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-7b-HerO/4e6c0336-5d94-4417-a194-92a4d6f38481.json
deleted file mode 100644
index dde4ae4ac..000000000
--- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-7b-HerO/4e6c0336-5d94-4417-a194-92a4d6f38481.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-7b-HerO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SauerkrautLM-7b-HerO",
-    "id": "VAGOsolutions/SauerkrautLM-7b-HerO",
-    "developer": "VAGOsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5346
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4904
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3924
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3046
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-7b-LaserChat/fe38dea8-92f4-4fb2-afdf-c5932d7c9e27.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-7b-LaserChat/fe38dea8-92f4-4fb2-afdf-c5932d7c9e27.json
deleted file mode 100644
index e0f9d6aca..000000000
--- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-7b-LaserChat/fe38dea8-92f4-4fb2-afdf-c5932d7c9e27.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-7b-LaserChat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SauerkrautLM-7b-LaserChat",
-    "id": "VAGOsolutions/SauerkrautLM-7b-LaserChat",
-    "developer": "VAGOsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5988
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4543
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0778
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4148
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3305
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Gemma-2b/5ced7497-5a05-40d2-80cb-cae63ca62022.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Gemma-2b/5ced7497-5a05-40d2-80cb-cae63ca62022.json
deleted file mode 100644
index 66467bc2c..000000000
--- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Gemma-2b/5ced7497-5a05-40d2-80cb-cae63ca62022.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-Gemma-2b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SauerkrautLM-Gemma-2b",
-    "id": "VAGOsolutions/SauerkrautLM-Gemma-2b",
-    "developer": "VAGOsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 2.506
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2475
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3416
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0279
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3676
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1469
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Gemma-7b/52a66aaa-193a-48ca-b693-4dcab811eaa3.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Gemma-7b/52a66aaa-193a-48ca-b693-4dcab811eaa3.json
deleted file mode 100644
index f14465432..000000000
--- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Gemma-7b/52a66aaa-193a-48ca-b693-4dcab811eaa3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-Gemma-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SauerkrautLM-Gemma-7b",
-    "id": "VAGOsolutions/SauerkrautLM-Gemma-7b",
-    "developer": "VAGOsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 8.538
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3407
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4188
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0672
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3594
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct/e0e4bcef-cb73-436b-9353-b18ade293e8b.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct/e0e4bcef-cb73-436b-9353-b18ade293e8b.json
deleted file mode 100644
index 56dde9357..000000000
--- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct/e0e4bcef-cb73-436b-9353-b18ade293e8b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-Mixtral-8x7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SauerkrautLM-Mixtral-8x7B-Instruct",
-    "id": "VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct",
-    "developer": "VAGOsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 46.703
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5602
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5277
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0982
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4204
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.365
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Nemo-12b-Instruct/1ae45791-7e47-4083-bd72-4530fa26893c.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Nemo-12b-Instruct/1ae45791-7e47-4083-bd72-4530fa26893c.json
deleted file mode 100644
index fa2eb1b02..000000000
--- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Nemo-12b-Instruct/1ae45791-7e47-4083-bd72-4530fa26893c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-Nemo-12b-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SauerkrautLM-Nemo-12b-Instruct",
-    "id": "VAGOsolutions/SauerkrautLM-Nemo-12b-Instruct",
-    "developer": "VAGOsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6113
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5214
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1224
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4469
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3385
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Phi-3-medium/b2731f04-a9bd-4e36-a545-85be5b66f5a7.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Phi-3-medium/b2731f04-a9bd-4e36-a545-85be5b66f5a7.json
deleted file mode 100644
index b7b0c2153..000000000
--- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-Phi-3-medium/b2731f04-a9bd-4e36-a545-85be5b66f5a7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-Phi-3-medium/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SauerkrautLM-Phi-3-medium",
-    "id": "VAGOsolutions/SauerkrautLM-Phi-3-medium",
-    "developer": "VAGOsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 13.96
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4409
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6433
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1601
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3347
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4845
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4665
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-SOLAR-Instruct/ed6de552-d04b-4d51-8456-610e2cb41d85.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-SOLAR-Instruct/ed6de552-d04b-4d51-8456-610e2cb41d85.json
deleted file mode 100644
index feb5d2dda..000000000
--- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-SOLAR-Instruct/ed6de552-d04b-4d51-8456-610e2cb41d85.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-SOLAR-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SauerkrautLM-SOLAR-Instruct",
-    "id": "VAGOsolutions/SauerkrautLM-SOLAR-Instruct",
-    "developer": "VAGOsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4917
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5169
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0634
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3965
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3183
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-gemma-2-2b-it/3e08a589-d2b3-487b-900e-85725522a2e4.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-gemma-2-2b-it/3e08a589-d2b3-487b-900e-85725522a2e4.json
deleted file mode 100644
index 49079f005..000000000
--- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-gemma-2-2b-it/3e08a589-d2b3-487b-900e-85725522a2e4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-gemma-2-2b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SauerkrautLM-gemma-2-2b-it",
-    "id": "VAGOsolutions/SauerkrautLM-gemma-2-2b-it",
-    "developer": "VAGOsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1321
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4241
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3995
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-gemma-2-9b-it/b2717503-d081-40ee-b1ed-fcadaf239049.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-gemma-2-9b-it/b2717503-d081-40ee-b1ed-fcadaf239049.json
deleted file mode 100644
index e6e7502ff..000000000
--- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-gemma-2-9b-it/b2717503-d081-40ee-b1ed-fcadaf239049.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-gemma-2-9b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SauerkrautLM-gemma-2-9b-it",
-    "id": "VAGOsolutions/SauerkrautLM-gemma-2-9b-it",
-    "developer": "VAGOsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3024
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6073
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0838
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4318
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4091
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-v2-14b-DPO/9915eb01-5c45-42b6-82a3-ad782411642f.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-v2-14b-DPO/9915eb01-5c45-42b6-82a3-ad782411642f.json
deleted file mode 100644
index 7d69802d9..000000000
--- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-v2-14b-DPO/9915eb01-5c45-42b6-82a3-ad782411642f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-v2-14b-DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SauerkrautLM-v2-14b-DPO",
-    "id": "VAGOsolutions/SauerkrautLM-v2-14b-DPO",
-    "developer": "VAGOsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7412
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.656
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3165
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4375
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5117
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-v2-14b-SFT/190eb7ca-46db-4e1d-8b71-9bb20af74ede.json b/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-v2-14b-SFT/190eb7ca-46db-4e1d-8b71-9bb20af74ede.json
deleted file mode 100644
index ac7db90ff..000000000
--- a/data/hfopenllm_v2/VAGOsolutions/SauerkrautLM-v2-14b-SFT/190eb7ca-46db-4e1d-8b71-9bb20af74ede.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VAGOsolutions_SauerkrautLM-v2-14b-SFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SauerkrautLM-v2-14b-SFT",
-    "id": "VAGOsolutions/SauerkrautLM-v2-14b-SFT",
-    "developer": "VAGOsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6949
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.621
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3285
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4179
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5205
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B-r-v-0.1/86b9077d-9ec3-411d-84c5-326ba97742c1.json b/data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B-r-v-0.1/86b9077d-9ec3-411d-84c5-326ba97742c1.json
deleted file mode 100644
index 40a4383a3..000000000
--- a/data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B-r-v-0.1/86b9077d-9ec3-411d-84c5-326ba97742c1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VIRNECT_llama-3-Korean-8B-r-v-0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-Korean-8B-r-v-0.1",
-    "id": "VIRNECT/llama-3-Korean-8B-r-v-0.1",
-    "developer": "VIRNECT",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 16.061
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4916
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4806
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0861
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2424
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3675
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.326
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B/18bfa50c-20be-4027-8ee7-f6cd1411c882.json b/data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B/18bfa50c-20be-4027-8ee7-f6cd1411c882.json
deleted file mode 100644
index f155f09ea..000000000
--- a/data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B/18bfa50c-20be-4027-8ee7-f6cd1411c882.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VIRNECT_llama-3-Korean-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-Korean-8B",
-    "id": "VIRNECT/llama-3-Korean-8B",
-    "developer": "VIRNECT",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5058
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4908
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0929
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3662
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3539
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B/eb1a099a-48c7-412b-b62f-143537c41f06.json b/data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B/eb1a099a-48c7-412b-b62f-143537c41f06.json
deleted file mode 100644
index 92a869ede..000000000
--- a/data/hfopenllm_v2/VIRNECT/llama-3-Korean-8B/eb1a099a-48c7-412b-b62f-143537c41f06.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/VIRNECT_llama-3-Korean-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-Korean-8B",
-    "id": "VIRNECT/llama-3-Korean-8B",
-    "developer": "VIRNECT",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5021
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4918
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3648
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3536
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3-70B-Fireplace/e530a4b7-c2f6-4bad-bab5-2895e950ed63.json b/data/hfopenllm_v2/ValiantLabs/Llama3-70B-Fireplace/e530a4b7-c2f6-4bad-bab5-2895e950ed63.json
deleted file mode 100644
index 60a42d8a2..000000000
--- a/data/hfopenllm_v2/ValiantLabs/Llama3-70B-Fireplace/e530a4b7-c2f6-4bad-bab5-2895e950ed63.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3-70B-Fireplace/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3-70B-Fireplace",
-    "id": "ValiantLabs/Llama3-70B-Fireplace",
-    "developer": "ValiantLabs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7774
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6489
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3549
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4449
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4893
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3-70B-ShiningValiant2/52ad7152-feea-46a6-b2d8-20e1a70514ce.json b/data/hfopenllm_v2/ValiantLabs/Llama3-70B-ShiningValiant2/52ad7152-feea-46a6-b2d8-20e1a70514ce.json
deleted file mode 100644
index 7ec6b966f..000000000
--- a/data/hfopenllm_v2/ValiantLabs/Llama3-70B-ShiningValiant2/52ad7152-feea-46a6-b2d8-20e1a70514ce.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3-70B-ShiningValiant2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3-70B-ShiningValiant2",
-    "id": "ValiantLabs/Llama3-70B-ShiningValiant2",
-    "developer": "ValiantLabs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6122
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6338
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2077
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3305
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4326
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4898
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.1-70B-ShiningValiant2/a61162a6-ef3e-46f4-8aa2-241547fadea2.json b/data/hfopenllm_v2/ValiantLabs/Llama3.1-70B-ShiningValiant2/a61162a6-ef3e-46f4-8aa2-241547fadea2.json
deleted file mode 100644
index 3438ef88c..000000000
--- a/data/hfopenllm_v2/ValiantLabs/Llama3.1-70B-ShiningValiant2/a61162a6-ef3e-46f4-8aa2-241547fadea2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.1-70B-ShiningValiant2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-70B-ShiningValiant2",
-    "id": "ValiantLabs/Llama3.1-70B-ShiningValiant2",
-    "developer": "ValiantLabs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5355
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6738
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2915
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3926
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4681
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5173
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Cobalt/9f208aef-8544-47c8-bb1f-a3841aff208b.json b/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Cobalt/9f208aef-8544-47c8-bb1f-a3841aff208b.json
deleted file mode 100644
index e206b8681..000000000
--- a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Cobalt/9f208aef-8544-47c8-bb1f-a3841aff208b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.1-8B-Cobalt/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-8B-Cobalt",
-    "id": "ValiantLabs/Llama3.1-8B-Cobalt",
-    "developer": "ValiantLabs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7168
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4911
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1533
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3512
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3663
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Cobalt/da237ab6-df39-460f-9efc-e1649e1ac202.json b/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Cobalt/da237ab6-df39-460f-9efc-e1649e1ac202.json
deleted file mode 100644
index aa9dee935..000000000
--- a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Cobalt/da237ab6-df39-460f-9efc-e1649e1ac202.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.1-8B-Cobalt/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-8B-Cobalt",
-    "id": "ValiantLabs/Llama3.1-8B-Cobalt",
-    "developer": "ValiantLabs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3496
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4947
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1269
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3959
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3644
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Enigma/c81b3193-9d01-4590-8b72-da97aa3c9dc4.json b/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Enigma/c81b3193-9d01-4590-8b72-da97aa3c9dc4.json
deleted file mode 100644
index c8bdca2cb..000000000
--- a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Enigma/c81b3193-9d01-4590-8b72-da97aa3c9dc4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.1-8B-Enigma/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-8B-Enigma",
-    "id": "ValiantLabs/Llama3.1-8B-Enigma",
-    "developer": "ValiantLabs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2681
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4478
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0891
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4196
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3409
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Esper2/1a9ffe50-69ae-48bc-b636-89431391eb37.json b/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Esper2/1a9ffe50-69ae-48bc-b636-89431391eb37.json
deleted file mode 100644
index ba7ad45fc..000000000
--- a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Esper2/1a9ffe50-69ae-48bc-b636-89431391eb37.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.1-8B-Esper2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-8B-Esper2",
-    "id": "ValiantLabs/Llama3.1-8B-Esper2",
-    "developer": "ValiantLabs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.447
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0589
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3561
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2904
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Fireplace2/b0c67359-1da0-4f55-aa1c-f54f88038bd7.json b/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Fireplace2/b0c67359-1da0-4f55-aa1c-f54f88038bd7.json
deleted file mode 100644
index 2f2d8e167..000000000
--- a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Fireplace2/b0c67359-1da0-4f55-aa1c-f54f88038bd7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.1-8B-Fireplace2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-8B-Fireplace2",
-    "id": "ValiantLabs/Llama3.1-8B-Fireplace2",
-    "developer": "ValiantLabs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5483
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.461
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0582
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3433
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2407
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Fireplace2/c700798b-583a-41be-94dd-382669bb495f.json b/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Fireplace2/c700798b-583a-41be-94dd-382669bb495f.json
deleted file mode 100644
index fca031505..000000000
--- a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-Fireplace2/c700798b-583a-41be-94dd-382669bb495f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.1-8B-Fireplace2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-8B-Fireplace2",
-    "id": "ValiantLabs/Llama3.1-8B-Fireplace2",
-    "developer": "ValiantLabs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5328
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4613
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0876
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3367
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2424
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-ShiningValiant2/3c0b9735-2ef1-4f27-b94a-f246eb57b73c.json b/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-ShiningValiant2/3c0b9735-2ef1-4f27-b94a-f246eb57b73c.json
deleted file mode 100644
index 2c1716490..000000000
--- a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-ShiningValiant2/3c0b9735-2ef1-4f27-b94a-f246eb57b73c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.1-8B-ShiningValiant2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-8B-ShiningValiant2",
-    "id": "ValiantLabs/Llama3.1-8B-ShiningValiant2",
-    "developer": "ValiantLabs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6496
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4774
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3909
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3382
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-ShiningValiant2/e8c9501b-c985-4b78-a902-a1a030c72e60.json b/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-ShiningValiant2/e8c9501b-c985-4b78-a902-a1a030c72e60.json
deleted file mode 100644
index b45bd8cf9..000000000
--- a/data/hfopenllm_v2/ValiantLabs/Llama3.1-8B-ShiningValiant2/e8c9501b-c985-4b78-a902-a1a030c72e60.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.1-8B-ShiningValiant2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-8B-ShiningValiant2",
-    "id": "ValiantLabs/Llama3.1-8B-ShiningValiant2",
-    "developer": "ValiantLabs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2678
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4429
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0521
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3959
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2927
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-Enigma/df978fce-3373-4073-8c44-d6a83df1d9d1.json b/data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-Enigma/df978fce-3373-4073-8c44-d6a83df1d9d1.json
deleted file mode 100644
index be7b22dec..000000000
--- a/data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-Enigma/df978fce-3373-4073-8c44-d6a83df1d9d1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.2-3B-Enigma/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.2-3B-Enigma",
-    "id": "ValiantLabs/Llama3.2-3B-Enigma",
-    "developer": "ValiantLabs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2786
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3723
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3921
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2428
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-Esper2/e46ee8d9-81af-4259-8fef-3d3113fb6168.json b/data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-Esper2/e46ee8d9-81af-4259-8fef-3d3113fb6168.json
deleted file mode 100644
index 40bf9a025..000000000
--- a/data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-Esper2/e46ee8d9-81af-4259-8fef-3d3113fb6168.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.2-3B-Esper2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.2-3B-Esper2",
-    "id": "ValiantLabs/Llama3.2-3B-Esper2",
-    "developer": "ValiantLabs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.275
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3808
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0363
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.355
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2257
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-ShiningValiant2/aa6ab404-89ef-4336-b811-7c8064e26107.json b/data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-ShiningValiant2/aa6ab404-89ef-4336-b811-7c8064e26107.json
deleted file mode 100644
index aad1ebcc3..000000000
--- a/data/hfopenllm_v2/ValiantLabs/Llama3.2-3B-ShiningValiant2/aa6ab404-89ef-4336-b811-7c8064e26107.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ValiantLabs_Llama3.2-3B-ShiningValiant2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.2-3B-ShiningValiant2",
-    "id": "ValiantLabs/Llama3.2-3B-ShiningValiant2",
-    "developer": "ValiantLabs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2625
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4226
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0823
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3866
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2829
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Vikhrmodels/Vikhr-Llama3.1-8B-Instruct-R-21-09-24/a14e6c79-4a78-4c02-a7ca-35e783f32be1.json b/data/hfopenllm_v2/Vikhrmodels/Vikhr-Llama3.1-8B-Instruct-R-21-09-24/a14e6c79-4a78-4c02-a7ca-35e783f32be1.json
deleted file mode 100644
index 3f2934658..000000000
--- a/data/hfopenllm_v2/Vikhrmodels/Vikhr-Llama3.1-8B-Instruct-R-21-09-24/a14e6c79-4a78-4c02-a7ca-35e783f32be1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Vikhrmodels_Vikhr-Llama3.1-8B-Instruct-R-21-09-24/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Vikhr-Llama3.1-8B-Instruct-R-21-09-24",
-    "id": "Vikhrmodels/Vikhr-Llama3.1-8B-Instruct-R-21-09-24",
-    "developer": "Vikhrmodels",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6431
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5272
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2175
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.245
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3754
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3547
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24/ba1fb85b-bbc0-46ac-95d7-e61b91f65c2b.json b/data/hfopenllm_v2/Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24/ba1fb85b-bbc0-46ac-95d7-e61b91f65c2b.json
deleted file mode 100644
index 5191b77e3..000000000
--- a/data/hfopenllm_v2/Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24/ba1fb85b-bbc0-46ac-95d7-e61b91f65c2b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Vikhrmodels_Vikhr-Nemo-12B-Instruct-R-21-09-24/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Vikhr-Nemo-12B-Instruct-R-21-09-24",
-    "id": "Vikhrmodels/Vikhr-Nemo-12B-Instruct-R-21-09-24",
-    "developer": "Vikhrmodels",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5999
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5212
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1715
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4073
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3398
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Weyaxi/Bagel-Hermes-2x34B/f6312fc7-c7a8-45dc-a57c-91f56b4ca28a.json b/data/hfopenllm_v2/Weyaxi/Bagel-Hermes-2x34B/f6312fc7-c7a8-45dc-a57c-91f56b4ca28a.json
deleted file mode 100644
index a15c3dfdc..000000000
--- a/data/hfopenllm_v2/Weyaxi/Bagel-Hermes-2x34B/f6312fc7-c7a8-45dc-a57c-91f56b4ca28a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Weyaxi_Bagel-Hermes-2x34B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Bagel-Hermes-2x34B",
-    "id": "Weyaxi/Bagel-Hermes-2x34B",
-    "developer": "Weyaxi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 60.814
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5432
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4917
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0604
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4517
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4589
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Weyaxi/Bagel-Hermes-34B-Slerp/335f5c32-f3f0-4a16-8c9d-8f07b2aae54a.json b/data/hfopenllm_v2/Weyaxi/Bagel-Hermes-34B-Slerp/335f5c32-f3f0-4a16-8c9d-8f07b2aae54a.json
deleted file mode 100644
index b2da0af11..000000000
--- a/data/hfopenllm_v2/Weyaxi/Bagel-Hermes-34B-Slerp/335f5c32-f3f0-4a16-8c9d-8f07b2aae54a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Weyaxi_Bagel-Hermes-34B-Slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Bagel-Hermes-34B-Slerp",
-    "id": "Weyaxi/Bagel-Hermes-34B-Slerp",
-    "developer": "Weyaxi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4603
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5922
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0604
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3347
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4622
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4703
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Weyaxi/Einstein-v4-7B/b7c7a907-7ecc-4d5b-bc6f-8b8d82954b21.json b/data/hfopenllm_v2/Weyaxi/Einstein-v4-7B/b7c7a907-7ecc-4d5b-bc6f-8b8d82954b21.json
deleted file mode 100644
index 42818be4e..000000000
--- a/data/hfopenllm_v2/Weyaxi/Einstein-v4-7B/b7c7a907-7ecc-4d5b-bc6f-8b8d82954b21.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Weyaxi_Einstein-v4-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Einstein-v4-7B",
-    "id": "Weyaxi/Einstein-v4-7B",
-    "developer": "Weyaxi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4708
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3849
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0189
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4682
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2259
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Weyaxi/Einstein-v6.1-Llama3-8B/112f01a2-f0fb-4257-86bf-61c9a184eb92.json b/data/hfopenllm_v2/Weyaxi/Einstein-v6.1-Llama3-8B/112f01a2-f0fb-4257-86bf-61c9a184eb92.json
deleted file mode 100644
index f43785b03..000000000
--- a/data/hfopenllm_v2/Weyaxi/Einstein-v6.1-Llama3-8B/112f01a2-f0fb-4257-86bf-61c9a184eb92.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Weyaxi_Einstein-v6.1-Llama3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Einstein-v6.1-Llama3-8B",
-    "id": "Weyaxi/Einstein-v6.1-Llama3-8B",
-    "developer": "Weyaxi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4568
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5008
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4213
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3131
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Weyaxi/Einstein-v6.1-developed-by-Weyaxi-Llama3-8B/2d9410d6-7162-4811-bf7d-9de2c2b48fd2.json b/data/hfopenllm_v2/Weyaxi/Einstein-v6.1-developed-by-Weyaxi-Llama3-8B/2d9410d6-7162-4811-bf7d-9de2c2b48fd2.json
deleted file mode 100644
index 4375030ce..000000000
--- a/data/hfopenllm_v2/Weyaxi/Einstein-v6.1-developed-by-Weyaxi-Llama3-8B/2d9410d6-7162-4811-bf7d-9de2c2b48fd2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Weyaxi_Einstein-v6.1-developed-by-Weyaxi-Llama3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Einstein-v6.1-developed-by-Weyaxi-Llama3-8B",
-    "id": "Weyaxi/Einstein-v6.1-developed-by-Weyaxi-Llama3-8B",
-    "developer": "Weyaxi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3927
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5044
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0718
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4332
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3093
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Weyaxi/Einstein-v7-Qwen2-7B/16ff8fa3-4676-473c-99ad-908ddb59d8ed.json b/data/hfopenllm_v2/Weyaxi/Einstein-v7-Qwen2-7B/16ff8fa3-4676-473c-99ad-908ddb59d8ed.json
deleted file mode 100644
index 3fd32bab2..000000000
--- a/data/hfopenllm_v2/Weyaxi/Einstein-v7-Qwen2-7B/16ff8fa3-4676-473c-99ad-908ddb59d8ed.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Weyaxi_Einstein-v7-Qwen2-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Einstein-v7-Qwen2-7B",
-    "id": "Weyaxi/Einstein-v7-Qwen2-7B",
-    "developer": "Weyaxi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.41
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5161
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1994
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4096
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Weyaxi/Einstein-v8-Llama3.2-1B/9b153ac9-f95b-419b-b7f9-beccd769ddad.json b/data/hfopenllm_v2/Weyaxi/Einstein-v8-Llama3.2-1B/9b153ac9-f95b-419b-b7f9-beccd769ddad.json
deleted file mode 100644
index 96e3825ec..000000000
--- a/data/hfopenllm_v2/Weyaxi/Einstein-v8-Llama3.2-1B/9b153ac9-f95b-419b-b7f9-beccd769ddad.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Weyaxi_Einstein-v8-Llama3.2-1B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Einstein-v8-Llama3.2-1B",
-    "id": "Weyaxi/Einstein-v8-Llama3.2-1B",
-    "developer": "Weyaxi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1862
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3018
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3618
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1161
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Weyaxi/SauerkrautLM-UNA-SOLAR-Instruct/8a5df3c2-eb71-4e12-b013-fb43685f2916.json b/data/hfopenllm_v2/Weyaxi/SauerkrautLM-UNA-SOLAR-Instruct/8a5df3c2-eb71-4e12-b013-fb43685f2916.json
deleted file mode 100644
index e6c896811..000000000
--- a/data/hfopenllm_v2/Weyaxi/SauerkrautLM-UNA-SOLAR-Instruct/8a5df3c2-eb71-4e12-b013-fb43685f2916.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Weyaxi_SauerkrautLM-UNA-SOLAR-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SauerkrautLM-UNA-SOLAR-Instruct",
-    "id": "Weyaxi/SauerkrautLM-UNA-SOLAR-Instruct",
-    "developer": "Weyaxi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4573
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5166
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0461
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3979
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3153
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/WizardLMTeam/WizardLM-13B-V1.0/35fa3213-5c08-4b19-ae76-237fdd25444e.json b/data/hfopenllm_v2/WizardLMTeam/WizardLM-13B-V1.0/35fa3213-5c08-4b19-ae76-237fdd25444e.json
deleted file mode 100644
index ea8afc65e..000000000
--- a/data/hfopenllm_v2/WizardLMTeam/WizardLM-13B-V1.0/35fa3213-5c08-4b19-ae76-237fdd25444e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/WizardLMTeam_WizardLM-13B-V1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "WizardLM-13B-V1.0",
-    "id": "WizardLMTeam/WizardLM-13B-V1.0",
-    "developer": "WizardLMTeam",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.185
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2913
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3497
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1166
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/WizardLMTeam/WizardLM-13B-V1.2/242ce55f-1471-435e-bcd7-d28b5fc87fc4.json b/data/hfopenllm_v2/WizardLMTeam/WizardLM-13B-V1.2/242ce55f-1471-435e-bcd7-d28b5fc87fc4.json
deleted file mode 100644
index 9777bbd90..000000000
--- a/data/hfopenllm_v2/WizardLMTeam/WizardLM-13B-V1.2/242ce55f-1471-435e-bcd7-d28b5fc87fc4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/WizardLMTeam_WizardLM-13B-V1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "WizardLM-13B-V1.2",
-    "id": "WizardLMTeam/WizardLM-13B-V1.2",
-    "developer": "WizardLMTeam",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3392
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4462
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0189
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4378
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2519
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/WizardLMTeam/WizardLM-70B-V1.0/95f509f2-5e67-404a-968d-f7488d684e32.json b/data/hfopenllm_v2/WizardLMTeam/WizardLM-70B-V1.0/95f509f2-5e67-404a-968d-f7488d684e32.json
deleted file mode 100644
index 8f43a920a..000000000
--- a/data/hfopenllm_v2/WizardLMTeam/WizardLM-70B-V1.0/95f509f2-5e67-404a-968d-f7488d684e32.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/WizardLMTeam_WizardLM-70B-V1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "WizardLM-70B-V1.0",
-    "id": "WizardLMTeam/WizardLM-70B-V1.0",
-    "developer": "WizardLMTeam",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4951
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.559
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4391
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3447
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Wladastic/Mini-Think-Base-1B/bcbcdfe9-0663-417c-9a29-60906e63db8f.json b/data/hfopenllm_v2/Wladastic/Mini-Think-Base-1B/bcbcdfe9-0663-417c-9a29-60906e63db8f.json
deleted file mode 100644
index ab878d450..000000000
--- a/data/hfopenllm_v2/Wladastic/Mini-Think-Base-1B/bcbcdfe9-0663-417c-9a29-60906e63db8f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Wladastic_Mini-Think-Base-1B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mini-Think-Base-1B",
-    "id": "Wladastic/Mini-Think-Base-1B",
-    "developer": "Wladastic",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5588
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3574
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0733
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3275
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1772
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Xclbr7/Arcanum-12b/d95a7493-2f99-4c10-8067-711c7388af7d.json b/data/hfopenllm_v2/Xclbr7/Arcanum-12b/d95a7493-2f99-4c10-8067-711c7388af7d.json
deleted file mode 100644
index 7bafc055c..000000000
--- a/data/hfopenllm_v2/Xclbr7/Arcanum-12b/d95a7493-2f99-4c10-8067-711c7388af7d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Xclbr7_Arcanum-12b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Arcanum-12b",
-    "id": "Xclbr7/Arcanum-12b",
-    "developer": "Xclbr7",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2907
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5265
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1193
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.417
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3586
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Xclbr7/Hyena-12b/789848a0-6d8a-4583-93c3-a72df74d0071.json b/data/hfopenllm_v2/Xclbr7/Hyena-12b/789848a0-6d8a-4583-93c3-a72df74d0071.json
deleted file mode 100644
index eff456eeb..000000000
--- a/data/hfopenllm_v2/Xclbr7/Hyena-12b/789848a0-6d8a-4583-93c3-a72df74d0071.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Xclbr7_Hyena-12b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hyena-12b",
-    "id": "Xclbr7/Hyena-12b",
-    "developer": "Xclbr7",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3404
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5457
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1133
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3984
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3439
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Xclbr7/caliburn-12b/14af87df-0fc5-46e1-9d0b-c25c8b6a7ce7.json b/data/hfopenllm_v2/Xclbr7/caliburn-12b/14af87df-0fc5-46e1-9d0b-c25c8b6a7ce7.json
deleted file mode 100644
index 48b753f2c..000000000
--- a/data/hfopenllm_v2/Xclbr7/caliburn-12b/14af87df-0fc5-46e1-9d0b-c25c8b6a7ce7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Xclbr7_caliburn-12b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "caliburn-12b",
-    "id": "Xclbr7/caliburn-12b",
-    "developer": "Xclbr7",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3576
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5519
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4292
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3675
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Xclbr7/caliburn-v2-12b/379f559f-9bfa-444f-b477-562c25b4c299.json b/data/hfopenllm_v2/Xclbr7/caliburn-v2-12b/379f559f-9bfa-444f-b477-562c25b4c299.json
deleted file mode 100644
index 475721de8..000000000
--- a/data/hfopenllm_v2/Xclbr7/caliburn-v2-12b/379f559f-9bfa-444f-b477-562c25b4c299.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Xclbr7_caliburn-v2-12b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "caliburn-v2-12b",
-    "id": "Xclbr7/caliburn-v2-12b",
-    "developer": "Xclbr7",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2967
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5141
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.105
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.437
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3784
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Xiaojian9992024/Llama3.2-1B-THREADRIPPER-v0.2/effb6a3d-c98f-4c3a-be77-902c61cda21b.json b/data/hfopenllm_v2/Xiaojian9992024/Llama3.2-1B-THREADRIPPER-v0.2/effb6a3d-c98f-4c3a-be77-902c61cda21b.json
deleted file mode 100644
index b05dd8d93..000000000
--- a/data/hfopenllm_v2/Xiaojian9992024/Llama3.2-1B-THREADRIPPER-v0.2/effb6a3d-c98f-4c3a-be77-902c61cda21b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Llama3.2-1B-THREADRIPPER-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.2-1B-THREADRIPPER-v0.2",
-    "id": "Xiaojian9992024/Llama3.2-1B-THREADRIPPER-v0.2",
-    "developer": "Xiaojian9992024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5318
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3528
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0657
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3316
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1745
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Xiaojian9992024/Llama3.2-1B-THREADRIPPER/6c1c1405-afa4-412d-ba1f-49dc1cac4509.json b/data/hfopenllm_v2/Xiaojian9992024/Llama3.2-1B-THREADRIPPER/6c1c1405-afa4-412d-ba1f-49dc1cac4509.json
deleted file mode 100644
index cd6966cd3..000000000
--- a/data/hfopenllm_v2/Xiaojian9992024/Llama3.2-1B-THREADRIPPER/6c1c1405-afa4-412d-ba1f-49dc1cac4509.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Llama3.2-1B-THREADRIPPER/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.2-1B-THREADRIPPER",
-    "id": "Xiaojian9992024/Llama3.2-1B-THREADRIPPER",
-    "developer": "Xiaojian9992024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5576
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3544
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.074
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.313
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1763
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Xiaojian9992024/Phi-4-Megatron-Empathetic/6f4ed7c2-c775-4fd2-8600-4cea523f53e4.json b/data/hfopenllm_v2/Xiaojian9992024/Phi-4-Megatron-Empathetic/6f4ed7c2-c775-4fd2-8600-4cea523f53e4.json
deleted file mode 100644
index b61bf3793..000000000
--- a/data/hfopenllm_v2/Xiaojian9992024/Phi-4-Megatron-Empathetic/6f4ed7c2-c775-4fd2-8600-4cea523f53e4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Phi-4-Megatron-Empathetic/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-Megatron-Empathetic",
-    "id": "Xiaojian9992024/Phi-4-Megatron-Empathetic",
-    "developer": "Xiaojian9992024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0173
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6673
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2696
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3859
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5071
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5082
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Xiaojian9992024/Phi-4-mini-UNOFFICAL/5fd5206b-186a-43b9-a4f4-07e75aa0293a.json b/data/hfopenllm_v2/Xiaojian9992024/Phi-4-mini-UNOFFICAL/5fd5206b-186a-43b9-a4f4-07e75aa0293a.json
deleted file mode 100644
index 76e78b910..000000000
--- a/data/hfopenllm_v2/Xiaojian9992024/Phi-4-mini-UNOFFICAL/5fd5206b-186a-43b9-a4f4-07e75aa0293a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Phi-4-mini-UNOFFICAL/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-mini-UNOFFICAL",
-    "id": "Xiaojian9992024/Phi-4-mini-UNOFFICAL",
-    "developer": "Xiaojian9992024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.754
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1273
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2944
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2408
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3368
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1144
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-7B-MS-Destroyer/b707ecbf-0658-4226-803d-53456d16d54b.json b/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-7B-MS-Destroyer/b707ecbf-0658-4226-803d-53456d16d54b.json
deleted file mode 100644
index ddb71d89b..000000000
--- a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-7B-MS-Destroyer/b707ecbf-0658-4226-803d-53456d16d54b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Qwen2.5-7B-MS-Destroyer/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-MS-Destroyer",
-    "id": "Xiaojian9992024/Qwen2.5-7B-MS-Destroyer",
-    "developer": "Xiaojian9992024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7296
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.547
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4592
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.427
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4412
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview-v0.2/dca1ee57-5e86-4532-a2f3-ac6a619ca576.json b/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview-v0.2/dca1ee57-5e86-4532-a2f3-ac6a619ca576.json
deleted file mode 100644
index cb19a6f74..000000000
--- a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview-v0.2/dca1ee57-5e86-4532-a2f3-ac6a619ca576.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Qwen2.5-Dyanka-7B-Preview-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Dyanka-7B-Preview-v0.2",
-    "id": "Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview-v0.2",
-    "developer": "Xiaojian9992024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6702
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5374
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4721
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4467
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4371
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview/1233476a-7839-4a22-a7ca-1d0f237d8888.json b/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview/1233476a-7839-4a22-a7ca-1d0f237d8888.json
deleted file mode 100644
index 61a8f0cb6..000000000
--- a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview/1233476a-7839-4a22-a7ca-1d0f237d8888.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Qwen2.5-Dyanka-7B-Preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Dyanka-7B-Preview",
-    "id": "Xiaojian9992024/Qwen2.5-Dyanka-7B-Preview",
-    "developer": "Xiaojian9992024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.764
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5543
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4879
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4481
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4376
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Medium-Censored/5c4bdeca-5ef8-4002-8f82-67d49b5ff722.json b/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Medium-Censored/5c4bdeca-5ef8-4002-8f82-67d49b5ff722.json
deleted file mode 100644
index 6183b3899..000000000
--- a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Medium-Censored/5c4bdeca-5ef8-4002-8f82-67d49b5ff722.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Qwen2.5-THREADRIPPER-Medium-Censored/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-THREADRIPPER-Medium-Censored",
-    "id": "Xiaojian9992024/Qwen2.5-THREADRIPPER-Medium-Censored",
-    "developer": "Xiaojian9992024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8112
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6431
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.534
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3347
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.414
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4929
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Small-AnniversaryEdition/18f5fd6c-2b79-4d48-b7e9-18845db16271.json b/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Small-AnniversaryEdition/18f5fd6c-2b79-4d48-b7e9-18845db16271.json
deleted file mode 100644
index 889a250b8..000000000
--- a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Small-AnniversaryEdition/18f5fd6c-2b79-4d48-b7e9-18845db16271.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Qwen2.5-THREADRIPPER-Small-AnniversaryEdition/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-THREADRIPPER-Small-AnniversaryEdition",
-    "id": "Xiaojian9992024/Qwen2.5-THREADRIPPER-Small-AnniversaryEdition",
-    "developer": "Xiaojian9992024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7404
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5465
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5076
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3807
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4393
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Small/a9039374-fa5a-4b8b-800f-5f4651cf812d.json b/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Small/a9039374-fa5a-4b8b-800f-5f4651cf812d.json
deleted file mode 100644
index 5724e01ef..000000000
--- a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-THREADRIPPER-Small/a9039374-fa5a-4b8b-800f-5f4651cf812d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Qwen2.5-THREADRIPPER-Small/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-THREADRIPPER-Small",
-    "id": "Xiaojian9992024/Qwen2.5-THREADRIPPER-Small",
-    "developer": "Xiaojian9992024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7689
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.549
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4736
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4349
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4357
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Ultra-1.5B-25.02-Exp/3f9704b4-bf25-40da-b6dc-b927c3569f40.json b/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Ultra-1.5B-25.02-Exp/3f9704b4-bf25-40da-b6dc-b927c3569f40.json
deleted file mode 100644
index 35e891112..000000000
--- a/data/hfopenllm_v2/Xiaojian9992024/Qwen2.5-Ultra-1.5B-25.02-Exp/3f9704b4-bf25-40da-b6dc-b927c3569f40.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Qwen2.5-Ultra-1.5B-25.02-Exp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Ultra-1.5B-25.02-Exp",
-    "id": "Xiaojian9992024/Qwen2.5-Ultra-1.5B-25.02-Exp",
-    "developer": "Xiaojian9992024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4073
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4066
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0831
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3383
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2641
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Xiaojian9992024/Reflection-L3.2-JametMiniMix-3B/a8f858d8-a792-409f-b79d-948a19e2aa87.json b/data/hfopenllm_v2/Xiaojian9992024/Reflection-L3.2-JametMiniMix-3B/a8f858d8-a792-409f-b79d-948a19e2aa87.json
deleted file mode 100644
index 163b699a4..000000000
--- a/data/hfopenllm_v2/Xiaojian9992024/Reflection-L3.2-JametMiniMix-3B/a8f858d8-a792-409f-b79d-948a19e2aa87.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Xiaojian9992024_Reflection-L3.2-JametMiniMix-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Reflection-L3.2-JametMiniMix-3B",
-    "id": "Xiaojian9992024/Reflection-L3.2-JametMiniMix-3B",
-    "developer": "Xiaojian9992024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4619
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.439
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1193
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3667
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2988
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Xkev/Llama-3.2V-11B-cot/5c34a168-b8cf-436b-a3b7-a2d1feadffb9.json b/data/hfopenllm_v2/Xkev/Llama-3.2V-11B-cot/5c34a168-b8cf-436b-a3b7-a2d1feadffb9.json
deleted file mode 100644
index 2b105fc64..000000000
--- a/data/hfopenllm_v2/Xkev/Llama-3.2V-11B-cot/5c34a168-b8cf-436b-a3b7-a2d1feadffb9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Xkev_Llama-3.2V-11B-cot/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2V-11B-cot",
-    "id": "Xkev/Llama-3.2V-11B-cot",
-    "developer": "Xkev",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MllamaForConditionalGeneration",
-      "params_billions": 10.67
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4158
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4959
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1556
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4159
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3587
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-1M-YOYO-V3/77092cfe-9820-45e8-94c5-31d27f1daa7c.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-1M-YOYO-V3/77092cfe-9820-45e8-94c5-31d27f1daa7c.json
deleted file mode 100644
index 750d1c03c..000000000
--- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-1M-YOYO-V3/77092cfe-9820-45e8-94c5-31d27f1daa7c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-1M-YOYO-V3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-1M-YOYO-V3",
-    "id": "YOYO-AI/Qwen2.5-14B-1M-YOYO-V3",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8398
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6448
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5355
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4141
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5207
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0505/cab8fed8-de68-4fa5-b4fc-d9483fc56571.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0505/cab8fed8-de68-4fa5-b4fc-d9483fc56571.json
deleted file mode 100644
index 7845c81e9..000000000
--- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0505/cab8fed8-de68-4fa5-b4fc-d9483fc56571.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-0505/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-YOYO-0505",
-    "id": "YOYO-AI/Qwen2.5-14B-YOYO-0505",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5883
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6539
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4434
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4757
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5371
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0510-v2/a8103350-b208-4856-8e7b-8ea8918ba0d1.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0510-v2/a8103350-b208-4856-8e7b-8ea8918ba0d1.json
deleted file mode 100644
index 7d6288961..000000000
--- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0510-v2/a8103350-b208-4856-8e7b-8ea8918ba0d1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-0510-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-YOYO-0510-v2",
-    "id": "YOYO-AI/Qwen2.5-14B-YOYO-0510-v2",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5947
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6553
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4441
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4744
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5381
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0805/e849c03c-c569-4059-8fc5-6a98cf391342.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0805/e849c03c-c569-4059-8fc5-6a98cf391342.json
deleted file mode 100644
index 60cbcc72d..000000000
--- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-0805/e849c03c-c569-4059-8fc5-6a98cf391342.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-0805/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-YOYO-0805",
-    "id": "YOYO-AI/Qwen2.5-14B-YOYO-0805",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5883
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6539
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4434
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4757
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5371
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1005-v2/f1d8bffa-61fc-47d5-85cf-48cebcb31af5.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1005-v2/f1d8bffa-61fc-47d5-85cf-48cebcb31af5.json
deleted file mode 100644
index 62d6d9013..000000000
--- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1005-v2/f1d8bffa-61fc-47d5-85cf-48cebcb31af5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-1005-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-YOYO-1005-v2",
-    "id": "YOYO-AI/Qwen2.5-14B-YOYO-1005-v2",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5953
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6551
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4434
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4731
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5372
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1005/97bdb352-2e9d-4cc5-8b70-55348ef3a217.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1005/97bdb352-2e9d-4cc5-8b70-55348ef3a217.json
deleted file mode 100644
index 0d6a7c3b6..000000000
--- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1005/97bdb352-2e9d-4cc5-8b70-55348ef3a217.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-1005/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-YOYO-1005",
-    "id": "YOYO-AI/Qwen2.5-14B-YOYO-1005",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5972
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6542
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4524
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3809
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.473
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5382
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010-v2/78053a33-24c8-4e9f-8791-f127f21eec1c.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010-v2/78053a33-24c8-4e9f-8791-f127f21eec1c.json
deleted file mode 100644
index ba66678f4..000000000
--- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010-v2/78053a33-24c8-4e9f-8791-f127f21eec1c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-1010-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-YOYO-1010-v2",
-    "id": "YOYO-AI/Qwen2.5-14B-YOYO-1010-v2",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5947
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6553
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4441
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4744
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5381
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010/03082966-87ba-4560-a784-5d8677003500.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010/03082966-87ba-4560-a784-5d8677003500.json
deleted file mode 100644
index 0f3f65bb4..000000000
--- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010/03082966-87ba-4560-a784-5d8677003500.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-1010/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-YOYO-1010",
-    "id": "YOYO-AI/Qwen2.5-14B-YOYO-1010",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5899
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.654
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4509
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3834
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4744
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5376
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010/97f26b20-db66-4a30-ba2a-c18a31081271.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010/97f26b20-db66-4a30-ba2a-c18a31081271.json
deleted file mode 100644
index 2b3d5ce24..000000000
--- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-1010/97f26b20-db66-4a30-ba2a-c18a31081271.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-1010/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-YOYO-1010",
-    "id": "YOYO-AI/Qwen2.5-14B-YOYO-1010",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7905
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6406
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4181
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4944
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-SCE/85f9ccda-8c47-4fa1-9d47-e9da4730b077.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-SCE/85f9ccda-8c47-4fa1-9d47-e9da4730b077.json
deleted file mode 100644
index b49788cb1..000000000
--- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-SCE/85f9ccda-8c47-4fa1-9d47-e9da4730b077.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-SCE/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-YOYO-SCE",
-    "id": "YOYO-AI/Qwen2.5-14B-YOYO-SCE",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5844
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6489
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4615
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3742
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4704
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5381
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4-p1/2a57d6f4-643b-4b30-8d67-03032d454887.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4-p1/2a57d6f4-643b-4b30-8d67-03032d454887.json
deleted file mode 100644
index 84039bdda..000000000
--- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4-p1/2a57d6f4-643b-4b30-8d67-03032d454887.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-V4-p1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-YOYO-V4-p1",
-    "id": "YOYO-AI/Qwen2.5-14B-YOYO-V4-p1",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8203
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6516
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5332
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3456
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4194
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.502
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4-p2/d333f360-c1c3-4916-8480-4a1fc490875a.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4-p2/d333f360-c1c3-4916-8480-4a1fc490875a.json
deleted file mode 100644
index 54ba3e37e..000000000
--- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4-p2/d333f360-c1c3-4916-8480-4a1fc490875a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-V4-p2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-YOYO-V4-p2",
-    "id": "YOYO-AI/Qwen2.5-14B-YOYO-V4-p2",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8048
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6339
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4435
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4968
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4/37a41261-a7b0-44b2-916f-770cdfa0ad39.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4/37a41261-a7b0-44b2-916f-770cdfa0ad39.json
deleted file mode 100644
index 40718d9ee..000000000
--- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-V4/37a41261-a7b0-44b2-916f-770cdfa0ad39.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-V4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-YOYO-V4",
-    "id": "YOYO-AI/Qwen2.5-14B-YOYO-V4",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8398
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.649
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5347
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4115
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.517
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-latest-V2/c46cd6cc-b56d-44c5-a03c-b49381ba3462.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-latest-V2/c46cd6cc-b56d-44c5-a03c-b49381ba3462.json
deleted file mode 100644
index 285272c1d..000000000
--- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-latest-V2/c46cd6cc-b56d-44c5-a03c-b49381ba3462.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-latest-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-YOYO-latest-V2",
-    "id": "YOYO-AI/Qwen2.5-14B-YOYO-latest-V2",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7771
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6299
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.354
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4299
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5224
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-latest/612b6226-c25d-42e0-bcd7-be7faa844530.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-latest/612b6226-c25d-42e0-bcd7-be7faa844530.json
deleted file mode 100644
index 045738a4d..000000000
--- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-YOYO-latest/612b6226-c25d-42e0-bcd7-be7faa844530.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-YOYO-latest/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-YOYO-latest",
-    "id": "YOYO-AI/Qwen2.5-14B-YOYO-latest",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5911
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6656
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4418
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3826
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4691
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5371
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-it-restore/2fc7a4d6-88e0-4f11-9110-dc53942870a4.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-it-restore/2fc7a4d6-88e0-4f11-9110-dc53942870a4.json
deleted file mode 100644
index 69cc906a3..000000000
--- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-14B-it-restore/2fc7a4d6-88e0-4f11-9110-dc53942870a4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-14B-it-restore/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-it-restore",
-    "id": "YOYO-AI/Qwen2.5-14B-it-restore",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8209
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6388
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.537
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3372
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4087
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.49
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-7B-it-restore/34665752-58d8-48ee-81a6-f1a068c23026.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-7B-it-restore/34665752-58d8-48ee-81a6-f1a068c23026.json
deleted file mode 100644
index 8d8fd9130..000000000
--- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-7B-it-restore/34665752-58d8-48ee-81a6-f1a068c23026.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-7B-it-restore/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-it-restore",
-    "id": "YOYO-AI/Qwen2.5-7B-it-restore",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7531
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5407
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4007
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4288
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-Coder-14B-YOYO-1010/cc0767b5-4aaa-4418-8f68-72a721323e9c.json b/data/hfopenllm_v2/YOYO-AI/Qwen2.5-Coder-14B-YOYO-1010/cc0767b5-4aaa-4418-8f68-72a721323e9c.json
deleted file mode 100644
index 72afbd4ba..000000000
--- a/data/hfopenllm_v2/YOYO-AI/Qwen2.5-Coder-14B-YOYO-1010/cc0767b5-4aaa-4418-8f68-72a721323e9c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_Qwen2.5-Coder-14B-YOYO-1010/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Coder-14B-YOYO-1010",
-    "id": "YOYO-AI/Qwen2.5-Coder-14B-YOYO-1010",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5336
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6187
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3218
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3523
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4422
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4075
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V2/ea507a41-1654-4515-94cc-ce2e38800c61.json b/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V2/ea507a41-1654-4515-94cc-ce2e38800c61.json
deleted file mode 100644
index 6a8e45bb1..000000000
--- a/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V2/ea507a41-1654-4515-94cc-ce2e38800c61.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_ZYH-LLM-Qwen2.5-14B-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZYH-LLM-Qwen2.5-14B-V2",
-    "id": "YOYO-AI/ZYH-LLM-Qwen2.5-14B-V2",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5071
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6452
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3542
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4689
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5372
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V3/c44e773f-4cca-4780-bdd4-f486e65c18e0.json b/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V3/c44e773f-4cca-4780-bdd4-f486e65c18e0.json
deleted file mode 100644
index e0448d4a8..000000000
--- a/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V3/c44e773f-4cca-4780-bdd4-f486e65c18e0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_ZYH-LLM-Qwen2.5-14B-V3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZYH-LLM-Qwen2.5-14B-V3",
-    "id": "YOYO-AI/ZYH-LLM-Qwen2.5-14B-V3",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8578
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6359
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5272
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3322
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4022
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4881
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V4/f8a46bda-d53b-484e-8832-7939f7d0762d.json b/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V4/f8a46bda-d53b-484e-8832-7939f7d0762d.json
deleted file mode 100644
index 4f4bd31c8..000000000
--- a/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B-V4/f8a46bda-d53b-484e-8832-7939f7d0762d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_ZYH-LLM-Qwen2.5-14B-V4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZYH-LLM-Qwen2.5-14B-V4",
-    "id": "YOYO-AI/ZYH-LLM-Qwen2.5-14B-V4",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8365
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6515
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4434
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5204
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B/c3968a2d-4a9a-4f62-8bea-a3b4b6dcd378.json b/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B/c3968a2d-4a9a-4f62-8bea-a3b4b6dcd378.json
deleted file mode 100644
index 7cecb6db8..000000000
--- a/data/hfopenllm_v2/YOYO-AI/ZYH-LLM-Qwen2.5-14B/c3968a2d-4a9a-4f62-8bea-a3b4b6dcd378.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YOYO-AI_ZYH-LLM-Qwen2.5-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZYH-LLM-Qwen2.5-14B",
-    "id": "YOYO-AI/ZYH-LLM-Qwen2.5-14B",
-    "developer": "YOYO-AI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5941
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6644
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4116
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3859
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4757
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5351
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Yash21/TinyYi-7B-Test/da18242c-d6bb-4a0a-a2f9-2e42099f4e8a.json b/data/hfopenllm_v2/Yash21/TinyYi-7B-Test/da18242c-d6bb-4a0a-a2f9-2e42099f4e8a.json
deleted file mode 100644
index f956d0abc..000000000
--- a/data/hfopenllm_v2/Yash21/TinyYi-7B-Test/da18242c-d6bb-4a0a-a2f9-2e42099f4e8a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Yash21_TinyYi-7B-Test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TinyYi-7B-Test",
-    "id": "Yash21/TinyYi-7B-Test",
-    "developer": "Yash21",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.061
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1856
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.291
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1091
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Youlln/1PARAMMYL-8B-ModelStock/ac078124-85d9-4715-bf7c-1428b1063732.json b/data/hfopenllm_v2/Youlln/1PARAMMYL-8B-ModelStock/ac078124-85d9-4715-bf7c-1428b1063732.json
deleted file mode 100644
index 0dfc3106e..000000000
--- a/data/hfopenllm_v2/Youlln/1PARAMMYL-8B-ModelStock/ac078124-85d9-4715-bf7c-1428b1063732.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Youlln_1PARAMMYL-8B-ModelStock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "1PARAMMYL-8B-ModelStock",
-    "id": "Youlln/1PARAMMYL-8B-ModelStock",
-    "developer": "Youlln",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5371
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5216
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1488
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4409
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Youlln/2PRYMMAL-Yi1.5-6B-SLERP/9c1dcd75-8491-4890-ac6f-000868099a3e.json b/data/hfopenllm_v2/Youlln/2PRYMMAL-Yi1.5-6B-SLERP/9c1dcd75-8491-4890-ac6f-000868099a3e.json
deleted file mode 100644
index 9c066ab38..000000000
--- a/data/hfopenllm_v2/Youlln/2PRYMMAL-Yi1.5-6B-SLERP/9c1dcd75-8491-4890-ac6f-000868099a3e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Youlln_2PRYMMAL-Yi1.5-6B-SLERP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "2PRYMMAL-Yi1.5-6B-SLERP",
-    "id": "Youlln/2PRYMMAL-Yi1.5-6B-SLERP",
-    "developer": "Youlln",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.061
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2826
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4665
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1133
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4756
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.317
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Youlln/3PRYMMAL-PHI3-3B-SLERP/7850fc57-49c7-4124-b7c6-e1e7bb2bc726.json b/data/hfopenllm_v2/Youlln/3PRYMMAL-PHI3-3B-SLERP/7850fc57-49c7-4124-b7c6-e1e7bb2bc726.json
deleted file mode 100644
index d99b33828..000000000
--- a/data/hfopenllm_v2/Youlln/3PRYMMAL-PHI3-3B-SLERP/7850fc57-49c7-4124-b7c6-e1e7bb2bc726.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Youlln_3PRYMMAL-PHI3-3B-SLERP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "3PRYMMAL-PHI3-3B-SLERP",
-    "id": "Youlln/3PRYMMAL-PHI3-3B-SLERP",
-    "developer": "Youlln",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3656
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5422
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1715
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4648
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4002
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Youlln/4PRYMMAL-GEMMA2-9B-SLERP/8f38374e-f373-4639-9278-24441ebd0325.json b/data/hfopenllm_v2/Youlln/4PRYMMAL-GEMMA2-9B-SLERP/8f38374e-f373-4639-9278-24441ebd0325.json
deleted file mode 100644
index 676bdf378..000000000
--- a/data/hfopenllm_v2/Youlln/4PRYMMAL-GEMMA2-9B-SLERP/8f38374e-f373-4639-9278-24441ebd0325.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Youlln_4PRYMMAL-GEMMA2-9B-SLERP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "4PRYMMAL-GEMMA2-9B-SLERP",
-    "id": "Youlln/4PRYMMAL-GEMMA2-9B-SLERP",
-    "developer": "Youlln",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2714
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5923
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0906
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3305
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4672
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.421
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Youlln/ECE-MIRAGE-1-12B/c007938e-3427-4896-8493-1500abdfbd2b.json b/data/hfopenllm_v2/Youlln/ECE-MIRAGE-1-12B/c007938e-3427-4896-8493-1500abdfbd2b.json
deleted file mode 100644
index 7f4f92c00..000000000
--- a/data/hfopenllm_v2/Youlln/ECE-MIRAGE-1-12B/c007938e-3427-4896-8493-1500abdfbd2b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Youlln_ECE-MIRAGE-1-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-MIRAGE-1-12B",
-    "id": "Youlln/ECE-MIRAGE-1-12B",
-    "developer": "Youlln",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 15.21
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.207
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3011
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3219
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.111
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Youlln/ECE-MIRAGE-1-15B/df81dc0d-6c72-49e9-862b-02e9b6642cb6.json b/data/hfopenllm_v2/Youlln/ECE-MIRAGE-1-15B/df81dc0d-6c72-49e9-862b-02e9b6642cb6.json
deleted file mode 100644
index 331e6d3a6..000000000
--- a/data/hfopenllm_v2/Youlln/ECE-MIRAGE-1-15B/df81dc0d-6c72-49e9-862b-02e9b6642cb6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Youlln_ECE-MIRAGE-1-15B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-MIRAGE-1-15B",
-    "id": "Youlln/ECE-MIRAGE-1-15B",
-    "developer": "Youlln",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 15.21
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.207
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3011
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3219
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.111
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V3-MUSR/46c96d8e-568c-48f8-a74b-9dd4b4195037.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V3-MUSR/46c96d8e-568c-48f8-a74b-9dd4b4195037.json
deleted file mode 100644
index 89fe56fd5..000000000
--- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V3-MUSR/46c96d8e-568c-48f8-a74b-9dd4b4195037.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL-0.5B-FT-V3-MUSR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-0.5B-FT-V3-MUSR",
-    "id": "Youlln/ECE-PRYMMAL-0.5B-FT-V3-MUSR",
-    "developer": "Youlln",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1533
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3041
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0242
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.366
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1645
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V3/1f4f7181-8a81-49f4-9e81-925d5d69a37c.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V3/1f4f7181-8a81-49f4-9e81-925d5d69a37c.json
deleted file mode 100644
index 19716554b..000000000
--- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V3/1f4f7181-8a81-49f4-9e81-925d5d69a37c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL-0.5B-FT-V3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-0.5B-FT-V3",
-    "id": "Youlln/ECE-PRYMMAL-0.5B-FT-V3",
-    "developer": "Youlln",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1642
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3093
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.003
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3644
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1161
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V4-MUSR/3ea343b6-93f6-4c61-a164-3db95d13cbdf.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V4-MUSR/3ea343b6-93f6-4c61-a164-3db95d13cbdf.json
deleted file mode 100644
index 8d60c846e..000000000
--- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-FT-V4-MUSR/3ea343b6-93f6-4c61-a164-3db95d13cbdf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL-0.5B-FT-V4-MUSR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-0.5B-FT-V4-MUSR",
-    "id": "Youlln/ECE-PRYMMAL-0.5B-FT-V4-MUSR",
-    "developer": "Youlln",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1138
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3038
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3529
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1321
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-SLERP-V2/a9ea8bb5-05fc-4da3-8e00-f53ab8ea6af5.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-SLERP-V2/a9ea8bb5-05fc-4da3-8e00-f53ab8ea6af5.json
deleted file mode 100644
index 76582f768..000000000
--- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-SLERP-V2/a9ea8bb5-05fc-4da3-8e00-f53ab8ea6af5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL-0.5B-SLERP-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-0.5B-SLERP-V2",
-    "id": "Youlln/ECE-PRYMMAL-0.5B-SLERP-V2",
-    "developer": "Youlln",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1612
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2935
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3831
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1095
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-SLERP-V3/0ea74ce5-43c9-43eb-92bc-3d928062d9e0.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-SLERP-V3/0ea74ce5-43c9-43eb-92bc-3d928062d9e0.json
deleted file mode 100644
index cf85a25de..000000000
--- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-0.5B-SLERP-V3/0ea74ce5-43c9-43eb-92bc-3d928062d9e0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL-0.5B-SLERP-V3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-0.5B-SLERP-V3",
-    "id": "Youlln/ECE-PRYMMAL-0.5B-SLERP-V3",
-    "developer": "Youlln",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.167
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2938
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3541
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1087
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-1B-SLERP-V1/6896faa7-7204-4091-8f4e-9cc0b53d673a.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-1B-SLERP-V1/6896faa7-7204-4091-8f4e-9cc0b53d673a.json
deleted file mode 100644
index af72a7152..000000000
--- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-1B-SLERP-V1/6896faa7-7204-4091-8f4e-9cc0b53d673a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL-YL-1B-SLERP-V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-YL-1B-SLERP-V1",
-    "id": "Youlln/ECE-PRYMMAL-YL-1B-SLERP-V1",
-    "developer": "Youlln",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3251
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4209
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1073
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4266
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-1B-SLERP-V2/88064453-fd8c-4bd9-adf1-39f43972bec1.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-1B-SLERP-V2/88064453-fd8c-4bd9-adf1-39f43972bec1.json
deleted file mode 100644
index b5b27cb03..000000000
--- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-1B-SLERP-V2/88064453-fd8c-4bd9-adf1-39f43972bec1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL-YL-1B-SLERP-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-YL-1B-SLERP-V2",
-    "id": "Youlln/ECE-PRYMMAL-YL-1B-SLERP-V2",
-    "developer": "Youlln",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3251
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4209
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1073
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4266
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-7B-SLERP-V4/a18ade45-acba-4059-b969-445e529a82e2.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-7B-SLERP-V4/a18ade45-acba-4059-b969-445e529a82e2.json
deleted file mode 100644
index b630ec36d..000000000
--- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL-YL-7B-SLERP-V4/a18ade45-acba-4059-b969-445e529a82e2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL-YL-7B-SLERP-V4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-YL-7B-SLERP-V4",
-    "id": "Youlln/ECE-PRYMMAL-YL-7B-SLERP-V4",
-    "developer": "Youlln",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.251
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.377
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3745
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2132
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL0.5-FT/6c0e4132-71e7-44af-95fc-83b0a6be2a82.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL0.5-FT/6c0e4132-71e7-44af-95fc-83b0a6be2a82.json
deleted file mode 100644
index 67a332158..000000000
--- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL0.5-FT/6c0e4132-71e7-44af-95fc-83b0a6be2a82.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL0.5-FT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL0.5-FT",
-    "id": "Youlln/ECE-PRYMMAL0.5-FT",
-    "developer": "Youlln",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1851
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3132
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0234
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3301
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1477
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL0.5B-Youri/5d9ab422-4f4f-460d-bd39-51266b43d7e5.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL0.5B-Youri/5d9ab422-4f4f-460d-bd39-51266b43d7e5.json
deleted file mode 100644
index 21bba30b8..000000000
--- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL0.5B-Youri/5d9ab422-4f4f-460d-bd39-51266b43d7e5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL0.5B-Youri/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL0.5B-Youri",
-    "id": "Youlln/ECE-PRYMMAL0.5B-Youri",
-    "developer": "Youlln",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1446
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2817
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2433
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3697
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1095
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL1B-FT-V1/cda03c45-0782-40cc-a17d-67d808657b83.json b/data/hfopenllm_v2/Youlln/ECE-PRYMMAL1B-FT-V1/cda03c45-0782-40cc-a17d-67d808657b83.json
deleted file mode 100644
index 1633dc538..000000000
--- a/data/hfopenllm_v2/Youlln/ECE-PRYMMAL1B-FT-V1/cda03c45-0782-40cc-a17d-67d808657b83.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Youlln_ECE-PRYMMAL1B-FT-V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL1B-FT-V1",
-    "id": "Youlln/ECE-PRYMMAL1B-FT-V1",
-    "developer": "Youlln",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2144
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4033
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0642
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3417
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Youlln/ECE-Qwen0.5B-FT-V2/50f5451b-41c4-4ba5-8bee-ee8a2deb7e79.json b/data/hfopenllm_v2/Youlln/ECE-Qwen0.5B-FT-V2/50f5451b-41c4-4ba5-8bee-ee8a2deb7e79.json
deleted file mode 100644
index 01985d185..000000000
--- a/data/hfopenllm_v2/Youlln/ECE-Qwen0.5B-FT-V2/50f5451b-41c4-4ba5-8bee-ee8a2deb7e79.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Youlln_ECE-Qwen0.5B-FT-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-Qwen0.5B-FT-V2",
-    "id": "Youlln/ECE-Qwen0.5B-FT-V2",
-    "developer": "Youlln",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2526
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.329
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3063
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1666
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Youlln/ECE.EIFFEIL.ia-0.5B-SLERP/cf758994-6e94-434d-bf68-74cca188b5e8.json b/data/hfopenllm_v2/Youlln/ECE.EIFFEIL.ia-0.5B-SLERP/cf758994-6e94-434d-bf68-74cca188b5e8.json
deleted file mode 100644
index 69c6a2377..000000000
--- a/data/hfopenllm_v2/Youlln/ECE.EIFFEIL.ia-0.5B-SLERP/cf758994-6e94-434d-bf68-74cca188b5e8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Youlln_ECE.EIFFEIL.ia-0.5B-SLERP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE.EIFFEIL.ia-0.5B-SLERP",
-    "id": "Youlln/ECE.EIFFEIL.ia-0.5B-SLERP",
-    "developer": "Youlln",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2561
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3306
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0597
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3102
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1903
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/YoungPanda/qwenqwen/611f9549-0788-44e9-8125-18df06cd80d6.json b/data/hfopenllm_v2/YoungPanda/qwenqwen/611f9549-0788-44e9-8125-18df06cd80d6.json
deleted file mode 100644
index 92e685942..000000000
--- a/data/hfopenllm_v2/YoungPanda/qwenqwen/611f9549-0788-44e9-8125-18df06cd80d6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/YoungPanda_qwenqwen/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwenqwen",
-    "id": "YoungPanda/qwenqwen",
-    "developer": "YoungPanda",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2MoeForCausalLM",
-      "params_billions": 14.316
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1264
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3379
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0355
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3434
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1168
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Yuma42/KangalKhan-RawRuby-7B/59cf23ba-027d-4bac-a0e1-526376396b4d.json b/data/hfopenllm_v2/Yuma42/KangalKhan-RawRuby-7B/59cf23ba-027d-4bac-a0e1-526376396b4d.json
deleted file mode 100644
index 2c57a54ac..000000000
--- a/data/hfopenllm_v2/Yuma42/KangalKhan-RawRuby-7B/59cf23ba-027d-4bac-a0e1-526376396b4d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Yuma42_KangalKhan-RawRuby-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "KangalKhan-RawRuby-7B",
-    "id": "Yuma42/KangalKhan-RawRuby-7B",
-    "developer": "Yuma42",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5477
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4755
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.395
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3023
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Yuma42/Llama3.1-IgneousIguana-8B/1f02bbd3-ddaf-4db6-b7f8-31bad8ffac66.json b/data/hfopenllm_v2/Yuma42/Llama3.1-IgneousIguana-8B/1f02bbd3-ddaf-4db6-b7f8-31bad8ffac66.json
deleted file mode 100644
index 168e27964..000000000
--- a/data/hfopenllm_v2/Yuma42/Llama3.1-IgneousIguana-8B/1f02bbd3-ddaf-4db6-b7f8-31bad8ffac66.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Yuma42_Llama3.1-IgneousIguana-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-IgneousIguana-8B",
-    "id": "Yuma42/Llama3.1-IgneousIguana-8B",
-    "developer": "Yuma42",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8133
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5191
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2198
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4203
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3974
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Yuma42/Llama3.1-SuperHawk-8B/1e737e28-d926-43e8-9e4c-e39fa91d7977.json b/data/hfopenllm_v2/Yuma42/Llama3.1-SuperHawk-8B/1e737e28-d926-43e8-9e4c-e39fa91d7977.json
deleted file mode 100644
index 75ea95504..000000000
--- a/data/hfopenllm_v2/Yuma42/Llama3.1-SuperHawk-8B/1e737e28-d926-43e8-9e4c-e39fa91d7977.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Yuma42_Llama3.1-SuperHawk-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-SuperHawk-8B",
-    "id": "Yuma42/Llama3.1-SuperHawk-8B",
-    "developer": "Yuma42",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7986
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2349
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4084
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3945
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/Z1-Coder/Z1-Coder-7B/43ef8eee-5d8a-47e7-ac71-1a898421370a.json b/data/hfopenllm_v2/Z1-Coder/Z1-Coder-7B/43ef8eee-5d8a-47e7-ac71-1a898421370a.json
deleted file mode 100644
index 49916e11f..000000000
--- a/data/hfopenllm_v2/Z1-Coder/Z1-Coder-7B/43ef8eee-5d8a-47e7-ac71-1a898421370a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/Z1-Coder_Z1-Coder-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Z1-Coder-7B",
-    "id": "Z1-Coder/Z1-Coder-7B",
-    "developer": "Z1-Coder",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3215
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4842
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3248
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3622
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3759
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ZHLiu627/zephyr-7b-gemma-dpo-avg/d8d03c71-942f-4aff-8a5e-5c265c639b44.json b/data/hfopenllm_v2/ZHLiu627/zephyr-7b-gemma-dpo-avg/d8d03c71-942f-4aff-8a5e-5c265c639b44.json
deleted file mode 100644
index 8775cade5..000000000
--- a/data/hfopenllm_v2/ZHLiu627/zephyr-7b-gemma-dpo-avg/d8d03c71-942f-4aff-8a5e-5c265c639b44.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ZHLiu627_zephyr-7b-gemma-dpo-avg/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "zephyr-7b-gemma-dpo-avg",
-    "id": "ZHLiu627/zephyr-7b-gemma-dpo-avg",
-    "developer": "ZHLiu627",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 8.538
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.309
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4149
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4107
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2851
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ZHLiu627/zephyr-7b-gemma-rpo-avg/96262938-1146-4993-92a1-a2ddb2519f8a.json b/data/hfopenllm_v2/ZHLiu627/zephyr-7b-gemma-rpo-avg/96262938-1146-4993-92a1-a2ddb2519f8a.json
deleted file mode 100644
index 399559c0f..000000000
--- a/data/hfopenllm_v2/ZHLiu627/zephyr-7b-gemma-rpo-avg/96262938-1146-4993-92a1-a2ddb2519f8a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ZHLiu627_zephyr-7b-gemma-rpo-avg/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "zephyr-7b-gemma-rpo-avg",
-    "id": "ZHLiu627/zephyr-7b-gemma-rpo-avg",
-    "developer": "ZHLiu627",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 8.538
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3006
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4183
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0498
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4081
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2831
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ZeroXClem/L3-Aspire-Heart-Matrix-8B/292d7cfb-3e3c-47d8-8cca-33507f9ff081.json b/data/hfopenllm_v2/ZeroXClem/L3-Aspire-Heart-Matrix-8B/292d7cfb-3e3c-47d8-8cca-33507f9ff081.json
deleted file mode 100644
index 8a789283c..000000000
--- a/data/hfopenllm_v2/ZeroXClem/L3-Aspire-Heart-Matrix-8B/292d7cfb-3e3c-47d8-8cca-33507f9ff081.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ZeroXClem_L3-Aspire-Heart-Matrix-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-Aspire-Heart-Matrix-8B",
-    "id": "ZeroXClem/L3-Aspire-Heart-Matrix-8B",
-    "developer": "ZeroXClem",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4834
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5384
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1828
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4187
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3785
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-AthenaSky-MegaMix/3f29c10f-57ef-435b-85df-2cae30ae72fa.json b/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-AthenaSky-MegaMix/3f29c10f-57ef-435b-85df-2cae30ae72fa.json
deleted file mode 100644
index a87472533..000000000
--- a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-AthenaSky-MegaMix/3f29c10f-57ef-435b-85df-2cae30ae72fa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ZeroXClem_Llama-3.1-8B-AthenaSky-MegaMix/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-AthenaSky-MegaMix",
-    "id": "ZeroXClem/Llama-3.1-8B-AthenaSky-MegaMix",
-    "developer": "ZeroXClem",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6301
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5163
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2795
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3538
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3504
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-RainbowLight-EtherealMix/d7f022fe-86cb-4e4e-a672-62c2dc8cffd3.json b/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-RainbowLight-EtherealMix/d7f022fe-86cb-4e4e-a672-62c2dc8cffd3.json
deleted file mode 100644
index eb07cdad6..000000000
--- a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-RainbowLight-EtherealMix/d7f022fe-86cb-4e4e-a672-62c2dc8cffd3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ZeroXClem_Llama-3.1-8B-RainbowLight-EtherealMix/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-RainbowLight-EtherealMix",
-    "id": "ZeroXClem/Llama-3.1-8B-RainbowLight-EtherealMix",
-    "developer": "ZeroXClem",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4973
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5155
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3947
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.363
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SpecialTitanFusion/baa35c90-c494-4dff-af28-cb549e40bed8.json b/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SpecialTitanFusion/baa35c90-c494-4dff-af28-cb549e40bed8.json
deleted file mode 100644
index 4a6a4d956..000000000
--- a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SpecialTitanFusion/baa35c90-c494-4dff-af28-cb549e40bed8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ZeroXClem_Llama-3.1-8B-SpecialTitanFusion/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-SpecialTitanFusion",
-    "id": "ZeroXClem/Llama-3.1-8B-SpecialTitanFusion",
-    "developer": "ZeroXClem",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7402
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5439
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2334
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3874
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3621
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SuperNova-EtherealHermes/2fdc3186-6791-4550-ac4f-a1a5a5a1d514.json b/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SuperNova-EtherealHermes/2fdc3186-6791-4550-ac4f-a1a5a5a1d514.json
deleted file mode 100644
index 554e92142..000000000
--- a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SuperNova-EtherealHermes/2fdc3186-6791-4550-ac4f-a1a5a5a1d514.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ZeroXClem_Llama-3.1-8B-SuperNova-EtherealHermes/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-SuperNova-EtherealHermes",
-    "id": "ZeroXClem/Llama-3.1-8B-SuperNova-EtherealHermes",
-    "developer": "ZeroXClem",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7339
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5244
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1745
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4066
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3745
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SuperTulu-LexiNova/f687df8b-42b5-4d94-b741-1b516d9221b2.json b/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SuperTulu-LexiNova/f687df8b-42b5-4d94-b741-1b516d9221b2.json
deleted file mode 100644
index 561402998..000000000
--- a/data/hfopenllm_v2/ZeroXClem/Llama-3.1-8B-SuperTulu-LexiNova/f687df8b-42b5-4d94-b741-1b516d9221b2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ZeroXClem_Llama-3.1-8B-SuperTulu-LexiNova/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-SuperTulu-LexiNova",
-    "id": "ZeroXClem/Llama-3.1-8B-SuperTulu-LexiNova",
-    "developer": "ZeroXClem",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4165
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5079
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.253
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3971
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3368
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ZeroXClem/Qwen-2.5-Aether-SlerpFusion-7B/c3a8a952-6869-4eee-a59f-4ae33ac72986.json b/data/hfopenllm_v2/ZeroXClem/Qwen-2.5-Aether-SlerpFusion-7B/c3a8a952-6869-4eee-a59f-4ae33ac72986.json
deleted file mode 100644
index 352964db5..000000000
--- a/data/hfopenllm_v2/ZeroXClem/Qwen-2.5-Aether-SlerpFusion-7B/c3a8a952-6869-4eee-a59f-4ae33ac72986.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ZeroXClem_Qwen-2.5-Aether-SlerpFusion-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-2.5-Aether-SlerpFusion-7B",
-    "id": "ZeroXClem/Qwen-2.5-Aether-SlerpFusion-7B",
-    "developer": "ZeroXClem",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6262
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5462
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2734
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4178
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4327
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-CelestialHarmony-1M/a7a74117-71e4-49b2-bd65-add82c9165d8.json b/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-CelestialHarmony-1M/a7a74117-71e4-49b2-bd65-add82c9165d8.json
deleted file mode 100644
index 1324b28ff..000000000
--- a/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-CelestialHarmony-1M/a7a74117-71e4-49b2-bd65-add82c9165d8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ZeroXClem_Qwen2.5-7B-CelestialHarmony-1M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-CelestialHarmony-1M",
-    "id": "ZeroXClem/Qwen2.5-7B-CelestialHarmony-1M",
-    "developer": "ZeroXClem",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5944
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5431
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3474
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4595
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4387
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-HomerAnvita-NerdMix/04ee694c-0c89-4f25-b10f-315a24743ba2.json b/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-HomerAnvita-NerdMix/04ee694c-0c89-4f25-b10f-315a24743ba2.json
deleted file mode 100644
index 444e35d1b..000000000
--- a/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-HomerAnvita-NerdMix/04ee694c-0c89-4f25-b10f-315a24743ba2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ZeroXClem_Qwen2.5-7B-HomerAnvita-NerdMix/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-HomerAnvita-NerdMix",
-    "id": "ZeroXClem/Qwen2.5-7B-HomerAnvita-NerdMix",
-    "developer": "ZeroXClem",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7708
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5541
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3837
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4391
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4432
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-HomerCreative-Mix/47fd4acb-acc3-4f12-8af5-c425d3754c38.json b/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-HomerCreative-Mix/47fd4acb-acc3-4f12-8af5-c425d3754c38.json
deleted file mode 100644
index 3f9b0472a..000000000
--- a/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-HomerCreative-Mix/47fd4acb-acc3-4f12-8af5-c425d3754c38.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ZeroXClem_Qwen2.5-7B-HomerCreative-Mix/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-HomerCreative-Mix",
-    "id": "ZeroXClem/Qwen2.5-7B-HomerCreative-Mix",
-    "developer": "ZeroXClem",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7835
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5548
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.435
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4447
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-Qandora-CySec/e19577f5-d1ba-45ad-8500-d18ae2b14440.json b/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-Qandora-CySec/e19577f5-d1ba-45ad-8500-d18ae2b14440.json
deleted file mode 100644
index b81b82415..000000000
--- a/data/hfopenllm_v2/ZeroXClem/Qwen2.5-7B-Qandora-CySec/e19577f5-d1ba-45ad-8500-d18ae2b14440.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ZeroXClem_Qwen2.5-7B-Qandora-CySec/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-Qandora-CySec",
-    "id": "ZeroXClem/Qwen2.5-7B-Qandora-CySec",
-    "developer": "ZeroXClem",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6773
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.549
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2931
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4286
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4485
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ZeusLabs/L3-Aethora-15B-V2/e86443cd-453b-4ca0-8e7e-054764fe4bb9.json b/data/hfopenllm_v2/ZeusLabs/L3-Aethora-15B-V2/e86443cd-453b-4ca0-8e7e-054764fe4bb9.json
deleted file mode 100644
index 575ec1a14..000000000
--- a/data/hfopenllm_v2/ZeusLabs/L3-Aethora-15B-V2/e86443cd-453b-4ca0-8e7e-054764fe4bb9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ZeusLabs_L3-Aethora-15B-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-Aethora-15B-V2",
-    "id": "ZeusLabs/L3-Aethora-15B-V2",
-    "developer": "ZeusLabs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 15.01
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7208
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5011
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0808
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3871
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3/24cd9977-f3fb-4619-aea1-59e1a36b2a5e.json b/data/hfopenllm_v2/ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3/24cd9977-f3fb-4619-aea1-59e1a36b2a5e.json
deleted file mode 100644
index 63d3d967e..000000000
--- a/data/hfopenllm_v2/ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3/24cd9977-f3fb-4619-aea1-59e1a36b2a5e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ZhangShenao_SELM-Llama-3-8B-Instruct-iter-3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SELM-Llama-3-8B-Instruct-iter-3",
-    "id": "ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3",
-    "developer": "ZhangShenao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6903
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5046
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0861
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3845
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3783
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/aaditya/Llama3-OpenBioLLM-70B/1401f0d9-6f4c-41d2-819f-eb9487c5c1e6.json b/data/hfopenllm_v2/aaditya/Llama3-OpenBioLLM-70B/1401f0d9-6f4c-41d2-819f-eb9487c5c1e6.json
deleted file mode 100644
index 82061858d..000000000
--- a/data/hfopenllm_v2/aaditya/Llama3-OpenBioLLM-70B/1401f0d9-6f4c-41d2-819f-eb9487c5c1e6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/aaditya_Llama3-OpenBioLLM-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3-OpenBioLLM-70B",
-    "id": "aaditya/Llama3-OpenBioLLM-70B",
-    "developer": "aaditya",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7597
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6399
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1971
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4417
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4867
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/abacusai/Dracarys-72B-Instruct/4b1f2aab-ef92-4231-9bdd-96918b26914c.json b/data/hfopenllm_v2/abacusai/Dracarys-72B-Instruct/4b1f2aab-ef92-4231-9bdd-96918b26914c.json
deleted file mode 100644
index 46a1a26ce..000000000
--- a/data/hfopenllm_v2/abacusai/Dracarys-72B-Instruct/4b1f2aab-ef92-4231-9bdd-96918b26914c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/abacusai_Dracarys-72B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dracarys-72B-Instruct",
-    "id": "abacusai/Dracarys-72B-Instruct",
-    "developer": "abacusai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7856
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6944
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3965
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3909
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4558
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5456
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/abacusai/Liberated-Qwen1.5-14B/4956e127-14a1-405e-a0e0-76fe94ea727b.json b/data/hfopenllm_v2/abacusai/Liberated-Qwen1.5-14B/4956e127-14a1-405e-a0e0-76fe94ea727b.json
deleted file mode 100644
index afc41b73e..000000000
--- a/data/hfopenllm_v2/abacusai/Liberated-Qwen1.5-14B/4956e127-14a1-405e-a0e0-76fe94ea727b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/abacusai_Liberated-Qwen1.5-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Liberated-Qwen1.5-14B",
-    "id": "abacusai/Liberated-Qwen1.5-14B",
-    "developer": "abacusai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3631
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4948
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1601
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4175
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3512
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/abacusai/Llama-3-Smaug-8B/90fb6e40-88f7-4ce2-ae99-308d87e69718.json b/data/hfopenllm_v2/abacusai/Llama-3-Smaug-8B/90fb6e40-88f7-4ce2-ae99-308d87e69718.json
deleted file mode 100644
index aee6436ea..000000000
--- a/data/hfopenllm_v2/abacusai/Llama-3-Smaug-8B/90fb6e40-88f7-4ce2-ae99-308d87e69718.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/abacusai_Llama-3-Smaug-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Smaug-8B",
-    "id": "abacusai/Llama-3-Smaug-8B",
-    "developer": "abacusai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4867
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4931
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0853
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2483
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3622
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3185
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/abacusai/Smaug-34B-v0.1/cdad0f08-1c60-4493-bed0-9733894b367a.json b/data/hfopenllm_v2/abacusai/Smaug-34B-v0.1/cdad0f08-1c60-4493-bed0-9733894b367a.json
deleted file mode 100644
index a0a920244..000000000
--- a/data/hfopenllm_v2/abacusai/Smaug-34B-v0.1/cdad0f08-1c60-4493-bed0-9733894b367a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/abacusai_Smaug-34B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Smaug-34B-v0.1",
-    "id": "abacusai/Smaug-34B-v0.1",
-    "developer": "abacusai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5016
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5358
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0718
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3979
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4543
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/abacusai/Smaug-72B-v0.1/8e83b4f7-736f-4e03-8256-2a1fc421b04f.json b/data/hfopenllm_v2/abacusai/Smaug-72B-v0.1/8e83b4f7-736f-4e03-8256-2a1fc421b04f.json
deleted file mode 100644
index d0562870a..000000000
--- a/data/hfopenllm_v2/abacusai/Smaug-72B-v0.1/8e83b4f7-736f-4e03-8256-2a1fc421b04f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/abacusai_Smaug-72B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Smaug-72B-v0.1",
-    "id": "abacusai/Smaug-72B-v0.1",
-    "developer": "abacusai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 72.289
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5167
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5996
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1911
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4473
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4624
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/abacusai/Smaug-Llama-3-70B-Instruct-32K/f0d6639d-8485-4bcd-b069-046a747dfbfa.json b/data/hfopenllm_v2/abacusai/Smaug-Llama-3-70B-Instruct-32K/f0d6639d-8485-4bcd-b069-046a747dfbfa.json
deleted file mode 100644
index cb2b7cd68..000000000
--- a/data/hfopenllm_v2/abacusai/Smaug-Llama-3-70B-Instruct-32K/f0d6639d-8485-4bcd-b069-046a747dfbfa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/abacusai_Smaug-Llama-3-70B-Instruct-32K/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Smaug-Llama-3-70B-Instruct-32K",
-    "id": "abacusai/Smaug-Llama-3-70B-Instruct-32K",
-    "developer": "abacusai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7761
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6493
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2749
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4208
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4765
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/abacusai/Smaug-Mixtral-v0.1/d1fe36ba-04f8-4110-8c39-81d393c4cbfc.json b/data/hfopenllm_v2/abacusai/Smaug-Mixtral-v0.1/d1fe36ba-04f8-4110-8c39-81d393c4cbfc.json
deleted file mode 100644
index e773b34ab..000000000
--- a/data/hfopenllm_v2/abacusai/Smaug-Mixtral-v0.1/d1fe36ba-04f8-4110-8c39-81d393c4cbfc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/abacusai_Smaug-Mixtral-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Smaug-Mixtral-v0.1",
-    "id": "abacusai/Smaug-Mixtral-v0.1",
-    "developer": "abacusai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 46.703
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5554
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5162
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0952
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4298
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3352
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/abacusai/Smaug-Qwen2-72B-Instruct/5a8ab5fb-ec1e-490c-b643-e3b9d49f5d34.json b/data/hfopenllm_v2/abacusai/Smaug-Qwen2-72B-Instruct/5a8ab5fb-ec1e-490c-b643-e3b9d49f5d34.json
deleted file mode 100644
index 064650e3a..000000000
--- a/data/hfopenllm_v2/abacusai/Smaug-Qwen2-72B-Instruct/5a8ab5fb-ec1e-490c-b643-e3b9d49f5d34.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/abacusai_Smaug-Qwen2-72B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Smaug-Qwen2-72B-Instruct",
-    "id": "abacusai/Smaug-Qwen2-72B-Instruct",
-    "developer": "abacusai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7825
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.691
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4131
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3616
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4401
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.519
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/abacusai/bigstral-12b-32k/de944f89-d2d4-4b01-b4b5-e7cbd1d8d1ae.json b/data/hfopenllm_v2/abacusai/bigstral-12b-32k/de944f89-d2d4-4b01-b4b5-e7cbd1d8d1ae.json
deleted file mode 100644
index 0c72ea1cb..000000000
--- a/data/hfopenllm_v2/abacusai/bigstral-12b-32k/de944f89-d2d4-4b01-b4b5-e7cbd1d8d1ae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/abacusai_bigstral-12b-32k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bigstral-12b-32k",
-    "id": "abacusai/bigstral-12b-32k",
-    "developer": "abacusai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.476
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4194
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.47
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.456
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2641
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/abacusai/bigyi-15b/db96601a-2f7f-438f-915b-55fee0e0d1d1.json b/data/hfopenllm_v2/abacusai/bigyi-15b/db96601a-2f7f-438f-915b-55fee0e0d1d1.json
deleted file mode 100644
index 78bf1bb7e..000000000
--- a/data/hfopenllm_v2/abacusai/bigyi-15b/db96601a-2f7f-438f-915b-55fee0e0d1d1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/abacusai_bigyi-15b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bigyi-15b",
-    "id": "abacusai/bigyi-15b",
-    "developer": "abacusai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 15.058
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2094
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4345
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3538
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/abhishek/autotrain-0tmgq-5tpbg/27912f7d-7033-4b7c-b93a-af1673ce4a9b.json b/data/hfopenllm_v2/abhishek/autotrain-0tmgq-5tpbg/27912f7d-7033-4b7c-b93a-af1673ce4a9b.json
deleted file mode 100644
index 7ebb312b2..000000000
--- a/data/hfopenllm_v2/abhishek/autotrain-0tmgq-5tpbg/27912f7d-7033-4b7c-b93a-af1673ce4a9b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/abhishek_autotrain-0tmgq-5tpbg/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "autotrain-0tmgq-5tpbg",
-    "id": "abhishek/autotrain-0tmgq-5tpbg",
-    "developer": "abhishek",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1957
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3135
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.365
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1151
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/abhishek/autotrain-0tmgq-5tpbg/da58a484-4a45-4a70-a651-031ada8023d5.json b/data/hfopenllm_v2/abhishek/autotrain-0tmgq-5tpbg/da58a484-4a45-4a70-a651-031ada8023d5.json
deleted file mode 100644
index f682d2d9d..000000000
--- a/data/hfopenllm_v2/abhishek/autotrain-0tmgq-5tpbg/da58a484-4a45-4a70-a651-031ada8023d5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/abhishek_autotrain-0tmgq-5tpbg/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "autotrain-0tmgq-5tpbg",
-    "id": "abhishek/autotrain-0tmgq-5tpbg",
-    "developer": "abhishek",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1952
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3127
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3584
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1144
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/abhishek/autotrain-llama3-70b-orpo-v1/e8bd221d-8a89-4e3c-8815-0bff27574053.json b/data/hfopenllm_v2/abhishek/autotrain-llama3-70b-orpo-v1/e8bd221d-8a89-4e3c-8815-0bff27574053.json
deleted file mode 100644
index 3051b9372..000000000
--- a/data/hfopenllm_v2/abhishek/autotrain-llama3-70b-orpo-v1/e8bd221d-8a89-4e3c-8815-0bff27574053.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/abhishek_autotrain-llama3-70b-orpo-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "autotrain-llama3-70b-orpo-v1",
-    "id": "abhishek/autotrain-llama3-70b-orpo-v1",
-    "developer": "abhishek",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4233
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5998
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2441
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3579
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1122
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/abhishek/autotrain-llama3-70b-orpo-v2/ffc21c2a-59fb-4ad8-88a4-930879b6eba0.json b/data/hfopenllm_v2/abhishek/autotrain-llama3-70b-orpo-v2/ffc21c2a-59fb-4ad8-88a4-930879b6eba0.json
deleted file mode 100644
index cfef2a069..000000000
--- a/data/hfopenllm_v2/abhishek/autotrain-llama3-70b-orpo-v2/ffc21c2a-59fb-4ad8-88a4-930879b6eba0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/abhishek_autotrain-llama3-70b-orpo-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "autotrain-llama3-70b-orpo-v2",
-    "id": "abhishek/autotrain-llama3-70b-orpo-v2",
-    "developer": "abhishek",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5406
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5899
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2107
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4113
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4818
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/abhishek/autotrain-llama3-orpo-v2/1e506afa-0d08-45d6-9242-b06104aa67e8.json b/data/hfopenllm_v2/abhishek/autotrain-llama3-orpo-v2/1e506afa-0d08-45d6-9242-b06104aa67e8.json
deleted file mode 100644
index 6b46e2ceb..000000000
--- a/data/hfopenllm_v2/abhishek/autotrain-llama3-orpo-v2/1e506afa-0d08-45d6-9242-b06104aa67e8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/abhishek_autotrain-llama3-orpo-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "autotrain-llama3-orpo-v2",
-    "id": "abhishek/autotrain-llama3-orpo-v2",
-    "developer": "abhishek",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4372
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3159
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0468
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2218
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/abhishek/autotrain-vr4a1-e5mms/7d66bb93-cb2f-4be6-b133-1f0325be58e1.json b/data/hfopenllm_v2/abhishek/autotrain-vr4a1-e5mms/7d66bb93-cb2f-4be6-b133-1f0325be58e1.json
deleted file mode 100644
index c2368e06c..000000000
--- a/data/hfopenllm_v2/abhishek/autotrain-vr4a1-e5mms/7d66bb93-cb2f-4be6-b133-1f0325be58e1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/abhishek_autotrain-vr4a1-e5mms/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "autotrain-vr4a1-e5mms",
-    "id": "abhishek/autotrain-vr4a1-e5mms",
-    "developer": "abhishek",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 16.061
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2142
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5001
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1412
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3891
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3667
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/abideen/MedPhi-4-14B-v1/936f3c5f-7817-4118-96c8-e4061d4560fb.json b/data/hfopenllm_v2/abideen/MedPhi-4-14B-v1/936f3c5f-7817-4118-96c8-e4061d4560fb.json
deleted file mode 100644
index 73811a080..000000000
--- a/data/hfopenllm_v2/abideen/MedPhi-4-14B-v1/936f3c5f-7817-4118-96c8-e4061d4560fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/abideen_MedPhi-4-14B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MedPhi-4-14B-v1",
-    "id": "abideen/MedPhi-4-14B-v1",
-    "developer": "abideen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6277
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6897
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2931
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4155
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5338
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/adamo1139/Yi-34B-200K-AEZAKMI-v2/7d36ceed-2a1b-4b20-88ae-0a609cc161e9.json b/data/hfopenllm_v2/adamo1139/Yi-34B-200K-AEZAKMI-v2/7d36ceed-2a1b-4b20-88ae-0a609cc161e9.json
deleted file mode 100644
index a4e2aa16d..000000000
--- a/data/hfopenllm_v2/adamo1139/Yi-34B-200K-AEZAKMI-v2/7d36ceed-2a1b-4b20-88ae-0a609cc161e9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/adamo1139_Yi-34B-200K-AEZAKMI-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-34B-200K-AEZAKMI-v2",
-    "id": "adamo1139/Yi-34B-200K-AEZAKMI-v2",
-    "developer": "adamo1139",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4555
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5384
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3322
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3886
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4513
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/adriszmar/QAIMath-Qwen2.5-7B-TIES/77cace56-503f-4531-a4eb-0178a68cc283.json b/data/hfopenllm_v2/adriszmar/QAIMath-Qwen2.5-7B-TIES/77cace56-503f-4531-a4eb-0178a68cc283.json
deleted file mode 100644
index e2903971c..000000000
--- a/data/hfopenllm_v2/adriszmar/QAIMath-Qwen2.5-7B-TIES/77cace56-503f-4531-a4eb-0178a68cc283.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/adriszmar_QAIMath-Qwen2.5-7B-TIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QAIMath-Qwen2.5-7B-TIES",
-    "id": "adriszmar/QAIMath-Qwen2.5-7B-TIES",
-    "developer": "adriszmar",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1685
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3124
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0015
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3963
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1066
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/adriszmar/QAIMath-Qwen2.5-7B-TIES/9e49b710-2413-42f3-8943-bc9dbf68cb3c.json b/data/hfopenllm_v2/adriszmar/QAIMath-Qwen2.5-7B-TIES/9e49b710-2413-42f3-8943-bc9dbf68cb3c.json
deleted file mode 100644
index dbd70e39f..000000000
--- a/data/hfopenllm_v2/adriszmar/QAIMath-Qwen2.5-7B-TIES/9e49b710-2413-42f3-8943-bc9dbf68cb3c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/adriszmar_QAIMath-Qwen2.5-7B-TIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QAIMath-Qwen2.5-7B-TIES",
-    "id": "adriszmar/QAIMath-Qwen2.5-7B-TIES",
-    "developer": "adriszmar",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1746
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3126
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.245
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4096
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1087
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/aevalone/distill_qw_test/9a5b3564-97df-4661-a171-37322386ac4d.json b/data/hfopenllm_v2/aevalone/distill_qw_test/9a5b3564-97df-4661-a171-37322386ac4d.json
deleted file mode 100644
index 4e684cee1..000000000
--- a/data/hfopenllm_v2/aevalone/distill_qw_test/9a5b3564-97df-4661-a171-37322386ac4d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/aevalone_distill_qw_test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "distill_qw_test",
-    "id": "aevalone/distill_qw_test",
-    "developer": "aevalone",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7409
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5246
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4781
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.386
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4092
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/agentlans/Gemma2-9B-AdvancedFuse/0fc0450d-cdf1-44b5-a809-202d1dd6b5e3.json b/data/hfopenllm_v2/agentlans/Gemma2-9B-AdvancedFuse/0fc0450d-cdf1-44b5-a809-202d1dd6b5e3.json
deleted file mode 100644
index 65f6c605e..000000000
--- a/data/hfopenllm_v2/agentlans/Gemma2-9B-AdvancedFuse/0fc0450d-cdf1-44b5-a809-202d1dd6b5e3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/agentlans_Gemma2-9B-AdvancedFuse/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma2-9B-AdvancedFuse",
-    "id": "agentlans/Gemma2-9B-AdvancedFuse",
-    "developer": "agentlans",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1543
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5859
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1005
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3347
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4231
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/agentlans/Llama-3.2-1B-Instruct-CrashCourse12K/7f06c78c-f95e-4e50-aa57-da0579adcdae.json b/data/hfopenllm_v2/agentlans/Llama-3.2-1B-Instruct-CrashCourse12K/7f06c78c-f95e-4e50-aa57-da0579adcdae.json
deleted file mode 100644
index 9800d3c2e..000000000
--- a/data/hfopenllm_v2/agentlans/Llama-3.2-1B-Instruct-CrashCourse12K/7f06c78c-f95e-4e50-aa57-da0579adcdae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/agentlans_Llama-3.2-1B-Instruct-CrashCourse12K/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-1B-Instruct-CrashCourse12K",
-    "id": "agentlans/Llama-3.2-1B-Instruct-CrashCourse12K",
-    "developer": "agentlans",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5395
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3548
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.071
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2408
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.321
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1809
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/agentlans/Llama3.1-8B-drill/06e55e47-9995-4fa2-877a-c728e9f9f1a1.json b/data/hfopenllm_v2/agentlans/Llama3.1-8B-drill/06e55e47-9995-4fa2-877a-c728e9f9f1a1.json
deleted file mode 100644
index 3dd94e869..000000000
--- a/data/hfopenllm_v2/agentlans/Llama3.1-8B-drill/06e55e47-9995-4fa2-877a-c728e9f9f1a1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/agentlans_Llama3.1-8B-drill/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-8B-drill",
-    "id": "agentlans/Llama3.1-8B-drill",
-    "developer": "agentlans",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7652
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5016
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1715
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3672
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3776
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/agentlans/Llama3.1-Daredevilish-Instruct/39af1e0a-d1e3-4372-bc18-d07f3dff09f0.json b/data/hfopenllm_v2/agentlans/Llama3.1-Daredevilish-Instruct/39af1e0a-d1e3-4372-bc18-d07f3dff09f0.json
deleted file mode 100644
index 8cbe2b1db..000000000
--- a/data/hfopenllm_v2/agentlans/Llama3.1-Daredevilish-Instruct/39af1e0a-d1e3-4372-bc18-d07f3dff09f0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/agentlans_Llama3.1-Daredevilish-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-Daredevilish-Instruct",
-    "id": "agentlans/Llama3.1-Daredevilish-Instruct",
-    "developer": "agentlans",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7926
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5235
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1722
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3911
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3877
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/agentlans/Llama3.1-Daredevilish/f32d59d6-8ab9-4b7d-ad9d-f62ce6d559bd.json b/data/hfopenllm_v2/agentlans/Llama3.1-Daredevilish/f32d59d6-8ab9-4b7d-ad9d-f62ce6d559bd.json
deleted file mode 100644
index 8fc5cbbdf..000000000
--- a/data/hfopenllm_v2/agentlans/Llama3.1-Daredevilish/f32d59d6-8ab9-4b7d-ad9d-f62ce6d559bd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/agentlans_Llama3.1-Daredevilish/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-Daredevilish",
-    "id": "agentlans/Llama3.1-Daredevilish",
-    "developer": "agentlans",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6292
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5013
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1292
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4091
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3697
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/agentlans/Llama3.1-LexiHermes-SuperStorm/7ddc3aef-c6c5-4d04-8473-3b3bba219d7f.json b/data/hfopenllm_v2/agentlans/Llama3.1-LexiHermes-SuperStorm/7ddc3aef-c6c5-4d04-8473-3b3bba219d7f.json
deleted file mode 100644
index 1c22faf44..000000000
--- a/data/hfopenllm_v2/agentlans/Llama3.1-LexiHermes-SuperStorm/7ddc3aef-c6c5-4d04-8473-3b3bba219d7f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/agentlans_Llama3.1-LexiHermes-SuperStorm/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-LexiHermes-SuperStorm",
-    "id": "agentlans/Llama3.1-LexiHermes-SuperStorm",
-    "developer": "agentlans",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7835
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5266
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1616
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3963
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3844
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/agentlans/Llama3.1-SuperDeepFuse-CrashCourse12K/ce80ac07-22d2-4883-ac6c-40b080e00b81.json b/data/hfopenllm_v2/agentlans/Llama3.1-SuperDeepFuse-CrashCourse12K/ce80ac07-22d2-4883-ac6c-40b080e00b81.json
deleted file mode 100644
index 806df32bb..000000000
--- a/data/hfopenllm_v2/agentlans/Llama3.1-SuperDeepFuse-CrashCourse12K/ce80ac07-22d2-4883-ac6c-40b080e00b81.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/agentlans_Llama3.1-SuperDeepFuse-CrashCourse12K/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-SuperDeepFuse-CrashCourse12K",
-    "id": "agentlans/Llama3.1-SuperDeepFuse-CrashCourse12K",
-    "developer": "agentlans",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7187
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5216
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1805
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4026
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3631
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/agentlans/Llama3.1-SuperDeepFuse/cbece170-f872-485f-a6c2-5db17ced73bc.json b/data/hfopenllm_v2/agentlans/Llama3.1-SuperDeepFuse/cbece170-f872-485f-a6c2-5db17ced73bc.json
deleted file mode 100644
index 0f8c139a6..000000000
--- a/data/hfopenllm_v2/agentlans/Llama3.1-SuperDeepFuse/cbece170-f872-485f-a6c2-5db17ced73bc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/agentlans_Llama3.1-SuperDeepFuse/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-SuperDeepFuse",
-    "id": "agentlans/Llama3.1-SuperDeepFuse",
-    "developer": "agentlans",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7762
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5049
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1828
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3699
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3775
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/agentlans/Qwen2.5-0.5B-Instruct-CrashCourse-dropout/c1fd751b-c6c3-4350-9618-f4b4840e1b69.json b/data/hfopenllm_v2/agentlans/Qwen2.5-0.5B-Instruct-CrashCourse-dropout/c1fd751b-c6c3-4350-9618-f4b4840e1b69.json
deleted file mode 100644
index 8da33be6f..000000000
--- a/data/hfopenllm_v2/agentlans/Qwen2.5-0.5B-Instruct-CrashCourse-dropout/c1fd751b-c6c3-4350-9618-f4b4840e1b69.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/agentlans_Qwen2.5-0.5B-Instruct-CrashCourse-dropout/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-Instruct-CrashCourse-dropout",
-    "id": "agentlans/Qwen2.5-0.5B-Instruct-CrashCourse-dropout",
-    "developer": "agentlans",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2949
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3312
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0423
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1608
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ahmeda335/13_outOf_32_pruned_layers_llama3.1-8b/bfd28b91-3a72-4417-b52b-804d2cbae12f.json b/data/hfopenllm_v2/ahmeda335/13_outOf_32_pruned_layers_llama3.1-8b/bfd28b91-3a72-4417-b52b-804d2cbae12f.json
deleted file mode 100644
index d883b1cb1..000000000
--- a/data/hfopenllm_v2/ahmeda335/13_outOf_32_pruned_layers_llama3.1-8b/bfd28b91-3a72-4417-b52b-804d2cbae12f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ahmeda335_13_outOf_32_pruned_layers_llama3.1-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "13_outOf_32_pruned_layers_llama3.1-8b",
-    "id": "ahmeda335/13_outOf_32_pruned_layers_llama3.1-8b",
-    "developer": "ahmeda335",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 5.195
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1748
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2883
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3803
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1129
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ai21labs/Jamba-v0.1/32c26cbc-3697-47a6-bd12-18187df9dda9.json b/data/hfopenllm_v2/ai21labs/Jamba-v0.1/32c26cbc-3697-47a6-bd12-18187df9dda9.json
deleted file mode 100644
index 61193fc54..000000000
--- a/data/hfopenllm_v2/ai21labs/Jamba-v0.1/32c26cbc-3697-47a6-bd12-18187df9dda9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ai21labs_Jamba-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Jamba-v0.1",
-    "id": "ai21labs/Jamba-v0.1",
-    "developer": "ai21labs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "JambaForCausalLM",
-      "params_billions": 51.57
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2026
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3602
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.359
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ai4bharat/Airavata/02280b9f-bc01-4e44-9d09-1e4ae8c0438b.json b/data/hfopenllm_v2/ai4bharat/Airavata/02280b9f-bc01-4e44-9d09-1e4ae8c0438b.json
deleted file mode 100644
index 0badacdc8..000000000
--- a/data/hfopenllm_v2/ai4bharat/Airavata/02280b9f-bc01-4e44-9d09-1e4ae8c0438b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ai4bharat_Airavata/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Airavata",
-    "id": "ai4bharat/Airavata",
-    "developer": "ai4bharat",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.87
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0559
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3628
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3763
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1635
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/aixonlab/Aether-12b/a57d2d49-5ccf-48f5-8035-b1d480c80f40.json b/data/hfopenllm_v2/aixonlab/Aether-12b/a57d2d49-5ccf-48f5-8035-b1d480c80f40.json
deleted file mode 100644
index 0edbd3648..000000000
--- a/data/hfopenllm_v2/aixonlab/Aether-12b/a57d2d49-5ccf-48f5-8035-b1d480c80f40.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/aixonlab_Aether-12b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Aether-12b",
-    "id": "aixonlab/Aether-12b",
-    "developer": "aixonlab",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2347
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5179
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3829
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.341
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/aixonlab/Grey-12b/6b5a3c69-f8dd-4952-96fc-b6e4dec1ed9d.json b/data/hfopenllm_v2/aixonlab/Grey-12b/6b5a3c69-f8dd-4952-96fc-b6e4dec1ed9d.json
deleted file mode 100644
index ae2b8e520..000000000
--- a/data/hfopenllm_v2/aixonlab/Grey-12b/6b5a3c69-f8dd-4952-96fc-b6e4dec1ed9d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/aixonlab_Grey-12b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Grey-12b",
-    "id": "aixonlab/Grey-12b",
-    "developer": "aixonlab",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3968
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5699
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0982
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4516
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3779
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/aixonlab/Zara-14b-v1.2/fe0665dd-b976-4d90-b16b-6c2acfef15ff.json b/data/hfopenllm_v2/aixonlab/Zara-14b-v1.2/fe0665dd-b976-4d90-b16b-6c2acfef15ff.json
deleted file mode 100644
index fff6f1b78..000000000
--- a/data/hfopenllm_v2/aixonlab/Zara-14b-v1.2/fe0665dd-b976-4d90-b16b-6c2acfef15ff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/aixonlab_Zara-14b-v1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Zara-14b-v1.2",
-    "id": "aixonlab/Zara-14b-v1.2",
-    "developer": "aixonlab",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6197
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6405
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3535
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4675
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5263
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.01-First/8c6bdc44-fd29-45e7-b161-2c8e07ef2935.json b/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.01-First/8c6bdc44-fd29-45e7-b161-2c8e07ef2935.json
deleted file mode 100644
index d89f7563d..000000000
--- a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.01-First/8c6bdc44-fd29-45e7-b161-2c8e07ef2935.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/akhadangi_Llama3.2.1B.0.01-First/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.2.1B.0.01-First",
-    "id": "akhadangi/Llama3.2.1B.0.01-First",
-    "developer": "akhadangi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0814
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3189
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2483
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3194
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1197
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.01-Last/e7c70ff9-59ad-4d09-8af0-ef9cf16d1dfa.json b/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.01-Last/e7c70ff9-59ad-4d09-8af0-ef9cf16d1dfa.json
deleted file mode 100644
index c7af9c6fd..000000000
--- a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.01-Last/e7c70ff9-59ad-4d09-8af0-ef9cf16d1dfa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/akhadangi_Llama3.2.1B.0.01-Last/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.2.1B.0.01-Last",
-    "id": "akhadangi/Llama3.2.1B.0.01-Last",
-    "developer": "akhadangi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0917
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3159
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2433
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3206
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1227
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.1-First/26c4c993-ae49-42a0-be0a-f157be9f7d58.json b/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.1-First/26c4c993-ae49-42a0-be0a-f157be9f7d58.json
deleted file mode 100644
index 79629d7c0..000000000
--- a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.1-First/26c4c993-ae49-42a0-be0a-f157be9f7d58.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/akhadangi_Llama3.2.1B.0.1-First/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.2.1B.0.1-First",
-    "id": "akhadangi/Llama3.2.1B.0.1-First",
-    "developer": "akhadangi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1001
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.312
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0211
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.245
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3301
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1169
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.1-Last/19adf124-c120-4e97-80cf-49c40a66eb81.json b/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.1-Last/19adf124-c120-4e97-80cf-49c40a66eb81.json
deleted file mode 100644
index 75815fcac..000000000
--- a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.0.1-Last/19adf124-c120-4e97-80cf-49c40a66eb81.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/akhadangi_Llama3.2.1B.0.1-Last/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.2.1B.0.1-Last",
-    "id": "akhadangi/Llama3.2.1B.0.1-Last",
-    "developer": "akhadangi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.095
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3164
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0211
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2383
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3341
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1178
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.BaseFiT/66bc5d38-8d25-4934-bce8-41ce4ea0e385.json b/data/hfopenllm_v2/akhadangi/Llama3.2.1B.BaseFiT/66bc5d38-8d25-4934-bce8-41ce4ea0e385.json
deleted file mode 100644
index 37dbf8a34..000000000
--- a/data/hfopenllm_v2/akhadangi/Llama3.2.1B.BaseFiT/66bc5d38-8d25-4934-bce8-41ce4ea0e385.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/akhadangi_Llama3.2.1B.BaseFiT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.2.1B.BaseFiT",
-    "id": "akhadangi/Llama3.2.1B.BaseFiT",
-    "developer": "akhadangi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0883
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3175
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0242
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1172
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/akjindal53244/Llama-3.1-Storm-8B/541eafe5-807e-44b0-b652-a0752210fc71.json b/data/hfopenllm_v2/akjindal53244/Llama-3.1-Storm-8B/541eafe5-807e-44b0-b652-a0752210fc71.json
deleted file mode 100644
index a6b87b5af..000000000
--- a/data/hfopenllm_v2/akjindal53244/Llama-3.1-Storm-8B/541eafe5-807e-44b0-b652-a0752210fc71.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/akjindal53244_Llama-3.1-Storm-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Storm-8B",
-    "id": "akjindal53244/Llama-3.1-Storm-8B",
-    "developer": "akjindal53244",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8051
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5189
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1722
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4028
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3803
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/akjindal53244/Llama-3.1-Storm-8B/845a2484-9f17-4c0e-b06b-6250992298bc.json b/data/hfopenllm_v2/akjindal53244/Llama-3.1-Storm-8B/845a2484-9f17-4c0e-b06b-6250992298bc.json
deleted file mode 100644
index 8d84dffa8..000000000
--- a/data/hfopenllm_v2/akjindal53244/Llama-3.1-Storm-8B/845a2484-9f17-4c0e-b06b-6250992298bc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/akjindal53244_Llama-3.1-Storm-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Storm-8B",
-    "id": "akjindal53244/Llama-3.1-Storm-8B",
-    "developer": "akjindal53244",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8033
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5196
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1624
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4028
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3812
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/alcholjung/llama3_medical_tuned/e62b6b26-5f3c-42c9-9541-bb8b23caee66.json b/data/hfopenllm_v2/alcholjung/llama3_medical_tuned/e62b6b26-5f3c-42c9-9541-bb8b23caee66.json
deleted file mode 100644
index e59f216fd..000000000
--- a/data/hfopenllm_v2/alcholjung/llama3_medical_tuned/e62b6b26-5f3c-42c9-9541-bb8b23caee66.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/alcholjung_llama3_medical_tuned/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama3_medical_tuned",
-    "id": "alcholjung/llama3_medical_tuned",
-    "developer": "alcholjung",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 16.061
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4513
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0468
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.466
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2946
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B-DPO/ec773b66-24fd-4b6f-ac9c-ebcd355e4be7.json b/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B-DPO/ec773b66-24fd-4b6f-ac9c-ebcd355e4be7.json
deleted file mode 100644
index a5182fb56..000000000
--- a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B-DPO/ec773b66-24fd-4b6f-ac9c-ebcd355e4be7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allenai_Llama-3.1-Tulu-3-70B-DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Tulu-3-70B-DPO",
-    "id": "allenai/Llama-3.1-Tulu-3-70B-DPO",
-    "developer": "allenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8282
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6146
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4494
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3758
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4923
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4633
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B-SFT/a70b8356-94ce-4f0d-b44a-2215076eed5e.json b/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B-SFT/a70b8356-94ce-4f0d-b44a-2215076eed5e.json
deleted file mode 100644
index 1684989ec..000000000
--- a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B-SFT/a70b8356-94ce-4f0d-b44a-2215076eed5e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allenai_Llama-3.1-Tulu-3-70B-SFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Tulu-3-70B-SFT",
-    "id": "allenai/Llama-3.1-Tulu-3-70B-SFT",
-    "developer": "allenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8051
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5951
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3316
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3448
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5026
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4624
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B/b182807d-587e-4702-bf30-dab11983b8db.json b/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B/b182807d-587e-4702-bf30-dab11983b8db.json
deleted file mode 100644
index 80ba532bf..000000000
--- a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B/b182807d-587e-4702-bf30-dab11983b8db.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allenai_Llama-3.1-Tulu-3-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Tulu-3-70B",
-    "id": "allenai/Llama-3.1-Tulu-3-70B",
-    "developer": "allenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8291
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6164
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4502
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4948
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4645
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B/c1f0944a-c44c-42e9-90ba-a847509cbd66.json b/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B/c1f0944a-c44c-42e9-90ba-a847509cbd66.json
deleted file mode 100644
index 2e028f4fd..000000000
--- a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-70B/c1f0944a-c44c-42e9-90ba-a847509cbd66.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allenai_Llama-3.1-Tulu-3-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Tulu-3-70B",
-    "id": "allenai/Llama-3.1-Tulu-3-70B",
-    "developer": "allenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8379
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6157
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3829
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4988
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4656
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-DPO/64bb8530-7071-402e-ba9b-1d15ecbe275c.json b/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-DPO/64bb8530-7071-402e-ba9b-1d15ecbe275c.json
deleted file mode 100644
index 012b3e9c5..000000000
--- a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-DPO/64bb8530-7071-402e-ba9b-1d15ecbe275c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allenai_Llama-3.1-Tulu-3-8B-DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Tulu-3-8B-DPO",
-    "id": "allenai/Llama-3.1-Tulu-3-8B-DPO",
-    "developer": "allenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8029
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4079
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2364
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4161
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2898
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-RM/4f1fc265-f8b7-47e6-a9e6-cfa61b89ad4a.json b/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-RM/4f1fc265-f8b7-47e6-a9e6-cfa61b89ad4a.json
deleted file mode 100644
index 9397e2c60..000000000
--- a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-RM/4f1fc265-f8b7-47e6-a9e6-cfa61b89ad4a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allenai_Llama-3.1-Tulu-3-8B-RM/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Tulu-3-8B-RM",
-    "id": "allenai/Llama-3.1-Tulu-3-8B-RM",
-    "developer": "allenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForSequenceClassification",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.167
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.295
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3764
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1082
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-SFT/1420df5c-690e-4b01-b99c-c21c793689ae.json b/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-SFT/1420df5c-690e-4b01-b99c-c21c793689ae.json
deleted file mode 100644
index f10b3e75a..000000000
--- a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B-SFT/1420df5c-690e-4b01-b99c-c21c793689ae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allenai_Llama-3.1-Tulu-3-8B-SFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Tulu-3-8B-SFT",
-    "id": "allenai/Llama-3.1-Tulu-3-8B-SFT",
-    "developer": "allenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7403
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3872
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1178
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4268
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2812
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B/aa9d0b0e-cb3f-452e-bc85-f7cf172d2b8b.json b/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B/aa9d0b0e-cb3f-452e-bc85-f7cf172d2b8b.json
deleted file mode 100644
index 32b0d8b0b..000000000
--- a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B/aa9d0b0e-cb3f-452e-bc85-f7cf172d2b8b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allenai_Llama-3.1-Tulu-3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Tulu-3-8B",
-    "id": "allenai/Llama-3.1-Tulu-3-8B",
-    "developer": "allenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8255
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4061
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2115
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4175
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2821
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B/dfabd777-8620-40e3-b19c-a9227f57b638.json b/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B/dfabd777-8620-40e3-b19c-a9227f57b638.json
deleted file mode 100644
index fe1b7eb80..000000000
--- a/data/hfopenllm_v2/allenai/Llama-3.1-Tulu-3-8B/dfabd777-8620-40e3-b19c-a9227f57b638.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allenai_Llama-3.1-Tulu-3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Tulu-3-8B",
-    "id": "allenai/Llama-3.1-Tulu-3-8B",
-    "developer": "allenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8267
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.405
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1964
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4175
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allenai/OLMo-1.7-7B-hf/08fe3877-ab04-426a-9e27-72ec4ff8ffc3.json b/data/hfopenllm_v2/allenai/OLMo-1.7-7B-hf/08fe3877-ab04-426a-9e27-72ec4ff8ffc3.json
deleted file mode 100644
index 6810a21ef..000000000
--- a/data/hfopenllm_v2/allenai/OLMo-1.7-7B-hf/08fe3877-ab04-426a-9e27-72ec4ff8ffc3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allenai_OLMo-1.7-7B-hf/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OLMo-1.7-7B-hf",
-    "id": "allenai/OLMo-1.7-7B-hf",
-    "developer": "allenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Unknown",
-      "params_billions": 0.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1569
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3014
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0023
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3475
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1124
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allenai/OLMo-1B-hf/4b264bb0-bd7e-4b15-9591-50b5a521f100.json b/data/hfopenllm_v2/allenai/OLMo-1B-hf/4b264bb0-bd7e-4b15-9591-50b5a521f100.json
deleted file mode 100644
index 667c017f1..000000000
--- a/data/hfopenllm_v2/allenai/OLMo-1B-hf/4b264bb0-bd7e-4b15-9591-50b5a521f100.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allenai_OLMo-1B-hf/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OLMo-1B-hf",
-    "id": "allenai/OLMo-1B-hf",
-    "developer": "allenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "OlmoForCausalLM",
-      "params_billions": 1.177
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2182
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3052
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0174
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4098
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1174
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allenai/OLMo-2-1124-7B-Instruct/a8cfe336-0c3e-401c-a1e9-d951e64918ec.json b/data/hfopenllm_v2/allenai/OLMo-2-1124-7B-Instruct/a8cfe336-0c3e-401c-a1e9-d951e64918ec.json
deleted file mode 100644
index a7c81df8a..000000000
--- a/data/hfopenllm_v2/allenai/OLMo-2-1124-7B-Instruct/a8cfe336-0c3e-401c-a1e9-d951e64918ec.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allenai_OLMo-2-1124-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OLMo-2-1124-7B-Instruct",
-    "id": "allenai/OLMo-2-1124-7B-Instruct",
-    "developer": "allenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Olmo2ForCausalLM",
-      "params_billions": 7.299
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7244
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4022
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1488
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3508
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2672
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allenai/OLMo-7B-Instruct-hf/5e66c653-41b1-46de-b677-ffd8426ba5ec.json b/data/hfopenllm_v2/allenai/OLMo-7B-Instruct-hf/5e66c653-41b1-46de-b677-ffd8426ba5ec.json
deleted file mode 100644
index d0864d416..000000000
--- a/data/hfopenllm_v2/allenai/OLMo-7B-Instruct-hf/5e66c653-41b1-46de-b677-ffd8426ba5ec.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allenai_OLMo-7B-Instruct-hf/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OLMo-7B-Instruct-hf",
-    "id": "allenai/OLMo-7B-Instruct-hf",
-    "developer": "allenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "OlmoForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3706
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3765
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1785
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allenai/OLMo-7B-hf/9f0f0914-1f7a-468e-8a2e-7ae122fd064d.json b/data/hfopenllm_v2/allenai/OLMo-7B-hf/9f0f0914-1f7a-468e-8a2e-7ae122fd064d.json
deleted file mode 100644
index f93b86dbe..000000000
--- a/data/hfopenllm_v2/allenai/OLMo-7B-hf/9f0f0914-1f7a-468e-8a2e-7ae122fd064d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allenai_OLMo-7B-hf/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OLMo-7B-hf",
-    "id": "allenai/OLMo-7B-hf",
-    "developer": "allenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "OlmoForCausalLM",
-      "params_billions": 6.888
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2719
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3279
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3487
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1173
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allenai/OLMoE-1B-7B-0125-Instruct/cc64a143-4f1e-42ee-ade1-fafc4b316336.json b/data/hfopenllm_v2/allenai/OLMoE-1B-7B-0125-Instruct/cc64a143-4f1e-42ee-ade1-fafc4b316336.json
deleted file mode 100644
index d3d1048ff..000000000
--- a/data/hfopenllm_v2/allenai/OLMoE-1B-7B-0125-Instruct/cc64a143-4f1e-42ee-ade1-fafc4b316336.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allenai_OLMoE-1B-7B-0125-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OLMoE-1B-7B-0125-Instruct",
-    "id": "allenai/OLMoE-1B-7B-0125-Instruct",
-    "developer": "allenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "OlmoeForCausalLM",
-      "params_billions": 6.919
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6757
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3825
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0899
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3636
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1915
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allenai/OLMoE-1B-7B-0924-Instruct/cf322e64-2682-4a9a-a48f-c4ec47b852f2.json b/data/hfopenllm_v2/allenai/OLMoE-1B-7B-0924-Instruct/cf322e64-2682-4a9a-a48f-c4ec47b852f2.json
deleted file mode 100644
index 9edb56ae9..000000000
--- a/data/hfopenllm_v2/allenai/OLMoE-1B-7B-0924-Instruct/cf322e64-2682-4a9a-a48f-c4ec47b852f2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allenai_OLMoE-1B-7B-0924-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OLMoE-1B-7B-0924-Instruct",
-    "id": "allenai/OLMoE-1B-7B-0924-Instruct",
-    "developer": "allenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "OlmoeForCausalLM",
-      "params_billions": 6.919
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4667
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3902
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0279
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3848
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1876
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allenai/OLMoE-1B-7B-0924/30b32261-b24a-49e3-ba57-172dc1d03ba0.json b/data/hfopenllm_v2/allenai/OLMoE-1B-7B-0924/30b32261-b24a-49e3-ba57-172dc1d03ba0.json
deleted file mode 100644
index 9a8c2b352..000000000
--- a/data/hfopenllm_v2/allenai/OLMoE-1B-7B-0924/30b32261-b24a-49e3-ba57-172dc1d03ba0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allenai_OLMoE-1B-7B-0924/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OLMoE-1B-7B-0924",
-    "id": "allenai/OLMoE-1B-7B-0924",
-    "developer": "allenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "OlmoeForCausalLM",
-      "params_billions": 6.919
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2185
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3393
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2475
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3488
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.174
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Chocolatine-24B/0681c01d-23f3-4b8b-9516-a5cc41761fc4.json b/data/hfopenllm_v2/allknowingroger/Chocolatine-24B/0681c01d-23f3-4b8b-9516-a5cc41761fc4.json
deleted file mode 100644
index 159a92fad..000000000
--- a/data/hfopenllm_v2/allknowingroger/Chocolatine-24B/0681c01d-23f3-4b8b-9516-a5cc41761fc4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Chocolatine-24B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chocolatine-24B",
-    "id": "allknowingroger/Chocolatine-24B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 24.184
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1958
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6191
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4323
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4566
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp1-2.6B/7693ed8a-f76d-482b-92c1-f11810e522ca.json b/data/hfopenllm_v2/allknowingroger/Gemma2Slerp1-2.6B/7693ed8a-f76d-482b-92c1-f11810e522ca.json
deleted file mode 100644
index 7d68b1a32..000000000
--- a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp1-2.6B/7693ed8a-f76d-482b-92c1-f11810e522ca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Gemma2Slerp1-2.6B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma2Slerp1-2.6B",
-    "id": "allknowingroger/Gemma2Slerp1-2.6B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5354
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4343
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4562
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2689
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp1-27B/f8dc0128-c606-490a-b965-59d5377dd778.json b/data/hfopenllm_v2/allknowingroger/Gemma2Slerp1-27B/f8dc0128-c606-490a-b965-59d5377dd778.json
deleted file mode 100644
index 8516f0711..000000000
--- a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp1-27B/f8dc0128-c606-490a-b965-59d5377dd778.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Gemma2Slerp1-27B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma2Slerp1-27B",
-    "id": "allknowingroger/Gemma2Slerp1-27B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7186
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6399
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2583
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3641
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4767
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4456
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp2-2.6B/844547f7-658f-41dd-ab4c-dc0569030e59.json b/data/hfopenllm_v2/allknowingroger/Gemma2Slerp2-2.6B/844547f7-658f-41dd-ab4c-dc0569030e59.json
deleted file mode 100644
index 58ea93670..000000000
--- a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp2-2.6B/844547f7-658f-41dd-ab4c-dc0569030e59.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Gemma2Slerp2-2.6B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma2Slerp2-2.6B",
-    "id": "allknowingroger/Gemma2Slerp2-2.6B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5747
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4308
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0906
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4468
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2696
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp2-27B/75c291b5-6d60-4bde-8621-f865196a6ecc.json b/data/hfopenllm_v2/allknowingroger/Gemma2Slerp2-27B/75c291b5-6d60-4bde-8621-f865196a6ecc.json
deleted file mode 100644
index 921a675ca..000000000
--- a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp2-27B/75c291b5-6d60-4bde-8621-f865196a6ecc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Gemma2Slerp2-27B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma2Slerp2-27B",
-    "id": "allknowingroger/Gemma2Slerp2-27B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7546
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6557
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2787
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4621
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4623
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp3-27B/36d54b12-594f-47fe-9637-a9b740416c5c.json b/data/hfopenllm_v2/allknowingroger/Gemma2Slerp3-27B/36d54b12-594f-47fe-9637-a9b740416c5c.json
deleted file mode 100644
index 59baa2f21..000000000
--- a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp3-27B/36d54b12-594f-47fe-9637-a9b740416c5c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Gemma2Slerp3-27B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma2Slerp3-27B",
-    "id": "allknowingroger/Gemma2Slerp3-27B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7426
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.65
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2742
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3549
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.474
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4641
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp4-27B/57733383-9573-463d-a467-068d2685014c.json b/data/hfopenllm_v2/allknowingroger/Gemma2Slerp4-27B/57733383-9573-463d-a467-068d2685014c.json
deleted file mode 100644
index 5c534391b..000000000
--- a/data/hfopenllm_v2/allknowingroger/Gemma2Slerp4-27B/57733383-9573-463d-a467-068d2685014c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Gemma2Slerp4-27B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma2Slerp4-27B",
-    "id": "allknowingroger/Gemma2Slerp4-27B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7497
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.653
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2719
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3666
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4502
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4649
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/GemmaSlerp-9B/eda1ac9a-98e1-496f-bdeb-1e256b52c14a.json b/data/hfopenllm_v2/allknowingroger/GemmaSlerp-9B/eda1ac9a-98e1-496f-bdeb-1e256b52c14a.json
deleted file mode 100644
index edc562705..000000000
--- a/data/hfopenllm_v2/allknowingroger/GemmaSlerp-9B/eda1ac9a-98e1-496f-bdeb-1e256b52c14a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_GemmaSlerp-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GemmaSlerp-9B",
-    "id": "allknowingroger/GemmaSlerp-9B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7043
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5921
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4673
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4161
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/GemmaSlerp2-9B/00b8bfda-c6b1-4e1f-b68c-bff7335e2dff.json b/data/hfopenllm_v2/allknowingroger/GemmaSlerp2-9B/00b8bfda-c6b1-4e1f-b68c-bff7335e2dff.json
deleted file mode 100644
index 126811a1c..000000000
--- a/data/hfopenllm_v2/allknowingroger/GemmaSlerp2-9B/00b8bfda-c6b1-4e1f-b68c-bff7335e2dff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_GemmaSlerp2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GemmaSlerp2-9B",
-    "id": "allknowingroger/GemmaSlerp2-9B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7281
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5983
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2107
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3523
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4767
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4239
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/GemmaSlerp4-10B/0a3b9ad6-b853-471d-a292-413b30273034.json b/data/hfopenllm_v2/allknowingroger/GemmaSlerp4-10B/0a3b9ad6-b853-471d-a292-413b30273034.json
deleted file mode 100644
index 4749c39dc..000000000
--- a/data/hfopenllm_v2/allknowingroger/GemmaSlerp4-10B/0a3b9ad6-b853-471d-a292-413b30273034.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_GemmaSlerp4-10B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GemmaSlerp4-10B",
-    "id": "allknowingroger/GemmaSlerp4-10B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7326
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6028
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2243
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3532
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.454
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.425
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/GemmaSlerp5-10B/d61c3ace-e353-4c0b-9472-c9a1928809cc.json b/data/hfopenllm_v2/allknowingroger/GemmaSlerp5-10B/d61c3ace-e353-4c0b-9472-c9a1928809cc.json
deleted file mode 100644
index 16732735f..000000000
--- a/data/hfopenllm_v2/allknowingroger/GemmaSlerp5-10B/d61c3ace-e353-4c0b-9472-c9a1928809cc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_GemmaSlerp5-10B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GemmaSlerp5-10B",
-    "id": "allknowingroger/GemmaSlerp5-10B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7353
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6054
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2183
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3523
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4608
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4328
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/GemmaStock1-27B/2293a19a-b650-436d-9448-1b641e63d407.json b/data/hfopenllm_v2/allknowingroger/GemmaStock1-27B/2293a19a-b650-436d-9448-1b641e63d407.json
deleted file mode 100644
index c77b2f811..000000000
--- a/data/hfopenllm_v2/allknowingroger/GemmaStock1-27B/2293a19a-b650-436d-9448-1b641e63d407.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_GemmaStock1-27B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GemmaStock1-27B",
-    "id": "allknowingroger/GemmaStock1-27B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7509
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6566
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2636
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3641
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4527
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.473
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/HomerSlerp1-7B/c15b977c-c781-4b17-ac9f-25c77602c875.json b/data/hfopenllm_v2/allknowingroger/HomerSlerp1-7B/c15b977c-c781-4b17-ac9f-25c77602c875.json
deleted file mode 100644
index a693fa309..000000000
--- a/data/hfopenllm_v2/allknowingroger/HomerSlerp1-7B/c15b977c-c781-4b17-ac9f-25c77602c875.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_HomerSlerp1-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HomerSlerp1-7B",
-    "id": "allknowingroger/HomerSlerp1-7B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4621
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5518
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2719
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4359
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4504
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/HomerSlerp2-7B/42c191be-c0ae-4170-8b6f-565053ae7d9c.json b/data/hfopenllm_v2/allknowingroger/HomerSlerp2-7B/42c191be-c0ae-4170-8b6f-565053ae7d9c.json
deleted file mode 100644
index a4cdc4f0b..000000000
--- a/data/hfopenllm_v2/allknowingroger/HomerSlerp2-7B/42c191be-c0ae-4170-8b6f-565053ae7d9c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_HomerSlerp2-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HomerSlerp2-7B",
-    "id": "allknowingroger/HomerSlerp2-7B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4487
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5649
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2968
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4356
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4515
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/HomerSlerp3-7B/f5cb910d-6e5b-404a-a751-d5cb90668150.json b/data/hfopenllm_v2/allknowingroger/HomerSlerp3-7B/f5cb910d-6e5b-404a-a751-d5cb90668150.json
deleted file mode 100644
index 776bb888f..000000000
--- a/data/hfopenllm_v2/allknowingroger/HomerSlerp3-7B/f5cb910d-6e5b-404a-a751-d5cb90668150.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_HomerSlerp3-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HomerSlerp3-7B",
-    "id": "allknowingroger/HomerSlerp3-7B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4363
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5598
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3021
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4462
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4535
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/HomerSlerp4-7B/de806e4c-dbf8-48cc-a0d8-033a61dfc777.json b/data/hfopenllm_v2/allknowingroger/HomerSlerp4-7B/de806e4c-dbf8-48cc-a0d8-033a61dfc777.json
deleted file mode 100644
index 304635237..000000000
--- a/data/hfopenllm_v2/allknowingroger/HomerSlerp4-7B/de806e4c-dbf8-48cc-a0d8-033a61dfc777.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_HomerSlerp4-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HomerSlerp4-7B",
-    "id": "allknowingroger/HomerSlerp4-7B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4374
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5571
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.327
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4408
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4472
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/LimyQstar-7B-slerp/59150b73-b05a-451e-ba3f-696d04effe05.json b/data/hfopenllm_v2/allknowingroger/LimyQstar-7B-slerp/59150b73-b05a-451e-ba3f-696d04effe05.json
deleted file mode 100644
index 992931ce3..000000000
--- a/data/hfopenllm_v2/allknowingroger/LimyQstar-7B-slerp/59150b73-b05a-451e-ba3f-696d04effe05.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_LimyQstar-7B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LimyQstar-7B-slerp",
-    "id": "allknowingroger/LimyQstar-7B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3491
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5024
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0687
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4146
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3103
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Llama3.1-60B/84926b81-360a-480c-b240-f154ec7fe0ba.json b/data/hfopenllm_v2/allknowingroger/Llama3.1-60B/84926b81-360a-480c-b240-f154ec7fe0ba.json
deleted file mode 100644
index 37c1e779c..000000000
--- a/data/hfopenllm_v2/allknowingroger/Llama3.1-60B/84926b81-360a-480c-b240-f154ec7fe0ba.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Llama3.1-60B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-60B",
-    "id": "allknowingroger/Llama3.1-60B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 61.997
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1815
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3242
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3596
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.331
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Marco-01-slerp1-7B/8e6edb04-302b-4dfc-b38f-94b437c921a8.json b/data/hfopenllm_v2/allknowingroger/Marco-01-slerp1-7B/8e6edb04-302b-4dfc-b38f-94b437c921a8.json
deleted file mode 100644
index d13ebcc1a..000000000
--- a/data/hfopenllm_v2/allknowingroger/Marco-01-slerp1-7B/8e6edb04-302b-4dfc-b38f-94b437c921a8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Marco-01-slerp1-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Marco-01-slerp1-7B",
-    "id": "allknowingroger/Marco-01-slerp1-7B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4681
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5541
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3157
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4452
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4483
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Meme-7B-slerp/db92c564-1cf9-43db-9e25-1f450c7b1e7f.json b/data/hfopenllm_v2/allknowingroger/Meme-7B-slerp/db92c564-1cf9-43db-9e25-1f450c7b1e7f.json
deleted file mode 100644
index 2747ba0f5..000000000
--- a/data/hfopenllm_v2/allknowingroger/Meme-7B-slerp/db92c564-1cf9-43db-9e25-1f450c7b1e7f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Meme-7B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Meme-7B-slerp",
-    "id": "allknowingroger/Meme-7B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5164
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4661
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4223
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Ministral-8B-slerp/e3796243-cbba-4ec2-ad7c-89547ad24342.json b/data/hfopenllm_v2/allknowingroger/Ministral-8B-slerp/e3796243-cbba-4ec2-ad7c-89547ad24342.json
deleted file mode 100644
index ae54b512b..000000000
--- a/data/hfopenllm_v2/allknowingroger/Ministral-8B-slerp/e3796243-cbba-4ec2-ad7c-89547ad24342.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Ministral-8B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ministral-8B-slerp",
-    "id": "allknowingroger/Ministral-8B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1961
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4686
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0038
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4285
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3119
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/MistralPhi3-11B/1479be90-df8f-4e1d-b9db-03e84000187a.json b/data/hfopenllm_v2/allknowingroger/MistralPhi3-11B/1479be90-df8f-4e1d-b9db-03e84000187a.json
deleted file mode 100644
index 97d6fd6ec..000000000
--- a/data/hfopenllm_v2/allknowingroger/MistralPhi3-11B/1479be90-df8f-4e1d-b9db-03e84000187a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_MistralPhi3-11B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MistralPhi3-11B",
-    "id": "allknowingroger/MistralPhi3-11B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 11.234
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1943
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6234
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3322
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4267
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4688
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Mistralmash1-7B-s/d2e6c48c-1c18-45a6-ba1a-b335325c980c.json b/data/hfopenllm_v2/allknowingroger/Mistralmash1-7B-s/d2e6c48c-1c18-45a6-ba1a-b335325c980c.json
deleted file mode 100644
index c3b192913..000000000
--- a/data/hfopenllm_v2/allknowingroger/Mistralmash1-7B-s/d2e6c48c-1c18-45a6-ba1a-b335325c980c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Mistralmash1-7B-s/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistralmash1-7B-s",
-    "id": "allknowingroger/Mistralmash1-7B-s",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3961
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5277
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0921
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4267
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3293
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Mistralmash2-7B-s/f843e45a-f66b-4091-a964-75583c2d7fc5.json b/data/hfopenllm_v2/allknowingroger/Mistralmash2-7B-s/f843e45a-f66b-4091-a964-75583c2d7fc5.json
deleted file mode 100644
index 5d5a5cc33..000000000
--- a/data/hfopenllm_v2/allknowingroger/Mistralmash2-7B-s/f843e45a-f66b-4091-a964-75583c2d7fc5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Mistralmash2-7B-s/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistralmash2-7B-s",
-    "id": "allknowingroger/Mistralmash2-7B-s",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4102
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5305
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0793
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4372
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3345
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/MixTAO-19B-pass/cbc3cd41-e187-4c4f-b207-37bceab423a4.json b/data/hfopenllm_v2/allknowingroger/MixTAO-19B-pass/cbc3cd41-e187-4c4f-b207-37bceab423a4.json
deleted file mode 100644
index 860bc85e9..000000000
--- a/data/hfopenllm_v2/allknowingroger/MixTAO-19B-pass/cbc3cd41-e187-4c4f-b207-37bceab423a4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_MixTAO-19B-pass/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MixTAO-19B-pass",
-    "id": "allknowingroger/MixTAO-19B-pass",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 19.188
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3814
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5128
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4783
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3105
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/MixTaoTruthful-13B-slerp/0f124566-5e94-4233-9a3f-5ff9cfdf160c.json b/data/hfopenllm_v2/allknowingroger/MixTaoTruthful-13B-slerp/0f124566-5e94-4233-9a3f-5ff9cfdf160c.json
deleted file mode 100644
index 68ee45cec..000000000
--- a/data/hfopenllm_v2/allknowingroger/MixTaoTruthful-13B-slerp/0f124566-5e94-4233-9a3f-5ff9cfdf160c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_MixTaoTruthful-13B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MixTaoTruthful-13B-slerp",
-    "id": "allknowingroger/MixTaoTruthful-13B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 12.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4139
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5207
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4292
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.31
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/MultiCalm-7B-slerp/98fabba8-7d70-4a1f-b03c-37e1a9ac94e8.json b/data/hfopenllm_v2/allknowingroger/MultiCalm-7B-slerp/98fabba8-7d70-4a1f-b03c-37e1a9ac94e8.json
deleted file mode 100644
index 717e85ceb..000000000
--- a/data/hfopenllm_v2/allknowingroger/MultiCalm-7B-slerp/98fabba8-7d70-4a1f-b03c-37e1a9ac94e8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_MultiCalm-7B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MultiCalm-7B-slerp",
-    "id": "allknowingroger/MultiCalm-7B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3927
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5122
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0619
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4319
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3033
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/MultiMash-12B-slerp/91522dad-529b-477c-8372-793f631e14b7.json b/data/hfopenllm_v2/allknowingroger/MultiMash-12B-slerp/91522dad-529b-477c-8372-793f631e14b7.json
deleted file mode 100644
index 7be2befa5..000000000
--- a/data/hfopenllm_v2/allknowingroger/MultiMash-12B-slerp/91522dad-529b-477c-8372-793f631e14b7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMash-12B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MultiMash-12B-slerp",
-    "id": "allknowingroger/MultiMash-12B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 12.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3974
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5142
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0808
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4438
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3068
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/MultiMash10-13B-slerp/cec22734-493c-4d11-ba86-6c7ae2005124.json b/data/hfopenllm_v2/allknowingroger/MultiMash10-13B-slerp/cec22734-493c-4d11-ba86-6c7ae2005124.json
deleted file mode 100644
index 400c2095d..000000000
--- a/data/hfopenllm_v2/allknowingroger/MultiMash10-13B-slerp/cec22734-493c-4d11-ba86-6c7ae2005124.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMash10-13B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MultiMash10-13B-slerp",
-    "id": "allknowingroger/MultiMash10-13B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 12.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4163
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5186
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0718
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4318
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3117
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/MultiMash11-13B-slerp/704a6e19-0d86-42a5-b8f5-05a5856e9c29.json b/data/hfopenllm_v2/allknowingroger/MultiMash11-13B-slerp/704a6e19-0d86-42a5-b8f5-05a5856e9c29.json
deleted file mode 100644
index d47ffa3a1..000000000
--- a/data/hfopenllm_v2/allknowingroger/MultiMash11-13B-slerp/704a6e19-0d86-42a5-b8f5-05a5856e9c29.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMash11-13B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MultiMash11-13B-slerp",
-    "id": "allknowingroger/MultiMash11-13B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 12.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4251
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5194
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0702
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4373
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3085
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/MultiMash2-12B-slerp/bc54349d-59e0-4ae4-94f9-3f5ae98261f4.json b/data/hfopenllm_v2/allknowingroger/MultiMash2-12B-slerp/bc54349d-59e0-4ae4-94f9-3f5ae98261f4.json
deleted file mode 100644
index ee50b15c0..000000000
--- a/data/hfopenllm_v2/allknowingroger/MultiMash2-12B-slerp/bc54349d-59e0-4ae4-94f9-3f5ae98261f4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMash2-12B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MultiMash2-12B-slerp",
-    "id": "allknowingroger/MultiMash2-12B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 12.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4261
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5134
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0642
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4228
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3043
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/MultiMash5-12B-slerp/d20d533a-758b-477c-b4eb-073adaed640e.json b/data/hfopenllm_v2/allknowingroger/MultiMash5-12B-slerp/d20d533a-758b-477c-b4eb-073adaed640e.json
deleted file mode 100644
index 195752d1b..000000000
--- a/data/hfopenllm_v2/allknowingroger/MultiMash5-12B-slerp/d20d533a-758b-477c-b4eb-073adaed640e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMash5-12B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MultiMash5-12B-slerp",
-    "id": "allknowingroger/MultiMash5-12B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 12.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4142
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5145
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0634
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4203
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3028
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/MultiMash6-12B-slerp/f7c9ad0d-3fea-4bec-8ac3-46f01a3449fb.json b/data/hfopenllm_v2/allknowingroger/MultiMash6-12B-slerp/f7c9ad0d-3fea-4bec-8ac3-46f01a3449fb.json
deleted file mode 100644
index a7902f000..000000000
--- a/data/hfopenllm_v2/allknowingroger/MultiMash6-12B-slerp/f7c9ad0d-3fea-4bec-8ac3-46f01a3449fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMash6-12B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MultiMash6-12B-slerp",
-    "id": "allknowingroger/MultiMash6-12B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 12.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.43
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5196
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0725
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4306
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3091
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/MultiMash7-12B-slerp/9db1f823-e068-4a39-a5cc-b9c588099427.json b/data/hfopenllm_v2/allknowingroger/MultiMash7-12B-slerp/9db1f823-e068-4a39-a5cc-b9c588099427.json
deleted file mode 100644
index 7142478be..000000000
--- a/data/hfopenllm_v2/allknowingroger/MultiMash7-12B-slerp/9db1f823-e068-4a39-a5cc-b9c588099427.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMash7-12B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MultiMash7-12B-slerp",
-    "id": "allknowingroger/MultiMash7-12B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 12.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4213
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5111
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0695
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4279
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/MultiMash8-13B-slerp/23818b45-bf5f-48a2-982f-1e2a0d35aac8.json b/data/hfopenllm_v2/allknowingroger/MultiMash8-13B-slerp/23818b45-bf5f-48a2-982f-1e2a0d35aac8.json
deleted file mode 100644
index 55e001fb8..000000000
--- a/data/hfopenllm_v2/allknowingroger/MultiMash8-13B-slerp/23818b45-bf5f-48a2-982f-1e2a0d35aac8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMash8-13B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MultiMash8-13B-slerp",
-    "id": "allknowingroger/MultiMash8-13B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 12.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4321
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5178
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.077
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4424
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3126
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/MultiMash9-13B-slerp/de6eda66-b8f5-4b23-89e1-44bbac600953.json b/data/hfopenllm_v2/allknowingroger/MultiMash9-13B-slerp/de6eda66-b8f5-4b23-89e1-44bbac600953.json
deleted file mode 100644
index b3fb05f58..000000000
--- a/data/hfopenllm_v2/allknowingroger/MultiMash9-13B-slerp/de6eda66-b8f5-4b23-89e1-44bbac600953.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMash9-13B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MultiMash9-13B-slerp",
-    "id": "allknowingroger/MultiMash9-13B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 12.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4188
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5194
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0785
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4398
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.31
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/MultiMerge-7B-slerp/632974c2-57e2-41f9-8c00-671e07e7594b.json b/data/hfopenllm_v2/allknowingroger/MultiMerge-7B-slerp/632974c2-57e2-41f9-8c00-671e07e7594b.json
deleted file mode 100644
index a78094980..000000000
--- a/data/hfopenllm_v2/allknowingroger/MultiMerge-7B-slerp/632974c2-57e2-41f9-8c00-671e07e7594b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_MultiMerge-7B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MultiMerge-7B-slerp",
-    "id": "allknowingroger/MultiMerge-7B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3948
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.514
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.428
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Multimash3-12B-slerp/e86dcf4f-6282-4aa6-b645-00f93a2e9077.json b/data/hfopenllm_v2/allknowingroger/Multimash3-12B-slerp/e86dcf4f-6282-4aa6-b645-00f93a2e9077.json
deleted file mode 100644
index 67777cbfb..000000000
--- a/data/hfopenllm_v2/allknowingroger/Multimash3-12B-slerp/e86dcf4f-6282-4aa6-b645-00f93a2e9077.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Multimash3-12B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Multimash3-12B-slerp",
-    "id": "allknowingroger/Multimash3-12B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 12.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4437
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5177
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0627
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4344
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3068
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Multimerge-19B-pass/b20be5c9-9720-4076-b587-728549dd19af.json b/data/hfopenllm_v2/allknowingroger/Multimerge-19B-pass/b20be5c9-9720-4076-b587-728549dd19af.json
deleted file mode 100644
index 1204ac4a5..000000000
--- a/data/hfopenllm_v2/allknowingroger/Multimerge-19B-pass/b20be5c9-9720-4076-b587-728549dd19af.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Multimerge-19B-pass/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Multimerge-19B-pass",
-    "id": "allknowingroger/Multimerge-19B-pass",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 19.188
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1773
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2892
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.343
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1169
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/MultiverseEx26-7B-slerp/5e193803-39d1-4f12-8726-ebbe5f71563c.json b/data/hfopenllm_v2/allknowingroger/MultiverseEx26-7B-slerp/5e193803-39d1-4f12-8726-ebbe5f71563c.json
deleted file mode 100644
index b5df1f90d..000000000
--- a/data/hfopenllm_v2/allknowingroger/MultiverseEx26-7B-slerp/5e193803-39d1-4f12-8726-ebbe5f71563c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_MultiverseEx26-7B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MultiverseEx26-7B-slerp",
-    "id": "allknowingroger/MultiverseEx26-7B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3939
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5134
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0755
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4293
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3035
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/NeuralWestSeverus-7B-slerp/61131a6c-f412-42bf-814b-7d711a840d44.json b/data/hfopenllm_v2/allknowingroger/NeuralWestSeverus-7B-slerp/61131a6c-f412-42bf-814b-7d711a840d44.json
deleted file mode 100644
index b88992a9a..000000000
--- a/data/hfopenllm_v2/allknowingroger/NeuralWestSeverus-7B-slerp/61131a6c-f412-42bf-814b-7d711a840d44.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_NeuralWestSeverus-7B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NeuralWestSeverus-7B-slerp",
-    "id": "allknowingroger/NeuralWestSeverus-7B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4136
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5244
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0733
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4529
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3137
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Neuralcoven-7B-slerp/535e72b1-17e0-40e3-9d66-d31f8ec70413.json b/data/hfopenllm_v2/allknowingroger/Neuralcoven-7B-slerp/535e72b1-17e0-40e3-9d66-d31f8ec70413.json
deleted file mode 100644
index 035e133f8..000000000
--- a/data/hfopenllm_v2/allknowingroger/Neuralcoven-7B-slerp/535e72b1-17e0-40e3-9d66-d31f8ec70413.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Neuralcoven-7B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Neuralcoven-7B-slerp",
-    "id": "allknowingroger/Neuralcoven-7B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3859
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5303
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0785
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.429
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3294
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Neuralmultiverse-7B-slerp/ea15479e-24a8-4924-a754-a8567c511e61.json b/data/hfopenllm_v2/allknowingroger/Neuralmultiverse-7B-slerp/ea15479e-24a8-4924-a754-a8567c511e61.json
deleted file mode 100644
index 534b2a6eb..000000000
--- a/data/hfopenllm_v2/allknowingroger/Neuralmultiverse-7B-slerp/ea15479e-24a8-4924-a754-a8567c511e61.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Neuralmultiverse-7B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Neuralmultiverse-7B-slerp",
-    "id": "allknowingroger/Neuralmultiverse-7B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3769
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5166
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.428
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3042
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Ph3della5-14B/5799f285-c61f-43a8-a6a6-053808cf4e8f.json b/data/hfopenllm_v2/allknowingroger/Ph3della5-14B/5799f285-c61f-43a8-a6a6-053808cf4e8f.json
deleted file mode 100644
index 0a78188b8..000000000
--- a/data/hfopenllm_v2/allknowingroger/Ph3della5-14B/5799f285-c61f-43a8-a6a6-053808cf4e8f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Ph3della5-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ph3della5-14B",
-    "id": "allknowingroger/Ph3della5-14B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 13.96
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4799
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6332
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1767
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3423
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4386
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4787
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Ph3merge-14B/36feef44-3d3b-4102-8606-ee6420bddcff.json b/data/hfopenllm_v2/allknowingroger/Ph3merge-14B/36feef44-3d3b-4102-8606-ee6420bddcff.json
deleted file mode 100644
index 9d397a68f..000000000
--- a/data/hfopenllm_v2/allknowingroger/Ph3merge-14B/36feef44-3d3b-4102-8606-ee6420bddcff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Ph3merge-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ph3merge-14B",
-    "id": "allknowingroger/Ph3merge-14B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 13.619
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6381
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4334
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4611
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Ph3merge2-14B/fd55f19a-2c22-4f29-82e0-15b02f25b9a9.json b/data/hfopenllm_v2/allknowingroger/Ph3merge2-14B/fd55f19a-2c22-4f29-82e0-15b02f25b9a9.json
deleted file mode 100644
index 3229ae0a1..000000000
--- a/data/hfopenllm_v2/allknowingroger/Ph3merge2-14B/fd55f19a-2c22-4f29-82e0-15b02f25b9a9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Ph3merge2-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ph3merge2-14B",
-    "id": "allknowingroger/Ph3merge2-14B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 13.619
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1706
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3607
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3911
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1723
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Ph3merge3-14B/18e5decd-c95e-43d2-9ba2-007ba32e216f.json b/data/hfopenllm_v2/allknowingroger/Ph3merge3-14B/18e5decd-c95e-43d2-9ba2-007ba32e216f.json
deleted file mode 100644
index 56d85e1a4..000000000
--- a/data/hfopenllm_v2/allknowingroger/Ph3merge3-14B/18e5decd-c95e-43d2-9ba2-007ba32e216f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Ph3merge3-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ph3merge3-14B",
-    "id": "allknowingroger/Ph3merge3-14B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 13.619
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1645
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3597
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4082
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1647
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Ph3task1-14B/85a4996e-8c44-4e4f-9478-19a8c5513617.json b/data/hfopenllm_v2/allknowingroger/Ph3task1-14B/85a4996e-8c44-4e4f-9478-19a8c5513617.json
deleted file mode 100644
index 7b39272a0..000000000
--- a/data/hfopenllm_v2/allknowingroger/Ph3task1-14B/85a4996e-8c44-4e4f-9478-19a8c5513617.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Ph3task1-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ph3task1-14B",
-    "id": "allknowingroger/Ph3task1-14B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 13.96
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4695
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6318
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1669
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3507
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4508
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4734
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Ph3task2-14B/db6d57c8-df0b-407e-b937-67c55b513a5f.json b/data/hfopenllm_v2/allknowingroger/Ph3task2-14B/db6d57c8-df0b-407e-b937-67c55b513a5f.json
deleted file mode 100644
index 32356567b..000000000
--- a/data/hfopenllm_v2/allknowingroger/Ph3task2-14B/db6d57c8-df0b-407e-b937-67c55b513a5f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Ph3task2-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ph3task2-14B",
-    "id": "allknowingroger/Ph3task2-14B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 13.96
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4713
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6098
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1465
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3305
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4535
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.446
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Ph3task3-14B/89ac933d-0a7c-40e6-8fa7-35bb6205e44b.json b/data/hfopenllm_v2/allknowingroger/Ph3task3-14B/89ac933d-0a7c-40e6-8fa7-35bb6205e44b.json
deleted file mode 100644
index 817fe3f3d..000000000
--- a/data/hfopenllm_v2/allknowingroger/Ph3task3-14B/89ac933d-0a7c-40e6-8fa7-35bb6205e44b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Ph3task3-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ph3task3-14B",
-    "id": "allknowingroger/Ph3task3-14B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 13.96
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4962
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6298
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.176
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3414
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4426
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4771
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Ph3unsloth-3B-slerp/c79e690f-3e09-4fac-9412-937a3b7ef352.json b/data/hfopenllm_v2/allknowingroger/Ph3unsloth-3B-slerp/c79e690f-3e09-4fac-9412-937a3b7ef352.json
deleted file mode 100644
index ba7ad3c78..000000000
--- a/data/hfopenllm_v2/allknowingroger/Ph3unsloth-3B-slerp/c79e690f-3e09-4fac-9412-937a3b7ef352.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Ph3unsloth-3B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ph3unsloth-3B-slerp",
-    "id": "allknowingroger/Ph3unsloth-3B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1894
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5468
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1012
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4528
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3701
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Phi3mash1-17B-pass/ce74b7e3-8505-4c79-a7de-12d1e6b47155.json b/data/hfopenllm_v2/allknowingroger/Phi3mash1-17B-pass/ce74b7e3-8505-4c79-a7de-12d1e6b47155.json
deleted file mode 100644
index fb008b871..000000000
--- a/data/hfopenllm_v2/allknowingroger/Phi3mash1-17B-pass/ce74b7e3-8505-4c79-a7de-12d1e6b47155.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Phi3mash1-17B-pass/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi3mash1-17B-pass",
-    "id": "allknowingroger/Phi3mash1-17B-pass",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 16.687
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1884
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6129
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4451
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4589
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Quen2-65B/3c562d8a-2df9-4d3f-9699-bfaee4a1ce2b.json b/data/hfopenllm_v2/allknowingroger/Quen2-65B/3c562d8a-2df9-4d3f-9699-bfaee4a1ce2b.json
deleted file mode 100644
index bd16b28e4..000000000
--- a/data/hfopenllm_v2/allknowingroger/Quen2-65B/3c562d8a-2df9-4d3f-9699-bfaee4a1ce2b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Quen2-65B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Quen2-65B",
-    "id": "allknowingroger/Quen2-65B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 63.923
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1758
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2757
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2357
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3209
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1114
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Qwen2.5-42B-AGI/152b0cbe-e27b-4438-8326-e67f4e70e600.json b/data/hfopenllm_v2/allknowingroger/Qwen2.5-42B-AGI/152b0cbe-e27b-4438-8326-e67f4e70e600.json
deleted file mode 100644
index a56e8fc53..000000000
--- a/data/hfopenllm_v2/allknowingroger/Qwen2.5-42B-AGI/152b0cbe-e27b-4438-8326-e67f4e70e600.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Qwen2.5-42B-AGI/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-42B-AGI",
-    "id": "allknowingroger/Qwen2.5-42B-AGI",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 42.516
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1913
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2942
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.362
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1168
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task2/c733c91f-79a9-49e5-9398-3a424ee1940a.json b/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task2/c733c91f-79a9-49e5-9398-3a424ee1940a.json
deleted file mode 100644
index c7409a22c..000000000
--- a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task2/c733c91f-79a9-49e5-9398-3a424ee1940a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Qwen2.5-7B-task2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-task2",
-    "id": "allknowingroger/Qwen2.5-7B-task2",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4527
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5626
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.355
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.437
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4517
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task3/32d7b6c6-de5c-4864-a446-97dccce378c5.json b/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task3/32d7b6c6-de5c-4864-a446-97dccce378c5.json
deleted file mode 100644
index 0200a240d..000000000
--- a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task3/32d7b6c6-de5c-4864-a446-97dccce378c5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Qwen2.5-7B-task3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-task3",
-    "id": "allknowingroger/Qwen2.5-7B-task3",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5129
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5398
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2606
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4356
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4501
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task4/7b22d02b-5bfd-4243-9ad9-c858d0af55a6.json b/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task4/7b22d02b-5bfd-4243-9ad9-c858d0af55a6.json
deleted file mode 100644
index fcd61a787..000000000
--- a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task4/7b22d02b-5bfd-4243-9ad9-c858d0af55a6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Qwen2.5-7B-task4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-task4",
-    "id": "allknowingroger/Qwen2.5-7B-task4",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5005
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5583
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4395
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4561
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task7/99650529-55d9-42b0-b812-761a30277e5e.json b/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task7/99650529-55d9-42b0-b812-761a30277e5e.json
deleted file mode 100644
index 3af393c86..000000000
--- a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task7/99650529-55d9-42b0-b812-761a30277e5e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Qwen2.5-7B-task7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-task7",
-    "id": "allknowingroger/Qwen2.5-7B-task7",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4284
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5552
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4326
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4133
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task8/81abbc2a-791b-4a39-bb46-97edfa14b9c0.json b/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task8/81abbc2a-791b-4a39-bb46-97edfa14b9c0.json
deleted file mode 100644
index fcd83a7a4..000000000
--- a/data/hfopenllm_v2/allknowingroger/Qwen2.5-7B-task8/81abbc2a-791b-4a39-bb46-97edfa14b9c0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Qwen2.5-7B-task8/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-task8",
-    "id": "allknowingroger/Qwen2.5-7B-task8",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4645
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5525
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3527
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4514
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4433
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Qwen2.5-slerp-14B/c658e535-7098-40fc-bea0-f5734d8f4ca9.json b/data/hfopenllm_v2/allknowingroger/Qwen2.5-slerp-14B/c658e535-7098-40fc-bea0-f5734d8f4ca9.json
deleted file mode 100644
index 9b625885c..000000000
--- a/data/hfopenllm_v2/allknowingroger/Qwen2.5-slerp-14B/c658e535-7098-40fc-bea0-f5734d8f4ca9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Qwen2.5-slerp-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-slerp-14B",
-    "id": "allknowingroger/Qwen2.5-slerp-14B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4928
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6512
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4622
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3674
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4744
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5379
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/QwenSlerp12-7B/9e0656e9-9b82-4f6d-b00a-c09cf9cbc105.json b/data/hfopenllm_v2/allknowingroger/QwenSlerp12-7B/9e0656e9-9b82-4f6d-b00a-c09cf9cbc105.json
deleted file mode 100644
index e6896f1a4..000000000
--- a/data/hfopenllm_v2/allknowingroger/QwenSlerp12-7B/9e0656e9-9b82-4f6d-b00a-c09cf9cbc105.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_QwenSlerp12-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenSlerp12-7B",
-    "id": "allknowingroger/QwenSlerp12-7B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5076
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5556
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2946
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4595
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4461
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/QwenSlerp4-14B/07c36058-e0e8-48ea-85f3-0a2cb2fe3443.json b/data/hfopenllm_v2/allknowingroger/QwenSlerp4-14B/07c36058-e0e8-48ea-85f3-0a2cb2fe3443.json
deleted file mode 100644
index 38cf85407..000000000
--- a/data/hfopenllm_v2/allknowingroger/QwenSlerp4-14B/07c36058-e0e8-48ea-85f3-0a2cb2fe3443.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_QwenSlerp4-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenSlerp4-14B",
-    "id": "allknowingroger/QwenSlerp4-14B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6328
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6483
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3693
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3725
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.465
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5436
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/QwenSlerp5-14B/c41d8925-b56b-458e-b1a9-27dbbcaee149.json b/data/hfopenllm_v2/allknowingroger/QwenSlerp5-14B/c41d8925-b56b-458e-b1a9-27dbbcaee149.json
deleted file mode 100644
index 615bfa0a9..000000000
--- a/data/hfopenllm_v2/allknowingroger/QwenSlerp5-14B/c41d8925-b56b-458e-b1a9-27dbbcaee149.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_QwenSlerp5-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenSlerp5-14B",
-    "id": "allknowingroger/QwenSlerp5-14B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7119
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6357
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3649
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4675
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5391
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/QwenSlerp6-14B/9136feb4-5c3e-48b3-bc70-c7816b8b189b.json b/data/hfopenllm_v2/allknowingroger/QwenSlerp6-14B/9136feb4-5c3e-48b3-bc70-c7816b8b189b.json
deleted file mode 100644
index 32bf693fb..000000000
--- a/data/hfopenllm_v2/allknowingroger/QwenSlerp6-14B/9136feb4-5c3e-48b3-bc70-c7816b8b189b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_QwenSlerp6-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenSlerp6-14B",
-    "id": "allknowingroger/QwenSlerp6-14B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6867
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6384
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3724
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.469
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5406
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/QwenStock1-14B/c395ef02-9a50-4696-aad2-bcb32ba05f67.json b/data/hfopenllm_v2/allknowingroger/QwenStock1-14B/c395ef02-9a50-4696-aad2-bcb32ba05f67.json
deleted file mode 100644
index dd2bb81bf..000000000
--- a/data/hfopenllm_v2/allknowingroger/QwenStock1-14B/c395ef02-9a50-4696-aad2-bcb32ba05f67.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_QwenStock1-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenStock1-14B",
-    "id": "allknowingroger/QwenStock1-14B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5634
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6528
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3769
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3767
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.473
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5418
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/QwenStock2-14B/93f47969-556a-4fd4-b7bb-4d1c861a8d71.json b/data/hfopenllm_v2/allknowingroger/QwenStock2-14B/93f47969-556a-4fd4-b7bb-4d1c861a8d71.json
deleted file mode 100644
index 83694e0b6..000000000
--- a/data/hfopenllm_v2/allknowingroger/QwenStock2-14B/93f47969-556a-4fd4-b7bb-4d1c861a8d71.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_QwenStock2-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenStock2-14B",
-    "id": "allknowingroger/QwenStock2-14B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5563
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6569
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3882
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4756
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5406
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/QwenStock3-14B/349ae559-6c1f-4b2f-954c-e83cba1e603a.json b/data/hfopenllm_v2/allknowingroger/QwenStock3-14B/349ae559-6c1f-4b2f-954c-e83cba1e603a.json
deleted file mode 100644
index 5d15474d8..000000000
--- a/data/hfopenllm_v2/allknowingroger/QwenStock3-14B/349ae559-6c1f-4b2f-954c-e83cba1e603a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_QwenStock3-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenStock3-14B",
-    "id": "allknowingroger/QwenStock3-14B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5615
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6565
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3776
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3784
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4756
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5428
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Qwenslerp2-14B/3e43c3f6-645b-4ab3-b684-b23eb67bc5d9.json b/data/hfopenllm_v2/allknowingroger/Qwenslerp2-14B/3e43c3f6-645b-4ab3-b684-b23eb67bc5d9.json
deleted file mode 100644
index 74527b77f..000000000
--- a/data/hfopenllm_v2/allknowingroger/Qwenslerp2-14B/3e43c3f6-645b-4ab3-b684-b23eb67bc5d9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Qwenslerp2-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenslerp2-14B",
-    "id": "allknowingroger/Qwenslerp2-14B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5007
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6555
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4456
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3683
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4729
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5403
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Qwenslerp2-7B/500c8cd4-fe4e-44f3-86b7-b0efd387ab92.json b/data/hfopenllm_v2/allknowingroger/Qwenslerp2-7B/500c8cd4-fe4e-44f3-86b7-b0efd387ab92.json
deleted file mode 100644
index 0d48394c6..000000000
--- a/data/hfopenllm_v2/allknowingroger/Qwenslerp2-7B/500c8cd4-fe4e-44f3-86b7-b0efd387ab92.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Qwenslerp2-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenslerp2-7B",
-    "id": "allknowingroger/Qwenslerp2-7B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5294
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5609
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3421
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4356
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4515
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Qwenslerp3-14B/340a3ebb-bc06-404f-84e7-aeccc016fd32.json b/data/hfopenllm_v2/allknowingroger/Qwenslerp3-14B/340a3ebb-bc06-404f-84e7-aeccc016fd32.json
deleted file mode 100644
index 3dedb719e..000000000
--- a/data/hfopenllm_v2/allknowingroger/Qwenslerp3-14B/340a3ebb-bc06-404f-84e7-aeccc016fd32.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Qwenslerp3-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenslerp3-14B",
-    "id": "allknowingroger/Qwenslerp3-14B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5052
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6521
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4464
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4676
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5395
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Qwenslerp3-7B/a6426f88-d7cc-4e6a-a2b5-76e59a52a6de.json b/data/hfopenllm_v2/allknowingroger/Qwenslerp3-7B/a6426f88-d7cc-4e6a-a2b5-76e59a52a6de.json
deleted file mode 100644
index c20870016..000000000
--- a/data/hfopenllm_v2/allknowingroger/Qwenslerp3-7B/a6426f88-d7cc-4e6a-a2b5-76e59a52a6de.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Qwenslerp3-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenslerp3-7B",
-    "id": "allknowingroger/Qwenslerp3-7B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5018
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.558
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3218
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4515
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4542
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/ROGERphi-7B-slerp/bdd05c8f-b895-4c91-9a9f-a608a4259cbd.json b/data/hfopenllm_v2/allknowingroger/ROGERphi-7B-slerp/bdd05c8f-b895-4c91-9a9f-a608a4259cbd.json
deleted file mode 100644
index 0ef272faf..000000000
--- a/data/hfopenllm_v2/allknowingroger/ROGERphi-7B-slerp/bdd05c8f-b895-4c91-9a9f-a608a4259cbd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_ROGERphi-7B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ROGERphi-7B-slerp",
-    "id": "allknowingroger/ROGERphi-7B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3861
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5196
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0733
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4685
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3053
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/RogerMerge-7B-slerp/0e1e45d4-2747-480d-9b1f-2b200e250271.json b/data/hfopenllm_v2/allknowingroger/RogerMerge-7B-slerp/0e1e45d4-2747-480d-9b1f-2b200e250271.json
deleted file mode 100644
index e44fe42b8..000000000
--- a/data/hfopenllm_v2/allknowingroger/RogerMerge-7B-slerp/0e1e45d4-2747-480d-9b1f-2b200e250271.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_RogerMerge-7B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RogerMerge-7B-slerp",
-    "id": "allknowingroger/RogerMerge-7B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3933
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.516
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0687
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.432
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.303
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Rombos-LLM-V2.5-Qwen-42b/00f3f9ca-ae7d-4e62-9e7e-6bd202dbed59.json b/data/hfopenllm_v2/allknowingroger/Rombos-LLM-V2.5-Qwen-42b/00f3f9ca-ae7d-4e62-9e7e-6bd202dbed59.json
deleted file mode 100644
index 213c2250c..000000000
--- a/data/hfopenllm_v2/allknowingroger/Rombos-LLM-V2.5-Qwen-42b/00f3f9ca-ae7d-4e62-9e7e-6bd202dbed59.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Rombos-LLM-V2.5-Qwen-42b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rombos-LLM-V2.5-Qwen-42b",
-    "id": "allknowingroger/Rombos-LLM-V2.5-Qwen-42b",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 42.516
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1879
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2969
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3633
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1168
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Strangecoven-7B-slerp/c9e57ab2-c2a4-4935-b976-4bf24647b777.json b/data/hfopenllm_v2/allknowingroger/Strangecoven-7B-slerp/c9e57ab2-c2a4-4935-b976-4bf24647b777.json
deleted file mode 100644
index a4348c83f..000000000
--- a/data/hfopenllm_v2/allknowingroger/Strangecoven-7B-slerp/c9e57ab2-c2a4-4935-b976-4bf24647b777.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Strangecoven-7B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Strangecoven-7B-slerp",
-    "id": "allknowingroger/Strangecoven-7B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3746
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5368
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0763
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Weirdslerp2-25B/c22436a2-ec60-4220-82b3-123618165eb2.json b/data/hfopenllm_v2/allknowingroger/Weirdslerp2-25B/c22436a2-ec60-4220-82b3-123618165eb2.json
deleted file mode 100644
index a8975282d..000000000
--- a/data/hfopenllm_v2/allknowingroger/Weirdslerp2-25B/c22436a2-ec60-4220-82b3-123618165eb2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Weirdslerp2-25B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Weirdslerp2-25B",
-    "id": "allknowingroger/Weirdslerp2-25B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 25.204
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1754
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2874
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3524
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1128
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/WestlakeMaziyar-7B-slerp/1f990438-dd84-44d2-99f9-a10035ecd652.json b/data/hfopenllm_v2/allknowingroger/WestlakeMaziyar-7B-slerp/1f990438-dd84-44d2-99f9-a10035ecd652.json
deleted file mode 100644
index 75ae48ca3..000000000
--- a/data/hfopenllm_v2/allknowingroger/WestlakeMaziyar-7B-slerp/1f990438-dd84-44d2-99f9-a10035ecd652.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_WestlakeMaziyar-7B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "WestlakeMaziyar-7B-slerp",
-    "id": "allknowingroger/WestlakeMaziyar-7B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4838
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5245
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4474
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3078
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/YamMaths-7B-slerp/f4564f5e-3595-466e-8201-0e2a4c50ff0d.json b/data/hfopenllm_v2/allknowingroger/YamMaths-7B-slerp/f4564f5e-3595-466e-8201-0e2a4c50ff0d.json
deleted file mode 100644
index ab0338d1d..000000000
--- a/data/hfopenllm_v2/allknowingroger/YamMaths-7B-slerp/f4564f5e-3595-466e-8201-0e2a4c50ff0d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_YamMaths-7B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "YamMaths-7B-slerp",
-    "id": "allknowingroger/YamMaths-7B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4148
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5156
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0853
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4384
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3131
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Yi-1.5-34B/040def3a-702d-4868-b429-39697ca36207.json b/data/hfopenllm_v2/allknowingroger/Yi-1.5-34B/040def3a-702d-4868-b429-39697ca36207.json
deleted file mode 100644
index 73e379495..000000000
--- a/data/hfopenllm_v2/allknowingroger/Yi-1.5-34B/040def3a-702d-4868-b429-39697ca36207.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Yi-1.5-34B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-1.5-34B",
-    "id": "allknowingroger/Yi-1.5-34B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1639
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3857
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1095
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Yi-blossom-40B/9e24fd65-56ec-4160-b299-b34d702a3231.json b/data/hfopenllm_v2/allknowingroger/Yi-blossom-40B/9e24fd65-56ec-4160-b299-b34d702a3231.json
deleted file mode 100644
index 6517b6a6e..000000000
--- a/data/hfopenllm_v2/allknowingroger/Yi-blossom-40B/9e24fd65-56ec-4160-b299-b34d702a3231.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Yi-blossom-40B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-blossom-40B",
-    "id": "allknowingroger/Yi-blossom-40B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 18.769
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2009
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3215
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3843
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Yibuddy-35B/216bf9f8-9521-4311-a40b-8a847271265c.json b/data/hfopenllm_v2/allknowingroger/Yibuddy-35B/216bf9f8-9521-4311-a40b-8a847271265c.json
deleted file mode 100644
index a8d56323b..000000000
--- a/data/hfopenllm_v2/allknowingroger/Yibuddy-35B/216bf9f8-9521-4311-a40b-8a847271265c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Yibuddy-35B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yibuddy-35B",
-    "id": "allknowingroger/Yibuddy-35B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4235
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5916
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1571
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3557
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4505
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4489
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Yillama-40B/45f8c4fb-3591-44df-a4f0-57093b9bae23.json b/data/hfopenllm_v2/allknowingroger/Yillama-40B/45f8c4fb-3591-44df-a4f0-57093b9bae23.json
deleted file mode 100644
index 925316eac..000000000
--- a/data/hfopenllm_v2/allknowingroger/Yillama-40B/45f8c4fb-3591-44df-a4f0-57093b9bae23.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Yillama-40B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yillama-40B",
-    "id": "allknowingroger/Yillama-40B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1697
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4063
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3501
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1981
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Yislerp-34B/d17275ef-8a32-4fcb-94f4-fb24299ba50e.json b/data/hfopenllm_v2/allknowingroger/Yislerp-34B/d17275ef-8a32-4fcb-94f4-fb24299ba50e.json
deleted file mode 100644
index 8c5bc4734..000000000
--- a/data/hfopenllm_v2/allknowingroger/Yislerp-34B/d17275ef-8a32-4fcb-94f4-fb24299ba50e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Yislerp-34B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yislerp-34B",
-    "id": "allknowingroger/Yislerp-34B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3692
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6159
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3582
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4566
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4751
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Yislerp2-34B/61b79e7d-0f50-4cfe-825c-ed5b23d943f3.json b/data/hfopenllm_v2/allknowingroger/Yislerp2-34B/61b79e7d-0f50-4cfe-825c-ed5b23d943f3.json
deleted file mode 100644
index 15c44b31d..000000000
--- a/data/hfopenllm_v2/allknowingroger/Yislerp2-34B/61b79e7d-0f50-4cfe-825c-ed5b23d943f3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Yislerp2-34B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yislerp2-34B",
-    "id": "allknowingroger/Yislerp2-34B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3999
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6246
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2296
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3641
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.453
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4724
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/Yunconglong-13B-slerp/113c3507-b738-4b06-ada8-da93b19c6ae2.json b/data/hfopenllm_v2/allknowingroger/Yunconglong-13B-slerp/113c3507-b738-4b06-ada8-da93b19c6ae2.json
deleted file mode 100644
index ca824d478..000000000
--- a/data/hfopenllm_v2/allknowingroger/Yunconglong-13B-slerp/113c3507-b738-4b06-ada8-da93b19c6ae2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_Yunconglong-13B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yunconglong-13B-slerp",
-    "id": "allknowingroger/Yunconglong-13B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 12.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4242
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5166
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4161
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3036
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/limyClown-7B-slerp/8835d5c1-8350-4d42-a753-82b94dffda3b.json b/data/hfopenllm_v2/allknowingroger/limyClown-7B-slerp/8835d5c1-8350-4d42-a753-82b94dffda3b.json
deleted file mode 100644
index b54682c35..000000000
--- a/data/hfopenllm_v2/allknowingroger/limyClown-7B-slerp/8835d5c1-8350-4d42-a753-82b94dffda3b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_limyClown-7B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "limyClown-7B-slerp",
-    "id": "allknowingroger/limyClown-7B-slerp",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4017
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5148
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0687
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4293
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3038
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/llama3-Jallabi-40B-s/dc3bbda7-5007-44c7-b1ba-af0c82d100ee.json b/data/hfopenllm_v2/allknowingroger/llama3-Jallabi-40B-s/dc3bbda7-5007-44c7-b1ba-af0c82d100ee.json
deleted file mode 100644
index 3d7d9aa70..000000000
--- a/data/hfopenllm_v2/allknowingroger/llama3-Jallabi-40B-s/dc3bbda7-5007-44c7-b1ba-af0c82d100ee.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_llama3-Jallabi-40B-s/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama3-Jallabi-40B-s",
-    "id": "allknowingroger/llama3-Jallabi-40B-s",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 18.769
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1921
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3252
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2374
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1088
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allknowingroger/llama3AnFeng-40B/0d24ee06-a6b4-4be7-b3ef-c4f53b4fc414.json b/data/hfopenllm_v2/allknowingroger/llama3AnFeng-40B/0d24ee06-a6b4-4be7-b3ef-c4f53b4fc414.json
deleted file mode 100644
index ec77ddbf3..000000000
--- a/data/hfopenllm_v2/allknowingroger/llama3AnFeng-40B/0d24ee06-a6b4-4be7-b3ef-c4f53b4fc414.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allknowingroger_llama3AnFeng-40B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama3AnFeng-40B",
-    "id": "allknowingroger/llama3AnFeng-40B",
-    "developer": "allknowingroger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 39.971
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1742
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3794
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.394
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.198
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allura-org/L3.1-8b-RP-Ink/f2415b7a-2cd7-4a05-834b-7da992e1da1a.json b/data/hfopenllm_v2/allura-org/L3.1-8b-RP-Ink/f2415b7a-2cd7-4a05-834b-7da992e1da1a.json
deleted file mode 100644
index acab59baa..000000000
--- a/data/hfopenllm_v2/allura-org/L3.1-8b-RP-Ink/f2415b7a-2cd7-4a05-834b-7da992e1da1a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allura-org_L3.1-8b-RP-Ink/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.1-8b-RP-Ink",
-    "id": "allura-org/L3.1-8b-RP-Ink",
-    "developer": "allura-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7811
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4828
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3608
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3428
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allura-org/MN-12b-RP-Ink/01af237f-40d8-4841-a90d-13dce6db8634.json b/data/hfopenllm_v2/allura-org/MN-12b-RP-Ink/01af237f-40d8-4841-a90d-13dce6db8634.json
deleted file mode 100644
index 36c98b87c..000000000
--- a/data/hfopenllm_v2/allura-org/MN-12b-RP-Ink/01af237f-40d8-4841-a90d-13dce6db8634.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allura-org_MN-12b-RP-Ink/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12b-RP-Ink",
-    "id": "allura-org/MN-12b-RP-Ink",
-    "developer": "allura-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7186
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4834
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1186
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3818
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3514
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allura-org/MS-Meadowlark-22B/d69bb392-fd38-4f57-b567-24566896167b.json b/data/hfopenllm_v2/allura-org/MS-Meadowlark-22B/d69bb392-fd38-4f57-b567-24566896167b.json
deleted file mode 100644
index 242f13b4f..000000000
--- a/data/hfopenllm_v2/allura-org/MS-Meadowlark-22B/d69bb392-fd38-4f57-b567-24566896167b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allura-org_MS-Meadowlark-22B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MS-Meadowlark-22B",
-    "id": "allura-org/MS-Meadowlark-22B",
-    "developer": "allura-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6697
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5163
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1835
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3843
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3823
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allura-org/Mistral-Small-24b-Sertraline-0304/63503943-1c1e-4dac-9c41-4933fbb44b70.json b/data/hfopenllm_v2/allura-org/Mistral-Small-24b-Sertraline-0304/63503943-1c1e-4dac-9c41-4933fbb44b70.json
deleted file mode 100644
index a1e249c7d..000000000
--- a/data/hfopenllm_v2/allura-org/Mistral-Small-24b-Sertraline-0304/63503943-1c1e-4dac-9c41-4933fbb44b70.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allura-org_Mistral-Small-24b-Sertraline-0304/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Small-24b-Sertraline-0304",
-    "id": "allura-org/Mistral-Small-24b-Sertraline-0304",
-    "developer": "allura-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.68
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6525
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2228
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4395
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5106
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allura-org/Mistral-Small-Sisyphus-24b-2503/80c5d343-41e6-45d7-8921-62586a3cd270.json b/data/hfopenllm_v2/allura-org/Mistral-Small-Sisyphus-24b-2503/80c5d343-41e6-45d7-8921-62586a3cd270.json
deleted file mode 100644
index 7a4d2fa7f..000000000
--- a/data/hfopenllm_v2/allura-org/Mistral-Small-Sisyphus-24b-2503/80c5d343-41e6-45d7-8921-62586a3cd270.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allura-org_Mistral-Small-Sisyphus-24b-2503/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Small-Sisyphus-24b-2503",
-    "id": "allura-org/Mistral-Small-Sisyphus-24b-2503",
-    "developer": "allura-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6848
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.627
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3977
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5127
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allura-org/MoE-Girl-1BA-7BT/2c27d7f6-60fd-49f3-8666-784f2a16031b.json b/data/hfopenllm_v2/allura-org/MoE-Girl-1BA-7BT/2c27d7f6-60fd-49f3-8666-784f2a16031b.json
deleted file mode 100644
index 93ee2f96c..000000000
--- a/data/hfopenllm_v2/allura-org/MoE-Girl-1BA-7BT/2c27d7f6-60fd-49f3-8666-784f2a16031b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allura-org_MoE-Girl-1BA-7BT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MoE-Girl-1BA-7BT",
-    "id": "allura-org/MoE-Girl-1BA-7BT",
-    "developer": "allura-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "OlmoeForCausalLM",
-      "params_billions": 6.919
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2705
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3139
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3436
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1218
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allura-org/TQ2.5-14B-Aletheia-v1/cbcc1e64-8455-4382-8999-654d1757bbd6.json b/data/hfopenllm_v2/allura-org/TQ2.5-14B-Aletheia-v1/cbcc1e64-8455-4382-8999-654d1757bbd6.json
deleted file mode 100644
index 200aa0dee..000000000
--- a/data/hfopenllm_v2/allura-org/TQ2.5-14B-Aletheia-v1/cbcc1e64-8455-4382-8999-654d1757bbd6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allura-org_TQ2.5-14B-Aletheia-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TQ2.5-14B-Aletheia-v1",
-    "id": "allura-org/TQ2.5-14B-Aletheia-v1",
-    "developer": "allura-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.753
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6585
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3399
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3624
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4452
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5241
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allura-org/TQ2.5-14B-Neon-v1/1bea4f6b-7a41-4907-baca-430c7ea179e9.json b/data/hfopenllm_v2/allura-org/TQ2.5-14B-Neon-v1/1bea4f6b-7a41-4907-baca-430c7ea179e9.json
deleted file mode 100644
index 42bdffdcf..000000000
--- a/data/hfopenllm_v2/allura-org/TQ2.5-14B-Neon-v1/1bea4f6b-7a41-4907-baca-430c7ea179e9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allura-org_TQ2.5-14B-Neon-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TQ2.5-14B-Neon-v1",
-    "id": "allura-org/TQ2.5-14B-Neon-v1",
-    "developer": "allura-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6754
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6553
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3603
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.461
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5253
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/allura-org/Teleut-7b/298ce89b-966c-4f4e-9da5-3803a395188f.json b/data/hfopenllm_v2/allura-org/Teleut-7b/298ce89b-966c-4f4e-9da5-3803a395188f.json
deleted file mode 100644
index 9486047d3..000000000
--- a/data/hfopenllm_v2/allura-org/Teleut-7b/298ce89b-966c-4f4e-9da5-3803a395188f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/allura-org_Teleut-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Teleut-7b",
-    "id": "allura-org/Teleut-7b",
-    "developer": "allura-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6379
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5141
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2409
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.464
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4131
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/aloobun/Meta-Llama-3-7B-28Layers/ea27a4d6-8c32-4b36-873d-1046ae6240e5.json b/data/hfopenllm_v2/aloobun/Meta-Llama-3-7B-28Layers/ea27a4d6-8c32-4b36-873d-1046ae6240e5.json
deleted file mode 100644
index 1550e1cda..000000000
--- a/data/hfopenllm_v2/aloobun/Meta-Llama-3-7B-28Layers/ea27a4d6-8c32-4b36-873d-1046ae6240e5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/aloobun_Meta-Llama-3-7B-28Layers/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Meta-Llama-3-7B-28Layers",
-    "id": "aloobun/Meta-Llama-3-7B-28Layers",
-    "developer": "aloobun",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.158
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1964
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4437
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0279
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3589
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.316
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/aloobun/d-SmolLM2-360M/73d5905d-7825-43ba-8051-7e1f5639b857.json b/data/hfopenllm_v2/aloobun/d-SmolLM2-360M/73d5905d-7825-43ba-8051-7e1f5639b857.json
deleted file mode 100644
index 9ae14979c..000000000
--- a/data/hfopenllm_v2/aloobun/d-SmolLM2-360M/73d5905d-7825-43ba-8051-7e1f5639b857.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/aloobun_d-SmolLM2-360M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "d-SmolLM2-360M",
-    "id": "aloobun/d-SmolLM2-360M",
-    "developer": "aloobun",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.362
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2097
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3981
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1169
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/alpindale/WizardLM-2-8x22B/956b8589-a048-43be-9cfd-05658d3c57ca.json b/data/hfopenllm_v2/alpindale/WizardLM-2-8x22B/956b8589-a048-43be-9cfd-05658d3c57ca.json
deleted file mode 100644
index 11db1c919..000000000
--- a/data/hfopenllm_v2/alpindale/WizardLM-2-8x22B/956b8589-a048-43be-9cfd-05658d3c57ca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/alpindale_WizardLM-2-8x22B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "WizardLM-2-8x22B",
-    "id": "alpindale/WizardLM-2-8x22B",
-    "developer": "alpindale",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 140.621
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5272
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6377
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4387
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4596
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/alpindale/magnum-72b-v1/36f597b4-8f53-4b40-9c0e-c9284743e456.json b/data/hfopenllm_v2/alpindale/magnum-72b-v1/36f597b4-8f53-4b40-9c0e-c9284743e456.json
deleted file mode 100644
index 6b8bdd31e..000000000
--- a/data/hfopenllm_v2/alpindale/magnum-72b-v1/36f597b4-8f53-4b40-9c0e-c9284743e456.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/alpindale_magnum-72b-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "magnum-72b-v1",
-    "id": "alpindale/magnum-72b-v1",
-    "developer": "alpindale",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7606
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6982
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.398
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3909
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4489
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5468
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/altomek/YiSM-34B-0rn/7b67e526-7588-4c62-9293-55e77851c4c7.json b/data/hfopenllm_v2/altomek/YiSM-34B-0rn/7b67e526-7588-4c62-9293-55e77851c4c7.json
deleted file mode 100644
index f2755bff5..000000000
--- a/data/hfopenllm_v2/altomek/YiSM-34B-0rn/7b67e526-7588-4c62-9293-55e77851c4c7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/altomek_YiSM-34B-0rn/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "YiSM-34B-0rn",
-    "id": "altomek/YiSM-34B-0rn",
-    "developer": "altomek",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4284
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.614
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2281
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.445
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4696
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/amazon/MegaBeam-Mistral-7B-300k/8bc96d6d-0cd7-49c4-8112-7d8fb1c45199.json b/data/hfopenllm_v2/amazon/MegaBeam-Mistral-7B-300k/8bc96d6d-0cd7-49c4-8112-7d8fb1c45199.json
deleted file mode 100644
index 6587a17ee..000000000
--- a/data/hfopenllm_v2/amazon/MegaBeam-Mistral-7B-300k/8bc96d6d-0cd7-49c4-8112-7d8fb1c45199.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/amazon_MegaBeam-Mistral-7B-300k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MegaBeam-Mistral-7B-300k",
-    "id": "amazon/MegaBeam-Mistral-7B-300k",
-    "developer": "amazon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5203
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4228
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0211
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.398
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2549
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/amd/AMD-Llama-135m/6751a200-0bd9-498e-a991-ebe22375633d.json b/data/hfopenllm_v2/amd/AMD-Llama-135m/6751a200-0bd9-498e-a991-ebe22375633d.json
deleted file mode 100644
index c580f04da..000000000
--- a/data/hfopenllm_v2/amd/AMD-Llama-135m/6751a200-0bd9-498e-a991-ebe22375633d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/amd_AMD-Llama-135m/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AMD-Llama-135m",
-    "id": "amd/AMD-Llama-135m",
-    "developer": "amd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.134
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1918
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2969
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0076
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3846
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1169
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/amd/AMD-Llama-135m/f41442e3-5aa7-4ca4-9e61-a5e13965a3e4.json b/data/hfopenllm_v2/amd/AMD-Llama-135m/f41442e3-5aa7-4ca4-9e61-a5e13965a3e4.json
deleted file mode 100644
index ac33d4f61..000000000
--- a/data/hfopenllm_v2/amd/AMD-Llama-135m/f41442e3-5aa7-4ca4-9e61-a5e13965a3e4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/amd_AMD-Llama-135m/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AMD-Llama-135m",
-    "id": "amd/AMD-Llama-135m",
-    "developer": "amd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1842
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2974
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0053
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.378
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1169
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/anakin87/gemma-2b-orpo/b105b62a-ce77-4387-b679-1adf2782b2f4.json b/data/hfopenllm_v2/anakin87/gemma-2b-orpo/b105b62a-ce77-4387-b679-1adf2782b2f4.json
deleted file mode 100644
index 83e2b1b5b..000000000
--- a/data/hfopenllm_v2/anakin87/gemma-2b-orpo/b105b62a-ce77-4387-b679-1adf2782b2f4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/anakin87_gemma-2b-orpo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2b-orpo",
-    "id": "anakin87/gemma-2b-orpo",
-    "developer": "anakin87",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 2.506
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2478
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3426
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0189
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3728
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1306
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v1-72b/72180fd7-bf34-4758-b02f-7d11859700c7.json b/data/hfopenllm_v2/anthracite-org/magnum-v1-72b/72180fd7-bf34-4758-b02f-7d11859700c7.json
deleted file mode 100644
index 18b204986..000000000
--- a/data/hfopenllm_v2/anthracite-org/magnum-v1-72b/72180fd7-bf34-4758-b02f-7d11859700c7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v1-72b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "magnum-v1-72b",
-    "id": "anthracite-org/magnum-v1-72b",
-    "developer": "anthracite-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7606
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6982
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.398
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3909
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4489
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5486
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v2-12b/ac5aaa9c-79ab-4082-b8c5-084fba3e122a.json b/data/hfopenllm_v2/anthracite-org/magnum-v2-12b/ac5aaa9c-79ab-4082-b8c5-084fba3e122a.json
deleted file mode 100644
index 26fb22a26..000000000
--- a/data/hfopenllm_v2/anthracite-org/magnum-v2-12b/ac5aaa9c-79ab-4082-b8c5-084fba3e122a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v2-12b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "magnum-v2-12b",
-    "id": "anthracite-org/magnum-v2-12b",
-    "developer": "anthracite-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3762
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5021
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4179
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3167
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v2-72b/2d266d7f-8edd-40fd-adfc-597a7742167b.json b/data/hfopenllm_v2/anthracite-org/magnum-v2-72b/2d266d7f-8edd-40fd-adfc-597a7742167b.json
deleted file mode 100644
index ea532bb60..000000000
--- a/data/hfopenllm_v2/anthracite-org/magnum-v2-72b/2d266d7f-8edd-40fd-adfc-597a7742167b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v2-72b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "magnum-v2-72b",
-    "id": "anthracite-org/magnum-v2-72b",
-    "developer": "anthracite-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.756
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7005
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3542
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3859
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4372
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5456
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v2.5-12b-kto/484ccbf2-87e2-423f-9de4-a4bd54291b54.json b/data/hfopenllm_v2/anthracite-org/magnum-v2.5-12b-kto/484ccbf2-87e2-423f-9de4-a4bd54291b54.json
deleted file mode 100644
index d05197a33..000000000
--- a/data/hfopenllm_v2/anthracite-org/magnum-v2.5-12b-kto/484ccbf2-87e2-423f-9de4-a4bd54291b54.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v2.5-12b-kto/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "magnum-v2.5-12b-kto",
-    "id": "anthracite-org/magnum-v2.5-12b-kto",
-    "developer": "anthracite-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3866
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5077
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0521
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4086
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3215
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v3-27b-kto/4de79504-f9e8-4235-9aad-d38f0799e081.json b/data/hfopenllm_v2/anthracite-org/magnum-v3-27b-kto/4de79504-f9e8-4235-9aad-d38f0799e081.json
deleted file mode 100644
index 81b95e27b..000000000
--- a/data/hfopenllm_v2/anthracite-org/magnum-v3-27b-kto/4de79504-f9e8-4235-9aad-d38f0799e081.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v3-27b-kto/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "magnum-v3-27b-kto",
-    "id": "anthracite-org/magnum-v3-27b-kto",
-    "developer": "anthracite-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5675
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.586
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1813
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3557
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3855
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4238
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v3-34b/b4bde9d8-f50c-448c-ada4-5bc05f302c04.json b/data/hfopenllm_v2/anthracite-org/magnum-v3-34b/b4bde9d8-f50c-448c-ada4-5bc05f302c04.json
deleted file mode 100644
index 84369bf89..000000000
--- a/data/hfopenllm_v2/anthracite-org/magnum-v3-34b/b4bde9d8-f50c-448c-ada4-5bc05f302c04.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v3-34b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "magnum-v3-34b",
-    "id": "anthracite-org/magnum-v3-34b",
-    "developer": "anthracite-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5115
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6088
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1949
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3607
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3872
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4752
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v3-9b-chatml/5da3240b-b5e3-4333-ba61-925343b56043.json b/data/hfopenllm_v2/anthracite-org/magnum-v3-9b-chatml/5da3240b-b5e3-4333-ba61-925343b56043.json
deleted file mode 100644
index 48d20512c..000000000
--- a/data/hfopenllm_v2/anthracite-org/magnum-v3-9b-chatml/5da3240b-b5e3-4333-ba61-925343b56043.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v3-9b-chatml/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "magnum-v3-9b-chatml",
-    "id": "anthracite-org/magnum-v3-9b-chatml",
-    "developer": "anthracite-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1275
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5428
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0695
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3456
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4432
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4242
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v3-9b-customgemma2/d6727b7d-cdf3-48d5-8e30-484e86ad60b6.json b/data/hfopenllm_v2/anthracite-org/magnum-v3-9b-customgemma2/d6727b7d-cdf3-48d5-8e30-484e86ad60b6.json
deleted file mode 100644
index 08eeb4ea5..000000000
--- a/data/hfopenllm_v2/anthracite-org/magnum-v3-9b-customgemma2/d6727b7d-cdf3-48d5-8e30-484e86ad60b6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v3-9b-customgemma2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "magnum-v3-9b-customgemma2",
-    "id": "anthracite-org/magnum-v3-9b-customgemma2",
-    "developer": "anthracite-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1273
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.534
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0718
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4565
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4205
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v4-12b/15b86bbf-8d3b-474b-98f0-abb3972a7271.json b/data/hfopenllm_v2/anthracite-org/magnum-v4-12b/15b86bbf-8d3b-474b-98f0-abb3972a7271.json
deleted file mode 100644
index 98e18e6b7..000000000
--- a/data/hfopenllm_v2/anthracite-org/magnum-v4-12b/15b86bbf-8d3b-474b-98f0-abb3972a7271.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v4-12b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "magnum-v4-12b",
-    "id": "anthracite-org/magnum-v4-12b",
-    "developer": "anthracite-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3393
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5177
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1178
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4093
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3604
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v4-22b/c0b339f6-4a46-46eb-b2d0-945176afe676.json b/data/hfopenllm_v2/anthracite-org/magnum-v4-22b/c0b339f6-4a46-46eb-b2d0-945176afe676.json
deleted file mode 100644
index 8cce65095..000000000
--- a/data/hfopenllm_v2/anthracite-org/magnum-v4-22b/c0b339f6-4a46-46eb-b2d0-945176afe676.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v4-22b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "magnum-v4-22b",
-    "id": "anthracite-org/magnum-v4-22b",
-    "developer": "anthracite-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5629
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5486
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2002
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4408
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.383
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v4-27b/79367289-6245-4bf0-99e9-42bc3ff7649c.json b/data/hfopenllm_v2/anthracite-org/magnum-v4-27b/79367289-6245-4bf0-99e9-42bc3ff7649c.json
deleted file mode 100644
index fd3218df0..000000000
--- a/data/hfopenllm_v2/anthracite-org/magnum-v4-27b/79367289-6245-4bf0-99e9-42bc3ff7649c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v4-27b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "magnum-v4-27b",
-    "id": "anthracite-org/magnum-v4-27b",
-    "developer": "anthracite-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3454
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5867
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1798
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.438
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4376
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/anthracite-org/magnum-v4-9b/c3ec5505-1086-446a-9739-523810e93d13.json b/data/hfopenllm_v2/anthracite-org/magnum-v4-9b/c3ec5505-1086-446a-9739-523810e93d13.json
deleted file mode 100644
index d1166e7fe..000000000
--- a/data/hfopenllm_v2/anthracite-org/magnum-v4-9b/c3ec5505-1086-446a-9739-523810e93d13.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/anthracite-org_magnum-v4-9b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "magnum-v4-9b",
-    "id": "anthracite-org/magnum-v4-9b",
-    "developer": "anthracite-org",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3503
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5336
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1307
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4516
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3953
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/apple/DCLM-7B/c6c5e462-d373-4536-afc3-b740fb7e300f.json b/data/hfopenllm_v2/apple/DCLM-7B/c6c5e462-d373-4536-afc3-b740fb7e300f.json
deleted file mode 100644
index 92760e54b..000000000
--- a/data/hfopenllm_v2/apple/DCLM-7B/c6c5e462-d373-4536-afc3-b740fb7e300f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/apple_DCLM-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DCLM-7B",
-    "id": "apple/DCLM-7B",
-    "developer": "apple",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "OpenLMModel",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2173
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4232
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.037
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3921
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3111
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/appvoid/arco-2-instruct/b7537abe-8177-4206-999f-5bb7e95c72c8.json b/data/hfopenllm_v2/appvoid/arco-2-instruct/b7537abe-8177-4206-999f-5bb7e95c72c8.json
deleted file mode 100644
index c53efb7dc..000000000
--- a/data/hfopenllm_v2/appvoid/arco-2-instruct/b7537abe-8177-4206-999f-5bb7e95c72c8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/appvoid_arco-2-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "arco-2-instruct",
-    "id": "appvoid/arco-2-instruct",
-    "developer": "appvoid",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.514
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2164
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3133
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2383
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3496
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1113
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/appvoid/arco-2/eb2f6159-e37e-46db-9419-6a66cb7e539e.json b/data/hfopenllm_v2/appvoid/arco-2/eb2f6159-e37e-46db-9419-6a66cb7e539e.json
deleted file mode 100644
index a19063325..000000000
--- a/data/hfopenllm_v2/appvoid/arco-2/eb2f6159-e37e-46db-9419-6a66cb7e539e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/appvoid_arco-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "arco-2",
-    "id": "appvoid/arco-2",
-    "developer": "appvoid",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.514
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1991
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2391
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3536
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1116
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/arcee-ai/Arcee-Blitz/0b2d0a06-2907-4258-be33-1591e18ac6a2.json b/data/hfopenllm_v2/arcee-ai/Arcee-Blitz/0b2d0a06-2907-4258-be33-1591e18ac6a2.json
deleted file mode 100644
index 8851d8d7b..000000000
--- a/data/hfopenllm_v2/arcee-ai/Arcee-Blitz/0b2d0a06-2907-4258-be33-1591e18ac6a2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/arcee-ai_Arcee-Blitz/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Arcee-Blitz",
-    "id": "arcee-ai/Arcee-Blitz",
-    "developer": "arcee-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5543
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6607
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3482
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3851
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5047
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6154
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/arcee-ai/Arcee-Maestro-7B-Preview/0284d867-45c4-4fe4-883c-8e3ea169d66c.json b/data/hfopenllm_v2/arcee-ai/Arcee-Maestro-7B-Preview/0284d867-45c4-4fe4-883c-8e3ea169d66c.json
deleted file mode 100644
index 9080738a7..000000000
--- a/data/hfopenllm_v2/arcee-ai/Arcee-Maestro-7B-Preview/0284d867-45c4-4fe4-883c-8e3ea169d66c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/arcee-ai_Arcee-Maestro-7B-Preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Arcee-Maestro-7B-Preview",
-    "id": "arcee-ai/Arcee-Maestro-7B-Preview",
-    "developer": "arcee-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.275
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4648
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4992
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3322
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3885
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3039
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/arcee-ai/Arcee-Nova/1a2da513-104e-4074-b3b7-601ab11bf6d8.json b/data/hfopenllm_v2/arcee-ai/Arcee-Nova/1a2da513-104e-4074-b3b7-601ab11bf6d8.json
deleted file mode 100644
index ec647a01e..000000000
--- a/data/hfopenllm_v2/arcee-ai/Arcee-Nova/1a2da513-104e-4074-b3b7-601ab11bf6d8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/arcee-ai_Arcee-Nova/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Arcee-Nova",
-    "id": "arcee-ai/Arcee-Nova",
-    "developer": "arcee-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7907
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6942
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4381
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3851
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4562
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5452
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/arcee-ai/Arcee-Spark/189db16b-5e78-439f-9f79-6eec979c3a79.json b/data/hfopenllm_v2/arcee-ai/Arcee-Spark/189db16b-5e78-439f-9f79-6eec979c3a79.json
deleted file mode 100644
index 38ed0acf4..000000000
--- a/data/hfopenllm_v2/arcee-ai/Arcee-Spark/189db16b-5e78-439f-9f79-6eec979c3a79.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/arcee-ai_Arcee-Spark/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Arcee-Spark",
-    "id": "arcee-ai/Arcee-Spark",
-    "developer": "arcee-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5621
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5489
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4021
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3822
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/arcee-ai/Arcee-Spark/d751f1c5-5505-4c12-8d51-091538b49949.json b/data/hfopenllm_v2/arcee-ai/Arcee-Spark/d751f1c5-5505-4c12-8d51-091538b49949.json
deleted file mode 100644
index f4c610574..000000000
--- a/data/hfopenllm_v2/arcee-ai/Arcee-Spark/d751f1c5-5505-4c12-8d51-091538b49949.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/arcee-ai_Arcee-Spark/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Arcee-Spark",
-    "id": "arcee-ai/Arcee-Spark",
-    "developer": "arcee-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5718
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5481
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4008
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3813
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/arcee-ai/Llama-3.1-SuperNova-Lite/b6f9144f-57a0-4c18-9e52-ffccf2d8ca9c.json b/data/hfopenllm_v2/arcee-ai/Llama-3.1-SuperNova-Lite/b6f9144f-57a0-4c18-9e52-ffccf2d8ca9c.json
deleted file mode 100644
index 894a616bf..000000000
--- a/data/hfopenllm_v2/arcee-ai/Llama-3.1-SuperNova-Lite/b6f9144f-57a0-4c18-9e52-ffccf2d8ca9c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/arcee-ai_Llama-3.1-SuperNova-Lite/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-SuperNova-Lite",
-    "id": "arcee-ai/Llama-3.1-SuperNova-Lite",
-    "developer": "arcee-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8017
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5152
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1828
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4163
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3877
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/arcee-ai/Llama-Spark/67dc7fb2-1455-4f60-9dcb-59a8197741d7.json b/data/hfopenllm_v2/arcee-ai/Llama-Spark/67dc7fb2-1455-4f60-9dcb-59a8197741d7.json
deleted file mode 100644
index ffa60c546..000000000
--- a/data/hfopenllm_v2/arcee-ai/Llama-Spark/67dc7fb2-1455-4f60-9dcb-59a8197741d7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/arcee-ai_Llama-Spark/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-Spark",
-    "id": "arcee-ai/Llama-Spark",
-    "developer": "arcee-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7911
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5054
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.139
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3593
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3721
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/arcee-ai/SuperNova-Medius/7f4ab590-29fa-473a-b617-00135dd1d6ee.json b/data/hfopenllm_v2/arcee-ai/SuperNova-Medius/7f4ab590-29fa-473a-b617-00135dd1d6ee.json
deleted file mode 100644
index 4067be61a..000000000
--- a/data/hfopenllm_v2/arcee-ai/SuperNova-Medius/7f4ab590-29fa-473a-b617-00135dd1d6ee.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/arcee-ai_SuperNova-Medius/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SuperNova-Medius",
-    "id": "arcee-ai/SuperNova-Medius",
-    "developer": "arcee-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7184
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6377
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.469
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4233
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5035
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/arcee-ai/Virtuoso-Lite/d67db62e-e21d-43c8-8b4c-bfa353e47636.json b/data/hfopenllm_v2/arcee-ai/Virtuoso-Lite/d67db62e-e21d-43c8-8b4c-bfa353e47636.json
deleted file mode 100644
index e33d0d23e..000000000
--- a/data/hfopenllm_v2/arcee-ai/Virtuoso-Lite/d67db62e-e21d-43c8-8b4c-bfa353e47636.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/arcee-ai_Virtuoso-Lite/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Virtuoso-Lite",
-    "id": "arcee-ai/Virtuoso-Lite",
-    "developer": "arcee-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.81
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6099
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.253
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4595
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4441
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/arcee-ai/Virtuoso-Small-v2/85abff46-8ae5-4a75-9522-721793224363.json b/data/hfopenllm_v2/arcee-ai/Virtuoso-Small-v2/85abff46-8ae5-4a75-9522-721793224363.json
deleted file mode 100644
index 5a503f2da..000000000
--- a/data/hfopenllm_v2/arcee-ai/Virtuoso-Small-v2/85abff46-8ae5-4a75-9522-721793224363.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/arcee-ai_Virtuoso-Small-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Virtuoso-Small-v2",
-    "id": "arcee-ai/Virtuoso-Small-v2",
-    "developer": "arcee-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8273
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6554
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.466
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3532
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4313
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5188
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/arcee-ai/Virtuoso-Small/1736bbd8-4457-4d55-8c0b-0ae6e001ee62.json b/data/hfopenllm_v2/arcee-ai/Virtuoso-Small/1736bbd8-4457-4d55-8c0b-0ae6e001ee62.json
deleted file mode 100644
index 4e3877887..000000000
--- a/data/hfopenllm_v2/arcee-ai/Virtuoso-Small/1736bbd8-4457-4d55-8c0b-0ae6e001ee62.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/arcee-ai_Virtuoso-Small/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Virtuoso-Small",
-    "id": "arcee-ai/Virtuoso-Small",
-    "developer": "arcee-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7935
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6518
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4094
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4339
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5191
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/arcee-ai/raspberry-3B/4777e427-8d17-4e06-8cbf-0883c95bbfd8.json b/data/hfopenllm_v2/arcee-ai/raspberry-3B/4777e427-8d17-4e06-8cbf-0883c95bbfd8.json
deleted file mode 100644
index e1c6d4dab..000000000
--- a/data/hfopenllm_v2/arcee-ai/raspberry-3B/4777e427-8d17-4e06-8cbf-0883c95bbfd8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/arcee-ai_raspberry-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "raspberry-3B",
-    "id": "arcee-ai/raspberry-3B",
-    "developer": "arcee-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4269
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1035
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4123
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2854
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/argilla-warehouse/Llama-3.1-8B-MagPie-Ultra/4df0b890-d4c5-408e-8994-88f7383e9235.json b/data/hfopenllm_v2/argilla-warehouse/Llama-3.1-8B-MagPie-Ultra/4df0b890-d4c5-408e-8994-88f7383e9235.json
deleted file mode 100644
index c9896a281..000000000
--- a/data/hfopenllm_v2/argilla-warehouse/Llama-3.1-8B-MagPie-Ultra/4df0b890-d4c5-408e-8994-88f7383e9235.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/argilla-warehouse_Llama-3.1-8B-MagPie-Ultra/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-MagPie-Ultra",
-    "id": "argilla-warehouse/Llama-3.1-8B-MagPie-Ultra",
-    "developer": "argilla-warehouse",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5757
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.462
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.077
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3543
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3144
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/argilla/notus-7b-v1/76a5a59d-f5fd-4fb0-849e-7db7772b555a.json b/data/hfopenllm_v2/argilla/notus-7b-v1/76a5a59d-f5fd-4fb0-849e-7db7772b555a.json
deleted file mode 100644
index d4480efe6..000000000
--- a/data/hfopenllm_v2/argilla/notus-7b-v1/76a5a59d-f5fd-4fb0-849e-7db7772b555a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/argilla_notus-7b-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "notus-7b-v1",
-    "id": "argilla/notus-7b-v1",
-    "developer": "argilla",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5082
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4512
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0317
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3004
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/argilla/notux-8x7b-v1/6c8399d0-01ce-45cb-a20f-a49e4e760a1e.json b/data/hfopenllm_v2/argilla/notux-8x7b-v1/6c8399d0-01ce-45cb-a20f-a49e4e760a1e.json
deleted file mode 100644
index 8c4516521..000000000
--- a/data/hfopenllm_v2/argilla/notux-8x7b-v1/6c8399d0-01ce-45cb-a20f-a49e4e760a1e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/argilla_notux-8x7b-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "notux-8x7b-v1",
-    "id": "argilla/notux-8x7b-v1",
-    "developer": "argilla",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 46.703
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5422
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5363
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0997
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4176
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.366
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/arisin/orca-platypus-13B-slerp/92c2c5ee-dfa2-4db3-8401-887d02cc21dd.json b/data/hfopenllm_v2/arisin/orca-platypus-13B-slerp/92c2c5ee-dfa2-4db3-8401-887d02cc21dd.json
deleted file mode 100644
index 593200335..000000000
--- a/data/hfopenllm_v2/arisin/orca-platypus-13B-slerp/92c2c5ee-dfa2-4db3-8401-887d02cc21dd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/arisin_orca-platypus-13B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca-platypus-13B-slerp",
-    "id": "arisin/orca-platypus-13B-slerp",
-    "developer": "arisin",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.016
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2672
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4631
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4253
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/arshiaafshani/Arsh-V1/b40ef568-f277-4d5c-87cd-53feaa71598b.json b/data/hfopenllm_v2/arshiaafshani/Arsh-V1/b40ef568-f277-4d5c-87cd-53feaa71598b.json
deleted file mode 100644
index 9bb389c7b..000000000
--- a/data/hfopenllm_v2/arshiaafshani/Arsh-V1/b40ef568-f277-4d5c-87cd-53feaa71598b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/arshiaafshani_Arsh-V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Arsh-V1",
-    "id": "arshiaafshani/Arsh-V1",
-    "developer": "arshiaafshani",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.96
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6043
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.674
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2621
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4899
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5257
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/asharsha30/LLAMA_Harsha_8_B_ORDP_10k/893d5149-c535-41c7-8a1a-26bb6b33e407.json b/data/hfopenllm_v2/asharsha30/LLAMA_Harsha_8_B_ORDP_10k/893d5149-c535-41c7-8a1a-26bb6b33e407.json
deleted file mode 100644
index 1530afe24..000000000
--- a/data/hfopenllm_v2/asharsha30/LLAMA_Harsha_8_B_ORDP_10k/893d5149-c535-41c7-8a1a-26bb6b33e407.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/asharsha30_LLAMA_Harsha_8_B_ORDP_10k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLAMA_Harsha_8_B_ORDP_10k",
-    "id": "asharsha30/LLAMA_Harsha_8_B_ORDP_10k",
-    "developer": "asharsha30",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3464
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4669
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3697
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ashercn97/a1-v0.0.1/0b649ed5-5af4-4910-b853-2408e3b58f1f.json b/data/hfopenllm_v2/ashercn97/a1-v0.0.1/0b649ed5-5af4-4910-b853-2408e3b58f1f.json
deleted file mode 100644
index d679dcf37..000000000
--- a/data/hfopenllm_v2/ashercn97/a1-v0.0.1/0b649ed5-5af4-4910-b853-2408e3b58f1f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ashercn97_a1-v0.0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "a1-v0.0.1",
-    "id": "ashercn97/a1-v0.0.1",
-    "developer": "ashercn97",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2198
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5188
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.412
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4165
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ashercn97/a1-v002/5c8edeba-5c65-4168-b67e-02143acbcafb.json b/data/hfopenllm_v2/ashercn97/a1-v002/5c8edeba-5c65-4168-b67e-02143acbcafb.json
deleted file mode 100644
index ae1d8850d..000000000
--- a/data/hfopenllm_v2/ashercn97/a1-v002/5c8edeba-5c65-4168-b67e-02143acbcafb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ashercn97_a1-v002/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "a1-v002",
-    "id": "ashercn97/a1-v002",
-    "developer": "ashercn97",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2585
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5261
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2341
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4159
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4175
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/assskelad/smollm2-360M-sft_SmallThoughts/67e657ef-d602-4f58-b898-874a22f4a009.json b/data/hfopenllm_v2/assskelad/smollm2-360M-sft_SmallThoughts/67e657ef-d602-4f58-b898-874a22f4a009.json
deleted file mode 100644
index 72305cf3d..000000000
--- a/data/hfopenllm_v2/assskelad/smollm2-360M-sft_SmallThoughts/67e657ef-d602-4f58-b898-874a22f4a009.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/assskelad_smollm2-360M-sft_SmallThoughts/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smollm2-360M-sft_SmallThoughts",
-    "id": "assskelad/smollm2-360M-sft_SmallThoughts",
-    "developer": "assskelad",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.362
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2007
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.315
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3395
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1182
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/athirdpath/Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit/53d2bf07-689a-4e69-a534-b288313c8481.json b/data/hfopenllm_v2/athirdpath/Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit/53d2bf07-689a-4e69-a534-b288313c8481.json
deleted file mode 100644
index f4805d33d..000000000
--- a/data/hfopenllm_v2/athirdpath/Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit/53d2bf07-689a-4e69-a534-b288313c8481.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/athirdpath_Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit",
-    "id": "athirdpath/Llama-3.1-Instruct_NSFW-pretrained_e1-plus_reddit",
-    "developer": "athirdpath",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4521
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4939
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3864
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/automerger/YamshadowExperiment28-7B/34d6a184-d4d5-4609-8305-c0e2ee1c585b.json b/data/hfopenllm_v2/automerger/YamshadowExperiment28-7B/34d6a184-d4d5-4609-8305-c0e2ee1c585b.json
deleted file mode 100644
index 2b426dfef..000000000
--- a/data/hfopenllm_v2/automerger/YamshadowExperiment28-7B/34d6a184-d4d5-4609-8305-c0e2ee1c585b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/automerger_YamshadowExperiment28-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "YamshadowExperiment28-7B",
-    "id": "automerger/YamshadowExperiment28-7B",
-    "developer": "automerger",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.407
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.515
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4306
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.306
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/avemio/GRAG-NEMO-12B-ORPO-HESSIAN-AI/39b627ab-3e64-42f7-a88d-abe5764fcf4d.json b/data/hfopenllm_v2/avemio/GRAG-NEMO-12B-ORPO-HESSIAN-AI/39b627ab-3e64-42f7-a88d-abe5764fcf4d.json
deleted file mode 100644
index a782f820f..000000000
--- a/data/hfopenllm_v2/avemio/GRAG-NEMO-12B-ORPO-HESSIAN-AI/39b627ab-3e64-42f7-a88d-abe5764fcf4d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/avemio_GRAG-NEMO-12B-ORPO-HESSIAN-AI/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GRAG-NEMO-12B-ORPO-HESSIAN-AI",
-    "id": "avemio/GRAG-NEMO-12B-ORPO-HESSIAN-AI",
-    "developer": "avemio",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2607
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3447
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1061
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-1-over-2/d8467b15-8a03-4cde-9fc5-5c08bdabb6c6.json b/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-1-over-2/d8467b15-8a03-4cde-9fc5-5c08bdabb6c6.json
deleted file mode 100644
index edcb076d2..000000000
--- a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-1-over-2/d8467b15-8a03-4cde-9fc5-5c08bdabb6c6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/awnr_Mistral-7B-v0.1-signtensors-1-over-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-v0.1-signtensors-1-over-2",
-    "id": "awnr/Mistral-7B-v0.1-signtensors-1-over-2",
-    "developer": "awnr",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2179
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4423
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.034
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4006
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-1-over-4/85bc5976-0d40-4416-bbf8-9b1dbf372343.json b/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-1-over-4/85bc5976-0d40-4416-bbf8-9b1dbf372343.json
deleted file mode 100644
index cddb44dc5..000000000
--- a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-1-over-4/85bc5976-0d40-4416-bbf8-9b1dbf372343.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/awnr_Mistral-7B-v0.1-signtensors-1-over-4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-v0.1-signtensors-1-over-4",
-    "id": "awnr/Mistral-7B-v0.1-signtensors-1-over-4",
-    "developer": "awnr",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2133
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3507
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.346
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2311
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-3-over-8/8c7e8e64-672e-4c7e-a808-a49f1792d3a8.json b/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-3-over-8/8c7e8e64-672e-4c7e-a808-a49f1792d3a8.json
deleted file mode 100644
index d4c3cdd30..000000000
--- a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-3-over-8/8c7e8e64-672e-4c7e-a808-a49f1792d3a8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/awnr_Mistral-7B-v0.1-signtensors-3-over-8/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-v0.1-signtensors-3-over-8",
-    "id": "awnr/Mistral-7B-v0.1-signtensors-3-over-8",
-    "developer": "awnr",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2394
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.43
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0332
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3818
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3001
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-5-over-16/de8651eb-16d1-46ee-a1df-b8c72caaf205.json b/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-5-over-16/de8651eb-16d1-46ee-a1df-b8c72caaf205.json
deleted file mode 100644
index cde7a7057..000000000
--- a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-5-over-16/de8651eb-16d1-46ee-a1df-b8c72caaf205.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/awnr_Mistral-7B-v0.1-signtensors-5-over-16/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-v0.1-signtensors-5-over-16",
-    "id": "awnr/Mistral-7B-v0.1-signtensors-5-over-16",
-    "developer": "awnr",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2118
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4124
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3686
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2958
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-7-over-16/6a744db8-814f-4e8e-b6e5-0d096267dfa5.json b/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-7-over-16/6a744db8-814f-4e8e-b6e5-0d096267dfa5.json
deleted file mode 100644
index 7819c9d49..000000000
--- a/data/hfopenllm_v2/awnr/Mistral-7B-v0.1-signtensors-7-over-16/6a744db8-814f-4e8e-b6e5-0d096267dfa5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/awnr_Mistral-7B-v0.1-signtensors-7-over-16/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-v0.1-signtensors-7-over-16",
-    "id": "awnr/Mistral-7B-v0.1-signtensors-7-over-16",
-    "developer": "awnr",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2294
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4316
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3952
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.303
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/aws-prototyping/MegaBeam-Mistral-7B-512k/028b7c37-770e-4356-a7c6-0cc74650d5fd.json b/data/hfopenllm_v2/aws-prototyping/MegaBeam-Mistral-7B-512k/028b7c37-770e-4356-a7c6-0cc74650d5fd.json
deleted file mode 100644
index 8bff123e1..000000000
--- a/data/hfopenllm_v2/aws-prototyping/MegaBeam-Mistral-7B-512k/028b7c37-770e-4356-a7c6-0cc74650d5fd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/aws-prototyping_MegaBeam-Mistral-7B-512k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MegaBeam-Mistral-7B-512k",
-    "id": "aws-prototyping/MegaBeam-Mistral-7B-512k",
-    "developer": "aws-prototyping",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5973
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3662
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3994
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2589
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/axolotl-ai-co/romulus-mistral-nemo-12b-simpo/3b399c64-922a-48ba-9a25-862102749647.json b/data/hfopenllm_v2/axolotl-ai-co/romulus-mistral-nemo-12b-simpo/3b399c64-922a-48ba-9a25-862102749647.json
deleted file mode 100644
index fa7e408a7..000000000
--- a/data/hfopenllm_v2/axolotl-ai-co/romulus-mistral-nemo-12b-simpo/3b399c64-922a-48ba-9a25-862102749647.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/axolotl-ai-co_romulus-mistral-nemo-12b-simpo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "romulus-mistral-nemo-12b-simpo",
-    "id": "axolotl-ai-co/romulus-mistral-nemo-12b-simpo",
-    "developer": "axolotl-ai-co",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6079
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5395
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4233
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3469
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/baconnier/Napoleon_24B_V0.0/d5e46a11-3e81-457d-9d26-9fd17f96f076.json b/data/hfopenllm_v2/baconnier/Napoleon_24B_V0.0/d5e46a11-3e81-457d-9d26-9fd17f96f076.json
deleted file mode 100644
index e7ee5d6e9..000000000
--- a/data/hfopenllm_v2/baconnier/Napoleon_24B_V0.0/d5e46a11-3e81-457d-9d26-9fd17f96f076.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/baconnier_Napoleon_24B_V0.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Napoleon_24B_V0.0",
-    "id": "baconnier/Napoleon_24B_V0.0",
-    "developer": "baconnier",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1801
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6367
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2273
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.442
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.504
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/baconnier/Napoleon_24B_V0.2/b3abfbc1-911a-43b7-a338-efb25f746f9d.json b/data/hfopenllm_v2/baconnier/Napoleon_24B_V0.2/b3abfbc1-911a-43b7-a338-efb25f746f9d.json
deleted file mode 100644
index d0b53963a..000000000
--- a/data/hfopenllm_v2/baconnier/Napoleon_24B_V0.2/b3abfbc1-911a-43b7-a338-efb25f746f9d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/baconnier_Napoleon_24B_V0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Napoleon_24B_V0.2",
-    "id": "baconnier/Napoleon_24B_V0.2",
-    "developer": "baconnier",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2527
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5911
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1435
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.446
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4357
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/baebee/7B-Cetacea/6b471ee0-9444-45ff-92cf-da624aa59bf6.json b/data/hfopenllm_v2/baebee/7B-Cetacea/6b471ee0-9444-45ff-92cf-da624aa59bf6.json
deleted file mode 100644
index a0e865f81..000000000
--- a/data/hfopenllm_v2/baebee/7B-Cetacea/6b471ee0-9444-45ff-92cf-da624aa59bf6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/baebee_7B-Cetacea/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "7B-Cetacea",
-    "id": "baebee/7B-Cetacea",
-    "developer": "baebee",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5279
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4757
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0468
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4136
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2955
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/baebee/mergekit-model_stock-nzjnheg/b56bd924-0a63-4ca2-8f2f-97b581e47a36.json b/data/hfopenllm_v2/baebee/mergekit-model_stock-nzjnheg/b56bd924-0a63-4ca2-8f2f-97b581e47a36.json
deleted file mode 100644
index bee951289..000000000
--- a/data/hfopenllm_v2/baebee/mergekit-model_stock-nzjnheg/b56bd924-0a63-4ca2-8f2f-97b581e47a36.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/baebee_mergekit-model_stock-nzjnheg/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mergekit-model_stock-nzjnheg",
-    "id": "baebee/mergekit-model_stock-nzjnheg",
-    "developer": "baebee",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4844
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5287
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1677
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3847
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3699
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/baebee/mergekit-ties-fnjenli/bfe9098d-7207-4f8c-9a3f-549a29303b5f.json b/data/hfopenllm_v2/baebee/mergekit-ties-fnjenli/bfe9098d-7207-4f8c-9a3f-549a29303b5f.json
deleted file mode 100644
index 73adf9632..000000000
--- a/data/hfopenllm_v2/baebee/mergekit-ties-fnjenli/bfe9098d-7207-4f8c-9a3f-549a29303b5f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/baebee_mergekit-ties-fnjenli/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mergekit-ties-fnjenli",
-    "id": "baebee/mergekit-ties-fnjenli",
-    "developer": "baebee",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1988
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3024
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0023
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.245
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4019
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1129
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.1v/7856172d-ec3e-4e71-befe-54952478e330.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.1v/7856172d-ec3e-4e71-befe-54952478e330.json
deleted file mode 100644
index 8956821d3..000000000
--- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.1v/7856172d-ec3e-4e71-befe-54952478e330.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B-Mix_0.1v/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MISCHIEVOUS-12B-Mix_0.1v",
-    "id": "bamec66557/MISCHIEVOUS-12B-Mix_0.1v",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3636
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5436
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1329
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4132
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3674
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.2v/a68aada5-61bd-4a4c-a8e1-b9a2ace349df.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.2v/a68aada5-61bd-4a4c-a8e1-b9a2ace349df.json
deleted file mode 100644
index 4b040ddba..000000000
--- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.2v/a68aada5-61bd-4a4c-a8e1-b9a2ace349df.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B-Mix_0.2v/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MISCHIEVOUS-12B-Mix_0.2v",
-    "id": "bamec66557/MISCHIEVOUS-12B-Mix_0.2v",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3624
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5434
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1261
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4158
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3663
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.3v/9d19c44f-4912-4c95-ab3f-2dddb055d932.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.3v/9d19c44f-4912-4c95-ab3f-2dddb055d932.json
deleted file mode 100644
index f38472040..000000000
--- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.3v/9d19c44f-4912-4c95-ab3f-2dddb055d932.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B-Mix_0.3v/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MISCHIEVOUS-12B-Mix_0.3v",
-    "id": "bamec66557/MISCHIEVOUS-12B-Mix_0.3v",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.387
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5431
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1337
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4131
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3664
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.4v/6cef3550-27d7-4073-b4bb-0f19a2c5f553.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.4v/6cef3550-27d7-4073-b4bb-0f19a2c5f553.json
deleted file mode 100644
index ac3a04ead..000000000
--- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.4v/6cef3550-27d7-4073-b4bb-0f19a2c5f553.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B-Mix_0.4v/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MISCHIEVOUS-12B-Mix_0.4v",
-    "id": "bamec66557/MISCHIEVOUS-12B-Mix_0.4v",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6508
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5094
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1352
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4176
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3683
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.5v/08ab8f6a-9aaf-4ab4-ada3-eb4a75f46995.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.5v/08ab8f6a-9aaf-4ab4-ada3-eb4a75f46995.json
deleted file mode 100644
index 95a75b4b9..000000000
--- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.5v/08ab8f6a-9aaf-4ab4-ada3-eb4a75f46995.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B-Mix_0.5v/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MISCHIEVOUS-12B-Mix_0.5v",
-    "id": "bamec66557/MISCHIEVOUS-12B-Mix_0.5v",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3746
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5422
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1367
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4132
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3661
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.6v/622f9379-6a30-43ba-a7a8-fbd08c484fa5.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.6v/622f9379-6a30-43ba-a7a8-fbd08c484fa5.json
deleted file mode 100644
index a048d396f..000000000
--- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_0.6v/622f9379-6a30-43ba-a7a8-fbd08c484fa5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B-Mix_0.6v/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MISCHIEVOUS-12B-Mix_0.6v",
-    "id": "bamec66557/MISCHIEVOUS-12B-Mix_0.6v",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4366
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5449
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1254
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4185
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3662
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_III_IV_V/24f728e6-de5e-44cc-8b6d-51e0065c1475.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_III_IV_V/24f728e6-de5e-44cc-8b6d-51e0065c1475.json
deleted file mode 100644
index 5db8c43d5..000000000
--- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_III_IV_V/24f728e6-de5e-44cc-8b6d-51e0065c1475.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B-Mix_III_IV_V/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MISCHIEVOUS-12B-Mix_III_IV_V",
-    "id": "bamec66557/MISCHIEVOUS-12B-Mix_III_IV_V",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4031
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5465
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1292
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4198
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3664
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_III_ex_V/c3b2bf18-d355-40fc-a862-376c1b988305.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_III_ex_V/c3b2bf18-d355-40fc-a862-376c1b988305.json
deleted file mode 100644
index 33f7812de..000000000
--- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_III_ex_V/c3b2bf18-d355-40fc-a862-376c1b988305.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B-Mix_III_ex_V/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MISCHIEVOUS-12B-Mix_III_ex_V",
-    "id": "bamec66557/MISCHIEVOUS-12B-Mix_III_ex_V",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4316
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5449
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1322
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4198
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3649
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_Neo/79474be5-2587-4087-a2cc-1337e3b696dd.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_Neo/79474be5-2587-4087-a2cc-1337e3b696dd.json
deleted file mode 100644
index ab7a30fec..000000000
--- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B-Mix_Neo/79474be5-2587-4087-a2cc-1337e3b696dd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B-Mix_Neo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MISCHIEVOUS-12B-Mix_Neo",
-    "id": "bamec66557/MISCHIEVOUS-12B-Mix_Neo",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.625
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5078
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1329
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.415
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3685
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B/22ff2700-70c0-459e-96a2-0ce1710947bc.json b/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B/22ff2700-70c0-459e-96a2-0ce1710947bc.json
deleted file mode 100644
index 2f6482099..000000000
--- a/data/hfopenllm_v2/bamec66557/MISCHIEVOUS-12B/22ff2700-70c0-459e-96a2-0ce1710947bc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_MISCHIEVOUS-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MISCHIEVOUS-12B",
-    "id": "bamec66557/MISCHIEVOUS-12B",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3852
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5405
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1276
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4145
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3672
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/Mistral-Nemo-VICIOUS_MESH-12B-2407/7d3a47a3-83d3-4f51-ab72-6a2fa5b5ef80.json b/data/hfopenllm_v2/bamec66557/Mistral-Nemo-VICIOUS_MESH-12B-2407/7d3a47a3-83d3-4f51-ab72-6a2fa5b5ef80.json
deleted file mode 100644
index d39a7d3db..000000000
--- a/data/hfopenllm_v2/bamec66557/Mistral-Nemo-VICIOUS_MESH-12B-2407/7d3a47a3-83d3-4f51-ab72-6a2fa5b5ef80.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_Mistral-Nemo-VICIOUS_MESH-12B-2407/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Nemo-VICIOUS_MESH-12B-2407",
-    "id": "bamec66557/Mistral-Nemo-VICIOUS_MESH-12B-2407",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6706
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5156
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1367
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.431
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3677
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/NameLess-12B-prob/69dc0f8e-16d7-4907-9741-484eafa62b8c.json b/data/hfopenllm_v2/bamec66557/NameLess-12B-prob/69dc0f8e-16d7-4907-9741-484eafa62b8c.json
deleted file mode 100644
index e2704ecca..000000000
--- a/data/hfopenllm_v2/bamec66557/NameLess-12B-prob/69dc0f8e-16d7-4907-9741-484eafa62b8c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_NameLess-12B-prob/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NameLess-12B-prob",
-    "id": "bamec66557/NameLess-12B-prob",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6602
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5158
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1261
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4336
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3684
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-0.1v/e516abc1-9c3c-4921-a385-e2533d45fed3.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-0.1v/e516abc1-9c3c-4921-a385-e2533d45fed3.json
deleted file mode 100644
index 6ba84e23e..000000000
--- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-0.1v/e516abc1-9c3c-4921-a385-e2533d45fed3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-0.1v/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "VICIOUS_MESH-12B-0.1v",
-    "id": "bamec66557/VICIOUS_MESH-12B-0.1v",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 6.124
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3657
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5412
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1322
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4158
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3683
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-0.X.ver/8baa5832-cc07-4a31-a815-0e8151426ea6.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-0.X.ver/8baa5832-cc07-4a31-a815-0e8151426ea6.json
deleted file mode 100644
index cd846c308..000000000
--- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-0.X.ver/8baa5832-cc07-4a31-a815-0e8151426ea6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-0.X.ver/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "VICIOUS_MESH-12B-0.X.ver",
-    "id": "bamec66557/VICIOUS_MESH-12B-0.X.ver",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 6.124
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3776
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5416
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4198
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3671
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-ALPHA/509fbca4-f405-4c27-85a9-1eea59025070.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-ALPHA/509fbca4-f405-4c27-85a9-1eea59025070.json
deleted file mode 100644
index 98b8dfc2e..000000000
--- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-ALPHA/509fbca4-f405-4c27-85a9-1eea59025070.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-ALPHA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "VICIOUS_MESH-12B-ALPHA",
-    "id": "bamec66557/VICIOUS_MESH-12B-ALPHA",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6365
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5094
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1367
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4203
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3697
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-BETA/6f45ed56-6bec-4439-9adb-e79fcd74667c.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-BETA/6f45ed56-6bec-4439-9adb-e79fcd74667c.json
deleted file mode 100644
index 5e63ad14b..000000000
--- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-BETA/6f45ed56-6bec-4439-9adb-e79fcd74667c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-BETA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "VICIOUS_MESH-12B-BETA",
-    "id": "bamec66557/VICIOUS_MESH-12B-BETA",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6721
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5156
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1329
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.431
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3679
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-DELTA/512ff924-c1d3-4d75-a468-2bcdcda25cf6.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-DELTA/512ff924-c1d3-4d75-a468-2bcdcda25cf6.json
deleted file mode 100644
index d55e617b1..000000000
--- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-DELTA/512ff924-c1d3-4d75-a468-2bcdcda25cf6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-DELTA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "VICIOUS_MESH-12B-DELTA",
-    "id": "bamec66557/VICIOUS_MESH-12B-DELTA",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 6.124
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6469
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5055
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1375
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4057
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3651
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-DIGAMMA/86b561ae-c4d3-4293-a884-bcab26df026d.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-DIGAMMA/86b561ae-c4d3-4293-a884-bcab26df026d.json
deleted file mode 100644
index 33dd088e9..000000000
--- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-DIGAMMA/86b561ae-c4d3-4293-a884-bcab26df026d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-DIGAMMA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "VICIOUS_MESH-12B-DIGAMMA",
-    "id": "bamec66557/VICIOUS_MESH-12B-DIGAMMA",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 6.124
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6429
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5061
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1337
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4097
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3659
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-EPSILON/516d1972-9731-4234-a4b3-b96423ebba5c.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-EPSILON/516d1972-9731-4234-a4b3-b96423ebba5c.json
deleted file mode 100644
index e7811c1a1..000000000
--- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-EPSILON/516d1972-9731-4234-a4b3-b96423ebba5c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-EPSILON/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "VICIOUS_MESH-12B-EPSILON",
-    "id": "bamec66557/VICIOUS_MESH-12B-EPSILON",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 6.124
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6305
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5038
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1261
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.407
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3648
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-GAMMA/274f6e02-c81f-4f2e-9747-e5de5cee1933.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-GAMMA/274f6e02-c81f-4f2e-9747-e5de5cee1933.json
deleted file mode 100644
index 1e3744cbe..000000000
--- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-GAMMA/274f6e02-c81f-4f2e-9747-e5de5cee1933.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-GAMMA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "VICIOUS_MESH-12B-GAMMA",
-    "id": "bamec66557/VICIOUS_MESH-12B-GAMMA",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6362
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5182
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1307
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4363
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3666
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-NEMO/61638b55-296b-40fd-a39f-cc2276d9f94a.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-NEMO/61638b55-296b-40fd-a39f-cc2276d9f94a.json
deleted file mode 100644
index 07670d514..000000000
--- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-NEMO/61638b55-296b-40fd-a39f-cc2276d9f94a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-NEMO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "VICIOUS_MESH-12B-NEMO",
-    "id": "bamec66557/VICIOUS_MESH-12B-NEMO",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4022
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5442
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1269
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4251
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-OMEGA/11c1b6fe-4815-415b-a4a8-d14073df6ee1.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-OMEGA/11c1b6fe-4815-415b-a4a8-d14073df6ee1.json
deleted file mode 100644
index 6ef877a1b..000000000
--- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-OMEGA/11c1b6fe-4815-415b-a4a8-d14073df6ee1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-OMEGA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "VICIOUS_MESH-12B-OMEGA",
-    "id": "bamec66557/VICIOUS_MESH-12B-OMEGA",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.67
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5166
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1344
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4323
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3677
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-UNION/88e2cb24-288e-4f37-8753-f0daa825051c.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-UNION/88e2cb24-288e-4f37-8753-f0daa825051c.json
deleted file mode 100644
index 149e4a3b1..000000000
--- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B-UNION/88e2cb24-288e-4f37-8753-f0daa825051c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B-UNION/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "VICIOUS_MESH-12B-UNION",
-    "id": "bamec66557/VICIOUS_MESH-12B-UNION",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 6.124
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6429
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5107
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.139
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4257
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3672
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B/8a1a6c44-17fd-402e-a22e-e795a1f612e3.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B/8a1a6c44-17fd-402e-a22e-e795a1f612e3.json
deleted file mode 100644
index d0b02080e..000000000
--- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B/8a1a6c44-17fd-402e-a22e-e795a1f612e3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "VICIOUS_MESH-12B",
-    "id": "bamec66557/VICIOUS_MESH-12B",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 6.124
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5436
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1344
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4105
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3679
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B_Razor/1121af0b-61fe-424a-bc66-3164bcb1d833.json b/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B_Razor/1121af0b-61fe-424a-bc66-3164bcb1d833.json
deleted file mode 100644
index c043f1c67..000000000
--- a/data/hfopenllm_v2/bamec66557/VICIOUS_MESH-12B_Razor/1121af0b-61fe-424a-bc66-3164bcb1d833.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_VICIOUS_MESH-12B_Razor/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "VICIOUS_MESH-12B_Razor",
-    "id": "bamec66557/VICIOUS_MESH-12B_Razor",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 6.124
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3736
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5447
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1299
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4092
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3669
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/mergekit-model_stock-zdaysvi/35300d67-7ee1-4874-b351-87f46267cec9.json b/data/hfopenllm_v2/bamec66557/mergekit-model_stock-zdaysvi/35300d67-7ee1-4874-b351-87f46267cec9.json
deleted file mode 100644
index ddc69cf5a..000000000
--- a/data/hfopenllm_v2/bamec66557/mergekit-model_stock-zdaysvi/35300d67-7ee1-4874-b351-87f46267cec9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_mergekit-model_stock-zdaysvi/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mergekit-model_stock-zdaysvi",
-    "id": "bamec66557/mergekit-model_stock-zdaysvi",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 6.124
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6426
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5063
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1352
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4124
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3688
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bamec66557/mergekit-ties-sinbkow/6180b7b3-4b21-42aa-a62d-084a91568b43.json b/data/hfopenllm_v2/bamec66557/mergekit-ties-sinbkow/6180b7b3-4b21-42aa-a62d-084a91568b43.json
deleted file mode 100644
index f59ec5045..000000000
--- a/data/hfopenllm_v2/bamec66557/mergekit-ties-sinbkow/6180b7b3-4b21-42aa-a62d-084a91568b43.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bamec66557_mergekit-ties-sinbkow/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mergekit-ties-sinbkow",
-    "id": "bamec66557/mergekit-ties-sinbkow",
-    "developer": "bamec66557",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 6.124
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6432
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5092
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4045
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3603
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/belztjti/dffghgjh/7414d344-0e67-424a-9e16-00de0487ce02.json b/data/hfopenllm_v2/belztjti/dffghgjh/7414d344-0e67-424a-9e16-00de0487ce02.json
deleted file mode 100644
index 007ff11a3..000000000
--- a/data/hfopenllm_v2/belztjti/dffghgjh/7414d344-0e67-424a-9e16-00de0487ce02.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/belztjti_dffghgjh/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dffghgjh",
-    "id": "belztjti/dffghgjh",
-    "developer": "belztjti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GlmForCausalLM",
-      "params_billions": 9.543
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5784
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3582
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0234
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3475
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3422
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/belztjti/dtfgv/f5fcd407-080c-4cb7-a299-7a7f919c734d.json b/data/hfopenllm_v2/belztjti/dtfgv/f5fcd407-080c-4cb7-a299-7a7f919c734d.json
deleted file mode 100644
index 05d2c4ee6..000000000
--- a/data/hfopenllm_v2/belztjti/dtfgv/f5fcd407-080c-4cb7-a299-7a7f919c734d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/belztjti_dtfgv/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dtfgv",
-    "id": "belztjti/dtfgv",
-    "developer": "belztjti",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 9.543
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3345
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3282
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3794
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1504
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/benhaotang/phi4-qwq-sky-t1/efe03731-6021-4dcf-b7fe-24cbf2d60fac.json b/data/hfopenllm_v2/benhaotang/phi4-qwq-sky-t1/efe03731-6021-4dcf-b7fe-24cbf2d60fac.json
deleted file mode 100644
index 4a2fac32d..000000000
--- a/data/hfopenllm_v2/benhaotang/phi4-qwq-sky-t1/efe03731-6021-4dcf-b7fe-24cbf2d60fac.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/benhaotang_phi4-qwq-sky-t1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi4-qwq-sky-t1",
-    "id": "benhaotang/phi4-qwq-sky-t1",
-    "developer": "benhaotang",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.046
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6711
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4101
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3951
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.49
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5244
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/beomi/gemma-mling-7b/6ffed624-cc22-4b62-a447-3c02b0e43ded.json b/data/hfopenllm_v2/beomi/gemma-mling-7b/6ffed624-cc22-4b62-a447-3c02b0e43ded.json
deleted file mode 100644
index dda19f01b..000000000
--- a/data/hfopenllm_v2/beomi/gemma-mling-7b/6ffed624-cc22-4b62-a447-3c02b0e43ded.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/beomi_gemma-mling-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-mling-7b",
-    "id": "beomi/gemma-mling-7b",
-    "developer": "beomi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 8.538
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2029
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4068
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3759
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2633
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/beowolx/CodeNinja-1.0-OpenChat-7B/ed867fa8-be8a-49b0-8c94-38085808b58b.json b/data/hfopenllm_v2/beowolx/CodeNinja-1.0-OpenChat-7B/ed867fa8-be8a-49b0-8c94-38085808b58b.json
deleted file mode 100644
index 4c755301d..000000000
--- a/data/hfopenllm_v2/beowolx/CodeNinja-1.0-OpenChat-7B/ed867fa8-be8a-49b0-8c94-38085808b58b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/beowolx_CodeNinja-1.0-OpenChat-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CodeNinja-1.0-OpenChat-7B",
-    "id": "beowolx/CodeNinja-1.0-OpenChat-7B",
-    "developer": "beowolx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5447
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4441
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0672
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4243
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3015
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/berkeley-nest/Starling-LM-7B-alpha/c8b9a56b-0933-4085-8d5f-a1d8294699db.json b/data/hfopenllm_v2/berkeley-nest/Starling-LM-7B-alpha/c8b9a56b-0933-4085-8d5f-a1d8294699db.json
deleted file mode 100644
index 59ff6b3cf..000000000
--- a/data/hfopenllm_v2/berkeley-nest/Starling-LM-7B-alpha/c8b9a56b-0933-4085-8d5f-a1d8294699db.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/berkeley-nest_Starling-LM-7B-alpha/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Starling-LM-7B-alpha",
-    "id": "berkeley-nest/Starling-LM-7B-alpha",
-    "developer": "berkeley-nest",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.548
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.444
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0838
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.412
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3172
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bfuzzy1/Gunny/9b178661-ed9a-427d-b93c-b905b8089ad8.json b/data/hfopenllm_v2/bfuzzy1/Gunny/9b178661-ed9a-427d-b93c-b905b8089ad8.json
deleted file mode 100644
index c2995af50..000000000
--- a/data/hfopenllm_v2/bfuzzy1/Gunny/9b178661-ed9a-427d-b93c-b905b8089ad8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bfuzzy1_Gunny/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gunny",
-    "id": "bfuzzy1/Gunny",
-    "developer": "bfuzzy1",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7129
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4546
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.173
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3583
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3039
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bfuzzy1/acheron-c/69588e07-7559-49c2-9423-19fd143e42f7.json b/data/hfopenllm_v2/bfuzzy1/acheron-c/69588e07-7559-49c2-9423-19fd143e42f7.json
deleted file mode 100644
index ca1f71360..000000000
--- a/data/hfopenllm_v2/bfuzzy1/acheron-c/69588e07-7559-49c2-9423-19fd143e42f7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bfuzzy1_acheron-c/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "acheron-c",
-    "id": "bfuzzy1/acheron-c",
-    "developer": "bfuzzy1",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.514
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1929
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3026
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.003
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2475
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1172
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bfuzzy1/acheron-d/317589da-d673-4f90-93e9-59983f2ef54b.json b/data/hfopenllm_v2/bfuzzy1/acheron-d/317589da-d673-4f90-93e9-59983f2ef54b.json
deleted file mode 100644
index e74d10763..000000000
--- a/data/hfopenllm_v2/bfuzzy1/acheron-d/317589da-d673-4f90-93e9-59983f2ef54b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bfuzzy1_acheron-d/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "acheron-d",
-    "id": "bfuzzy1/acheron-d",
-    "developer": "bfuzzy1",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.514
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1925
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.314
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2366
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3497
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1134
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bfuzzy1/acheron-m/efab322e-ea15-4fe7-9bfc-15246003e59c.json b/data/hfopenllm_v2/bfuzzy1/acheron-m/efab322e-ea15-4fe7-9bfc-15246003e59c.json
deleted file mode 100644
index 21c3ead2f..000000000
--- a/data/hfopenllm_v2/bfuzzy1/acheron-m/efab322e-ea15-4fe7-9bfc-15246003e59c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bfuzzy1_acheron-m/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "acheron-m",
-    "id": "bfuzzy1/acheron-m",
-    "developer": "bfuzzy1",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.514
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1758
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3487
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1113
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bfuzzy1/acheron-m1a-llama/b1eac68e-b292-414b-9594-c921f8e10818.json b/data/hfopenllm_v2/bfuzzy1/acheron-m1a-llama/b1eac68e-b292-414b-9594-c921f8e10818.json
deleted file mode 100644
index a0d0cda28..000000000
--- a/data/hfopenllm_v2/bfuzzy1/acheron-m1a-llama/b1eac68e-b292-414b-9594-c921f8e10818.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bfuzzy1_acheron-m1a-llama/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "acheron-m1a-llama",
-    "id": "bfuzzy1/acheron-m1a-llama",
-    "developer": "bfuzzy1",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.514
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2956
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0076
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3633
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1146
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bfuzzy1/acheron/b7d08c65-8219-4067-9504-99e438a86038.json b/data/hfopenllm_v2/bfuzzy1/acheron/b7d08c65-8219-4067-9504-99e438a86038.json
deleted file mode 100644
index 599c0add4..000000000
--- a/data/hfopenllm_v2/bfuzzy1/acheron/b7d08c65-8219-4067-9504-99e438a86038.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bfuzzy1_acheron/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "acheron",
-    "id": "bfuzzy1/acheron",
-    "developer": "bfuzzy1",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.514
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1983
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3108
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2391
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3511
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1096
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bfuzzy1/llambses-1/e9c5b479-0dce-4de3-84d6-90c7515337f1.json b/data/hfopenllm_v2/bfuzzy1/llambses-1/e9c5b479-0dce-4de3-84d6-90c7515337f1.json
deleted file mode 100644
index f8f06d954..000000000
--- a/data/hfopenllm_v2/bfuzzy1/llambses-1/e9c5b479-0dce-4de3-84d6-90c7515337f1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bfuzzy1_llambses-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llambses-1",
-    "id": "bfuzzy1/llambses-1",
-    "developer": "bfuzzy1",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3554
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5047
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0687
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4529
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.314
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bhuvneshsaini/merged_model/3c766465-29db-4b3d-b42f-a3222b38a096.json b/data/hfopenllm_v2/bhuvneshsaini/merged_model/3c766465-29db-4b3d-b42f-a3222b38a096.json
deleted file mode 100644
index 477afd11a..000000000
--- a/data/hfopenllm_v2/bhuvneshsaini/merged_model/3c766465-29db-4b3d-b42f-a3222b38a096.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bhuvneshsaini_merged_model/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "merged_model",
-    "id": "bhuvneshsaini/merged_model",
-    "developer": "bhuvneshsaini",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.715
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1813
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.336
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3497
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1445
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bigcode/starcoder2-15b/e6c85677-61ed-475b-85a5-48b91ec76bcf.json b/data/hfopenllm_v2/bigcode/starcoder2-15b/e6c85677-61ed-475b-85a5-48b91ec76bcf.json
deleted file mode 100644
index 9a0162bcf..000000000
--- a/data/hfopenllm_v2/bigcode/starcoder2-15b/e6c85677-61ed-475b-85a5-48b91ec76bcf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bigcode_starcoder2-15b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "starcoder2-15b",
-    "id": "bigcode/starcoder2-15b",
-    "developer": "bigcode",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Starcoder2ForCausalLM",
-      "params_billions": 15.958
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.278
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4448
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0597
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3501
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2353
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bigcode/starcoder2-3b/7b68fa5e-dbbf-4542-8767-6874aabf8f40.json b/data/hfopenllm_v2/bigcode/starcoder2-3b/7b68fa5e-dbbf-4542-8767-6874aabf8f40.json
deleted file mode 100644
index 4f9f33d8f..000000000
--- a/data/hfopenllm_v2/bigcode/starcoder2-3b/7b68fa5e-dbbf-4542-8767-6874aabf8f40.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bigcode_starcoder2-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "starcoder2-3b",
-    "id": "bigcode/starcoder2-3b",
-    "developer": "bigcode",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Starcoder2ForCausalLM",
-      "params_billions": 3.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2037
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3509
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2441
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3435
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1636
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bigcode/starcoder2-7b/c103b7f4-a432-42d6-86ef-cb369e0c16ff.json b/data/hfopenllm_v2/bigcode/starcoder2-7b/c103b7f4-a432-42d6-86ef-cb369e0c16ff.json
deleted file mode 100644
index ba7f43161..000000000
--- a/data/hfopenllm_v2/bigcode/starcoder2-7b/c103b7f4-a432-42d6-86ef-cb369e0c16ff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bigcode_starcoder2-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "starcoder2-7b",
-    "id": "bigcode/starcoder2-7b",
-    "developer": "bigcode",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Starcoder2ForCausalLM",
-      "params_billions": 7.174
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2209
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3661
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.031
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3793
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1642
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bigscience/bloom-1b1/643dda41-37d0-4c1e-b856-58b774612886.json b/data/hfopenllm_v2/bigscience/bloom-1b1/643dda41-37d0-4c1e-b856-58b774612886.json
deleted file mode 100644
index 02b4a4909..000000000
--- a/data/hfopenllm_v2/bigscience/bloom-1b1/643dda41-37d0-4c1e-b856-58b774612886.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bigscience_bloom-1b1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bloom-1b1",
-    "id": "bigscience/bloom-1b1",
-    "developer": "bigscience",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "BloomForCausalLM",
-      "params_billions": 1.065
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1373
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3107
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0053
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1108
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bigscience/bloom-1b7/ba2f284b-d7c6-4748-a8dc-4f80caa30c6c.json b/data/hfopenllm_v2/bigscience/bloom-1b7/ba2f284b-d7c6-4748-a8dc-4f80caa30c6c.json
deleted file mode 100644
index e53021d83..000000000
--- a/data/hfopenllm_v2/bigscience/bloom-1b7/ba2f284b-d7c6-4748-a8dc-4f80caa30c6c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bigscience_bloom-1b7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bloom-1b7",
-    "id": "bigscience/bloom-1b7",
-    "developer": "bigscience",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "BloomForCausalLM",
-      "params_billions": 1.722
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1044
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3141
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0053
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3886
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1086
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bigscience/bloom-3b/16e30aa0-736a-4ef8-8ba6-78285b84546f.json b/data/hfopenllm_v2/bigscience/bloom-3b/16e30aa0-736a-4ef8-8ba6-78285b84546f.json
deleted file mode 100644
index 0929bf77c..000000000
--- a/data/hfopenllm_v2/bigscience/bloom-3b/16e30aa0-736a-4ef8-8ba6-78285b84546f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bigscience_bloom-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bloom-3b",
-    "id": "bigscience/bloom-3b",
-    "developer": "bigscience",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "BloomForCausalLM",
-      "params_billions": 3.003
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1271
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3063
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2399
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3981
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1133
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bigscience/bloom-560m/73eb729d-adfd-4dee-9bde-04a31f5528f6.json b/data/hfopenllm_v2/bigscience/bloom-560m/73eb729d-adfd-4dee-9bde-04a31f5528f6.json
deleted file mode 100644
index 2294ec54f..000000000
--- a/data/hfopenllm_v2/bigscience/bloom-560m/73eb729d-adfd-4dee-9bde-04a31f5528f6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bigscience_bloom-560m/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bloom-560m",
-    "id": "bigscience/bloom-560m",
-    "developer": "bigscience",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "BloomForCausalLM",
-      "params_billions": 0.559
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.062
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3026
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0038
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4031
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1164
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bigscience/bloom-7b1/0daad2ae-92d0-4522-a067-20332f72c96f.json b/data/hfopenllm_v2/bigscience/bloom-7b1/0daad2ae-92d0-4522-a067-20332f72c96f.json
deleted file mode 100644
index 2cfad20ec..000000000
--- a/data/hfopenllm_v2/bigscience/bloom-7b1/0daad2ae-92d0-4522-a067-20332f72c96f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bigscience_bloom-7b1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bloom-7b1",
-    "id": "bigscience/bloom-7b1",
-    "developer": "bigscience",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "BloomForCausalLM",
-      "params_billions": 7.069
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1322
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3114
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0053
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3487
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1105
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bluuwhale/L3-SthenoMaid-8B-V1/a3e3849f-a289-4132-b4a8-f67d67ad46a1.json b/data/hfopenllm_v2/bluuwhale/L3-SthenoMaid-8B-V1/a3e3849f-a289-4132-b4a8-f67d67ad46a1.json
deleted file mode 100644
index bc1d74cae..000000000
--- a/data/hfopenllm_v2/bluuwhale/L3-SthenoMaid-8B-V1/a3e3849f-a289-4132-b4a8-f67d67ad46a1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bluuwhale_L3-SthenoMaid-8B-V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-SthenoMaid-8B-V1",
-    "id": "bluuwhale/L3-SthenoMaid-8B-V1",
-    "developer": "bluuwhale",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7345
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5219
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3687
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3656
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bond005/meno-tiny-0.1/59a9ed26-a67a-4e76-8858-520400c90766.json b/data/hfopenllm_v2/bond005/meno-tiny-0.1/59a9ed26-a67a-4e76-8858-520400c90766.json
deleted file mode 100644
index f9ab2bb8c..000000000
--- a/data/hfopenllm_v2/bond005/meno-tiny-0.1/59a9ed26-a67a-4e76-8858-520400c90766.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bond005_meno-tiny-0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "meno-tiny-0.1",
-    "id": "bond005/meno-tiny-0.1",
-    "developer": "bond005",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.455
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4263
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.139
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4185
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2786
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bosonai/Higgs-Llama-3-70B/6c5c61b4-8037-4b28-8616-1aefa7963eb8.json b/data/hfopenllm_v2/bosonai/Higgs-Llama-3-70B/6c5c61b4-8037-4b28-8616-1aefa7963eb8.json
deleted file mode 100644
index 7f13e2ca9..000000000
--- a/data/hfopenllm_v2/bosonai/Higgs-Llama-3-70B/6c5c61b4-8037-4b28-8616-1aefa7963eb8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bosonai_Higgs-Llama-3-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Higgs-Llama-3-70B",
-    "id": "bosonai/Higgs-Llama-3-70B",
-    "developer": "bosonai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5561
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6258
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2523
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3666
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4471
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4902
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-1.5B-Blunt/e9f9b836-fbdf-4996-9b35-2c8145a7f01b.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-1.5B-Blunt/e9f9b836-fbdf-4996-9b35-2c8145a7f01b.json
deleted file mode 100644
index 4c23d4552..000000000
--- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-1.5B-Blunt/e9f9b836-fbdf-4996-9b35-2c8145a7f01b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-1.5B-Blunt/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-1.5B-Blunt",
-    "id": "braindao/DeepSeek-R1-Distill-Qwen-1.5B-Blunt",
-    "developer": "braindao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2611
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2774
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1382
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2475
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3595
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1184
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-1.5B-Reflective/5b3dae43-5d5c-4d19-bd47-5c0f68ecbb81.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-1.5B-Reflective/5b3dae43-5d5c-4d19-bd47-5c0f68ecbb81.json
deleted file mode 100644
index 7fd796153..000000000
--- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-1.5B-Reflective/5b3dae43-5d5c-4d19-bd47-5c0f68ecbb81.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-1.5B-Reflective/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-1.5B-Reflective",
-    "id": "braindao/DeepSeek-R1-Distill-Qwen-1.5B-Reflective",
-    "developer": "braindao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3033
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2908
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1631
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.113
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-ABUB-ST/d5b31b1f-ace0-457f-bf8a-9041398b8344.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-ABUB-ST/d5b31b1f-ace0-457f-bf8a-9041398b8344.json
deleted file mode 100644
index 71e6c0585..000000000
--- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-ABUB-ST/d5b31b1f-ace0-457f-bf8a-9041398b8344.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-14B-ABUB-ST/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-14B-ABUB-ST",
-    "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-ABUB-ST",
-    "developer": "braindao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3752
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4927
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5015
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3448
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4221
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4243
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective/b34702cf-ffb8-4e75-9c9b-f5c52623d4c8.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective/b34702cf-ffb8-4e75-9c9b-f5c52623d4c8.json
deleted file mode 100644
index ce916802a..000000000
--- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective/b34702cf-ffb8-4e75-9c9b-f5c52623d4c8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective",
-    "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt-Reflective",
-    "developer": "braindao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.554
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3371
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2372
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4248
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1504
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt/c701f1fd-166d-416b-8f78-edf17f2fecd4.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt/c701f1fd-166d-416b-8f78-edf17f2fecd4.json
deleted file mode 100644
index d0206075b..000000000
--- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt/c701f1fd-166d-416b-8f78-edf17f2fecd4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt",
-    "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Blunt",
-    "developer": "braindao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5221
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3199
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4527
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1484
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective/4217b403-e924-4f67-9b0e-ad1d4ed293a1.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective/4217b403-e924-4f67-9b0e-ad1d4ed293a1.json
deleted file mode 100644
index 7a63e7232..000000000
--- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective/4217b403-e924-4f67-9b0e-ad1d4ed293a1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective",
-    "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored-Reflective",
-    "developer": "braindao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5139
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3013
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1473
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4433
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1289
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored/03816e41-5fb8-4815-ab9c-4108ab19a3bc.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored/03816e41-5fb8-4815-ab9c-4108ab19a3bc.json
deleted file mode 100644
index 93ba8529b..000000000
--- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored/03816e41-5fb8-4815-ab9c-4108ab19a3bc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored",
-    "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt-Uncensored",
-    "developer": "braindao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5422
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.317
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1631
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4487
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1431
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt/a763b10e-350a-4342-ade3-b782437ca3e2.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt/a763b10e-350a-4342-ade3-b782437ca3e2.json
deleted file mode 100644
index 511ffb178..000000000
--- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt/a763b10e-350a-4342-ade3-b782437ca3e2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-14B-Blunt/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-14B-Blunt",
-    "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Blunt",
-    "developer": "braindao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5612
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3283
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1639
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4554
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1447
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Reflective/9e806fd2-edbf-40e2-a008-834cee537bb6.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Reflective/9e806fd2-edbf-40e2-a008-834cee537bb6.json
deleted file mode 100644
index c56a5b418..000000000
--- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B-Reflective/9e806fd2-edbf-40e2-a008-834cee537bb6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-14B-Reflective/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-14B-Reflective",
-    "id": "braindao/DeepSeek-R1-Distill-Qwen-14B-Reflective",
-    "developer": "braindao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.429
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1918
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4554
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1129
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B/fbcf861c-62db-4079-bba6-becd4e231216.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B/fbcf861c-62db-4079-bba6-becd4e231216.json
deleted file mode 100644
index cd51e1d40..000000000
--- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-14B/fbcf861c-62db-4079-bba6-becd4e231216.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-14B",
-    "id": "braindao/DeepSeek-R1-Distill-Qwen-14B",
-    "developer": "braindao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4172
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3033
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.176
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4488
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1127
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-Blunt/22b591c0-3386-4bd5-860c-20c0c6001986.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-Blunt/22b591c0-3386-4bd5-860c-20c0c6001986.json
deleted file mode 100644
index e9582887e..000000000
--- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-Blunt/22b591c0-3386-4bd5-860c-20c0c6001986.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-7B-Blunt/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-7B-Blunt",
-    "id": "braindao/DeepSeek-R1-Distill-Qwen-7B-Blunt",
-    "developer": "braindao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4266
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2902
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3885
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1169
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored/dfb9a9c4-114e-4188-9940-4d6df7e4815f.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored/dfb9a9c4-114e-4188-9940-4d6df7e4815f.json
deleted file mode 100644
index b9fd0837f..000000000
--- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored/dfb9a9c4-114e-4188-9940-4d6df7e4815f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored",
-    "id": "braindao/DeepSeek-R1-Distill-Qwen-7B-ORPO-Uncensored",
-    "developer": "braindao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3655
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2958
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1737
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3846
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1133
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-Reflective/38fd5f4d-0f3c-4dc2-b250-a9ee7090aac2.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-Reflective/38fd5f4d-0f3c-4dc2-b250-a9ee7090aac2.json
deleted file mode 100644
index dea2a0ad9..000000000
--- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B-Reflective/38fd5f4d-0f3c-4dc2-b250-a9ee7090aac2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-7B-Reflective/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-7B-Reflective",
-    "id": "braindao/DeepSeek-R1-Distill-Qwen-7B-Reflective",
-    "developer": "braindao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3922
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2907
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2024
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1155
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B/e53cbc94-fc9f-4d53-ae28-26bc8c2caef8.json b/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B/e53cbc94-fc9f-4d53-ae28-26bc8c2caef8.json
deleted file mode 100644
index 256a50564..000000000
--- a/data/hfopenllm_v2/braindao/DeepSeek-R1-Distill-Qwen-7B/e53cbc94-fc9f-4d53-ae28-26bc8c2caef8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/braindao_DeepSeek-R1-Distill-Qwen-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-7B",
-    "id": "braindao/DeepSeek-R1-Distill-Qwen-7B",
-    "developer": "braindao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3968
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2887
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1918
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3767
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1141
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/braindao/Qwen2.5-14B-Instruct/2165e69a-c50c-419a-932e-909f53b73b71.json b/data/hfopenllm_v2/braindao/Qwen2.5-14B-Instruct/2165e69a-c50c-419a-932e-909f53b73b71.json
deleted file mode 100644
index 5a58485ba..000000000
--- a/data/hfopenllm_v2/braindao/Qwen2.5-14B-Instruct/2165e69a-c50c-419a-932e-909f53b73b71.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/braindao_Qwen2.5-14B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Instruct",
-    "id": "braindao/Qwen2.5-14B-Instruct",
-    "developer": "braindao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8143
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6404
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5529
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.414
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4889
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/braindao/Qwen2.5-14B/46430a07-15c8-4727-9102-2f471d4f1d3c.json b/data/hfopenllm_v2/braindao/Qwen2.5-14B/46430a07-15c8-4727-9102-2f471d4f1d3c.json
deleted file mode 100644
index fd68ca929..000000000
--- a/data/hfopenllm_v2/braindao/Qwen2.5-14B/46430a07-15c8-4727-9102-2f471d4f1d3c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/braindao_Qwen2.5-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B",
-    "id": "braindao/Qwen2.5-14B",
-    "developer": "braindao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5409
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5853
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2923
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4124
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4884
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/braindao/iq-code-evmind-0.5b/3c7f540a-c850-4e20-ad93-60e021d17133.json b/data/hfopenllm_v2/braindao/iq-code-evmind-0.5b/3c7f540a-c850-4e20-ad93-60e021d17133.json
deleted file mode 100644
index 7ebae94fb..000000000
--- a/data/hfopenllm_v2/braindao/iq-code-evmind-0.5b/3c7f540a-c850-4e20-ad93-60e021d17133.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/braindao_iq-code-evmind-0.5b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "iq-code-evmind-0.5b",
-    "id": "braindao/iq-code-evmind-0.5b",
-    "developer": "braindao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3216
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3164
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0242
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2416
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3304
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1189
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/brgx53/3Bgeneral-ECE-PRYMMAL-Martial/c3ab4f38-6f7b-4589-ae4f-21ace05b8c44.json b/data/hfopenllm_v2/brgx53/3Bgeneral-ECE-PRYMMAL-Martial/c3ab4f38-6f7b-4589-ae4f-21ace05b8c44.json
deleted file mode 100644
index e96867233..000000000
--- a/data/hfopenllm_v2/brgx53/3Bgeneral-ECE-PRYMMAL-Martial/c3ab4f38-6f7b-4589-ae4f-21ace05b8c44.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/brgx53_3Bgeneral-ECE-PRYMMAL-Martial/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "3Bgeneral-ECE-PRYMMAL-Martial",
-    "id": "brgx53/3Bgeneral-ECE-PRYMMAL-Martial",
-    "developer": "brgx53",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5458
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1314
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4373
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3934
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/brgx53/3Bgeneralv2-ECE-PRYMMAL-Martial/2708c0d6-03e7-4a17-b6b9-e16f3ddcf5bb.json b/data/hfopenllm_v2/brgx53/3Bgeneralv2-ECE-PRYMMAL-Martial/2708c0d6-03e7-4a17-b6b9-e16f3ddcf5bb.json
deleted file mode 100644
index bebdea7ac..000000000
--- a/data/hfopenllm_v2/brgx53/3Bgeneralv2-ECE-PRYMMAL-Martial/2708c0d6-03e7-4a17-b6b9-e16f3ddcf5bb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/brgx53_3Bgeneralv2-ECE-PRYMMAL-Martial/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "3Bgeneralv2-ECE-PRYMMAL-Martial",
-    "id": "brgx53/3Bgeneralv2-ECE-PRYMMAL-Martial",
-    "developer": "brgx53",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5677
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5607
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3497
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4356
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4505
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/brgx53/3Blareneg-ECE-PRYMMAL-Martial/6427a5ef-8508-430d-970d-054fc485e754.json b/data/hfopenllm_v2/brgx53/3Blareneg-ECE-PRYMMAL-Martial/6427a5ef-8508-430d-970d-054fc485e754.json
deleted file mode 100644
index 27aa2a2d2..000000000
--- a/data/hfopenllm_v2/brgx53/3Blareneg-ECE-PRYMMAL-Martial/6427a5ef-8508-430d-970d-054fc485e754.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/brgx53_3Blareneg-ECE-PRYMMAL-Martial/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "3Blareneg-ECE-PRYMMAL-Martial",
-    "id": "brgx53/3Blareneg-ECE-PRYMMAL-Martial",
-    "developer": "brgx53",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2876
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5358
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1208
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3347
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4429
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4016
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/brgx53/3Blarenegv2-ECE-PRYMMAL-Martial/08984ad9-1e9b-4916-b214-af26dadfcc0b.json b/data/hfopenllm_v2/brgx53/3Blarenegv2-ECE-PRYMMAL-Martial/08984ad9-1e9b-4916-b214-af26dadfcc0b.json
deleted file mode 100644
index 3114b7908..000000000
--- a/data/hfopenllm_v2/brgx53/3Blarenegv2-ECE-PRYMMAL-Martial/08984ad9-1e9b-4916-b214-af26dadfcc0b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/brgx53_3Blarenegv2-ECE-PRYMMAL-Martial/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "3Blarenegv2-ECE-PRYMMAL-Martial",
-    "id": "brgx53/3Blarenegv2-ECE-PRYMMAL-Martial",
-    "developer": "brgx53",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5662
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5607
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3497
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4356
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4505
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/brgx53/Barracuda-PRYMMAL-ECE-TW3/1dbb5d03-fdfa-4059-9d50-d037ada6b1ac.json b/data/hfopenllm_v2/brgx53/Barracuda-PRYMMAL-ECE-TW3/1dbb5d03-fdfa-4059-9d50-d037ada6b1ac.json
deleted file mode 100644
index ee39d8a64..000000000
--- a/data/hfopenllm_v2/brgx53/Barracuda-PRYMMAL-ECE-TW3/1dbb5d03-fdfa-4059-9d50-d037ada6b1ac.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/brgx53_Barracuda-PRYMMAL-ECE-TW3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Barracuda-PRYMMAL-ECE-TW3",
-    "id": "brgx53/Barracuda-PRYMMAL-ECE-TW3",
-    "developer": "brgx53",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.164
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3002
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0023
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3609
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1093
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/brgx53/LaConfiance-PRYMMAL-ECE-TW3/6bf42faa-c3e9-4069-bf93-ffd626062f0f.json b/data/hfopenllm_v2/brgx53/LaConfiance-PRYMMAL-ECE-TW3/6bf42faa-c3e9-4069-bf93-ffd626062f0f.json
deleted file mode 100644
index f4004b73f..000000000
--- a/data/hfopenllm_v2/brgx53/LaConfiance-PRYMMAL-ECE-TW3/6bf42faa-c3e9-4069-bf93-ffd626062f0f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/brgx53_LaConfiance-PRYMMAL-ECE-TW3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LaConfiance-PRYMMAL-ECE-TW3",
-    "id": "brgx53/LaConfiance-PRYMMAL-ECE-TW3",
-    "developer": "brgx53",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1579
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2962
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3846
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1146
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Best-Mix-Llama-3.1-8B/9feccbdc-18eb-4077-b50b-986db0047fc8.json b/data/hfopenllm_v2/bunnycore/Best-Mix-Llama-3.1-8B/9feccbdc-18eb-4077-b50b-986db0047fc8.json
deleted file mode 100644
index b5b81fbcc..000000000
--- a/data/hfopenllm_v2/bunnycore/Best-Mix-Llama-3.1-8B/9feccbdc-18eb-4077-b50b-986db0047fc8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Best-Mix-Llama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Best-Mix-Llama-3.1-8B",
-    "id": "bunnycore/Best-Mix-Llama-3.1-8B",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2067
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3432
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2054
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2929
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1565
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Blabbertron-1.0/a074c33f-782a-409c-987b-7dd62c65ccc7.json b/data/hfopenllm_v2/bunnycore/Blabbertron-1.0/a074c33f-782a-409c-987b-7dd62c65ccc7.json
deleted file mode 100644
index 313ecbf7a..000000000
--- a/data/hfopenllm_v2/bunnycore/Blabbertron-1.0/a074c33f-782a-409c-987b-7dd62c65ccc7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Blabbertron-1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Blabbertron-1.0",
-    "id": "bunnycore/Blabbertron-1.0",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7433
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5497
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4924
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4337
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4354
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Blabbertron-1.1/2f2c0dea-dcd4-4e54-9f40-9fda4b91bd40.json b/data/hfopenllm_v2/bunnycore/Blabbertron-1.1/2f2c0dea-dcd4-4e54-9f40-9fda4b91bd40.json
deleted file mode 100644
index 47f457e44..000000000
--- a/data/hfopenllm_v2/bunnycore/Blabbertron-1.1/2f2c0dea-dcd4-4e54-9f40-9fda4b91bd40.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Blabbertron-1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Blabbertron-1.1",
-    "id": "bunnycore/Blabbertron-1.1",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7265
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5534
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4804
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4416
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4431
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/CyberCore-Qwen-2.1-7B/84481fee-3727-427b-912a-30e2744df28a.json b/data/hfopenllm_v2/bunnycore/CyberCore-Qwen-2.1-7B/84481fee-3727-427b-912a-30e2744df28a.json
deleted file mode 100644
index a37c47c92..000000000
--- a/data/hfopenllm_v2/bunnycore/CyberCore-Qwen-2.1-7B/84481fee-3727-427b-912a-30e2744df28a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_CyberCore-Qwen-2.1-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CyberCore-Qwen-2.1-7B",
-    "id": "bunnycore/CyberCore-Qwen-2.1-7B",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5766
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5572
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3588
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4145
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4445
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/DeepQwen-3B-LCoT-SCE/aaa801dc-1a47-4009-9ad4-7129a8d4e651.json b/data/hfopenllm_v2/bunnycore/DeepQwen-3B-LCoT-SCE/aaa801dc-1a47-4009-9ad4-7129a8d4e651.json
deleted file mode 100644
index ee38b50d8..000000000
--- a/data/hfopenllm_v2/bunnycore/DeepQwen-3B-LCoT-SCE/aaa801dc-1a47-4009-9ad4-7129a8d4e651.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_DeepQwen-3B-LCoT-SCE/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepQwen-3B-LCoT-SCE",
-    "id": "bunnycore/DeepQwen-3B-LCoT-SCE",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.396
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.449
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4512
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.247
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3514
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.329
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/DeepSeek-R1-Distill-Qwen-7B-RRP-Ex/3ac92cbf-c85b-4e00-9ef9-4322f961591a.json b/data/hfopenllm_v2/bunnycore/DeepSeek-R1-Distill-Qwen-7B-RRP-Ex/3ac92cbf-c85b-4e00-9ef9-4322f961591a.json
deleted file mode 100644
index b208debb8..000000000
--- a/data/hfopenllm_v2/bunnycore/DeepSeek-R1-Distill-Qwen-7B-RRP-Ex/3ac92cbf-c85b-4e00-9ef9-4322f961591a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_DeepSeek-R1-Distill-Qwen-7B-RRP-Ex/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-7B-RRP-Ex",
-    "id": "bunnycore/DeepSeek-R1-Distill-Qwen-7B-RRP-Ex",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3901
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3494
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1654
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3663
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/DeepThinker-7B-Sce-v1/162b511b-4684-4595-9261-a33f3a4117f9.json b/data/hfopenllm_v2/bunnycore/DeepThinker-7B-Sce-v1/162b511b-4684-4595-9261-a33f3a4117f9.json
deleted file mode 100644
index e66755432..000000000
--- a/data/hfopenllm_v2/bunnycore/DeepThinker-7B-Sce-v1/162b511b-4684-4595-9261-a33f3a4117f9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_DeepThinker-7B-Sce-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepThinker-7B-Sce-v1",
-    "id": "bunnycore/DeepThinker-7B-Sce-v1",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1218
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3018
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4194
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1123
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/DeepThinker-7B-Sce-v2/20d5d59a-028d-4e34-9414-d9edaf2e59b8.json b/data/hfopenllm_v2/bunnycore/DeepThinker-7B-Sce-v2/20d5d59a-028d-4e34-9414-d9edaf2e59b8.json
deleted file mode 100644
index 36edeced3..000000000
--- a/data/hfopenllm_v2/bunnycore/DeepThinker-7B-Sce-v2/20d5d59a-028d-4e34-9414-d9edaf2e59b8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_DeepThinker-7B-Sce-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepThinker-7B-Sce-v2",
-    "id": "bunnycore/DeepThinker-7B-Sce-v2",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1631
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3057
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0113
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4101
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1146
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/FuseCyberMix-Qwen-2.5-7B-Instruct/a21b53fb-783b-440b-9f3d-d8ada3bd18ea.json b/data/hfopenllm_v2/bunnycore/FuseCyberMix-Qwen-2.5-7B-Instruct/a21b53fb-783b-440b-9f3d-d8ada3bd18ea.json
deleted file mode 100644
index ae646dc09..000000000
--- a/data/hfopenllm_v2/bunnycore/FuseCyberMix-Qwen-2.5-7B-Instruct/a21b53fb-783b-440b-9f3d-d8ada3bd18ea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_FuseCyberMix-Qwen-2.5-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FuseCyberMix-Qwen-2.5-7B-Instruct",
-    "id": "bunnycore/FuseCyberMix-Qwen-2.5-7B-Instruct",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7019
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5518
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4841
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.402
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4337
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/FuseQwQen-7B/0d2ab1e8-a2d7-45cf-b123-67bcab2d9dff.json b/data/hfopenllm_v2/bunnycore/FuseQwQen-7B/0d2ab1e8-a2d7-45cf-b123-67bcab2d9dff.json
deleted file mode 100644
index 066c75efc..000000000
--- a/data/hfopenllm_v2/bunnycore/FuseQwQen-7B/0d2ab1e8-a2d7-45cf-b123-67bcab2d9dff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_FuseQwQen-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FuseQwQen-7B",
-    "id": "bunnycore/FuseQwQen-7B",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7275
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5504
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4366
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4217
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4407
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/FwF-Qwen-7B-0.1/6b4a37c8-c7e6-4156-9d6d-8cba51b74d82.json b/data/hfopenllm_v2/bunnycore/FwF-Qwen-7B-0.1/6b4a37c8-c7e6-4156-9d6d-8cba51b74d82.json
deleted file mode 100644
index 13a373a0b..000000000
--- a/data/hfopenllm_v2/bunnycore/FwF-Qwen-7B-0.1/6b4a37c8-c7e6-4156-9d6d-8cba51b74d82.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_FwF-Qwen-7B-0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FwF-Qwen-7B-0.1",
-    "id": "bunnycore/FwF-Qwen-7B-0.1",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3005
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5019
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2764
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3952
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4061
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/FwF-Qwen-7B-0.2/78582fec-2f69-4b37-8497-12ceb097b44b.json b/data/hfopenllm_v2/bunnycore/FwF-Qwen-7B-0.2/78582fec-2f69-4b37-8497-12ceb097b44b.json
deleted file mode 100644
index c94f58add..000000000
--- a/data/hfopenllm_v2/bunnycore/FwF-Qwen-7B-0.2/78582fec-2f69-4b37-8497-12ceb097b44b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_FwF-Qwen-7B-0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FwF-Qwen-7B-0.2",
-    "id": "bunnycore/FwF-Qwen-7B-0.2",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4479
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5596
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.426
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4218
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4382
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Gemma-2-2B-Smart/949bf65e-c2ae-4701-82f0-39d0c62a0e87.json b/data/hfopenllm_v2/bunnycore/Gemma-2-2B-Smart/949bf65e-c2ae-4701-82f0-39d0c62a0e87.json
deleted file mode 100644
index 7d739968f..000000000
--- a/data/hfopenllm_v2/bunnycore/Gemma-2-2B-Smart/949bf65e-c2ae-4701-82f0-39d0c62a0e87.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Gemma-2-2B-Smart/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-2B-Smart",
-    "id": "bunnycore/Gemma-2-2B-Smart",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1321
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3974
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0332
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4249
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2426
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Gemma2-9B-TitanFusion/8812151c-4301-4131-a414-d64d025e476e.json b/data/hfopenllm_v2/bunnycore/Gemma2-9B-TitanFusion/8812151c-4301-4131-a414-d64d025e476e.json
deleted file mode 100644
index 7da88954d..000000000
--- a/data/hfopenllm_v2/bunnycore/Gemma2-9B-TitanFusion/8812151c-4301-4131-a414-d64d025e476e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Gemma2-9B-TitanFusion/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma2-9B-TitanFusion",
-    "id": "bunnycore/Gemma2-9B-TitanFusion",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1618
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5712
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.077
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3322
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4136
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.396
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/HyperLlama-3.1-8B/2db1542f-a8da-4fb8-91a5-6dd1a942b55e.json b/data/hfopenllm_v2/bunnycore/HyperLlama-3.1-8B/2db1542f-a8da-4fb8-91a5-6dd1a942b55e.json
deleted file mode 100644
index 0347a413c..000000000
--- a/data/hfopenllm_v2/bunnycore/HyperLlama-3.1-8B/2db1542f-a8da-4fb8-91a5-6dd1a942b55e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_HyperLlama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HyperLlama-3.1-8B",
-    "id": "bunnycore/HyperLlama-3.1-8B",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7883
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5103
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1828
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3829
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3783
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.1-8B-TitanFusion-Mix/9feeffb2-3763-4e43-933e-89100b76f7fa.json b/data/hfopenllm_v2/bunnycore/Llama-3.1-8B-TitanFusion-Mix/9feeffb2-3763-4e43-933e-89100b76f7fa.json
deleted file mode 100644
index 0258259b7..000000000
--- a/data/hfopenllm_v2/bunnycore/Llama-3.1-8B-TitanFusion-Mix/9feeffb2-3763-4e43-933e-89100b76f7fa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.1-8B-TitanFusion-Mix/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-TitanFusion-Mix",
-    "id": "bunnycore/Llama-3.1-8B-TitanFusion-Mix",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4925
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5756
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1284
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4317
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3695
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.1-8B-TitanFusion-v3/721102b5-ed5e-4631-8600-a6adfff0c784.json b/data/hfopenllm_v2/bunnycore/Llama-3.1-8B-TitanFusion-v3/721102b5-ed5e-4631-8600-a6adfff0c784.json
deleted file mode 100644
index 9e1223273..000000000
--- a/data/hfopenllm_v2/bunnycore/Llama-3.1-8B-TitanFusion-v3/721102b5-ed5e-4631-8600-a6adfff0c784.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.1-8B-TitanFusion-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-TitanFusion-v3",
-    "id": "bunnycore/Llama-3.1-8B-TitanFusion-v3",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.481
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5262
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.142
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4302
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3806
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-All-Mix/18c185f7-5ca4-46ff-81c2-6c538f096409.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-All-Mix/18c185f7-5ca4-46ff-81c2-6c538f096409.json
deleted file mode 100644
index e63ada024..000000000
--- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-All-Mix/18c185f7-5ca4-46ff-81c2-6c538f096409.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-All-Mix/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-All-Mix",
-    "id": "bunnycore/Llama-3.2-3B-All-Mix",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.607
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7226
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4508
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1503
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3287
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.316
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Bespoke-Thought/7ab5911c-e229-43e5-a798-095287d0a597.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Bespoke-Thought/7ab5911c-e229-43e5-a798-095287d0a597.json
deleted file mode 100644
index 8d51fa17b..000000000
--- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Bespoke-Thought/7ab5911c-e229-43e5-a798-095287d0a597.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-Bespoke-Thought/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-Bespoke-Thought",
-    "id": "bunnycore/Llama-3.2-3B-Bespoke-Thought",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4113
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4522
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1647
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3302
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.311
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Booval/f800c4e5-e918-45bb-8a12-3ca2a64c6b23.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Booval/f800c4e5-e918-45bb-8a12-3ca2a64c6b23.json
deleted file mode 100644
index 596f8d2f9..000000000
--- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Booval/f800c4e5-e918-45bb-8a12-3ca2a64c6b23.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-Booval/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-Booval",
-    "id": "bunnycore/Llama-3.2-3B-Booval",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6669
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4514
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1269
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3394
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3058
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Deep-Test/5fcf41bc-30dc-46a7-9cf2-4ce2c7a5850c.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Deep-Test/5fcf41bc-30dc-46a7-9cf2-4ce2c7a5850c.json
deleted file mode 100644
index cce4b4015..000000000
--- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Deep-Test/5fcf41bc-30dc-46a7-9cf2-4ce2c7a5850c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-Deep-Test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-Deep-Test",
-    "id": "bunnycore/Llama-3.2-3B-Deep-Test",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.607
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4652
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4531
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1284
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3394
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3152
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Deep-Test/d4b20ef4-734e-40a7-818e-f77e170d7437.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Deep-Test/d4b20ef4-734e-40a7-818e-f77e170d7437.json
deleted file mode 100644
index 89f5cb978..000000000
--- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Deep-Test/d4b20ef4-734e-40a7-818e-f77e170d7437.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-Deep-Test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-Deep-Test",
-    "id": "bunnycore/Llama-3.2-3B-Deep-Test",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.803
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1775
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.295
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3647
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1049
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Della/e0996c96-c9e5-4d39-8e6d-1455ef1f9544.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Della/e0996c96-c9e5-4d39-8e6d-1455ef1f9544.json
deleted file mode 100644
index 10dca8528..000000000
--- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Della/e0996c96-c9e5-4d39-8e6d-1455ef1f9544.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-Della/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-Della",
-    "id": "bunnycore/Llama-3.2-3B-Della",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.607
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3561
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3683
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3902
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2128
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Long-Think/3ad2b31e-ce2a-4cb4-9b85-79cdebd5d364.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Long-Think/3ad2b31e-ce2a-4cb4-9b85-79cdebd5d364.json
deleted file mode 100644
index a4a40c69d..000000000
--- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Long-Think/3ad2b31e-ce2a-4cb4-9b85-79cdebd5d364.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-Long-Think/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-Long-Think",
-    "id": "bunnycore/Llama-3.2-3B-Long-Think",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5473
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.461
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1458
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3396
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3048
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Mix-Skill/9aff874c-1953-4b97-9bff-9e6120b0bfa7.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Mix-Skill/9aff874c-1953-4b97-9bff-9e6120b0bfa7.json
deleted file mode 100644
index 80946bb54..000000000
--- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-Mix-Skill/9aff874c-1953-4b97-9bff-9e6120b0bfa7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-Mix-Skill/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-Mix-Skill",
-    "id": "bunnycore/Llama-3.2-3B-Mix-Skill",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.607
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6404
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4582
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1473
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3396
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ProdigyPlus/45ae7f45-8c36-46c6-989d-bc672cdf8eff.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ProdigyPlus/45ae7f45-8c36-46c6-989d-bc672cdf8eff.json
deleted file mode 100644
index dfe4a54af..000000000
--- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ProdigyPlus/45ae7f45-8c36-46c6-989d-bc672cdf8eff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-ProdigyPlus/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-ProdigyPlus",
-    "id": "bunnycore/Llama-3.2-3B-ProdigyPlus",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.607
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4015
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4392
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1156
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2817
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ProdigyPlusPlus/7d36e44e-a329-4b96-a891-365ad900f718.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ProdigyPlusPlus/7d36e44e-a329-4b96-a891-365ad900f718.json
deleted file mode 100644
index 29f6d9a02..000000000
--- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ProdigyPlusPlus/7d36e44e-a329-4b96-a891-365ad900f718.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-ProdigyPlusPlus/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-ProdigyPlusPlus",
-    "id": "bunnycore/Llama-3.2-3B-ProdigyPlusPlus",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.607
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1645
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.369
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3541
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.15
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-RP-DeepThink/a8c26325-1eec-43a6-a8ad-3bcb2e378924.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-RP-DeepThink/a8c26325-1eec-43a6-a8ad-3bcb2e378924.json
deleted file mode 100644
index 467586c96..000000000
--- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-RP-DeepThink/a8c26325-1eec-43a6-a8ad-3bcb2e378924.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-RP-DeepThink/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-RP-DeepThink",
-    "id": "bunnycore/Llama-3.2-3B-RP-DeepThink",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.607
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7144
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4563
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1609
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3302
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3242
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-RRStock/bde1a879-6852-42ce-9217-f427af85a46a.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-RRStock/bde1a879-6852-42ce-9217-f427af85a46a.json
deleted file mode 100644
index aa68c7658..000000000
--- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-RRStock/bde1a879-6852-42ce-9217-f427af85a46a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-RRStock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-RRStock",
-    "id": "bunnycore/Llama-3.2-3B-RRStock",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.607
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6657
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4568
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1699
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3236
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ToxicKod/dd7a0377-f4d6-4390-b9f2-bf50b05ec0f7.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ToxicKod/dd7a0377-f4d6-4390-b9f2-bf50b05ec0f7.json
deleted file mode 100644
index 69e6b8e9a..000000000
--- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3B-ToxicKod/dd7a0377-f4d6-4390-b9f2-bf50b05ec0f7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3B-ToxicKod/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-ToxicKod",
-    "id": "bunnycore/Llama-3.2-3B-ToxicKod",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6319
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4525
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1699
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3475
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.288
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Llama-3.2-3b-RP-Toxic-Fuse/12cbf241-d6d4-4d25-ad3d-13a42d7adc74.json b/data/hfopenllm_v2/bunnycore/Llama-3.2-3b-RP-Toxic-Fuse/12cbf241-d6d4-4d25-ad3d-13a42d7adc74.json
deleted file mode 100644
index 3cdd73e74..000000000
--- a/data/hfopenllm_v2/bunnycore/Llama-3.2-3b-RP-Toxic-Fuse/12cbf241-d6d4-4d25-ad3d-13a42d7adc74.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Llama-3.2-3b-RP-Toxic-Fuse/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3b-RP-Toxic-Fuse",
-    "id": "bunnycore/Llama-3.2-3b-RP-Toxic-Fuse",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6834
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.465
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2402
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3954
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3106
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Maestro-S1k-7B-Sce/1f66fd7c-40ee-4249-8963-5c7bb93a3eaf.json b/data/hfopenllm_v2/bunnycore/Maestro-S1k-7B-Sce/1f66fd7c-40ee-4249-8963-5c7bb93a3eaf.json
deleted file mode 100644
index 56e2b2c00..000000000
--- a/data/hfopenllm_v2/bunnycore/Maestro-S1k-7B-Sce/1f66fd7c-40ee-4249-8963-5c7bb93a3eaf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Maestro-S1k-7B-Sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Maestro-S1k-7B-Sce",
-    "id": "bunnycore/Maestro-S1k-7B-Sce",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2523
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0279
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3768
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.117
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Phi-3.5-mini-TitanFusion-0.1/7076406b-7e0a-49c7-8150-2e6a243aa23b.json b/data/hfopenllm_v2/bunnycore/Phi-3.5-mini-TitanFusion-0.1/7076406b-7e0a-49c7-8150-2e6a243aa23b.json
deleted file mode 100644
index 700136bdc..000000000
--- a/data/hfopenllm_v2/bunnycore/Phi-3.5-mini-TitanFusion-0.1/7076406b-7e0a-49c7-8150-2e6a243aa23b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Phi-3.5-mini-TitanFusion-0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3.5-mini-TitanFusion-0.1",
-    "id": "bunnycore/Phi-3.5-mini-TitanFusion-0.1",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5228
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5374
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1186
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4453
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3807
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v2/96c3fd80-a601-4629-a1ab-bf7f366a909a.json b/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v2/96c3fd80-a601-4629-a1ab-bf7f366a909a.json
deleted file mode 100644
index 82650a142..000000000
--- a/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v2/96c3fd80-a601-4629-a1ab-bf7f366a909a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-Model-Stock-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-Model-Stock-v2",
-    "id": "bunnycore/Phi-4-Model-Stock-v2",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6375
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6825
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3754
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.349
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4662
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5331
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v3/1302c9a5-d35c-400c-b9f3-d990243e5d59.json b/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v3/1302c9a5-d35c-400c-b9f3-d990243e5d59.json
deleted file mode 100644
index 2532056a9..000000000
--- a/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v3/1302c9a5-d35c-400c-b9f3-d990243e5d59.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-Model-Stock-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-Model-Stock-v3",
-    "id": "bunnycore/Phi-4-Model-Stock-v3",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5912
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6726
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4902
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4166
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5381
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v4/c7f48bbf-6583-4ddd-ae4d-671c43218dae.json b/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v4/c7f48bbf-6583-4ddd-ae4d-671c43218dae.json
deleted file mode 100644
index 6189289e4..000000000
--- a/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock-v4/c7f48bbf-6583-4ddd-ae4d-671c43218dae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-Model-Stock-v4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-Model-Stock-v4",
-    "id": "bunnycore/Phi-4-Model-Stock-v4",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.711
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6924
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3829
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3691
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4611
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5394
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock/5f07e092-2eb0-44c2-b2ce-5f1b31a9ea99.json b/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock/5f07e092-2eb0-44c2-b2ce-5f1b31a9ea99.json
deleted file mode 100644
index fd4a9988a..000000000
--- a/data/hfopenllm_v2/bunnycore/Phi-4-Model-Stock/5f07e092-2eb0-44c2-b2ce-5f1b31a9ea99.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-Model-Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-Model-Stock",
-    "id": "bunnycore/Phi-4-Model-Stock",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6879
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.689
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4298
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3549
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4441
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5368
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-RP-v0/15701682-97ce-46cf-8010-a6bdeaf8c7aa.json b/data/hfopenllm_v2/bunnycore/Phi-4-RP-v0/15701682-97ce-46cf-8010-a6bdeaf8c7aa.json
deleted file mode 100644
index 695b852da..000000000
--- a/data/hfopenllm_v2/bunnycore/Phi-4-RP-v0/15701682-97ce-46cf-8010-a6bdeaf8c7aa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-RP-v0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-RP-v0",
-    "id": "bunnycore/Phi-4-RP-v0",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6827
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6856
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3316
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3523
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4141
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5364
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-RR-Shoup/c6eecf0b-fa16-484a-8eeb-d196203b3c3e.json b/data/hfopenllm_v2/bunnycore/Phi-4-RR-Shoup/c6eecf0b-fa16-484a-8eeb-d196203b3c3e.json
deleted file mode 100644
index a4fe6d434..000000000
--- a/data/hfopenllm_v2/bunnycore/Phi-4-RR-Shoup/c6eecf0b-fa16-484a-8eeb-d196203b3c3e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-RR-Shoup/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-RR-Shoup",
-    "id": "bunnycore/Phi-4-RR-Shoup",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6587
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6947
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4992
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3372
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.444
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5429
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-RStock-v0.1/4337b1c1-cc00-4a15-8148-e8d0739561b9.json b/data/hfopenllm_v2/bunnycore/Phi-4-RStock-v0.1/4337b1c1-cc00-4a15-8148-e8d0739561b9.json
deleted file mode 100644
index 476936c1d..000000000
--- a/data/hfopenllm_v2/bunnycore/Phi-4-RStock-v0.1/4337b1c1-cc00-4a15-8148-e8d0739561b9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-RStock-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-RStock-v0.1",
-    "id": "bunnycore/Phi-4-RStock-v0.1",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7019
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6928
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.395
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3649
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4584
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5401
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-ReasoningRP/1151ee14-8fe9-4f97-808d-8103b353c2ec.json b/data/hfopenllm_v2/bunnycore/Phi-4-ReasoningRP/1151ee14-8fe9-4f97-808d-8103b353c2ec.json
deleted file mode 100644
index c71d4c31e..000000000
--- a/data/hfopenllm_v2/bunnycore/Phi-4-ReasoningRP/1151ee14-8fe9-4f97-808d-8103b353c2ec.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-ReasoningRP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-ReasoningRP",
-    "id": "bunnycore/Phi-4-ReasoningRP",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6736
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6922
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4569
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4491
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5421
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-Sce-exp-v0.1/a2c18179-aca3-422c-b9f5-8345109cea13.json b/data/hfopenllm_v2/bunnycore/Phi-4-Sce-exp-v0.1/a2c18179-aca3-422c-b9f5-8345109cea13.json
deleted file mode 100644
index ca8ff9b68..000000000
--- a/data/hfopenllm_v2/bunnycore/Phi-4-Sce-exp-v0.1/a2c18179-aca3-422c-b9f5-8345109cea13.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-Sce-exp-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-Sce-exp-v0.1",
-    "id": "bunnycore/Phi-4-Sce-exp-v0.1",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6595
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6943
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.503
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4441
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5423
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-Stock-Ex/07495d34-1505-45a9-bb48-887af0da8a0c.json b/data/hfopenllm_v2/bunnycore/Phi-4-Stock-Ex/07495d34-1505-45a9-bb48-887af0da8a0c.json
deleted file mode 100644
index d7f7a2004..000000000
--- a/data/hfopenllm_v2/bunnycore/Phi-4-Stock-Ex/07495d34-1505-45a9-bb48-887af0da8a0c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-Stock-Ex/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-Stock-Ex",
-    "id": "bunnycore/Phi-4-Stock-Ex",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6575
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6864
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4086
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3507
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4624
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5375
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-Stock-RP/567baf6d-99f9-46a5-8c40-c6899986f1ff.json b/data/hfopenllm_v2/bunnycore/Phi-4-Stock-RP/567baf6d-99f9-46a5-8c40-c6899986f1ff.json
deleted file mode 100644
index bfb812571..000000000
--- a/data/hfopenllm_v2/bunnycore/Phi-4-Stock-RP/567baf6d-99f9-46a5-8c40-c6899986f1ff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-Stock-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-Stock-RP",
-    "id": "bunnycore/Phi-4-Stock-RP",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6399
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.686
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3414
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3582
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4715
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5317
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Phi-4-Trim-Exp1/a337df3a-28ff-46c9-adae-4bc029937101.json b/data/hfopenllm_v2/bunnycore/Phi-4-Trim-Exp1/a337df3a-28ff-46c9-adae-4bc029937101.json
deleted file mode 100644
index 803b9c23e..000000000
--- a/data/hfopenllm_v2/bunnycore/Phi-4-Trim-Exp1/a337df3a-28ff-46c9-adae-4bc029937101.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Phi-4-Trim-Exp1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-Trim-Exp1",
-    "id": "bunnycore/Phi-4-Trim-Exp1",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.503
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1219
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0053
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4177
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1147
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Phi-Seek-4-Sce-V1/b201a849-44e9-4598-918b-ffa27c894ee9.json b/data/hfopenllm_v2/bunnycore/Phi-Seek-4-Sce-V1/b201a849-44e9-4598-918b-ffa27c894ee9.json
deleted file mode 100644
index b2a61d834..000000000
--- a/data/hfopenllm_v2/bunnycore/Phi-Seek-4-Sce-V1/b201a849-44e9-4598-918b-ffa27c894ee9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Phi-Seek-4-Sce-V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-Seek-4-Sce-V1",
-    "id": "bunnycore/Phi-Seek-4-Sce-V1",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2935
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6459
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3982
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5123
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qandora-2.5-7B-Creative/dd87ebf3-3088-43b1-851c-a97d12a68ea8.json b/data/hfopenllm_v2/bunnycore/Qandora-2.5-7B-Creative/dd87ebf3-3088-43b1-851c-a97d12a68ea8.json
deleted file mode 100644
index 3f60d9d68..000000000
--- a/data/hfopenllm_v2/bunnycore/Qandora-2.5-7B-Creative/dd87ebf3-3088-43b1-851c-a97d12a68ea8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qandora-2.5-7B-Creative/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qandora-2.5-7B-Creative",
-    "id": "bunnycore/Qandora-2.5-7B-Creative",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6803
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5542
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3059
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4212
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.448
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/QandoraExp-7B-Persona/1b3ef805-8b0c-44bf-b048-773a0dd94d0d.json b/data/hfopenllm_v2/bunnycore/QandoraExp-7B-Persona/1b3ef805-8b0c-44bf-b048-773a0dd94d0d.json
deleted file mode 100644
index 3676e6214..000000000
--- a/data/hfopenllm_v2/bunnycore/QandoraExp-7B-Persona/1b3ef805-8b0c-44bf-b048-773a0dd94d0d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_QandoraExp-7B-Persona/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QandoraExp-7B-Persona",
-    "id": "bunnycore/QandoraExp-7B-Persona",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6247
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5558
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4372
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4407
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/QandoraExp-7B-v2/220cb478-58c0-4028-b51a-ec5fe1050746.json b/data/hfopenllm_v2/bunnycore/QandoraExp-7B-v2/220cb478-58c0-4028-b51a-ec5fe1050746.json
deleted file mode 100644
index 69de747fd..000000000
--- a/data/hfopenllm_v2/bunnycore/QandoraExp-7B-v2/220cb478-58c0-4028-b51a-ec5fe1050746.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_QandoraExp-7B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QandoraExp-7B-v2",
-    "id": "bunnycore/QandoraExp-7B-v2",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5607
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5445
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4713
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4045
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3909
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/QandoraExp-7B/17cb8ab1-e7ba-4daf-95d4-2cdbd2777434.json b/data/hfopenllm_v2/bunnycore/QandoraExp-7B/17cb8ab1-e7ba-4daf-95d4-2cdbd2777434.json
deleted file mode 100644
index 7fc71a8b5..000000000
--- a/data/hfopenllm_v2/bunnycore/QandoraExp-7B/17cb8ab1-e7ba-4daf-95d4-2cdbd2777434.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_QandoraExp-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QandoraExp-7B",
-    "id": "bunnycore/QandoraExp-7B",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7509
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5478
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4743
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4312
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.441
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/QwQen-3B-LCoT-R1/2b55023b-b8bc-42a2-aca8-dcaf39890232.json b/data/hfopenllm_v2/bunnycore/QwQen-3B-LCoT-R1/2b55023b-b8bc-42a2-aca8-dcaf39890232.json
deleted file mode 100644
index cfea94b78..000000000
--- a/data/hfopenllm_v2/bunnycore/QwQen-3B-LCoT-R1/2b55023b-b8bc-42a2-aca8-dcaf39890232.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_QwQen-3B-LCoT-R1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwQen-3B-LCoT-R1",
-    "id": "bunnycore/QwQen-3B-LCoT-R1",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.085
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5342
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4799
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3353
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4138
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3723
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/QwQen-3B-LCoT/31736569-5992-4b1d-9d66-27a6c1620506.json b/data/hfopenllm_v2/bunnycore/QwQen-3B-LCoT/31736569-5992-4b1d-9d66-27a6c1620506.json
deleted file mode 100644
index c7d4b48fe..000000000
--- a/data/hfopenllm_v2/bunnycore/QwQen-3B-LCoT/31736569-5992-4b1d-9d66-27a6c1620506.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_QwQen-3B-LCoT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwQen-3B-LCoT",
-    "id": "bunnycore/QwQen-3B-LCoT",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.397
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6025
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4899
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3618
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4178
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3699
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Sky-T1/630b37b5-351c-403c-ac76-ccb68ffc5d53.json b/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Sky-T1/630b37b5-351c-403c-ac76-ccb68ffc5d53.json
deleted file mode 100644
index e8e23a052..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Sky-T1/630b37b5-351c-403c-ac76-ccb68ffc5d53.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen-2.5-7B-Deep-Sky-T1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-2.5-7B-Deep-Sky-T1",
-    "id": "bunnycore/Qwen-2.5-7B-Deep-Sky-T1",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4208
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.414
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0551
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4018
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2104
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v1/69cdef01-30dc-4f75-97fa-9daeebcec72f.json b/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v1/69cdef01-30dc-4f75-97fa-9daeebcec72f.json
deleted file mode 100644
index 949efbe5f..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v1/69cdef01-30dc-4f75-97fa-9daeebcec72f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen-2.5-7B-Deep-Stock-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-2.5-7B-Deep-Stock-v1",
-    "id": "bunnycore/Qwen-2.5-7B-Deep-Stock-v1",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5695
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5361
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2644
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4109
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4066
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v4/9aa1acb0-c791-4dea-aa1e-c912cea69466.json b/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v4/9aa1acb0-c791-4dea-aa1e-c912cea69466.json
deleted file mode 100644
index 39d26d113..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v4/9aa1acb0-c791-4dea-aa1e-c912cea69466.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen-2.5-7B-Deep-Stock-v4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-2.5-7B-Deep-Stock-v4",
-    "id": "bunnycore/Qwen-2.5-7B-Deep-Stock-v4",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7753
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5453
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4894
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4127
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4342
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v5/0c1d66f3-8fd7-47f2-8538-a1aa8985aebf.json b/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v5/0c1d66f3-8fd7-47f2-8538-a1aa8985aebf.json
deleted file mode 100644
index 7ef80c9ec..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Deep-Stock-v5/0c1d66f3-8fd7-47f2-8538-a1aa8985aebf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen-2.5-7B-Deep-Stock-v5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-2.5-7B-Deep-Stock-v5",
-    "id": "bunnycore/Qwen-2.5-7B-Deep-Stock-v5",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4509
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4672
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1473
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3648
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2832
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Exp-Sce/2872dcd9-421b-4346-812c-b27bb32c6e86.json b/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Exp-Sce/2872dcd9-421b-4346-812c-b27bb32c6e86.json
deleted file mode 100644
index e05a367ea..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Exp-Sce/2872dcd9-421b-4346-812c-b27bb32c6e86.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen-2.5-7B-Exp-Sce/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-2.5-7B-Exp-Sce",
-    "id": "bunnycore/Qwen-2.5-7B-Exp-Sce",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7652
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5506
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.443
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4259
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-R1-Stock/2f3e2fc0-f1e0-43cb-8a8c-6aadcc538646.json b/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-R1-Stock/2f3e2fc0-f1e0-43cb-8a8c-6aadcc538646.json
deleted file mode 100644
index 126635626..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-R1-Stock/2f3e2fc0-f1e0-43cb-8a8c-6aadcc538646.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen-2.5-7B-R1-Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-2.5-7B-R1-Stock",
-    "id": "bunnycore/Qwen-2.5-7B-R1-Stock",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7573
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5393
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3994
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4294
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Stock-Deep-Bespoke/d0a76497-84b0-45b9-b748-04ffe9bc13a3.json b/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Stock-Deep-Bespoke/d0a76497-84b0-45b9-b748-04ffe9bc13a3.json
deleted file mode 100644
index 4940eaf52..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7B-Stock-Deep-Bespoke/d0a76497-84b0-45b9-b748-04ffe9bc13a3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen-2.5-7B-Stock-Deep-Bespoke/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-2.5-7B-Stock-Deep-Bespoke",
-    "id": "bunnycore/Qwen-2.5-7B-Stock-Deep-Bespoke",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5206
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.492
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1888
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4068
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7b-S1k/185b6560-6790-417f-aeba-f7405fee808a.json b/data/hfopenllm_v2/bunnycore/Qwen-2.5-7b-S1k/185b6560-6790-417f-aeba-f7405fee808a.json
deleted file mode 100644
index 4b2686474..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen-2.5-7b-S1k/185b6560-6790-417f-aeba-f7405fee808a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen-2.5-7b-S1k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-2.5-7b-S1k",
-    "id": "bunnycore/Qwen-2.5-7b-S1k",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7162
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5563
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4781
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4071
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4382
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-1.5B-Model-Stock/30a8074e-df03-4866-9b8d-a5a7eece3c71.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-1.5B-Model-Stock/30a8074e-df03-4866-9b8d-a5a7eece3c71.json
deleted file mode 100644
index 3a805a02c..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-1.5B-Model-Stock/30a8074e-df03-4866-9b8d-a5a7eece3c71.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-1.5B-Model-Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-1.5B-Model-Stock",
-    "id": "bunnycore/Qwen2.5-1.5B-Model-Stock",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.776
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1829
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2874
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3674
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.11
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v2/ac8874ae-d6d6-45d3-aabc-06a3852f68d0.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v2/ac8874ae-d6d6-45d3-aabc-06a3852f68d0.json
deleted file mode 100644
index edae05519..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v2/ac8874ae-d6d6-45d3-aabc-06a3852f68d0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-3B-Model-Stock-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-3B-Model-Stock-v2",
-    "id": "bunnycore/Qwen2.5-3B-Model-Stock-v2",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.396
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.649
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4677
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3915
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.327
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v3.1/bc98b048-18d4-438e-80c4-0cd851798da5.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v3.1/bc98b048-18d4-438e-80c4-0cd851798da5.json
deleted file mode 100644
index eae5a80e4..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v3.1/bc98b048-18d4-438e-80c4-0cd851798da5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-3B-Model-Stock-v3.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-3B-Model-Stock-v3.1",
-    "id": "bunnycore/Qwen2.5-3B-Model-Stock-v3.1",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.396
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6481
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4737
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3897
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3968
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.329
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v3.2/c88c011f-0a24-4e78-a104-035d25af2430.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v3.2/c88c011f-0a24-4e78-a104-035d25af2430.json
deleted file mode 100644
index 6dedfe6ac..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v3.2/c88c011f-0a24-4e78-a104-035d25af2430.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-3B-Model-Stock-v3.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-3B-Model-Stock-v3.2",
-    "id": "bunnycore/Qwen2.5-3B-Model-Stock-v3.2",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.396
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6353
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4727
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3754
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3928
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3294
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v4.1/f9e3c31c-02c0-4f5e-ad4f-3be0801a0f41.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v4.1/f9e3c31c-02c0-4f5e-ad4f-3be0801a0f41.json
deleted file mode 100644
index e50f85cd0..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock-v4.1/f9e3c31c-02c0-4f5e-ad4f-3be0801a0f41.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-3B-Model-Stock-v4.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-3B-Model-Stock-v4.1",
-    "id": "bunnycore/Qwen2.5-3B-Model-Stock-v4.1",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.396
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6381
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.482
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3769
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3941
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3387
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock/5484405a-2ec8-4515-af75-76a5dd348d3d.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock/5484405a-2ec8-4515-af75-76a5dd348d3d.json
deleted file mode 100644
index 1e65375e1..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-Model-Stock/5484405a-2ec8-4515-af75-76a5dd348d3d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-3B-Model-Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-3B-Model-Stock",
-    "id": "bunnycore/Qwen2.5-3B-Model-Stock",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.396
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6381
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4712
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3799
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3942
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.325
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Mix/7dc117b9-c2a2-44c1-8471-f3bc8a116e3e.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Mix/7dc117b9-c2a2-44c1-8471-f3bc8a116e3e.json
deleted file mode 100644
index 3bc4e4b91..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Mix/7dc117b9-c2a2-44c1-8471-f3bc8a116e3e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-3B-RP-Mix/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-3B-RP-Mix",
-    "id": "bunnycore/Qwen2.5-3B-RP-Mix",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.397
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5721
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4894
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2153
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4284
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3728
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Thinker-V2/e2d314dd-b5b3-49b5-8e64-1e3464f4b963.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Thinker-V2/e2d314dd-b5b3-49b5-8e64-1e3464f4b963.json
deleted file mode 100644
index 8be20f53d..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Thinker-V2/e2d314dd-b5b3-49b5-8e64-1e3464f4b963.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-3B-RP-Thinker-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-3B-RP-Thinker-V2",
-    "id": "bunnycore/Qwen2.5-3B-RP-Thinker-V2",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.397
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.642
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4678
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3829
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3981
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3271
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Thinker/7ecb453b-1ba7-44ec-abfd-1f8be4c817fd.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Thinker/7ecb453b-1ba7-44ec-abfd-1f8be4c817fd.json
deleted file mode 100644
index aa4332eb8..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-3B-RP-Thinker/7ecb453b-1ba7-44ec-abfd-1f8be4c817fd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-3B-RP-Thinker/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-3B-RP-Thinker",
-    "id": "bunnycore/Qwen2.5-3B-RP-Thinker",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.397
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5894
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4164
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3353
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3287
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.315
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-CyberRombos/d0a70e95-fc72-41c6-ac42-09b8f379b566.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-CyberRombos/d0a70e95-fc72-41c6-ac42-09b8f379b566.json
deleted file mode 100644
index d0560f8b6..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-CyberRombos/d0a70e95-fc72-41c6-ac42-09b8f379b566.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-CyberRombos/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-CyberRombos",
-    "id": "bunnycore/Qwen2.5-7B-CyberRombos",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7518
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5465
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4962
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4125
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4391
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Fuse-Exp/e2ef8ea6-b464-445e-81df-ef0779c1d0d4.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Fuse-Exp/e2ef8ea6-b464-445e-81df-ef0779c1d0d4.json
deleted file mode 100644
index 3450ede55..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Fuse-Exp/e2ef8ea6-b464-445e-81df-ef0779c1d0d4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-Fuse-Exp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-Fuse-Exp",
-    "id": "bunnycore/Qwen2.5-7B-Fuse-Exp",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5469
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5109
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3142
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4573
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3309
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Instruct-Fusion/f3d7cca2-141c-4b84-abc4-396ad2d59e3c.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Instruct-Fusion/f3d7cca2-141c-4b84-abc4-396ad2d59e3c.json
deleted file mode 100644
index 96f112f7d..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Instruct-Fusion/f3d7cca2-141c-4b84-abc4-396ad2d59e3c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-Instruct-Fusion/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-Instruct-Fusion",
-    "id": "bunnycore/Qwen2.5-7B-Instruct-Fusion",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6962
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5492
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4297
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4467
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Instruct-Merge-Stock-v0.1/e3f48d7a-c8a3-4e75-99d6-7f2946696b12.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Instruct-Merge-Stock-v0.1/e3f48d7a-c8a3-4e75-99d6-7f2946696b12.json
deleted file mode 100644
index 192a551c4..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Instruct-Merge-Stock-v0.1/e3f48d7a-c8a3-4e75-99d6-7f2946696b12.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-Instruct-Merge-Stock-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-Instruct-Merge-Stock-v0.1",
-    "id": "bunnycore/Qwen2.5-7B-Instruct-Merge-Stock-v0.1",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7509
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5529
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4894
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4231
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4383
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-MixStock-Sce-V0.3/3feb9449-49a2-427f-a317-c21e6d1ca66c.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-MixStock-Sce-V0.3/3feb9449-49a2-427f-a317-c21e6d1ca66c.json
deleted file mode 100644
index ce27f83f1..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-MixStock-Sce-V0.3/3feb9449-49a2-427f-a317-c21e6d1ca66c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-MixStock-Sce-V0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-MixStock-Sce-V0.3",
-    "id": "bunnycore/Qwen2.5-7B-MixStock-Sce-V0.3",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.212
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3479
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3714
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1779
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-MixStock-V0.1/6359e37e-0405-436b-903c-8f0e740dd6c7.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-MixStock-V0.1/6359e37e-0405-436b-903c-8f0e740dd6c7.json
deleted file mode 100644
index cda1c1002..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-MixStock-V0.1/6359e37e-0405-436b-903c-8f0e740dd6c7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-MixStock-V0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-MixStock-V0.1",
-    "id": "bunnycore/Qwen2.5-7B-MixStock-V0.1",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7673
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5479
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3172
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4416
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4256
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-R1-Bespoke-Stock/f5daed76-f6e5-4a7d-84d7-80537a046b83.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-R1-Bespoke-Stock/f5daed76-f6e5-4a7d-84d7-80537a046b83.json
deleted file mode 100644
index 7a2c17b90..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-R1-Bespoke-Stock/f5daed76-f6e5-4a7d-84d7-80537a046b83.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-R1-Bespoke-Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-R1-Bespoke-Stock",
-    "id": "bunnycore/Qwen2.5-7B-R1-Bespoke-Stock",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3726
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4822
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2047
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3926
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3472
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-R1-Bespoke-Task/03af2b1d-989f-4afc-ab13-8793093b9c50.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-R1-Bespoke-Task/03af2b1d-989f-4afc-ab13-8793093b9c50.json
deleted file mode 100644
index c62431a02..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-R1-Bespoke-Task/03af2b1d-989f-4afc-ab13-8793093b9c50.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-R1-Bespoke-Task/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-R1-Bespoke-Task",
-    "id": "bunnycore/Qwen2.5-7B-R1-Bespoke-Task",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3787
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.415
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1782
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3569
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2688
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-1M-Thinker/5db7ec54-7feb-4c11-b2e0-042226ba1f94.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-1M-Thinker/5db7ec54-7feb-4c11-b2e0-042226ba1f94.json
deleted file mode 100644
index 8ddd86827..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-1M-Thinker/5db7ec54-7feb-4c11-b2e0-042226ba1f94.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-RRP-1M-Thinker/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-RRP-1M-Thinker",
-    "id": "bunnycore/Qwen2.5-7B-RRP-1M-Thinker",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2308
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3482
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2719
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3767
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1769
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-1M/f1f5615d-8a78-43c9-b5c6-edc180252381.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-1M/f1f5615d-8a78-43c9-b5c6-edc180252381.json
deleted file mode 100644
index 80b3d9332..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-1M/f1f5615d-8a78-43c9-b5c6-edc180252381.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-RRP-1M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-RRP-1M",
-    "id": "bunnycore/Qwen2.5-7B-RRP-1M",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7481
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5452
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3248
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4483
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4266
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-ID/9c89bf8f-4b8a-4c01-8685-fafc687c673e.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-ID/9c89bf8f-4b8a-4c01-8685-fafc687c673e.json
deleted file mode 100644
index ebca5e44d..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-RRP-ID/9c89bf8f-4b8a-4c01-8685-fafc687c673e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-RRP-ID/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-RRP-ID",
-    "id": "bunnycore/Qwen2.5-7B-RRP-ID",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7473
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.548
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4864
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.418
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4387
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Sky-R1-Mini/58b69c0f-826d-414f-915e-dd0b78d9298c.json b/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Sky-R1-Mini/58b69c0f-826d-414f-915e-dd0b78d9298c.json
deleted file mode 100644
index 592f70f7e..000000000
--- a/data/hfopenllm_v2/bunnycore/Qwen2.5-7B-Sky-R1-Mini/58b69c0f-826d-414f-915e-dd0b78d9298c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Qwen2.5-7B-Sky-R1-Mini/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-Sky-R1-Mini",
-    "id": "bunnycore/Qwen2.5-7B-Sky-R1-Mini",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2305
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3503
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3448
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1253
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/QwenMosaic-7B/101ea548-2ffe-4f47-b3b5-5fbe9a3854b4.json b/data/hfopenllm_v2/bunnycore/QwenMosaic-7B/101ea548-2ffe-4f47-b3b5-5fbe9a3854b4.json
deleted file mode 100644
index c1f5056f6..000000000
--- a/data/hfopenllm_v2/bunnycore/QwenMosaic-7B/101ea548-2ffe-4f47-b3b5-5fbe9a3854b4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_QwenMosaic-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenMosaic-7B",
-    "id": "bunnycore/QwenMosaic-7B",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5819
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5564
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4441
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4164
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.431
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Smol-Llama-3.2-3B/259c4798-ff03-4f58-8fb4-59150710212b.json b/data/hfopenllm_v2/bunnycore/Smol-Llama-3.2-3B/259c4798-ff03-4f58-8fb4-59150710212b.json
deleted file mode 100644
index 3ffc953ab..000000000
--- a/data/hfopenllm_v2/bunnycore/Smol-Llama-3.2-3B/259c4798-ff03-4f58-8fb4-59150710212b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Smol-Llama-3.2-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Smol-Llama-3.2-3B",
-    "id": "bunnycore/Smol-Llama-3.2-3B",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.607
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6679
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4539
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1382
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.346
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3228
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/SmolLM2-1.7-Persona/f731caa1-f777-494a-8490-da0c815f0708.json b/data/hfopenllm_v2/bunnycore/SmolLM2-1.7-Persona/f731caa1-f777-494a-8490-da0c815f0708.json
deleted file mode 100644
index 1cde2b7fa..000000000
--- a/data/hfopenllm_v2/bunnycore/SmolLM2-1.7-Persona/f731caa1-f777-494a-8490-da0c815f0708.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_SmolLM2-1.7-Persona/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM2-1.7-Persona",
-    "id": "bunnycore/SmolLM2-1.7-Persona",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.711
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5465
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3623
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3341
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1974
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/SmolLM2-1.7B-roleplay-lora/d4d25d38-b21a-490e-9ca9-556504ec00ea.json b/data/hfopenllm_v2/bunnycore/SmolLM2-1.7B-roleplay-lora/d4d25d38-b21a-490e-9ca9-556504ec00ea.json
deleted file mode 100644
index fc1b7c8b5..000000000
--- a/data/hfopenllm_v2/bunnycore/SmolLM2-1.7B-roleplay-lora/d4d25d38-b21a-490e-9ca9-556504ec00ea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_SmolLM2-1.7B-roleplay-lora/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM2-1.7B-roleplay-lora",
-    "id": "bunnycore/SmolLM2-1.7B-roleplay-lora",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 3.423
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5382
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.361
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0529
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3395
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1966
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/bunnycore/Tulu-3.1-8B-SuperNova/75bb85a3-40bb-4630-95a0-50e40b008412.json b/data/hfopenllm_v2/bunnycore/Tulu-3.1-8B-SuperNova/75bb85a3-40bb-4630-95a0-50e40b008412.json
deleted file mode 100644
index c128cbf2c..000000000
--- a/data/hfopenllm_v2/bunnycore/Tulu-3.1-8B-SuperNova/75bb85a3-40bb-4630-95a0-50e40b008412.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/bunnycore_Tulu-3.1-8B-SuperNova/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tulu-3.1-8B-SuperNova",
-    "id": "bunnycore/Tulu-3.1-8B-SuperNova",
-    "developer": "bunnycore",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8194
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5254
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2462
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3814
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/byroneverson/Mistral-Small-Instruct-2409-abliterated/bb44f3ef-eefa-48ef-a257-2eb345c89a00.json b/data/hfopenllm_v2/byroneverson/Mistral-Small-Instruct-2409-abliterated/bb44f3ef-eefa-48ef-a257-2eb345c89a00.json
deleted file mode 100644
index 096e9bd79..000000000
--- a/data/hfopenllm_v2/byroneverson/Mistral-Small-Instruct-2409-abliterated/bb44f3ef-eefa-48ef-a257-2eb345c89a00.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/byroneverson_Mistral-Small-Instruct-2409-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Small-Instruct-2409-abliterated",
-    "id": "byroneverson/Mistral-Small-Instruct-2409-abliterated",
-    "developer": "byroneverson",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6971
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5238
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2477
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3697
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3923
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/byroneverson/Yi-1.5-9B-Chat-16K-abliterated/2dcf1771-3dbe-43ad-974c-54e2e2860bcc.json b/data/hfopenllm_v2/byroneverson/Yi-1.5-9B-Chat-16K-abliterated/2dcf1771-3dbe-43ad-974c-54e2e2860bcc.json
deleted file mode 100644
index 7084716cd..000000000
--- a/data/hfopenllm_v2/byroneverson/Yi-1.5-9B-Chat-16K-abliterated/2dcf1771-3dbe-43ad-974c-54e2e2860bcc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/byroneverson_Yi-1.5-9B-Chat-16K-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-1.5-9B-Chat-16K-abliterated",
-    "id": "byroneverson/Yi-1.5-9B-Chat-16K-abliterated",
-    "developer": "byroneverson",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.829
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5528
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5282
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1412
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4734
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3823
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/byroneverson/Yi-1.5-9B-Chat-abliterated/caa0c8df-5488-4bf9-a5b8-0fff831e6732.json b/data/hfopenllm_v2/byroneverson/Yi-1.5-9B-Chat-abliterated/caa0c8df-5488-4bf9-a5b8-0fff831e6732.json
deleted file mode 100644
index 0098a2f51..000000000
--- a/data/hfopenllm_v2/byroneverson/Yi-1.5-9B-Chat-abliterated/caa0c8df-5488-4bf9-a5b8-0fff831e6732.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/byroneverson_Yi-1.5-9B-Chat-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-1.5-9B-Chat-abliterated",
-    "id": "byroneverson/Yi-1.5-9B-Chat-abliterated",
-    "developer": "byroneverson",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.829
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5723
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5401
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1662
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4389
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3715
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/c10x/Q-Pluse/c6f8e581-e849-4e28-b3a6-1838ee522770.json b/data/hfopenllm_v2/c10x/Q-Pluse/c6f8e581-e849-4e28-b3a6-1838ee522770.json
deleted file mode 100644
index 10eb44f61..000000000
--- a/data/hfopenllm_v2/c10x/Q-Pluse/c6f8e581-e849-4e28-b3a6-1838ee522770.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/c10x_Q-Pluse/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Q-Pluse",
-    "id": "c10x/Q-Pluse",
-    "developer": "c10x",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1123
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2875
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2466
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3938
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1135
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/c10x/longthinker/f0c361a1-a3ac-4415-ab5d-069bdf27e7a3.json b/data/hfopenllm_v2/c10x/longthinker/f0c361a1-a3ac-4415-ab5d-069bdf27e7a3.json
deleted file mode 100644
index c9bd5fd8a..000000000
--- a/data/hfopenllm_v2/c10x/longthinker/f0c361a1-a3ac-4415-ab5d-069bdf27e7a3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/c10x_longthinker/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "longthinker",
-    "id": "c10x/longthinker",
-    "developer": "c10x",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3609
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4927
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2319
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.391
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3527
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/carsenk/flippa-v6/44129be7-f73d-4580-8375-e8ef324e73a8.json b/data/hfopenllm_v2/carsenk/flippa-v6/44129be7-f73d-4580-8375-e8ef324e73a8.json
deleted file mode 100644
index 5fe1e9b7d..000000000
--- a/data/hfopenllm_v2/carsenk/flippa-v6/44129be7-f73d-4580-8375-e8ef324e73a8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/carsenk_flippa-v6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "flippa-v6",
-    "id": "carsenk/flippa-v6",
-    "developer": "carsenk",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 16.061
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3439
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5047
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1405
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4089
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3668
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/carsenk/phi3.5_mini_exp_825_uncensored/2925ecde-a9a5-4369-b391-d23a8605d35c.json b/data/hfopenllm_v2/carsenk/phi3.5_mini_exp_825_uncensored/2925ecde-a9a5-4369-b391-d23a8605d35c.json
deleted file mode 100644
index e36d77eec..000000000
--- a/data/hfopenllm_v2/carsenk/phi3.5_mini_exp_825_uncensored/2925ecde-a9a5-4369-b391-d23a8605d35c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/carsenk_phi3.5_mini_exp_825_uncensored/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi3.5_mini_exp_825_uncensored",
-    "id": "carsenk/phi3.5_mini_exp_825_uncensored",
-    "developer": "carsenk",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1364
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2965
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3644
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1175
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cat-searcher/gemma-2-9b-it-sppo-iter-1-evol-1/8409e464-fd16-4b41-b533-2f6cae4fe894.json b/data/hfopenllm_v2/cat-searcher/gemma-2-9b-it-sppo-iter-1-evol-1/8409e464-fd16-4b41-b533-2f6cae4fe894.json
deleted file mode 100644
index 0abe1f2fd..000000000
--- a/data/hfopenllm_v2/cat-searcher/gemma-2-9b-it-sppo-iter-1-evol-1/8409e464-fd16-4b41-b533-2f6cae4fe894.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cat-searcher_gemma-2-9b-it-sppo-iter-1-evol-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-9b-it-sppo-iter-1-evol-1",
-    "id": "cat-searcher/gemma-2-9b-it-sppo-iter-1-evol-1",
-    "developer": "cat-searcher",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2942
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5939
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0853
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3926
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cat-searcher/gemma-2-9b-it-sppo-iter-1/86f6c6eb-8b08-4e6c-a1bc-0d941a00f10b.json b/data/hfopenllm_v2/cat-searcher/gemma-2-9b-it-sppo-iter-1/86f6c6eb-8b08-4e6c-a1bc-0d941a00f10b.json
deleted file mode 100644
index a688e85bd..000000000
--- a/data/hfopenllm_v2/cat-searcher/gemma-2-9b-it-sppo-iter-1/86f6c6eb-8b08-4e6c-a1bc-0d941a00f10b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cat-searcher_gemma-2-9b-it-sppo-iter-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-9b-it-sppo-iter-1",
-    "id": "cat-searcher/gemma-2-9b-it-sppo-iter-1",
-    "developer": "cat-searcher",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3015
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5972
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0831
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3448
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3927
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3854
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cckm/tinymistral_950m/aa2e6df7-a0b0-42f7-8057-e2763fc34834.json b/data/hfopenllm_v2/cckm/tinymistral_950m/aa2e6df7-a0b0-42f7-8057-e2763fc34834.json
deleted file mode 100644
index c5f1e1fe2..000000000
--- a/data/hfopenllm_v2/cckm/tinymistral_950m/aa2e6df7-a0b0-42f7-8057-e2763fc34834.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cckm_tinymistral_950m/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tinymistral_950m",
-    "id": "cckm/tinymistral_950m",
-    "developer": "cckm",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 0.955
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2395
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2969
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0053
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3554
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1096
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cgato/TheSalt-L3-8b-v0.3.2/2bf9a06e-f3bf-4b55-804b-e553a722e0de.json b/data/hfopenllm_v2/cgato/TheSalt-L3-8b-v0.3.2/2bf9a06e-f3bf-4b55-804b-e553a722e0de.json
deleted file mode 100644
index 56d6c2c5f..000000000
--- a/data/hfopenllm_v2/cgato/TheSalt-L3-8b-v0.3.2/2bf9a06e-f3bf-4b55-804b-e553a722e0de.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cgato_TheSalt-L3-8b-v0.3.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TheSalt-L3-8b-v0.3.2",
-    "id": "cgato/TheSalt-L3-8b-v0.3.2",
-    "developer": "cgato",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2705
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2968
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3896
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1139
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/chargoddard/prometheus-2-llama-3-8b/b380a675-39ea-4950-ad0a-d9771f09ddde.json b/data/hfopenllm_v2/chargoddard/prometheus-2-llama-3-8b/b380a675-39ea-4950-ad0a-d9771f09ddde.json
deleted file mode 100644
index fbec5b2fc..000000000
--- a/data/hfopenllm_v2/chargoddard/prometheus-2-llama-3-8b/b380a675-39ea-4950-ad0a-d9771f09ddde.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/chargoddard_prometheus-2-llama-3-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "prometheus-2-llama-3-8b",
-    "id": "chargoddard/prometheus-2-llama-3-8b",
-    "developer": "chargoddard",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5289
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4931
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0823
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3396
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/chujiezheng/Llama-3-Instruct-8B-SimPO-ExPO/482358eb-7d3b-4de0-b5d9-451308f104e2.json b/data/hfopenllm_v2/chujiezheng/Llama-3-Instruct-8B-SimPO-ExPO/482358eb-7d3b-4de0-b5d9-451308f104e2.json
deleted file mode 100644
index 122ad0855..000000000
--- a/data/hfopenllm_v2/chujiezheng/Llama-3-Instruct-8B-SimPO-ExPO/482358eb-7d3b-4de0-b5d9-451308f104e2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/chujiezheng_Llama-3-Instruct-8B-SimPO-ExPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-SimPO-ExPO",
-    "id": "chujiezheng/Llama-3-Instruct-8B-SimPO-ExPO",
-    "developer": "chujiezheng",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6434
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4765
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0702
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.392
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3401
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/chujiezheng/Mistral7B-PairRM-SPPO-ExPO/ef04a83d-7b89-43ec-ba33-30e1006422dc.json b/data/hfopenllm_v2/chujiezheng/Mistral7B-PairRM-SPPO-ExPO/ef04a83d-7b89-43ec-ba33-30e1006422dc.json
deleted file mode 100644
index c2d924a8a..000000000
--- a/data/hfopenllm_v2/chujiezheng/Mistral7B-PairRM-SPPO-ExPO/ef04a83d-7b89-43ec-ba33-30e1006422dc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/chujiezheng_Mistral7B-PairRM-SPPO-ExPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral7B-PairRM-SPPO-ExPO",
-    "id": "chujiezheng/Mistral7B-PairRM-SPPO-ExPO",
-    "developer": "chujiezheng",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3673
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3882
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4055
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2552
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cjvt/GaMS-1B/7b64cf2e-c7c6-4b48-8e51-ea2aa0914145.json b/data/hfopenllm_v2/cjvt/GaMS-1B/7b64cf2e-c7c6-4b48-8e51-ea2aa0914145.json
deleted file mode 100644
index 9129e2209..000000000
--- a/data/hfopenllm_v2/cjvt/GaMS-1B/7b64cf2e-c7c6-4b48-8e51-ea2aa0914145.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cjvt_GaMS-1B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GaMS-1B",
-    "id": "cjvt/GaMS-1B",
-    "developer": "cjvt",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "OPTForCausalLM",
-      "params_billions": 1.54
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1635
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3075
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3684
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1149
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cloudyu/Llama-3-70Bx2-MOE/52c8e3f4-1063-4d9c-80d9-fdd0a72fc98e.json b/data/hfopenllm_v2/cloudyu/Llama-3-70Bx2-MOE/52c8e3f4-1063-4d9c-80d9-fdd0a72fc98e.json
deleted file mode 100644
index 91fc09dc7..000000000
--- a/data/hfopenllm_v2/cloudyu/Llama-3-70Bx2-MOE/52c8e3f4-1063-4d9c-80d9-fdd0a72fc98e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cloudyu_Llama-3-70Bx2-MOE/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-70Bx2-MOE",
-    "id": "cloudyu/Llama-3-70Bx2-MOE",
-    "developer": "cloudyu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 126.926
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5482
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6636
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2175
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4812
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5142
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cloudyu/Llama-3.2-3Bx4/1f4a827d-31cd-42e6-871d-7c0cad010f58.json b/data/hfopenllm_v2/cloudyu/Llama-3.2-3Bx4/1f4a827d-31cd-42e6-871d-7c0cad010f58.json
deleted file mode 100644
index 2d3c69d77..000000000
--- a/data/hfopenllm_v2/cloudyu/Llama-3.2-3Bx4/1f4a827d-31cd-42e6-871d-7c0cad010f58.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cloudyu_Llama-3.2-3Bx4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3Bx4",
-    "id": "cloudyu/Llama-3.2-3Bx4",
-    "developer": "cloudyu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 9.949
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5069
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4332
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1073
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3496
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2985
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cloudyu/Mixtral_11Bx2_MoE_19B/56d6d99c-fba1-42e7-aad4-631370b44da3.json b/data/hfopenllm_v2/cloudyu/Mixtral_11Bx2_MoE_19B/56d6d99c-fba1-42e7-aad4-631370b44da3.json
deleted file mode 100644
index 1913318a3..000000000
--- a/data/hfopenllm_v2/cloudyu/Mixtral_11Bx2_MoE_19B/56d6d99c-fba1-42e7-aad4-631370b44da3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cloudyu_Mixtral_11Bx2_MoE_19B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mixtral_11Bx2_MoE_19B",
-    "id": "cloudyu/Mixtral_11Bx2_MoE_19B",
-    "developer": "cloudyu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 19.188
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3851
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5209
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0672
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4297
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3311
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cloudyu/Mixtral_34Bx2_MoE_60B/006a0ac7-d6c3-42c1-b0cc-6a0bfe74f884.json b/data/hfopenllm_v2/cloudyu/Mixtral_34Bx2_MoE_60B/006a0ac7-d6c3-42c1-b0cc-6a0bfe74f884.json
deleted file mode 100644
index 10f736b8c..000000000
--- a/data/hfopenllm_v2/cloudyu/Mixtral_34Bx2_MoE_60B/006a0ac7-d6c3-42c1-b0cc-6a0bfe74f884.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cloudyu_Mixtral_34Bx2_MoE_60B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mixtral_34Bx2_MoE_60B",
-    "id": "cloudyu/Mixtral_34Bx2_MoE_60B",
-    "developer": "cloudyu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 60.814
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4538
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.587
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.077
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4625
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4766
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cloudyu/Mixtral_7Bx2_MoE/33a82686-6202-4a4d-ba34-bd4537105e5f.json b/data/hfopenllm_v2/cloudyu/Mixtral_7Bx2_MoE/33a82686-6202-4a4d-ba34-bd4537105e5f.json
deleted file mode 100644
index 3cdeaa925..000000000
--- a/data/hfopenllm_v2/cloudyu/Mixtral_7Bx2_MoE/33a82686-6202-4a4d-ba34-bd4537105e5f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cloudyu_Mixtral_7Bx2_MoE/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mixtral_7Bx2_MoE",
-    "id": "cloudyu/Mixtral_7Bx2_MoE",
-    "developer": "cloudyu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 12.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.448
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.516
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0687
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4473
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3044
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cloudyu/S1-Llama-3.2-3Bx4-MoE/38d45554-44bd-4b40-b7c9-c0b7ba44b862.json b/data/hfopenllm_v2/cloudyu/S1-Llama-3.2-3Bx4-MoE/38d45554-44bd-4b40-b7c9-c0b7ba44b862.json
deleted file mode 100644
index 7127e4c67..000000000
--- a/data/hfopenllm_v2/cloudyu/S1-Llama-3.2-3Bx4-MoE/38d45554-44bd-4b40-b7c9-c0b7ba44b862.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cloudyu_S1-Llama-3.2-3Bx4-MoE/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "S1-Llama-3.2-3Bx4-MoE",
-    "id": "cloudyu/S1-Llama-3.2-3Bx4-MoE",
-    "developer": "cloudyu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 9.555
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5302
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4358
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3456
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3044
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cloudyu/Yi-34Bx2-MoE-60B-DPO/37d7e3ab-db9c-4ad7-81d1-933c030a6250.json b/data/hfopenllm_v2/cloudyu/Yi-34Bx2-MoE-60B-DPO/37d7e3ab-db9c-4ad7-81d1-933c030a6250.json
deleted file mode 100644
index 44ea55fed..000000000
--- a/data/hfopenllm_v2/cloudyu/Yi-34Bx2-MoE-60B-DPO/37d7e3ab-db9c-4ad7-81d1-933c030a6250.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cloudyu_Yi-34Bx2-MoE-60B-DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yi-34Bx2-MoE-60B-DPO",
-    "id": "cloudyu/Yi-34Bx2-MoE-60B-DPO",
-    "developer": "cloudyu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 60.814
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5319
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5168
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0702
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4375
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4677
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-ipo/9cc49b3c-4e51-4f67-92ea-4ac8a3cbed43.json b/data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-ipo/9cc49b3c-4e51-4f67-92ea-4ac8a3cbed43.json
deleted file mode 100644
index 3ef6239d5..000000000
--- a/data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-ipo/9cc49b3c-4e51-4f67-92ea-4ac8a3cbed43.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cluebbers_Llama-3.1-8B-paraphrase-type-generation-apty-ipo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-paraphrase-type-generation-apty-ipo",
-    "id": "cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-ipo",
-    "developer": "cluebbers",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1327
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4332
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2591
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid/b6bd8515-4c95-40ce-b2d5-af8873d261ab.json b/data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid/b6bd8515-4c95-40ce-b2d5-af8873d261ab.json
deleted file mode 100644
index d349e2332..000000000
--- a/data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid/b6bd8515-4c95-40ce-b2d5-af8873d261ab.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cluebbers_Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid",
-    "id": "cluebbers/Llama-3.1-8B-paraphrase-type-generation-apty-sigmoid",
-    "developer": "cluebbers",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1318
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3789
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4306
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2562
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-etpc/d102e75d-3e20-482b-a243-bae3ec44e2bb.json b/data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-etpc/d102e75d-3e20-482b-a243-bae3ec44e2bb.json
deleted file mode 100644
index 8314442a4..000000000
--- a/data/hfopenllm_v2/cluebbers/Llama-3.1-8B-paraphrase-type-generation-etpc/d102e75d-3e20-482b-a243-bae3ec44e2bb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cluebbers_Llama-3.1-8B-paraphrase-type-generation-etpc/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-paraphrase-type-generation-etpc",
-    "id": "cluebbers/Llama-3.1-8B-paraphrase-type-generation-etpc",
-    "developer": "cluebbers",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1209
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3781
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4319
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2556
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Llama3.1-8B/68920da1-af71-4ccd-88b9-554e3c72c4dc.json b/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Llama3.1-8B/68920da1-af71-4ccd-88b9-554e3c72c4dc.json
deleted file mode 100644
index e9c8f9f33..000000000
--- a/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Llama3.1-8B/68920da1-af71-4ccd-88b9-554e3c72c4dc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cognitivecomputations_Dolphin3.0-Llama3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dolphin3.0-Llama3.1-8B",
-    "id": "cognitivecomputations/Dolphin3.0-Llama3.1-8B",
-    "developer": "cognitivecomputations",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7621
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4916
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1231
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3653
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2992
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Llama3.2-1B/c0eb144f-c726-4a80-bce9-384fb7a641a7.json b/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Llama3.2-1B/c0eb144f-c726-4a80-bce9-384fb7a641a7.json
deleted file mode 100644
index feeca651a..000000000
--- a/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Llama3.2-1B/c0eb144f-c726-4a80-bce9-384fb7a641a7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cognitivecomputations_Dolphin3.0-Llama3.2-1B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dolphin3.0-Llama3.2-1B",
-    "id": "cognitivecomputations/Dolphin3.0-Llama3.2-1B",
-    "developer": "cognitivecomputations",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5428
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3122
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0279
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2299
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3249
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1375
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Qwen2.5-0.5B/0b26f82d-36f6-4fd0-a0fd-05e4a1368a6e.json b/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Qwen2.5-0.5B/0b26f82d-36f6-4fd0-a0fd-05e4a1368a6e.json
deleted file mode 100644
index cf6e079cd..000000000
--- a/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-Qwen2.5-0.5B/0b26f82d-36f6-4fd0-a0fd-05e4a1368a6e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cognitivecomputations_Dolphin3.0-Qwen2.5-0.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dolphin3.0-Qwen2.5-0.5B",
-    "id": "cognitivecomputations/Dolphin3.0-Qwen2.5-0.5B",
-    "developer": "cognitivecomputations",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4697
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3114
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2349
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3555
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1413
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-R1-Mistral-24B/8fe4360a-0924-4386-b4cd-89069f7ff55f.json b/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-R1-Mistral-24B/8fe4360a-0924-4386-b4cd-89069f7ff55f.json
deleted file mode 100644
index 5e1bed350..000000000
--- a/data/hfopenllm_v2/cognitivecomputations/Dolphin3.0-R1-Mistral-24B/8fe4360a-0924-4386-b4cd-89069f7ff55f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cognitivecomputations_Dolphin3.0-R1-Mistral-24B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dolphin3.0-R1-Mistral-24B",
-    "id": "cognitivecomputations/Dolphin3.0-R1-Mistral-24B",
-    "developer": "cognitivecomputations",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4068
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.536
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3119
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3952
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3005
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9-llama3-8b/eeeb082b-7112-4a08-a87a-b2c9ae37efff.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9-llama3-8b/eeeb082b-7112-4a08-a87a-b2c9ae37efff.json
deleted file mode 100644
index e996e7392..000000000
--- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9-llama3-8b/eeeb082b-7112-4a08-a87a-b2c9ae37efff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9-llama3-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dolphin-2.9-llama3-8b",
-    "id": "cognitivecomputations/dolphin-2.9-llama3-8b",
-    "developer": "cognitivecomputations",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.385
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.495
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4375
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2771
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-llama-3-70b/b8f933e9-867f-4934-9648-371d1e632116.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-llama-3-70b/b8f933e9-867f-4934-9648-371d1e632116.json
deleted file mode 100644
index 96525d9b7..000000000
--- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-llama-3-70b/b8f933e9-867f-4934-9648-371d1e632116.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.1-llama-3-70b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dolphin-2.9.1-llama-3-70b",
-    "id": "cognitivecomputations/dolphin-2.9.1-llama-3-70b",
-    "developer": "cognitivecomputations",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.376
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5205
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.182
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4976
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.413
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-yi-1.5-34b/8d225023-4b7e-48cd-ae67-6d00b541f17d.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-yi-1.5-34b/8d225023-4b7e-48cd-ae67-6d00b541f17d.json
deleted file mode 100644
index e70bb20f7..000000000
--- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-yi-1.5-34b/8d225023-4b7e-48cd-ae67-6d00b541f17d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.1-yi-1.5-34b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dolphin-2.9.1-yi-1.5-34b",
-    "id": "cognitivecomputations/dolphin-2.9.1-yi-1.5-34b",
-    "developer": "cognitivecomputations",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3853
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6076
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1866
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3431
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4598
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4519
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-yi-1.5-9b/ee3b45e7-a5d6-4fa8-8abd-f6a77d5a6d5b.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-yi-1.5-9b/ee3b45e7-a5d6-4fa8-8abd-f6a77d5a6d5b.json
deleted file mode 100644
index d916c8a8c..000000000
--- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.1-yi-1.5-9b/ee3b45e7-a5d6-4fa8-8abd-f6a77d5a6d5b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.1-yi-1.5-9b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dolphin-2.9.1-yi-1.5-9b",
-    "id": "cognitivecomputations/dolphin-2.9.1-yi-1.5-9b",
-    "developer": "cognitivecomputations",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.829
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4465
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5484
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1518
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4348
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3967
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated/177ef040-da5c-4a65-adac-efdc555bd110.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated/177ef040-da5c-4a65-adac-efdc555bd110.json
deleted file mode 100644
index 2947eef38..000000000
--- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated/177ef040-da5c-4a65-adac-efdc555bd110.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.2-Phi-3-Medium-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dolphin-2.9.2-Phi-3-Medium-abliterated",
-    "id": "cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated",
-    "developer": "cognitivecomputations",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 13.96
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3613
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6123
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1239
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4112
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4494
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated/e9dc8337-eb35-4eb9-bca7-30ec1cd44092.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated/e9dc8337-eb35-4eb9-bca7-30ec1cd44092.json
deleted file mode 100644
index 118e42ec5..000000000
--- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated/e9dc8337-eb35-4eb9-bca7-30ec1cd44092.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.2-Phi-3-Medium-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dolphin-2.9.2-Phi-3-Medium-abliterated",
-    "id": "cognitivecomputations/dolphin-2.9.2-Phi-3-Medium-abliterated",
-    "developer": "cognitivecomputations",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 13.96
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4124
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6383
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.182
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4349
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4525
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium/f4549a39-0b28-4e06-998a-774f5f02cfba.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium/f4549a39-0b28-4e06-998a-774f5f02cfba.json
deleted file mode 100644
index a8dbae20b..000000000
--- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-Phi-3-Medium/f4549a39-0b28-4e06-998a-774f5f02cfba.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.2-Phi-3-Medium/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dolphin-2.9.2-Phi-3-Medium",
-    "id": "cognitivecomputations/dolphin-2.9.2-Phi-3-Medium",
-    "developer": "cognitivecomputations",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": -1.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4248
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6457
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1828
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4191
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4555
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-qwen2-72b/a79af78a-adab-406f-995a-adb3893e1510.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-qwen2-72b/a79af78a-adab-406f-995a-adb3893e1510.json
deleted file mode 100644
index 027fcc0d4..000000000
--- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-qwen2-72b/a79af78a-adab-406f-995a-adb3893e1510.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.2-qwen2-72b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dolphin-2.9.2-qwen2-72b",
-    "id": "cognitivecomputations/dolphin-2.9.2-qwen2-72b",
-    "developer": "cognitivecomputations",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6344
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6296
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4521
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5471
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-qwen2-7b/4e8e457a-85eb-4afb-a9fe-8f8ce6eaf4d7.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-qwen2-7b/4e8e457a-85eb-4afb-a9fe-8f8ce6eaf4d7.json
deleted file mode 100644
index e8d10e1e5..000000000
--- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.2-qwen2-7b/4e8e457a-85eb-4afb-a9fe-8f8ce6eaf4d7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.2-qwen2-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dolphin-2.9.2-qwen2-7b",
-    "id": "cognitivecomputations/dolphin-2.9.2-qwen2-7b",
-    "developer": "cognitivecomputations",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3535
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4894
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1344
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4191
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4051
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-Yi-1.5-34B-32k/eeb3a10a-d584-414a-90de-e018c47615c2.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-Yi-1.5-34B-32k/eeb3a10a-d584-414a-90de-e018c47615c2.json
deleted file mode 100644
index a64db73d8..000000000
--- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-Yi-1.5-34B-32k/eeb3a10a-d584-414a-90de-e018c47615c2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.3-Yi-1.5-34B-32k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dolphin-2.9.3-Yi-1.5-34B-32k",
-    "id": "cognitivecomputations/dolphin-2.9.3-Yi-1.5-34B-32k",
-    "developer": "cognitivecomputations",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3639
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6047
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1669
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3431
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4311
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.463
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-mistral-7B-32k/e83dadb0-5092-48b8-b408-e6bb1ac8a0ba.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-mistral-7B-32k/e83dadb0-5092-48b8-b408-e6bb1ac8a0ba.json
deleted file mode 100644
index 88711de67..000000000
--- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-mistral-7B-32k/e83dadb0-5092-48b8-b408-e6bb1ac8a0ba.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.3-mistral-7B-32k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dolphin-2.9.3-mistral-7B-32k",
-    "id": "cognitivecomputations/dolphin-2.9.3-mistral-7B-32k",
-    "developer": "cognitivecomputations",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4126
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4813
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4643
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2821
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-mistral-nemo-12b/cebc7767-fbc9-45a2-808b-51e1a4f0f35c.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-mistral-nemo-12b/cebc7767-fbc9-45a2-808b-51e1a4f0f35c.json
deleted file mode 100644
index 470ec6033..000000000
--- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.3-mistral-nemo-12b/cebc7767-fbc9-45a2-808b-51e1a4f0f35c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.3-mistral-nemo-12b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dolphin-2.9.3-mistral-nemo-12b",
-    "id": "cognitivecomputations/dolphin-2.9.3-mistral-nemo-12b",
-    "developer": "cognitivecomputations",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5601
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.548
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.074
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.443
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3377
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.4-gemma2-2b/b64b6416-b18b-47cc-a516-c613cd670b37.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.4-gemma2-2b/b64b6416-b18b-47cc-a516-c613cd670b37.json
deleted file mode 100644
index 859c42976..000000000
--- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.4-gemma2-2b/b64b6416-b18b-47cc-a516-c613cd670b37.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.4-gemma2-2b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dolphin-2.9.4-gemma2-2b",
-    "id": "cognitivecomputations/dolphin-2.9.4-gemma2-2b",
-    "developer": "cognitivecomputations",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0896
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4081
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0491
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.418
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2105
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.4-llama3.1-8b/64e96d56-72a9-413f-8903-45821b98f71e.json b/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.4-llama3.1-8b/64e96d56-72a9-413f-8903-45821b98f71e.json
deleted file mode 100644
index f6d9ab604..000000000
--- a/data/hfopenllm_v2/cognitivecomputations/dolphin-2.9.4-llama3.1-8b/64e96d56-72a9-413f-8903-45821b98f71e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cognitivecomputations_dolphin-2.9.4-llama3.1-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dolphin-2.9.4-llama3.1-8b",
-    "id": "cognitivecomputations/dolphin-2.9.4-llama3.1-8b",
-    "developer": "cognitivecomputations",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2757
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3524
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3236
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1237
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/collaiborateorg/Collaiborator-MEDLLM-Llama-3-8B-v2/a3f44cfd-d1fc-4a3c-aa5b-a0f37fc4a192.json b/data/hfopenllm_v2/collaiborateorg/Collaiborator-MEDLLM-Llama-3-8B-v2/a3f44cfd-d1fc-4a3c-aa5b-a0f37fc4a192.json
deleted file mode 100644
index faa327cca..000000000
--- a/data/hfopenllm_v2/collaiborateorg/Collaiborator-MEDLLM-Llama-3-8B-v2/a3f44cfd-d1fc-4a3c-aa5b-a0f37fc4a192.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/collaiborateorg_Collaiborator-MEDLLM-Llama-3-8B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Collaiborator-MEDLLM-Llama-3-8B-v2",
-    "id": "collaiborateorg/Collaiborator-MEDLLM-Llama-3-8B-v2",
-    "developer": "collaiborateorg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3809
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4648
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3434
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3481
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cpayne1303/cp2024-instruct/79314f48-d92b-4992-b3c6-d31278c0867a.json b/data/hfopenllm_v2/cpayne1303/cp2024-instruct/79314f48-d92b-4992-b3c6-d31278c0867a.json
deleted file mode 100644
index 0a74e80db..000000000
--- a/data/hfopenllm_v2/cpayne1303/cp2024-instruct/79314f48-d92b-4992-b3c6-d31278c0867a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cpayne1303_cp2024-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "cp2024-instruct",
-    "id": "cpayne1303/cp2024-instruct",
-    "developer": "cpayne1303",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.031
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1706
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2947
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3686
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1167
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cpayne1303/cp2024/5a007612-c8e7-4f6b-baa9-a21af7e908c6.json b/data/hfopenllm_v2/cpayne1303/cp2024/5a007612-c8e7-4f6b-baa9-a21af7e908c6.json
deleted file mode 100644
index 5ffe334d2..000000000
--- a/data/hfopenllm_v2/cpayne1303/cp2024/5a007612-c8e7-4f6b-baa9-a21af7e908c6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cpayne1303_cp2024/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "cp2024",
-    "id": "cpayne1303/cp2024",
-    "developer": "cpayne1303",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.031
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1658
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2985
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0053
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3383
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1101
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cpayne1303/llama-43m-beta/fdefdd3e-2d83-4430-bd95-e16a1935dff1.json b/data/hfopenllm_v2/cpayne1303/llama-43m-beta/fdefdd3e-2d83-4430-bd95-e16a1935dff1.json
deleted file mode 100644
index d8c1991cd..000000000
--- a/data/hfopenllm_v2/cpayne1303/llama-43m-beta/fdefdd3e-2d83-4430-bd95-e16a1935dff1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cpayne1303_llama-43m-beta/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-43m-beta",
-    "id": "cpayne1303/llama-43m-beta",
-    "developer": "cpayne1303",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.043
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1949
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2965
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3885
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1111
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cpayne1303/llama-43m-beta/ffdd45bf-3409-4b92-909a-25a32ba27f82.json b/data/hfopenllm_v2/cpayne1303/llama-43m-beta/ffdd45bf-3409-4b92-909a-25a32ba27f82.json
deleted file mode 100644
index 979e3c2d6..000000000
--- a/data/hfopenllm_v2/cpayne1303/llama-43m-beta/ffdd45bf-3409-4b92-909a-25a32ba27f82.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cpayne1303_llama-43m-beta/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-43m-beta",
-    "id": "cpayne1303/llama-43m-beta",
-    "developer": "cpayne1303",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.043
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1916
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2977
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3872
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1132
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cpayne1303/smallcp2024/a78ab8ac-2c2e-405a-95ee-0d1d27cf533b.json b/data/hfopenllm_v2/cpayne1303/smallcp2024/a78ab8ac-2c2e-405a-95ee-0d1d27cf533b.json
deleted file mode 100644
index fba1477ee..000000000
--- a/data/hfopenllm_v2/cpayne1303/smallcp2024/a78ab8ac-2c2e-405a-95ee-0d1d27cf533b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cpayne1303_smallcp2024/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smallcp2024",
-    "id": "cpayne1303/smallcp2024",
-    "developer": "cpayne1303",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.002
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1582
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3027
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0053
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3425
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1114
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/crestf411/MN-Slush/d9d49bf7-f6f0-4c25-9182-d815454940e3.json b/data/hfopenllm_v2/crestf411/MN-Slush/d9d49bf7-f6f0-4c25-9182-d815454940e3.json
deleted file mode 100644
index 5057d4ade..000000000
--- a/data/hfopenllm_v2/crestf411/MN-Slush/d9d49bf7-f6f0-4c25-9182-d815454940e3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/crestf411_MN-Slush/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-Slush",
-    "id": "crestf411/MN-Slush",
-    "developer": "crestf411",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4077
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.534
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1269
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3933
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3508
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cstr/llama3.1-8b-spaetzle-v90/deb48e93-0378-482f-8a5d-7ec350497e0b.json b/data/hfopenllm_v2/cstr/llama3.1-8b-spaetzle-v90/deb48e93-0378-482f-8a5d-7ec350497e0b.json
deleted file mode 100644
index cf3237c07..000000000
--- a/data/hfopenllm_v2/cstr/llama3.1-8b-spaetzle-v90/deb48e93-0378-482f-8a5d-7ec350497e0b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cstr_llama3.1-8b-spaetzle-v90/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama3.1-8b-spaetzle-v90",
-    "id": "cstr/llama3.1-8b-spaetzle-v90",
-    "developer": "cstr",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7356
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5303
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1495
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4134
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3731
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/cyberagent/calm3-22b-chat/302a9a47-8603-42d9-85fb-64c60e7c6f44.json b/data/hfopenllm_v2/cyberagent/calm3-22b-chat/302a9a47-8603-42d9-85fb-64c60e7c6f44.json
deleted file mode 100644
index f7b5122d0..000000000
--- a/data/hfopenllm_v2/cyberagent/calm3-22b-chat/302a9a47-8603-42d9-85fb-64c60e7c6f44.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/cyberagent_calm3-22b-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "calm3-22b-chat",
-    "id": "cyberagent/calm3-22b-chat",
-    "developer": "cyberagent",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 22.543
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5091
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4992
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0695
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4553
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.295
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/darkc0de/BuddyGlassNeverSleeps/28d52801-3998-421f-a37a-2b7b677d0eaa.json b/data/hfopenllm_v2/darkc0de/BuddyGlassNeverSleeps/28d52801-3998-421f-a37a-2b7b677d0eaa.json
deleted file mode 100644
index 8fd65a779..000000000
--- a/data/hfopenllm_v2/darkc0de/BuddyGlassNeverSleeps/28d52801-3998-421f-a37a-2b7b677d0eaa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/darkc0de_BuddyGlassNeverSleeps/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BuddyGlassNeverSleeps",
-    "id": "darkc0de/BuddyGlassNeverSleeps",
-    "developer": "darkc0de",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4239
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4977
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0627
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3993
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3452
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/darkc0de/BuddyGlassUncensored2025.2/32b4e23b-9430-45a8-bfa2-eea2e89792c4.json b/data/hfopenllm_v2/darkc0de/BuddyGlassUncensored2025.2/32b4e23b-9430-45a8-bfa2-eea2e89792c4.json
deleted file mode 100644
index 0e633e15d..000000000
--- a/data/hfopenllm_v2/darkc0de/BuddyGlassUncensored2025.2/32b4e23b-9430-45a8-bfa2-eea2e89792c4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/darkc0de_BuddyGlassUncensored2025.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BuddyGlassUncensored2025.2",
-    "id": "darkc0de/BuddyGlassUncensored2025.2",
-    "developer": "darkc0de",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7731
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6095
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2402
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4071
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4336
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/darkc0de/BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp/0336e168-e313-44cb-a030-42e6d20e92df.json b/data/hfopenllm_v2/darkc0de/BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp/0336e168-e313-44cb-a030-42e6d20e92df.json
deleted file mode 100644
index d8470fccf..000000000
--- a/data/hfopenllm_v2/darkc0de/BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp/0336e168-e313-44cb-a030-42e6d20e92df.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/darkc0de_BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp",
-    "id": "darkc0de/BuddyGlass_v0.3_Xortron7MethedUpSwitchedUp",
-    "developer": "darkc0de",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.007
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4358
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5243
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1284
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4143
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3673
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/databricks/dbrx-base/11bd8b5b-2ea4-4ec5-8fe6-654aedb40fc9.json b/data/hfopenllm_v2/databricks/dbrx-base/11bd8b5b-2ea4-4ec5-8fe6-654aedb40fc9.json
deleted file mode 100644
index e2da8eba9..000000000
--- a/data/hfopenllm_v2/databricks/dbrx-base/11bd8b5b-2ea4-4ec5-8fe6-654aedb40fc9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/databricks_dbrx-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dbrx-base",
-    "id": "databricks/dbrx-base",
-    "developer": "databricks",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Unknown",
-      "params_billions": 0.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0821
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5196
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3267
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4067
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/databricks/dbrx-instruct/6d97749c-3bfa-4c32-b581-a5e2b73303f3.json b/data/hfopenllm_v2/databricks/dbrx-instruct/6d97749c-3bfa-4c32-b581-a5e2b73303f3.json
deleted file mode 100644
index 3b824e9c7..000000000
--- a/data/hfopenllm_v2/databricks/dbrx-instruct/6d97749c-3bfa-4c32-b581-a5e2b73303f3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/databricks_dbrx-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dbrx-instruct",
-    "id": "databricks/dbrx-instruct",
-    "developer": "databricks",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "DbrxForCausalLM",
-      "params_billions": 131.597
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5416
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5429
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0687
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3414
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4269
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3683
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/databricks/dolly-v1-6b/ec58907d-b67c-467e-a3dd-b9f9c10138f0.json b/data/hfopenllm_v2/databricks/dolly-v1-6b/ec58907d-b67c-467e-a3dd-b9f9c10138f0.json
deleted file mode 100644
index ad7981890..000000000
--- a/data/hfopenllm_v2/databricks/dolly-v1-6b/ec58907d-b67c-467e-a3dd-b9f9c10138f0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/databricks_dolly-v1-6b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dolly-v1-6b",
-    "id": "databricks/dolly-v1-6b",
-    "developer": "databricks",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPTJForCausalLM",
-      "params_billions": 6.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2224
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3172
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0189
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4004
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1266
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/databricks/dolly-v2-12b/a7f09a3d-025c-48fa-9358-863b9ae382b1.json b/data/hfopenllm_v2/databricks/dolly-v2-12b/a7f09a3d-025c-48fa-9358-863b9ae382b1.json
deleted file mode 100644
index 68d7dcd5a..000000000
--- a/data/hfopenllm_v2/databricks/dolly-v2-12b/a7f09a3d-025c-48fa-9358-863b9ae382b1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/databricks_dolly-v2-12b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dolly-v2-12b",
-    "id": "databricks/dolly-v2-12b",
-    "developer": "databricks",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 12.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2355
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.332
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2408
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3739
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1129
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/databricks/dolly-v2-3b/bf2be2d5-58de-4550-b733-a5910bded48d.json b/data/hfopenllm_v2/databricks/dolly-v2-3b/bf2be2d5-58de-4550-b733-a5910bded48d.json
deleted file mode 100644
index 0f29fbdd0..000000000
--- a/data/hfopenllm_v2/databricks/dolly-v2-3b/bf2be2d5-58de-4550-b733-a5910bded48d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/databricks_dolly-v2-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dolly-v2-3b",
-    "id": "databricks/dolly-v2-3b",
-    "developer": "databricks",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 3.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2247
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3338
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1145
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/databricks/dolly-v2-7b/52b32c1f-6189-4850-b3f4-de442eb2ccb5.json b/data/hfopenllm_v2/databricks/dolly-v2-7b/52b32c1f-6189-4850-b3f4-de442eb2ccb5.json
deleted file mode 100644
index 99647f517..000000000
--- a/data/hfopenllm_v2/databricks/dolly-v2-7b/52b32c1f-6189-4850-b3f4-de442eb2ccb5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/databricks_dolly-v2-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dolly-v2-7b",
-    "id": "databricks/dolly-v2-7b",
-    "developer": "databricks",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.201
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3173
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3553
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1149
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/davidkim205/Rhea-72b-v0.5/87b44160-c3dd-452d-8c15-c4f758f8db7b.json b/data/hfopenllm_v2/davidkim205/Rhea-72b-v0.5/87b44160-c3dd-452d-8c15-c4f758f8db7b.json
deleted file mode 100644
index 6a35282cd..000000000
--- a/data/hfopenllm_v2/davidkim205/Rhea-72b-v0.5/87b44160-c3dd-452d-8c15-c4f758f8db7b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/davidkim205_Rhea-72b-v0.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rhea-72b-v0.5",
-    "id": "davidkim205/Rhea-72b-v0.5",
-    "developer": "davidkim205",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 72.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0145
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3078
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1737
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4241
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1166
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/davidkim205/nox-solar-10.7b-v4/3e6814d3-54ea-493f-a9fc-85ae9eed1b05.json b/data/hfopenllm_v2/davidkim205/nox-solar-10.7b-v4/3e6814d3-54ea-493f-a9fc-85ae9eed1b05.json
deleted file mode 100644
index 0227496f4..000000000
--- a/data/hfopenllm_v2/davidkim205/nox-solar-10.7b-v4/3e6814d3-54ea-493f-a9fc-85ae9eed1b05.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/davidkim205_nox-solar-10.7b-v4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "nox-solar-10.7b-v4",
-    "id": "davidkim205/nox-solar-10.7b-v4",
-    "developer": "davidkim205",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3753
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4814
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4298
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3333
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/35b7ff42-3825-4240-97bf-f8af7e8c23ff.json b/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/35b7ff42-3825-4240-97bf-f8af7e8c23ff.json
deleted file mode 100644
index d4fcf871a..000000000
--- a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Llama-70B/35b7ff42-3825-4240-97bf-f8af7e8c23ff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/deepseek-ai_DeepSeek-R1-Distill-Llama-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Llama-70B",
-    "id": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
-    "developer": "deepseek-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4336
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5635
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3074
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4748
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/c108173e-1582-4c99-9291-46986d7ba1cf.json b/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/c108173e-1582-4c99-9291-46986d7ba1cf.json
deleted file mode 100644
index 50037a506..000000000
--- a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Llama-8B/c108173e-1582-4c99-9291-46986d7ba1cf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/deepseek-ai_DeepSeek-R1-Distill-Llama-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Llama-8B",
-    "id": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-    "developer": "deepseek-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3782
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3239
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2198
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.325
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2089
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/6feb08b0-1c67-4fe2-a001-0b3b84529687.json b/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/6feb08b0-1c67-4fe2-a001-0b3b84529687.json
deleted file mode 100644
index 642a4c84c..000000000
--- a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/6feb08b0-1c67-4fe2-a001-0b3b84529687.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/deepseek-ai_DeepSeek-R1-Distill-Qwen-1.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-1.5B",
-    "id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
-    "developer": "deepseek-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3463
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3241
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1692
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3635
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1187
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/d4ab3df2-109a-4eec-9742-dc3bb79d5a58.json b/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/d4ab3df2-109a-4eec-9742-dc3bb79d5a58.json
deleted file mode 100644
index 6772800d9..000000000
--- a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B/d4ab3df2-109a-4eec-9742-dc3bb79d5a58.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/deepseek-ai_DeepSeek-R1-Distill-Qwen-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-14B",
-    "id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
-    "developer": "deepseek-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4382
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5906
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5702
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3876
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5366
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4667
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/53ec995e-bcfd-4a72-bd9a-45d14da3f219.json b/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/53ec995e-bcfd-4a72-bd9a-45d14da3f219.json
deleted file mode 100644
index 69ee0e546..000000000
--- a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B/53ec995e-bcfd-4a72-bd9a-45d14da3f219.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/deepseek-ai_DeepSeek-R1-Distill-Qwen-32B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-32B",
-    "id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B",
-    "developer": "deepseek-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4186
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4197
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1707
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4526
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4687
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/299a0397-89c7-4329-9599-9fc29a52db87.json b/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/299a0397-89c7-4329-9599-9fc29a52db87.json
deleted file mode 100644
index 8f7f946ab..000000000
--- a/data/hfopenllm_v2/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B/299a0397-89c7-4329-9599-9fc29a52db87.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/deepseek-ai_DeepSeek-R1-Distill-Qwen-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-7B",
-    "id": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
-    "developer": "deepseek-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4038
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3443
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1956
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3663
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2321
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/deepseek-ai/deepseek-llm-67b-chat/41adbc32-6cdf-49ba-980c-6eb6f722b40b.json b/data/hfopenllm_v2/deepseek-ai/deepseek-llm-67b-chat/41adbc32-6cdf-49ba-980c-6eb6f722b40b.json
deleted file mode 100644
index e02fd22ba..000000000
--- a/data/hfopenllm_v2/deepseek-ai/deepseek-llm-67b-chat/41adbc32-6cdf-49ba-980c-6eb6f722b40b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/deepseek-ai_deepseek-llm-67b-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "deepseek-llm-67b-chat",
-    "id": "deepseek-ai/deepseek-llm-67b-chat",
-    "developer": "deepseek-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 67.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5587
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5243
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0929
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5059
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3944
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/deepseek-ai/deepseek-llm-7b-base/4236ece5-f2b2-44e7-9503-9731bff20155.json b/data/hfopenllm_v2/deepseek-ai/deepseek-llm-7b-base/4236ece5-f2b2-44e7-9503-9731bff20155.json
deleted file mode 100644
index fa4327008..000000000
--- a/data/hfopenllm_v2/deepseek-ai/deepseek-llm-7b-base/4236ece5-f2b2-44e7-9503-9731bff20155.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/deepseek-ai_deepseek-llm-7b-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "deepseek-llm-7b-base",
-    "id": "deepseek-ai/deepseek-llm-7b-base",
-    "developer": "deepseek-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2179
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3503
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1806
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/deepseek-ai/deepseek-llm-7b-chat/b33d672c-4a96-4093-bc13-25c42303b918.json b/data/hfopenllm_v2/deepseek-ai/deepseek-llm-7b-chat/b33d672c-4a96-4093-bc13-25c42303b918.json
deleted file mode 100644
index cd8ef9eab..000000000
--- a/data/hfopenllm_v2/deepseek-ai/deepseek-llm-7b-chat/b33d672c-4a96-4093-bc13-25c42303b918.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/deepseek-ai_deepseek-llm-7b-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "deepseek-llm-7b-chat",
-    "id": "deepseek-ai/deepseek-llm-7b-chat",
-    "developer": "deepseek-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4171
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3632
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4668
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2133
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/deepseek-ai/deepseek-moe-16b-base/2b4f42fc-8b25-481c-98f7-911c52fdd242.json b/data/hfopenllm_v2/deepseek-ai/deepseek-moe-16b-base/2b4f42fc-8b25-481c-98f7-911c52fdd242.json
deleted file mode 100644
index a987a5874..000000000
--- a/data/hfopenllm_v2/deepseek-ai/deepseek-moe-16b-base/2b4f42fc-8b25-481c-98f7-911c52fdd242.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/deepseek-ai_deepseek-moe-16b-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "deepseek-moe-16b-base",
-    "id": "deepseek-ai/deepseek-moe-16b-base",
-    "developer": "deepseek-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "DeepseekForCausalLM",
-      "params_billions": 16.376
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.245
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3409
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0242
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3658
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1505
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/deepseek-ai/deepseek-moe-16b-chat/634b7a64-2bd3-48b8-b2f4-a93189801850.json b/data/hfopenllm_v2/deepseek-ai/deepseek-moe-16b-chat/634b7a64-2bd3-48b8-b2f4-a93189801850.json
deleted file mode 100644
index 92b956141..000000000
--- a/data/hfopenllm_v2/deepseek-ai/deepseek-moe-16b-chat/634b7a64-2bd3-48b8-b2f4-a93189801850.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/deepseek-ai_deepseek-moe-16b-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "deepseek-moe-16b-chat",
-    "id": "deepseek-ai/deepseek-moe-16b-chat",
-    "developer": "deepseek-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "DeepseekForCausalLM",
-      "params_billions": 16.376
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3663
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3275
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0257
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2248
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3808
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1964
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dfurman/CalmeRys-78B-Orpo-v0.1/72a4bcc3-9dfc-4268-be4e-cda5837a3da2.json b/data/hfopenllm_v2/dfurman/CalmeRys-78B-Orpo-v0.1/72a4bcc3-9dfc-4268-be4e-cda5837a3da2.json
deleted file mode 100644
index 152bc2222..000000000
--- a/data/hfopenllm_v2/dfurman/CalmeRys-78B-Orpo-v0.1/72a4bcc3-9dfc-4268-be4e-cda5837a3da2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dfurman_CalmeRys-78B-Orpo-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CalmeRys-78B-Orpo-v0.1",
-    "id": "dfurman/CalmeRys-78B-Orpo-v0.1",
-    "developer": "dfurman",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 77.965
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8163
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7262
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4063
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4002
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5902
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7012
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dfurman/Llama-3-70B-Orpo-v0.1/78fa85f6-baff-4d95-ad3a-a0663f51b0a0.json b/data/hfopenllm_v2/dfurman/Llama-3-70B-Orpo-v0.1/78fa85f6-baff-4d95-ad3a-a0663f51b0a0.json
deleted file mode 100644
index 2fb382007..000000000
--- a/data/hfopenllm_v2/dfurman/Llama-3-70B-Orpo-v0.1/78fa85f6-baff-4d95-ad3a-a0663f51b0a0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dfurman_Llama-3-70B-Orpo-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-70B-Orpo-v0.1",
-    "id": "dfurman/Llama-3-70B-Orpo-v0.1",
-    "developer": "dfurman",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2049
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4655
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1579
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4534
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3893
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dfurman/Llama-3-8B-Orpo-v0.1/359231a5-6eb9-4f73-a6f1-d7fd7f35c7ed.json b/data/hfopenllm_v2/dfurman/Llama-3-8B-Orpo-v0.1/359231a5-6eb9-4f73-a6f1-d7fd7f35c7ed.json
deleted file mode 100644
index cafc237a9..000000000
--- a/data/hfopenllm_v2/dfurman/Llama-3-8B-Orpo-v0.1/359231a5-6eb9-4f73-a6f1-d7fd7f35c7ed.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dfurman_Llama-3-8B-Orpo-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Orpo-v0.1",
-    "id": "dfurman/Llama-3-8B-Orpo-v0.1",
-    "developer": "dfurman",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2835
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0521
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3566
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2298
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dfurman/Llama-3-8B-Orpo-v0.1/79b81e37-f75e-4b18-b145-73c42625ced5.json b/data/hfopenllm_v2/dfurman/Llama-3-8B-Orpo-v0.1/79b81e37-f75e-4b18-b145-73c42625ced5.json
deleted file mode 100644
index f0f0c7311..000000000
--- a/data/hfopenllm_v2/dfurman/Llama-3-8B-Orpo-v0.1/79b81e37-f75e-4b18-b145-73c42625ced5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dfurman_Llama-3-8B-Orpo-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Orpo-v0.1",
-    "id": "dfurman/Llama-3-8B-Orpo-v0.1",
-    "developer": "dfurman",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3853
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0415
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3579
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2281
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dfurman/Qwen2-72B-Orpo-v0.1/2d99af7a-f67c-4e74-9ba2-f1401dfdf9fb.json b/data/hfopenllm_v2/dfurman/Qwen2-72B-Orpo-v0.1/2d99af7a-f67c-4e74-9ba2-f1401dfdf9fb.json
deleted file mode 100644
index 561c35bcc..000000000
--- a/data/hfopenllm_v2/dfurman/Qwen2-72B-Orpo-v0.1/2d99af7a-f67c-4e74-9ba2-f1401dfdf9fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dfurman_Qwen2-72B-Orpo-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-72B-Orpo-v0.1",
-    "id": "dfurman/Qwen2-72B-Orpo-v0.1",
-    "developer": "dfurman",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.699
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.788
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6969
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4056
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4784
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5455
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dicta-il/dictalm2.0-instruct/315fa815-fab0-47c9-8185-00bc597c0176.json b/data/hfopenllm_v2/dicta-il/dictalm2.0-instruct/315fa815-fab0-47c9-8185-00bc597c0176.json
deleted file mode 100644
index 66466a5b8..000000000
--- a/data/hfopenllm_v2/dicta-il/dictalm2.0-instruct/315fa815-fab0-47c9-8185-00bc597c0176.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dicta-il_dictalm2.0-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dictalm2.0-instruct",
-    "id": "dicta-il/dictalm2.0-instruct",
-    "developer": "dicta-il",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.251
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4412
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4256
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0227
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3946
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2605
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dicta-il/dictalm2.0/0c1686db-b396-4ecf-86f1-e4e092491acd.json b/data/hfopenllm_v2/dicta-il/dictalm2.0/0c1686db-b396-4ecf-86f1-e4e092491acd.json
deleted file mode 100644
index d0fee0d39..000000000
--- a/data/hfopenllm_v2/dicta-il/dictalm2.0/0c1686db-b396-4ecf-86f1-e4e092491acd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dicta-il_dictalm2.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dictalm2.0",
-    "id": "dicta-il/dictalm2.0",
-    "developer": "dicta-il",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.251
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2413
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4018
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2605
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/distilbert/distilgpt2/57455fbc-b5a9-4a3b-9a30-7da0593fd778.json b/data/hfopenllm_v2/distilbert/distilgpt2/57455fbc-b5a9-4a3b-9a30-7da0593fd778.json
deleted file mode 100644
index 4f626e3c3..000000000
--- a/data/hfopenllm_v2/distilbert/distilgpt2/57455fbc-b5a9-4a3b-9a30-7da0593fd778.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/distilbert_distilgpt2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "distilgpt2",
-    "id": "distilbert/distilgpt2",
-    "developer": "distilbert",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPT2LMHeadModel",
-      "params_billions": 0.088
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0611
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3038
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4207
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1187
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/divyanshukunwar/SASTRI_1_9B/a8f9d0e6-5a1a-4d09-ac78-47fd586384df.json b/data/hfopenllm_v2/divyanshukunwar/SASTRI_1_9B/a8f9d0e6-5a1a-4d09-ac78-47fd586384df.json
deleted file mode 100644
index 419e211aa..000000000
--- a/data/hfopenllm_v2/divyanshukunwar/SASTRI_1_9B/a8f9d0e6-5a1a-4d09-ac78-47fd586384df.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/divyanshukunwar_SASTRI_1_9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SASTRI_1_9B",
-    "id": "divyanshukunwar/SASTRI_1_9B",
-    "developer": "divyanshukunwar",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 5.211
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4207
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.468
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1156
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3831
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3187
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/djuna-test-lab/TEST-L3.2-ReWish-3B-ties-w-base/9d0d4eee-0b87-485c-843f-e32d08aa601b.json b/data/hfopenllm_v2/djuna-test-lab/TEST-L3.2-ReWish-3B-ties-w-base/9d0d4eee-0b87-485c-843f-e32d08aa601b.json
deleted file mode 100644
index fd16ac893..000000000
--- a/data/hfopenllm_v2/djuna-test-lab/TEST-L3.2-ReWish-3B-ties-w-base/9d0d4eee-0b87-485c-843f-e32d08aa601b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/djuna-test-lab_TEST-L3.2-ReWish-3B-ties-w-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TEST-L3.2-ReWish-3B-ties-w-base",
-    "id": "djuna-test-lab/TEST-L3.2-ReWish-3B-ties-w-base",
-    "developer": "djuna-test-lab",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6353
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4495
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1367
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3777
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3126
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/djuna-test-lab/TEST-L3.2-ReWish-3B/e47c83ff-9a16-488b-8ccf-4a2fad2b14fc.json b/data/hfopenllm_v2/djuna-test-lab/TEST-L3.2-ReWish-3B/e47c83ff-9a16-488b-8ccf-4a2fad2b14fc.json
deleted file mode 100644
index 0a5268a87..000000000
--- a/data/hfopenllm_v2/djuna-test-lab/TEST-L3.2-ReWish-3B/e47c83ff-9a16-488b-8ccf-4a2fad2b14fc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/djuna-test-lab_TEST-L3.2-ReWish-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TEST-L3.2-ReWish-3B",
-    "id": "djuna-test-lab/TEST-L3.2-ReWish-3B",
-    "developer": "djuna-test-lab",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6368
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4495
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1367
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3777
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3126
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/djuna/G2-BigGSHT-27B-2/8c7e25df-884d-4940-8185-4c1b82fac8c5.json b/data/hfopenllm_v2/djuna/G2-BigGSHT-27B-2/8c7e25df-884d-4940-8185-4c1b82fac8c5.json
deleted file mode 100644
index 294e1f1ce..000000000
--- a/data/hfopenllm_v2/djuna/G2-BigGSHT-27B-2/8c7e25df-884d-4940-8185-4c1b82fac8c5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/djuna_G2-BigGSHT-27B-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "G2-BigGSHT-27B-2",
-    "id": "djuna/G2-BigGSHT-27B-2",
-    "developer": "djuna",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7974
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6415
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2349
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3633
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4072
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4528
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/djuna/G2-GSHT/83611d50-01d0-4642-a104-daf77f1a0fe8.json b/data/hfopenllm_v2/djuna/G2-GSHT/83611d50-01d0-4642-a104-daf77f1a0fe8.json
deleted file mode 100644
index efbf1c971..000000000
--- a/data/hfopenllm_v2/djuna/G2-GSHT/83611d50-01d0-4642-a104-daf77f1a0fe8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/djuna_G2-GSHT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "G2-GSHT",
-    "id": "djuna/G2-GSHT",
-    "developer": "djuna",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.563
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.527
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1926
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4006
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/djuna/Gemma-2-gemmama-9b/5cbdafba-6071-4da1-8b19-3de612e9ff18.json b/data/hfopenllm_v2/djuna/Gemma-2-gemmama-9b/5cbdafba-6071-4da1-8b19-3de612e9ff18.json
deleted file mode 100644
index 2777bf875..000000000
--- a/data/hfopenllm_v2/djuna/Gemma-2-gemmama-9b/5cbdafba-6071-4da1-8b19-3de612e9ff18.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/djuna_Gemma-2-gemmama-9b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-gemmama-9b",
-    "id": "djuna/Gemma-2-gemmama-9b",
-    "developer": "djuna",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7703
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.542
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1926
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4031
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3109
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/djuna/L3.1-ForStHS/1c934cba-c94a-4aad-9645-84658e0b5588.json b/data/hfopenllm_v2/djuna/L3.1-ForStHS/1c934cba-c94a-4aad-9645-84658e0b5588.json
deleted file mode 100644
index be9ca7b2a..000000000
--- a/data/hfopenllm_v2/djuna/L3.1-ForStHS/1c934cba-c94a-4aad-9645-84658e0b5588.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/djuna_L3.1-ForStHS/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.1-ForStHS",
-    "id": "djuna/L3.1-ForStHS",
-    "developer": "djuna",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7813
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5203
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1503
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4026
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3735
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/djuna/L3.1-Promissum_Mane-8B-Della-1.5-calc/7aad3f6b-89d9-4c9e-9339-cf4111fc37c6.json b/data/hfopenllm_v2/djuna/L3.1-Promissum_Mane-8B-Della-1.5-calc/7aad3f6b-89d9-4c9e-9339-cf4111fc37c6.json
deleted file mode 100644
index 5e6b32a51..000000000
--- a/data/hfopenllm_v2/djuna/L3.1-Promissum_Mane-8B-Della-1.5-calc/7aad3f6b-89d9-4c9e-9339-cf4111fc37c6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/djuna_L3.1-Promissum_Mane-8B-Della-1.5-calc/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.1-Promissum_Mane-8B-Della-1.5-calc",
-    "id": "djuna/L3.1-Promissum_Mane-8B-Della-1.5-calc",
-    "developer": "djuna",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7235
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5433
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1639
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4253
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3904
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/djuna/L3.1-Promissum_Mane-8B-Della-calc/38d4a8ca-4273-4e6a-8a39-3b5ff20ec461.json b/data/hfopenllm_v2/djuna/L3.1-Promissum_Mane-8B-Della-calc/38d4a8ca-4273-4e6a-8a39-3b5ff20ec461.json
deleted file mode 100644
index 2ab5da35c..000000000
--- a/data/hfopenllm_v2/djuna/L3.1-Promissum_Mane-8B-Della-calc/38d4a8ca-4273-4e6a-8a39-3b5ff20ec461.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/djuna_L3.1-Promissum_Mane-8B-Della-calc/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.1-Promissum_Mane-8B-Della-calc",
-    "id": "djuna/L3.1-Promissum_Mane-8B-Della-calc",
-    "developer": "djuna",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5442
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5486
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1843
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.423
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3802
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/djuna/L3.1-Purosani-2-8B/3d65fbc2-bf91-479c-a687-e9ef702794fb.json b/data/hfopenllm_v2/djuna/L3.1-Purosani-2-8B/3d65fbc2-bf91-479c-a687-e9ef702794fb.json
deleted file mode 100644
index 14e62740c..000000000
--- a/data/hfopenllm_v2/djuna/L3.1-Purosani-2-8B/3d65fbc2-bf91-479c-a687-e9ef702794fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/djuna_L3.1-Purosani-2-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.1-Purosani-2-8B",
-    "id": "djuna/L3.1-Purosani-2-8B",
-    "developer": "djuna",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4988
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5182
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1171
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3816
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3752
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/djuna/L3.1-Suze-Vume-calc/650cdbbb-e066-4581-8d61-77aa6a4c402c.json b/data/hfopenllm_v2/djuna/L3.1-Suze-Vume-calc/650cdbbb-e066-4581-8d61-77aa6a4c402c.json
deleted file mode 100644
index 7bf4c3862..000000000
--- a/data/hfopenllm_v2/djuna/L3.1-Suze-Vume-calc/650cdbbb-e066-4581-8d61-77aa6a4c402c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/djuna_L3.1-Suze-Vume-calc/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.1-Suze-Vume-calc",
-    "id": "djuna/L3.1-Suze-Vume-calc",
-    "developer": "djuna",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7297
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5164
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3843
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/djuna/MN-Chinofun-12B-2/05d566c5-1810-483c-8ce0-84635b9457dc.json b/data/hfopenllm_v2/djuna/MN-Chinofun-12B-2/05d566c5-1810-483c-8ce0-84635b9457dc.json
deleted file mode 100644
index 5e17b60d7..000000000
--- a/data/hfopenllm_v2/djuna/MN-Chinofun-12B-2/05d566c5-1810-483c-8ce0-84635b9457dc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/djuna_MN-Chinofun-12B-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-Chinofun-12B-2",
-    "id": "djuna/MN-Chinofun-12B-2",
-    "developer": "djuna",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6171
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5037
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1307
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4268
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3615
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/djuna/MN-Chinofun-12B-3/37e3456a-92ff-4122-a697-ffbdc1c79555.json b/data/hfopenllm_v2/djuna/MN-Chinofun-12B-3/37e3456a-92ff-4122-a697-ffbdc1c79555.json
deleted file mode 100644
index d1f440712..000000000
--- a/data/hfopenllm_v2/djuna/MN-Chinofun-12B-3/37e3456a-92ff-4122-a697-ffbdc1c79555.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/djuna_MN-Chinofun-12B-3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-Chinofun-12B-3",
-    "id": "djuna/MN-Chinofun-12B-3",
-    "developer": "djuna",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3053
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5348
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1005
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4198
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3026
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/djuna/MN-Chinofun-12B-4/70c908d4-f1bf-4553-9bf7-95eb593b4853.json b/data/hfopenllm_v2/djuna/MN-Chinofun-12B-4/70c908d4-f1bf-4553-9bf7-95eb593b4853.json
deleted file mode 100644
index c46c8ff73..000000000
--- a/data/hfopenllm_v2/djuna/MN-Chinofun-12B-4/70c908d4-f1bf-4553-9bf7-95eb593b4853.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/djuna_MN-Chinofun-12B-4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-Chinofun-12B-4",
-    "id": "djuna/MN-Chinofun-12B-4",
-    "developer": "djuna",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5404
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5348
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1118
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4307
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3497
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/djuna/MN-Chinofun/2ccc9c20-5414-4286-abcd-ad2b20f8652d.json b/data/hfopenllm_v2/djuna/MN-Chinofun/2ccc9c20-5414-4286-abcd-ad2b20f8652d.json
deleted file mode 100644
index 3bb80844d..000000000
--- a/data/hfopenllm_v2/djuna/MN-Chinofun/2ccc9c20-5414-4286-abcd-ad2b20f8652d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/djuna_MN-Chinofun/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-Chinofun",
-    "id": "djuna/MN-Chinofun",
-    "developer": "djuna",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.611
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4953
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1307
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4084
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3603
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/djuna/Q2.5-Partron-7B/50f4560a-e172-42b9-b552-437aff158a38.json b/data/hfopenllm_v2/djuna/Q2.5-Partron-7B/50f4560a-e172-42b9-b552-437aff158a38.json
deleted file mode 100644
index 0893e6c11..000000000
--- a/data/hfopenllm_v2/djuna/Q2.5-Partron-7B/50f4560a-e172-42b9-b552-437aff158a38.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/djuna_Q2.5-Partron-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Q2.5-Partron-7B",
-    "id": "djuna/Q2.5-Partron-7B",
-    "developer": "djuna",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7321
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5418
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4826
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4165
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4283
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/djuna/Q2.5-Veltha-14B-0.5/c6a3abac-8a34-4725-915b-c27c3d0bc484.json b/data/hfopenllm_v2/djuna/Q2.5-Veltha-14B-0.5/c6a3abac-8a34-4725-915b-c27c3d0bc484.json
deleted file mode 100644
index 8d30205b4..000000000
--- a/data/hfopenllm_v2/djuna/Q2.5-Veltha-14B-0.5/c6a3abac-8a34-4725-915b-c27c3d0bc484.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/djuna_Q2.5-Veltha-14B-0.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Q2.5-Veltha-14B-0.5",
-    "id": "djuna/Q2.5-Veltha-14B-0.5",
-    "developer": "djuna",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7796
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6523
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4373
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3683
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4339
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5295
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/djuna/Q2.5-Veltha-14B/a8ed68ea-6463-4ff9-9dcd-034080272dec.json b/data/hfopenllm_v2/djuna/Q2.5-Veltha-14B/a8ed68ea-6463-4ff9-9dcd-034080272dec.json
deleted file mode 100644
index b3a521555..000000000
--- a/data/hfopenllm_v2/djuna/Q2.5-Veltha-14B/a8ed68ea-6463-4ff9-9dcd-034080272dec.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/djuna_Q2.5-Veltha-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Q2.5-Veltha-14B",
-    "id": "djuna/Q2.5-Veltha-14B",
-    "developer": "djuna",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8292
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6484
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4789
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3591
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4194
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5298
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dnhkng/RYS-Llama-3-8B-Instruct/5799ce8b-c00d-49f6-96dc-f7dd057a268c.json b/data/hfopenllm_v2/dnhkng/RYS-Llama-3-8B-Instruct/5799ce8b-c00d-49f6-96dc-f7dd057a268c.json
deleted file mode 100644
index 9ecc384c0..000000000
--- a/data/hfopenllm_v2/dnhkng/RYS-Llama-3-8B-Instruct/5799ce8b-c00d-49f6-96dc-f7dd057a268c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dnhkng_RYS-Llama-3-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RYS-Llama-3-8B-Instruct",
-    "id": "dnhkng/RYS-Llama-3-8B-Instruct",
-    "developer": "dnhkng",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6958
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4809
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0687
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3383
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3557
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dnhkng/RYS-Llama-3-Huge-Instruct/0d261023-3e35-4160-98ca-241bbaee927e.json b/data/hfopenllm_v2/dnhkng/RYS-Llama-3-Huge-Instruct/0d261023-3e35-4160-98ca-241bbaee927e.json
deleted file mode 100644
index f9db9541c..000000000
--- a/data/hfopenllm_v2/dnhkng/RYS-Llama-3-Huge-Instruct/0d261023-3e35-4160-98ca-241bbaee927e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dnhkng_RYS-Llama-3-Huge-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RYS-Llama-3-Huge-Instruct",
-    "id": "dnhkng/RYS-Llama-3-Huge-Instruct",
-    "developer": "dnhkng",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 99.646
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7686
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6481
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2289
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4208
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.511
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dnhkng/RYS-Llama-3-Large-Instruct/f0454d3b-18b4-488a-94dd-fb24729996c7.json b/data/hfopenllm_v2/dnhkng/RYS-Llama-3-Large-Instruct/f0454d3b-18b4-488a-94dd-fb24729996c7.json
deleted file mode 100644
index b5b981469..000000000
--- a/data/hfopenllm_v2/dnhkng/RYS-Llama-3-Large-Instruct/f0454d3b-18b4-488a-94dd-fb24729996c7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dnhkng_RYS-Llama-3-Large-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RYS-Llama-3-Large-Instruct",
-    "id": "dnhkng/RYS-Llama-3-Large-Instruct",
-    "developer": "dnhkng",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 73.976
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8051
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6525
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2304
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.418
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5137
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dnhkng/RYS-Llama-3.1-8B-Instruct/6bafa7a7-3a2a-4141-9564-a762d1cdb1d0.json b/data/hfopenllm_v2/dnhkng/RYS-Llama-3.1-8B-Instruct/6bafa7a7-3a2a-4141-9564-a762d1cdb1d0.json
deleted file mode 100644
index 8a33c3dbf..000000000
--- a/data/hfopenllm_v2/dnhkng/RYS-Llama-3.1-8B-Instruct/6bafa7a7-3a2a-4141-9564-a762d1cdb1d0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dnhkng_RYS-Llama-3.1-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RYS-Llama-3.1-8B-Instruct",
-    "id": "dnhkng/RYS-Llama-3.1-8B-Instruct",
-    "developer": "dnhkng",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 8.685
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7685
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5164
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1329
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3681
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3639
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dnhkng/RYS-Llama3.1-Large/37f20f86-40ba-4f63-b29d-efff6cb0e09b.json b/data/hfopenllm_v2/dnhkng/RYS-Llama3.1-Large/37f20f86-40ba-4f63-b29d-efff6cb0e09b.json
deleted file mode 100644
index bd2270331..000000000
--- a/data/hfopenllm_v2/dnhkng/RYS-Llama3.1-Large/37f20f86-40ba-4f63-b29d-efff6cb0e09b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dnhkng_RYS-Llama3.1-Large/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RYS-Llama3.1-Large",
-    "id": "dnhkng/RYS-Llama3.1-Large",
-    "developer": "dnhkng",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 81.677
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8492
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6899
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3505
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3742
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4554
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5249
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dnhkng/RYS-Medium/bf0e7ce4-09e9-4879-993a-eb50b2a421d7.json b/data/hfopenllm_v2/dnhkng/RYS-Medium/bf0e7ce4-09e9-4879-993a-eb50b2a421d7.json
deleted file mode 100644
index f208cf7e5..000000000
--- a/data/hfopenllm_v2/dnhkng/RYS-Medium/bf0e7ce4-09e9-4879-993a-eb50b2a421d7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dnhkng_RYS-Medium/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RYS-Medium",
-    "id": "dnhkng/RYS-Medium",
-    "developer": "dnhkng",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 18.731
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4406
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6285
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4069
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4326
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dnhkng/RYS-Phi-3-medium-4k-instruct/bcbc29f7-ea03-4dbe-a83e-d4940b2c6bea.json b/data/hfopenllm_v2/dnhkng/RYS-Phi-3-medium-4k-instruct/bcbc29f7-ea03-4dbe-a83e-d4940b2c6bea.json
deleted file mode 100644
index e893c8a74..000000000
--- a/data/hfopenllm_v2/dnhkng/RYS-Phi-3-medium-4k-instruct/bcbc29f7-ea03-4dbe-a83e-d4940b2c6bea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dnhkng_RYS-Phi-3-medium-4k-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RYS-Phi-3-medium-4k-instruct",
-    "id": "dnhkng/RYS-Phi-3-medium-4k-instruct",
-    "developer": "dnhkng",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 17.709
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4391
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6226
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1609
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3549
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4253
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4846
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dnhkng/RYS-XLarge-base/cbea8d66-0370-4998-8e3a-06fef0a60f0c.json b/data/hfopenllm_v2/dnhkng/RYS-XLarge-base/cbea8d66-0370-4998-8e3a-06fef0a60f0c.json
deleted file mode 100644
index 7bb7b0a97..000000000
--- a/data/hfopenllm_v2/dnhkng/RYS-XLarge-base/cbea8d66-0370-4998-8e3a-06fef0a60f0c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dnhkng_RYS-XLarge-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RYS-XLarge-base",
-    "id": "dnhkng/RYS-XLarge-base",
-    "developer": "dnhkng",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 77.972
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.791
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7047
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4903
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5431
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dnhkng/RYS-XLarge/ca48b670-b82e-46cc-beb9-2fd0f11d3585.json b/data/hfopenllm_v2/dnhkng/RYS-XLarge/ca48b670-b82e-46cc-beb9-2fd0f11d3585.json
deleted file mode 100644
index 8542415fb..000000000
--- a/data/hfopenllm_v2/dnhkng/RYS-XLarge/ca48b670-b82e-46cc-beb9-2fd0f11d3585.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dnhkng_RYS-XLarge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RYS-XLarge",
-    "id": "dnhkng/RYS-XLarge",
-    "developer": "dnhkng",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 77.965
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7996
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.705
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4252
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.497
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5428
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dnhkng/RYS-XLarge2/d37f99f7-f9c3-48b6-84d3-7da5d77f5030.json b/data/hfopenllm_v2/dnhkng/RYS-XLarge2/d37f99f7-f9c3-48b6-84d3-7da5d77f5030.json
deleted file mode 100644
index c6ee9080f..000000000
--- a/data/hfopenllm_v2/dnhkng/RYS-XLarge2/d37f99f7-f9c3-48b6-84d3-7da5d77f5030.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dnhkng_RYS-XLarge2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RYS-XLarge2",
-    "id": "dnhkng/RYS-XLarge2",
-    "developer": "dnhkng",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 77.965
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4902
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6574
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2749
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3742
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4508
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5378
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dreamgen/WizardLM-2-7B/503c8a24-4ced-4dca-b9df-5733ce89c2ca.json b/data/hfopenllm_v2/dreamgen/WizardLM-2-7B/503c8a24-4ced-4dca-b9df-5733ce89c2ca.json
deleted file mode 100644
index ea7995a74..000000000
--- a/data/hfopenllm_v2/dreamgen/WizardLM-2-7B/503c8a24-4ced-4dca-b9df-5733ce89c2ca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dreamgen_WizardLM-2-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "WizardLM-2-7B",
-    "id": "dreamgen/WizardLM-2-7B",
-    "developer": "dreamgen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4583
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3487
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0332
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3941
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.266
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v1/5c5283a0-819f-4112-bb90-5277423d9c00.json b/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v1/5c5283a0-819f-4112-bb90-5277423d9c00.json
deleted file mode 100644
index c68eb92a9..000000000
--- a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v1/5c5283a0-819f-4112-bb90-5277423d9c00.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dustinwloring1988_Reflexis-8b-chat-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Reflexis-8b-chat-v1",
-    "id": "dustinwloring1988/Reflexis-8b-chat-v1",
-    "developer": "dustinwloring1988",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3658
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4664
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1156
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3754
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3384
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v2/b636bc82-1625-49b1-beec-cadaf4e1b1a9.json b/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v2/b636bc82-1625-49b1-beec-cadaf4e1b1a9.json
deleted file mode 100644
index 9bbf76e4e..000000000
--- a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v2/b636bc82-1625-49b1-beec-cadaf4e1b1a9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dustinwloring1988_Reflexis-8b-chat-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Reflexis-8b-chat-v2",
-    "id": "dustinwloring1988/Reflexis-8b-chat-v2",
-    "developer": "dustinwloring1988",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3912
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4724
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1163
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3526
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3378
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v3/00f481c1-0ef0-40bd-bd95-81dc9443a62c.json b/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v3/00f481c1-0ef0-40bd-bd95-81dc9443a62c.json
deleted file mode 100644
index 4ff73b25c..000000000
--- a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v3/00f481c1-0ef0-40bd-bd95-81dc9443a62c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dustinwloring1988_Reflexis-8b-chat-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Reflexis-8b-chat-v3",
-    "id": "dustinwloring1988/Reflexis-8b-chat-v3",
-    "developer": "dustinwloring1988",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5367
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4658
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1224
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2424
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3512
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3548
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v4/7ea22fef-2d79-49ae-bf72-9153a4e239c5.json b/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v4/7ea22fef-2d79-49ae-bf72-9153a4e239c5.json
deleted file mode 100644
index 6e5776566..000000000
--- a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v4/7ea22fef-2d79-49ae-bf72-9153a4e239c5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dustinwloring1988_Reflexis-8b-chat-v4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Reflexis-8b-chat-v4",
-    "id": "dustinwloring1988/Reflexis-8b-chat-v4",
-    "developer": "dustinwloring1988",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4698
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4686
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1027
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2341
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3393
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.339
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v5/64f441df-1781-4d01-b73b-2156413ad403.json b/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v5/64f441df-1781-4d01-b73b-2156413ad403.json
deleted file mode 100644
index 6d8fa5edd..000000000
--- a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v5/64f441df-1781-4d01-b73b-2156413ad403.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dustinwloring1988_Reflexis-8b-chat-v5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Reflexis-8b-chat-v5",
-    "id": "dustinwloring1988/Reflexis-8b-chat-v5",
-    "developer": "dustinwloring1988",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4238
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4782
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3354
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3217
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v6/4e3676eb-8607-416e-986a-7098bc192820.json b/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v6/4e3676eb-8607-416e-986a-7098bc192820.json
deleted file mode 100644
index be5afc0ca..000000000
--- a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v6/4e3676eb-8607-416e-986a-7098bc192820.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dustinwloring1988_Reflexis-8b-chat-v6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Reflexis-8b-chat-v6",
-    "id": "dustinwloring1988/Reflexis-8b-chat-v6",
-    "developer": "dustinwloring1988",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4939
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.481
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1299
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3753
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3479
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v7/2101369c-5042-48f3-a8f2-f9f56e7b6ae7.json b/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v7/2101369c-5042-48f3-a8f2-f9f56e7b6ae7.json
deleted file mode 100644
index 601437128..000000000
--- a/data/hfopenllm_v2/dustinwloring1988/Reflexis-8b-chat-v7/2101369c-5042-48f3-a8f2-f9f56e7b6ae7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dustinwloring1988_Reflexis-8b-chat-v7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Reflexis-8b-chat-v7",
-    "id": "dustinwloring1988/Reflexis-8b-chat-v7",
-    "developer": "dustinwloring1988",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.398
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.481
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1631
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3222
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3643
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/duyhv1411/Llama-3.2-1B-en-vi/c4b86264-3725-4742-91f0-3e01f8d965a4.json b/data/hfopenllm_v2/duyhv1411/Llama-3.2-1B-en-vi/c4b86264-3725-4742-91f0-3e01f8d965a4.json
deleted file mode 100644
index f91722bb5..000000000
--- a/data/hfopenllm_v2/duyhv1411/Llama-3.2-1B-en-vi/c4b86264-3725-4742-91f0-3e01f8d965a4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/duyhv1411_Llama-3.2-1B-en-vi/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-1B-en-vi",
-    "id": "duyhv1411/Llama-3.2-1B-en-vi",
-    "developer": "duyhv1411",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4788
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3291
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3197
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1341
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/duyhv1411/Llama-3.2-3B-en-vi/0308147c-dabb-46bb-8add-d332fcd5a800.json b/data/hfopenllm_v2/duyhv1411/Llama-3.2-3B-en-vi/0308147c-dabb-46bb-8add-d332fcd5a800.json
deleted file mode 100644
index 7c38df979..000000000
--- a/data/hfopenllm_v2/duyhv1411/Llama-3.2-3B-en-vi/0308147c-dabb-46bb-8add-d332fcd5a800.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/duyhv1411_Llama-3.2-3B-en-vi/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-en-vi",
-    "id": "duyhv1411/Llama-3.2-3B-en-vi",
-    "developer": "duyhv1411",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4852
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0227
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.321
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1359
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id-inst/a9977a0d-e199-488a-a26e-6269806fdb2b.json b/data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id-inst/a9977a0d-e199-488a-a26e-6269806fdb2b.json
deleted file mode 100644
index ebf7ac8d4..000000000
--- a/data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id-inst/a9977a0d-e199-488a-a26e-6269806fdb2b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dwikitheduck_gemma-2-2b-id-inst/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-id-inst",
-    "id": "dwikitheduck/gemma-2-2b-id-inst",
-    "developer": "dwikitheduck",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3879
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3962
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4154
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2173
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id-instruct/56b89ec8-90c5-4e1e-a458-1bb8b5b92be8.json b/data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id-instruct/56b89ec8-90c5-4e1e-a458-1bb8b5b92be8.json
deleted file mode 100644
index 83e3a462d..000000000
--- a/data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id-instruct/56b89ec8-90c5-4e1e-a458-1bb8b5b92be8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dwikitheduck_gemma-2-2b-id-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-id-instruct",
-    "id": "dwikitheduck/gemma-2-2b-id-instruct",
-    "developer": "dwikitheduck",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3879
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3962
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4154
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2173
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id/4185c376-91c6-435d-ae3b-47cd85151049.json b/data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id/4185c376-91c6-435d-ae3b-47cd85151049.json
deleted file mode 100644
index f7fe9743d..000000000
--- a/data/hfopenllm_v2/dwikitheduck/gemma-2-2b-id/4185c376-91c6-435d-ae3b-47cd85151049.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dwikitheduck_gemma-2-2b-id/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-id",
-    "id": "dwikitheduck/gemma-2-2b-id",
-    "developer": "dwikitheduck",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3879
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3962
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4154
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2173
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dwikitheduck/gen-inst-1/26e45f5d-1e3d-425f-ba4d-b444dcda7f74.json b/data/hfopenllm_v2/dwikitheduck/gen-inst-1/26e45f5d-1e3d-425f-ba4d-b444dcda7f74.json
deleted file mode 100644
index 0250810c2..000000000
--- a/data/hfopenllm_v2/dwikitheduck/gen-inst-1/26e45f5d-1e3d-425f-ba4d-b444dcda7f74.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dwikitheduck_gen-inst-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gen-inst-1",
-    "id": "dwikitheduck/gen-inst-1",
-    "developer": "dwikitheduck",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.775
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.642
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4554
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4205
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5089
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dwikitheduck/gen-try1-notemp/09be48ce-61f8-4ba9-b082-b9c475fa714d.json b/data/hfopenllm_v2/dwikitheduck/gen-try1-notemp/09be48ce-61f8-4ba9-b082-b9c475fa714d.json
deleted file mode 100644
index ce2a46bd9..000000000
--- a/data/hfopenllm_v2/dwikitheduck/gen-try1-notemp/09be48ce-61f8-4ba9-b082-b9c475fa714d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dwikitheduck_gen-try1-notemp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gen-try1-notemp",
-    "id": "dwikitheduck/gen-try1-notemp",
-    "developer": "dwikitheduck",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2627
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6263
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.354
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4714
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.521
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dwikitheduck/gen-try1/27417bcb-fb2f-41d2-9dfa-9865a36f38d5.json b/data/hfopenllm_v2/dwikitheduck/gen-try1/27417bcb-fb2f-41d2-9dfa-9865a36f38d5.json
deleted file mode 100644
index e7929b7e4..000000000
--- a/data/hfopenllm_v2/dwikitheduck/gen-try1/27417bcb-fb2f-41d2-9dfa-9865a36f38d5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dwikitheduck_gen-try1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gen-try1",
-    "id": "dwikitheduck/gen-try1",
-    "developer": "dwikitheduck",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7522
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6359
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4101
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3414
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4416
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5111
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/dzakwan/dzakwan-MoE-4x7b-Beta/7b6fc3c2-a67d-450e-858c-fa87be122376.json b/data/hfopenllm_v2/dzakwan/dzakwan-MoE-4x7b-Beta/7b6fc3c2-a67d-450e-858c-fa87be122376.json
deleted file mode 100644
index 1562aefaf..000000000
--- a/data/hfopenllm_v2/dzakwan/dzakwan-MoE-4x7b-Beta/7b6fc3c2-a67d-450e-858c-fa87be122376.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/dzakwan_dzakwan-MoE-4x7b-Beta/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dzakwan-MoE-4x7b-Beta",
-    "id": "dzakwan/dzakwan-MoE-4x7b-Beta",
-    "developer": "dzakwan",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 24.154
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4443
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.514
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0778
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4267
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3108
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/Falcon3-8B-Franken-Basestruct/76b86418-5450-48c6-ae56-58a19016d055.json b/data/hfopenllm_v2/ehristoforu/Falcon3-8B-Franken-Basestruct/76b86418-5450-48c6-ae56-58a19016d055.json
deleted file mode 100644
index 8426beecc..000000000
--- a/data/hfopenllm_v2/ehristoforu/Falcon3-8B-Franken-Basestruct/76b86418-5450-48c6-ae56-58a19016d055.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_Falcon3-8B-Franken-Basestruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon3-8B-Franken-Basestruct",
-    "id": "ehristoforu/Falcon3-8B-Franken-Basestruct",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.406
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1715
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5463
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3555
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3947
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/Falcon3-MoE-2x7B-Insruct/e06594e4-899a-4285-b130-f7b605e5a6b9.json b/data/hfopenllm_v2/ehristoforu/Falcon3-MoE-2x7B-Insruct/e06594e4-899a-4285-b130-f7b605e5a6b9.json
deleted file mode 100644
index 02842cc41..000000000
--- a/data/hfopenllm_v2/ehristoforu/Falcon3-MoE-2x7B-Insruct/e06594e4-899a-4285-b130-f7b605e5a6b9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_Falcon3-MoE-2x7B-Insruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon3-MoE-2x7B-Insruct",
-    "id": "ehristoforu/Falcon3-MoE-2x7B-Insruct",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 13.401
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7643
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5648
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4124
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.484
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4095
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/Gemma2-9B-it-psy10k-mental_health/9efdc773-a5c7-4709-88c8-96a67d84a742.json b/data/hfopenllm_v2/ehristoforu/Gemma2-9B-it-psy10k-mental_health/9efdc773-a5c7-4709-88c8-96a67d84a742.json
deleted file mode 100644
index 4328adf7f..000000000
--- a/data/hfopenllm_v2/ehristoforu/Gemma2-9B-it-psy10k-mental_health/9efdc773-a5c7-4709-88c8-96a67d84a742.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_Gemma2-9B-it-psy10k-mental_health/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma2-9B-it-psy10k-mental_health",
-    "id": "ehristoforu/Gemma2-9B-it-psy10k-mental_health",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5887
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5539
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1631
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3372
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4086
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3829
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/Gemma2-9b-it-train6/1fcc2f96-afc9-403f-b82e-8e1804506582.json b/data/hfopenllm_v2/ehristoforu/Gemma2-9b-it-train6/1fcc2f96-afc9-403f-b82e-8e1804506582.json
deleted file mode 100644
index b1b1e648f..000000000
--- a/data/hfopenllm_v2/ehristoforu/Gemma2-9b-it-train6/1fcc2f96-afc9-403f-b82e-8e1804506582.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_Gemma2-9b-it-train6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma2-9b-it-train6",
-    "id": "ehristoforu/Gemma2-9b-it-train6",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7025
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5898
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1911
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4084
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3942
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/HappyLlama1/bee1e134-9a43-441a-b977-522c510dd1ce.json b/data/hfopenllm_v2/ehristoforu/HappyLlama1/bee1e134-9a43-441a-b977-522c510dd1ce.json
deleted file mode 100644
index ee8741019..000000000
--- a/data/hfopenllm_v2/ehristoforu/HappyLlama1/bee1e134-9a43-441a-b977-522c510dd1ce.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_HappyLlama1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HappyLlama1",
-    "id": "ehristoforu/HappyLlama1",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7363
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4996
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1427
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4287
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3546
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/QwenQwen2.5-7B-IT-Dare/b70e1089-d136-4b2f-a253-f361bcf8cdcc.json b/data/hfopenllm_v2/ehristoforu/QwenQwen2.5-7B-IT-Dare/b70e1089-d136-4b2f-a253-f361bcf8cdcc.json
deleted file mode 100644
index db80ae6ad..000000000
--- a/data/hfopenllm_v2/ehristoforu/QwenQwen2.5-7B-IT-Dare/b70e1089-d136-4b2f-a253-f361bcf8cdcc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_QwenQwen2.5-7B-IT-Dare/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenQwen2.5-7B-IT-Dare",
-    "id": "ehristoforu/QwenQwen2.5-7B-IT-Dare",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7509
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5398
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4034
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4289
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/QwenQwen2.5-7B-IT/8b7e9c34-a982-4f4d-b5dc-66a12578601f.json b/data/hfopenllm_v2/ehristoforu/QwenQwen2.5-7B-IT/8b7e9c34-a982-4f4d-b5dc-66a12578601f.json
deleted file mode 100644
index a51f16a50..000000000
--- a/data/hfopenllm_v2/ehristoforu/QwenQwen2.5-7B-IT/8b7e9c34-a982-4f4d-b5dc-66a12578601f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_QwenQwen2.5-7B-IT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenQwen2.5-7B-IT",
-    "id": "ehristoforu/QwenQwen2.5-7B-IT",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7518
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5398
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4034
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4289
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/RQwen-v0.1/0ccc36d0-f546-46d1-91d3-15a40c7bf6c1.json b/data/hfopenllm_v2/ehristoforu/RQwen-v0.1/0ccc36d0-f546-46d1-91d3-15a40c7bf6c1.json
deleted file mode 100644
index 552e8c521..000000000
--- a/data/hfopenllm_v2/ehristoforu/RQwen-v0.1/0ccc36d0-f546-46d1-91d3-15a40c7bf6c1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_RQwen-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RQwen-v0.1",
-    "id": "ehristoforu/RQwen-v0.1",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7625
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6446
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4645
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4139
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5202
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/RQwen-v0.2/066abe97-2c6c-4f3b-9e5e-e144f130258a.json b/data/hfopenllm_v2/ehristoforu/RQwen-v0.2/066abe97-2c6c-4f3b-9e5e-e144f130258a.json
deleted file mode 100644
index 9ac5c419c..000000000
--- a/data/hfopenllm_v2/ehristoforu/RQwen-v0.2/066abe97-2c6c-4f3b-9e5e-e144f130258a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_RQwen-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RQwen-v0.2",
-    "id": "ehristoforu/RQwen-v0.2",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7504
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6427
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.327
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3372
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4207
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5159
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/SoRu-0009/a3af8f77-d915-4482-a2b6-c99744aada4b.json b/data/hfopenllm_v2/ehristoforu/SoRu-0009/a3af8f77-d915-4482-a2b6-c99744aada4b.json
deleted file mode 100644
index 1af53918b..000000000
--- a/data/hfopenllm_v2/ehristoforu/SoRu-0009/a3af8f77-d915-4482-a2b6-c99744aada4b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_SoRu-0009/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SoRu-0009",
-    "id": "ehristoforu/SoRu-0009",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2582
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.315
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0211
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3369
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1239
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/coolqwen-3b-it/82cc8b37-e242-441e-ac74-1662bcc0a0e2.json b/data/hfopenllm_v2/ehristoforu/coolqwen-3b-it/82cc8b37-e242-441e-ac74-1662bcc0a0e2.json
deleted file mode 100644
index fcf5a776a..000000000
--- a/data/hfopenllm_v2/ehristoforu/coolqwen-3b-it/82cc8b37-e242-441e-ac74-1662bcc0a0e2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_coolqwen-3b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "coolqwen-3b-it",
-    "id": "ehristoforu/coolqwen-3b-it",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.085
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6473
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4851
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3671
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4125
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3601
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/della-70b-test-v1/1527c8bc-c1ec-45f4-9663-4cffbb808f94.json b/data/hfopenllm_v2/ehristoforu/della-70b-test-v1/1527c8bc-c1ec-45f4-9663-4cffbb808f94.json
deleted file mode 100644
index 367d47be6..000000000
--- a/data/hfopenllm_v2/ehristoforu/della-70b-test-v1/1527c8bc-c1ec-45f4-9663-4cffbb808f94.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_della-70b-test-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "della-70b-test-v1",
-    "id": "ehristoforu/della-70b-test-v1",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4979
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4555
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1575
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/falcon3-ultraset/337b8ce8-d697-47f6-94ac-7a420dd7d91b.json b/data/hfopenllm_v2/ehristoforu/falcon3-ultraset/337b8ce8-d697-47f6-94ac-7a420dd7d91b.json
deleted file mode 100644
index e2151cd4e..000000000
--- a/data/hfopenllm_v2/ehristoforu/falcon3-ultraset/337b8ce8-d697-47f6-94ac-7a420dd7d91b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_falcon3-ultraset/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "falcon3-ultraset",
-    "id": "ehristoforu/falcon3-ultraset",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7135
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5584
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2122
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3322
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4853
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3982
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/fd-lora-merged-16x32/3d6ed2bb-5be7-4838-abb7-49754f9c3bfe.json b/data/hfopenllm_v2/ehristoforu/fd-lora-merged-16x32/3d6ed2bb-5be7-4838-abb7-49754f9c3bfe.json
deleted file mode 100644
index af627d759..000000000
--- a/data/hfopenllm_v2/ehristoforu/fd-lora-merged-16x32/3d6ed2bb-5be7-4838-abb7-49754f9c3bfe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_fd-lora-merged-16x32/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "fd-lora-merged-16x32",
-    "id": "ehristoforu/fd-lora-merged-16x32",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.776
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3481
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3308
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1707
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3514
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1205
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/fd-lora-merged-64x128/0a6c7056-1bce-479e-84b0-f4eeea0bd3cc.json b/data/hfopenllm_v2/ehristoforu/fd-lora-merged-64x128/0a6c7056-1bce-479e-84b0-f4eeea0bd3cc.json
deleted file mode 100644
index 185158297..000000000
--- a/data/hfopenllm_v2/ehristoforu/fd-lora-merged-64x128/0a6c7056-1bce-479e-84b0-f4eeea0bd3cc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_fd-lora-merged-64x128/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "fd-lora-merged-64x128",
-    "id": "ehristoforu/fd-lora-merged-64x128",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3281
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3345
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1873
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3368
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1537
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/fp4-14b-it-v1/3e236ad8-3828-407f-9076-743b465b8d15.json b/data/hfopenllm_v2/ehristoforu/fp4-14b-it-v1/3e236ad8-3828-407f-9076-743b465b8d15.json
deleted file mode 100644
index cd4c75104..000000000
--- a/data/hfopenllm_v2/ehristoforu/fp4-14b-it-v1/3e236ad8-3828-407f-9076-743b465b8d15.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_fp4-14b-it-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "fp4-14b-it-v1",
-    "id": "ehristoforu/fp4-14b-it-v1",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2535
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.574
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3595
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4205
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/fp4-14b-v1-fix/9e90dcdf-ce2a-4a7c-8b89-6af8b7c2bcfe.json b/data/hfopenllm_v2/ehristoforu/fp4-14b-v1-fix/9e90dcdf-ce2a-4a7c-8b89-6af8b7c2bcfe.json
deleted file mode 100644
index ba50dee2a..000000000
--- a/data/hfopenllm_v2/ehristoforu/fp4-14b-v1-fix/9e90dcdf-ce2a-4a7c-8b89-6af8b7c2bcfe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_fp4-14b-v1-fix/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "fp4-14b-v1-fix",
-    "id": "ehristoforu/fp4-14b-v1-fix",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6742
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6817
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4207
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.354
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4532
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5353
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/fq2.5-7b-it-normalize_false/940d88e9-085b-4065-b8c8-92ebe685deb0.json b/data/hfopenllm_v2/ehristoforu/fq2.5-7b-it-normalize_false/940d88e9-085b-4065-b8c8-92ebe685deb0.json
deleted file mode 100644
index 06d2d501b..000000000
--- a/data/hfopenllm_v2/ehristoforu/fq2.5-7b-it-normalize_false/940d88e9-085b-4065-b8c8-92ebe685deb0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_fq2.5-7b-it-normalize_false/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "fq2.5-7b-it-normalize_false",
-    "id": "ehristoforu/fq2.5-7b-it-normalize_false",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7399
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.552
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4622
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4612
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4413
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/fq2.5-7b-it-normalize_true/7fdcd616-2c72-4c44-9646-9c32344bfa0b.json b/data/hfopenllm_v2/ehristoforu/fq2.5-7b-it-normalize_true/7fdcd616-2c72-4c44-9646-9c32344bfa0b.json
deleted file mode 100644
index 0dfb08d8b..000000000
--- a/data/hfopenllm_v2/ehristoforu/fq2.5-7b-it-normalize_true/7fdcd616-2c72-4c44-9646-9c32344bfa0b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_fq2.5-7b-it-normalize_true/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "fq2.5-7b-it-normalize_true",
-    "id": "ehristoforu/fq2.5-7b-it-normalize_true",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7399
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.552
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4622
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4612
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4413
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/frqwen2.5-from7b-duable4layers-it/9d358f55-810c-4ac1-adc7-83f95bd74c11.json b/data/hfopenllm_v2/ehristoforu/frqwen2.5-from7b-duable4layers-it/9d358f55-810c-4ac1-adc7-83f95bd74c11.json
deleted file mode 100644
index 073ffd258..000000000
--- a/data/hfopenllm_v2/ehristoforu/frqwen2.5-from7b-duable4layers-it/9d358f55-810c-4ac1-adc7-83f95bd74c11.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_frqwen2.5-from7b-duable4layers-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "frqwen2.5-from7b-duable4layers-it",
-    "id": "ehristoforu/frqwen2.5-from7b-duable4layers-it",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 8.545
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7729
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5264
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4509
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4166
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4126
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/frqwen2.5-from7b-it/9ba3fe31-772a-4cf7-aa13-3680b6ad51ba.json b/data/hfopenllm_v2/ehristoforu/frqwen2.5-from7b-it/9ba3fe31-772a-4cf7-aa13-3680b6ad51ba.json
deleted file mode 100644
index 61c9293ae..000000000
--- a/data/hfopenllm_v2/ehristoforu/frqwen2.5-from7b-it/9ba3fe31-772a-4cf7-aa13-3680b6ad51ba.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_frqwen2.5-from7b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "frqwen2.5-from7b-it",
-    "id": "ehristoforu/frqwen2.5-from7b-it",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 13.206
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6532
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5143
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2923
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4086
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3977
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/mllama-3.1-8b-instruct/651a32b1-77fb-4acf-89bf-2d45b684944d.json b/data/hfopenllm_v2/ehristoforu/mllama-3.1-8b-instruct/651a32b1-77fb-4acf-89bf-2d45b684944d.json
deleted file mode 100644
index 364558711..000000000
--- a/data/hfopenllm_v2/ehristoforu/mllama-3.1-8b-instruct/651a32b1-77fb-4acf-89bf-2d45b684944d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_mllama-3.1-8b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mllama-3.1-8b-instruct",
-    "id": "ehristoforu/mllama-3.1-8b-instruct",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3458
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4718
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3776
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.338
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2533
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/mllama-3.1-8b-it/192c4037-753a-4790-80d0-33c4d277102d.json b/data/hfopenllm_v2/ehristoforu/mllama-3.1-8b-it/192c4037-753a-4790-80d0-33c4d277102d.json
deleted file mode 100644
index 601171444..000000000
--- a/data/hfopenllm_v2/ehristoforu/mllama-3.1-8b-it/192c4037-753a-4790-80d0-33c4d277102d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_mllama-3.1-8b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mllama-3.1-8b-it",
-    "id": "ehristoforu/mllama-3.1-8b-it",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3879
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4868
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3799
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3349
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2622
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/moremerge-upscaled/679d66bf-244e-4080-9a42-0a0c6cfdc965.json b/data/hfopenllm_v2/ehristoforu/moremerge-upscaled/679d66bf-244e-4080-9a42-0a0c6cfdc965.json
deleted file mode 100644
index 62db8ac19..000000000
--- a/data/hfopenllm_v2/ehristoforu/moremerge-upscaled/679d66bf-244e-4080-9a42-0a0c6cfdc965.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_moremerge-upscaled/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "moremerge-upscaled",
-    "id": "ehristoforu/moremerge-upscaled",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 8.545
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1979
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2698
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2466
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3593
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1041
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/moremerge/73b0ca8a-fb16-43eb-a9af-a01219cf6196.json b/data/hfopenllm_v2/ehristoforu/moremerge/73b0ca8a-fb16-43eb-a9af-a01219cf6196.json
deleted file mode 100644
index f1591edbb..000000000
--- a/data/hfopenllm_v2/ehristoforu/moremerge/73b0ca8a-fb16-43eb-a9af-a01219cf6196.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_moremerge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "moremerge",
-    "id": "ehristoforu/moremerge",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2019
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2868
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3566
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1065
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/phi-4-25b/7f00ecbc-fcc8-43ae-867b-cb160e63a80c.json b/data/hfopenllm_v2/ehristoforu/phi-4-25b/7f00ecbc-fcc8-43ae-867b-cb160e63a80c.json
deleted file mode 100644
index 524b66617..000000000
--- a/data/hfopenllm_v2/ehristoforu/phi-4-25b/7f00ecbc-fcc8-43ae-867b-cb160e63a80c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_phi-4-25b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-4-25b",
-    "id": "ehristoforu/phi-4-25b",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 24.883
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6484
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6908
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4524
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4208
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5351
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/qwen2.5-test-32b-it/a8238bd4-3982-4e45-92e4-bab77e528e29.json b/data/hfopenllm_v2/ehristoforu/qwen2.5-test-32b-it/a8238bd4-3982-4e45-92e4-bab77e528e29.json
deleted file mode 100644
index e044dab4c..000000000
--- a/data/hfopenllm_v2/ehristoforu/qwen2.5-test-32b-it/a8238bd4-3982-4e45-92e4-bab77e528e29.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_qwen2.5-test-32b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen2.5-test-32b-it",
-    "id": "ehristoforu/qwen2.5-test-32b-it",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7889
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7081
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5974
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3641
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4578
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5765
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/qwen2.5-with-lora-think-3b-it/f87f9f08-e989-4e99-a254-a3650e7ab1b6.json b/data/hfopenllm_v2/ehristoforu/qwen2.5-with-lora-think-3b-it/f87f9f08-e989-4e99-a254-a3650e7ab1b6.json
deleted file mode 100644
index c5897513d..000000000
--- a/data/hfopenllm_v2/ehristoforu/qwen2.5-with-lora-think-3b-it/f87f9f08-e989-4e99-a254-a3650e7ab1b6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_qwen2.5-with-lora-think-3b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen2.5-with-lora-think-3b-it",
-    "id": "ehristoforu/qwen2.5-with-lora-think-3b-it",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5319
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4687
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2364
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.431
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3403
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/rmoe-v1/f40496a9-fb14-4b2d-8070-84f55e6417f6.json b/data/hfopenllm_v2/ehristoforu/rmoe-v1/f40496a9-fb14-4b2d-8070-84f55e6417f6.json
deleted file mode 100644
index 622169295..000000000
--- a/data/hfopenllm_v2/ehristoforu/rmoe-v1/f40496a9-fb14-4b2d-8070-84f55e6417f6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_rmoe-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "rmoe-v1",
-    "id": "ehristoforu/rmoe-v1",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2MoeForCausalLM",
-      "params_billions": 11.026
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.265
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2929
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0015
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3663
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/rufalcon3-3b-it/cc52f59d-5669-44b0-b1af-e6fd0836e284.json b/data/hfopenllm_v2/ehristoforu/rufalcon3-3b-it/cc52f59d-5669-44b0-b1af-e6fd0836e284.json
deleted file mode 100644
index 17d1c2ff2..000000000
--- a/data/hfopenllm_v2/ehristoforu/rufalcon3-3b-it/cc52f59d-5669-44b0-b1af-e6fd0836e284.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_rufalcon3-3b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "rufalcon3-3b-it",
-    "id": "ehristoforu/rufalcon3-3b-it",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.228
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5942
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4155
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1782
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3895
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2348
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/ruphi-4b/67525a37-f658-40e8-89a1-de8bf6275a00.json b/data/hfopenllm_v2/ehristoforu/ruphi-4b/67525a37-f658-40e8-89a1-de8bf6275a00.json
deleted file mode 100644
index 9d1d684fd..000000000
--- a/data/hfopenllm_v2/ehristoforu/ruphi-4b/67525a37-f658-40e8-89a1-de8bf6275a00.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_ruphi-4b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ruphi-4b",
-    "id": "ehristoforu/ruphi-4b",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1752
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2906
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2399
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3512
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1126
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/testq-32b/3cb34886-7a93-42b9-a8fa-fab5f4bd8624.json b/data/hfopenllm_v2/ehristoforu/testq-32b/3cb34886-7a93-42b9-a8fa-fab5f4bd8624.json
deleted file mode 100644
index 50e07975c..000000000
--- a/data/hfopenllm_v2/ehristoforu/testq-32b/3cb34886-7a93-42b9-a8fa-fab5f4bd8624.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_testq-32b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "testq-32b",
-    "id": "ehristoforu/testq-32b",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 56.165
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1876
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2877
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.003
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3715
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1166
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/tmoe-v2/0dd1f9fc-cf54-47ff-8ccd-148b45f3c921.json b/data/hfopenllm_v2/ehristoforu/tmoe-v2/0dd1f9fc-cf54-47ff-8ccd-148b45f3c921.json
deleted file mode 100644
index ed11e623d..000000000
--- a/data/hfopenllm_v2/ehristoforu/tmoe-v2/0dd1f9fc-cf54-47ff-8ccd-148b45f3c921.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_tmoe-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tmoe-v2",
-    "id": "ehristoforu/tmoe-v2",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2MoeForCausalLM",
-      "params_billions": 11.026
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1903
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2897
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0023
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4151
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.11
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/tmoe/7a05616e-7335-419a-914d-00fb287fe663.json b/data/hfopenllm_v2/ehristoforu/tmoe/7a05616e-7335-419a-914d-00fb287fe663.json
deleted file mode 100644
index d62cd4fdb..000000000
--- a/data/hfopenllm_v2/ehristoforu/tmoe/7a05616e-7335-419a-914d-00fb287fe663.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_tmoe/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tmoe",
-    "id": "ehristoforu/tmoe",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2MoeForCausalLM",
-      "params_billions": 11.026
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1193
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3073
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0076
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2232
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3699
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1191
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/trd-7b-it/070a21b5-4cd3-41b7-9653-0d2d2e4f273d.json b/data/hfopenllm_v2/ehristoforu/trd-7b-it/070a21b5-4cd3-41b7-9653-0d2d2e4f273d.json
deleted file mode 100644
index bf0489323..000000000
--- a/data/hfopenllm_v2/ehristoforu/trd-7b-it/070a21b5-4cd3-41b7-9653-0d2d2e4f273d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_trd-7b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "trd-7b-it",
-    "id": "ehristoforu/trd-7b-it",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2185
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.299
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0317
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3794
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1179
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ehristoforu/ud-14b/5afc044a-3138-443f-89cf-74f1272cc632.json b/data/hfopenllm_v2/ehristoforu/ud-14b/5afc044a-3138-443f-89cf-74f1272cc632.json
deleted file mode 100644
index 5bff4a74e..000000000
--- a/data/hfopenllm_v2/ehristoforu/ud-14b/5afc044a-3138-443f-89cf-74f1272cc632.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ehristoforu_ud-14b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ud-14b",
-    "id": "ehristoforu/ud-14b",
-    "developer": "ehristoforu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4235
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3324
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1903
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2374
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4394
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2415
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/elinas/Chronos-Gold-12B-1.0/a6c1d914-647c-46b7-b0e1-712b8d506780.json b/data/hfopenllm_v2/elinas/Chronos-Gold-12B-1.0/a6c1d914-647c-46b7-b0e1-712b8d506780.json
deleted file mode 100644
index 1bd60921e..000000000
--- a/data/hfopenllm_v2/elinas/Chronos-Gold-12B-1.0/a6c1d914-647c-46b7-b0e1-712b8d506780.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/elinas_Chronos-Gold-12B-1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chronos-Gold-12B-1.0",
-    "id": "elinas/Chronos-Gold-12B-1.0",
-    "developer": "elinas",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3166
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5515
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0695
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.474
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3518
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ell44ot/gemma-2b-def/43f35eac-0946-42f9-a128-eb8011c29588.json b/data/hfopenllm_v2/ell44ot/gemma-2b-def/43f35eac-0946-42f9-a128-eb8011c29588.json
deleted file mode 100644
index 07020d3eb..000000000
--- a/data/hfopenllm_v2/ell44ot/gemma-2b-def/43f35eac-0946-42f9-a128-eb8011c29588.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ell44ot_gemma-2b-def/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2b-def",
-    "id": "ell44ot/gemma-2b-def",
-    "developer": "ell44ot",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GemmaModel",
-      "params_billions": 1.546
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3159
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0242
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.367
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1572
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/euclaise/ReMask-3B/04c22be7-2cf4-4774-b479-863199c7c3a4.json b/data/hfopenllm_v2/euclaise/ReMask-3B/04c22be7-2cf4-4774-b479-863199c7c3a4.json
deleted file mode 100644
index f550f9a02..000000000
--- a/data/hfopenllm_v2/euclaise/ReMask-3B/04c22be7-2cf4-4774-b479-863199c7c3a4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/euclaise_ReMask-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReMask-3B",
-    "id": "euclaise/ReMask-3B",
-    "developer": "euclaise",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "StableLmForCausalLM",
-      "params_billions": 2.795
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2419
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3517
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3341
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1357
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/eworojoshua/vas-01/fc3d436b-ec61-4458-a3c6-1df41057ea70.json b/data/hfopenllm_v2/eworojoshua/vas-01/fc3d436b-ec61-4458-a3c6-1df41057ea70.json
deleted file mode 100644
index d331211d6..000000000
--- a/data/hfopenllm_v2/eworojoshua/vas-01/fc3d436b-ec61-4458-a3c6-1df41057ea70.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/eworojoshua_vas-01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "vas-01",
-    "id": "eworojoshua/vas-01",
-    "developer": "eworojoshua",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7612
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5418
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4736
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4432
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4348
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ewre324/Thinker-Llama-3.2-3B-Instruct-Reasoning/e3ed157f-f306-40fb-b3a1-d3434236759e.json b/data/hfopenllm_v2/ewre324/Thinker-Llama-3.2-3B-Instruct-Reasoning/e3ed157f-f306-40fb-b3a1-d3434236759e.json
deleted file mode 100644
index 99fd3f7c5..000000000
--- a/data/hfopenllm_v2/ewre324/Thinker-Llama-3.2-3B-Instruct-Reasoning/e3ed157f-f306-40fb-b3a1-d3434236759e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ewre324_Thinker-Llama-3.2-3B-Instruct-Reasoning/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Thinker-Llama-3.2-3B-Instruct-Reasoning",
-    "id": "ewre324/Thinker-Llama-3.2-3B-Instruct-Reasoning",
-    "developer": "ewre324",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4439
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4273
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0846
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3655
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ewre324/Thinker-Qwen2.5-0.5B-Instruct-Reasoning/8793b3e3-f409-499a-81f8-c250c8092841.json b/data/hfopenllm_v2/ewre324/Thinker-Qwen2.5-0.5B-Instruct-Reasoning/8793b3e3-f409-499a-81f8-c250c8092841.json
deleted file mode 100644
index eba4ef4a3..000000000
--- a/data/hfopenllm_v2/ewre324/Thinker-Qwen2.5-0.5B-Instruct-Reasoning/8793b3e3-f409-499a-81f8-c250c8092841.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ewre324_Thinker-Qwen2.5-0.5B-Instruct-Reasoning/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Thinker-Qwen2.5-0.5B-Instruct-Reasoning",
-    "id": "ewre324/Thinker-Qwen2.5-0.5B-Instruct-Reasoning",
-    "developer": "ewre324",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2476
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3292
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1647
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ewre324/Thinker-SmolLM2-135M-Instruct-Reasoning/33572f63-15ba-4fbc-b1cf-56b978384d02.json b/data/hfopenllm_v2/ewre324/Thinker-SmolLM2-135M-Instruct-Reasoning/33572f63-15ba-4fbc-b1cf-56b978384d02.json
deleted file mode 100644
index 3408e328d..000000000
--- a/data/hfopenllm_v2/ewre324/Thinker-SmolLM2-135M-Instruct-Reasoning/33572f63-15ba-4fbc-b1cf-56b978384d02.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ewre324_Thinker-SmolLM2-135M-Instruct-Reasoning/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Thinker-SmolLM2-135M-Instruct-Reasoning",
-    "id": "ewre324/Thinker-SmolLM2-135M-Instruct-Reasoning",
-    "developer": "ewre324",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3071
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3661
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1094
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ewre324/ewre324-R1-SmolLM2-135M-Distill/44c636ba-8303-4d75-bcb5-46e3c07a991a.json b/data/hfopenllm_v2/ewre324/ewre324-R1-SmolLM2-135M-Distill/44c636ba-8303-4d75-bcb5-46e3c07a991a.json
deleted file mode 100644
index 5cd36efa3..000000000
--- a/data/hfopenllm_v2/ewre324/ewre324-R1-SmolLM2-135M-Distill/44c636ba-8303-4d75-bcb5-46e3c07a991a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ewre324_ewre324-R1-SmolLM2-135M-Distill/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ewre324-R1-SmolLM2-135M-Distill",
-    "id": "ewre324/ewre324-R1-SmolLM2-135M-Distill",
-    "developer": "ewre324",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1649
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3042
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3409
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1134
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/experiment-llm/exp-3-q-r/0a002444-3e5a-4fc8-acc6-72210a4181a9.json b/data/hfopenllm_v2/experiment-llm/exp-3-q-r/0a002444-3e5a-4fc8-acc6-72210a4181a9.json
deleted file mode 100644
index 3553a3e14..000000000
--- a/data/hfopenllm_v2/experiment-llm/exp-3-q-r/0a002444-3e5a-4fc8-acc6-72210a4181a9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/experiment-llm_exp-3-q-r/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "exp-3-q-r",
-    "id": "experiment-llm/exp-3-q-r",
-    "developer": "experiment-llm",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6036
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5397
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2787
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4315
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4316
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/facebook/opt-1.3b/bbf936a5-3594-4d0a-b5af-7a01740d0c81.json b/data/hfopenllm_v2/facebook/opt-1.3b/bbf936a5-3594-4d0a-b5af-7a01740d0c81.json
deleted file mode 100644
index fc192890e..000000000
--- a/data/hfopenllm_v2/facebook/opt-1.3b/bbf936a5-3594-4d0a-b5af-7a01740d0c81.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/facebook_opt-1.3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "opt-1.3b",
-    "id": "facebook/opt-1.3b",
-    "developer": "facebook",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "OPTForCausalLM",
-      "params_billions": 1.3
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2383
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3094
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2424
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1107
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/facebook/opt-30b/1164abea-4cc2-46a7-a44b-f024a2ce40b4.json b/data/hfopenllm_v2/facebook/opt-30b/1164abea-4cc2-46a7-a44b-f024a2ce40b4.json
deleted file mode 100644
index d05075cf6..000000000
--- a/data/hfopenllm_v2/facebook/opt-30b/1164abea-4cc2-46a7-a44b-f024a2ce40b4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/facebook_opt-30b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "opt-30b",
-    "id": "facebook/opt-30b",
-    "developer": "facebook",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "OPTForCausalLM",
-      "params_billions": 30.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2453
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3604
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1164
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/failspy/Llama-3-8B-Instruct-MopeyMule/bfd88bec-fcc2-4580-a5c7-4792a0300a5b.json b/data/hfopenllm_v2/failspy/Llama-3-8B-Instruct-MopeyMule/bfd88bec-fcc2-4580-a5c7-4792a0300a5b.json
deleted file mode 100644
index c959c146e..000000000
--- a/data/hfopenllm_v2/failspy/Llama-3-8B-Instruct-MopeyMule/bfd88bec-fcc2-4580-a5c7-4792a0300a5b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/failspy_Llama-3-8B-Instruct-MopeyMule/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-MopeyMule",
-    "id": "failspy/Llama-3-8B-Instruct-MopeyMule",
-    "developer": "failspy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.675
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3839
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2391
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3513
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1764
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/failspy/Llama-3-8B-Instruct-abliterated/7f49e582-a01f-481f-8345-1c384fc8b567.json b/data/hfopenllm_v2/failspy/Llama-3-8B-Instruct-abliterated/7f49e582-a01f-481f-8345-1c384fc8b567.json
deleted file mode 100644
index b05984172..000000000
--- a/data/hfopenllm_v2/failspy/Llama-3-8B-Instruct-abliterated/7f49e582-a01f-481f-8345-1c384fc8b567.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/failspy_Llama-3-8B-Instruct-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-abliterated",
-    "id": "failspy/Llama-3-8B-Instruct-abliterated",
-    "developer": "failspy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5909
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4354
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4116
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2742
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/failspy/Meta-Llama-3-70B-Instruct-abliterated-v3.5/10937ed1-56e2-4aad-b717-5125bc8ac72a.json b/data/hfopenllm_v2/failspy/Meta-Llama-3-70B-Instruct-abliterated-v3.5/10937ed1-56e2-4aad-b717-5125bc8ac72a.json
deleted file mode 100644
index 794c2e176..000000000
--- a/data/hfopenllm_v2/failspy/Meta-Llama-3-70B-Instruct-abliterated-v3.5/10937ed1-56e2-4aad-b717-5125bc8ac72a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/failspy_Meta-Llama-3-70B-Instruct-abliterated-v3.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Meta-Llama-3-70B-Instruct-abliterated-v3.5",
-    "id": "failspy/Meta-Llama-3-70B-Instruct-abliterated-v3.5",
-    "developer": "failspy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7747
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5747
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1284
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3982
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4452
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/failspy/Meta-Llama-3-8B-Instruct-abliterated-v3/f4622539-c0ac-4e9f-86d4-00e3c826d03b.json b/data/hfopenllm_v2/failspy/Meta-Llama-3-8B-Instruct-abliterated-v3/f4622539-c0ac-4e9f-86d4-00e3c826d03b.json
deleted file mode 100644
index eb2bf7ad8..000000000
--- a/data/hfopenllm_v2/failspy/Meta-Llama-3-8B-Instruct-abliterated-v3/f4622539-c0ac-4e9f-86d4-00e3c826d03b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/failspy_Meta-Llama-3-8B-Instruct-abliterated-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Meta-Llama-3-8B-Instruct-abliterated-v3",
-    "id": "failspy/Meta-Llama-3-8B-Instruct-abliterated-v3",
-    "developer": "failspy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7245
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4925
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0959
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3622
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3654
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/failspy/Phi-3-medium-4k-instruct-abliterated-v3/6b13b2b1-68cd-4aae-8f2b-2400f40760d7.json b/data/hfopenllm_v2/failspy/Phi-3-medium-4k-instruct-abliterated-v3/6b13b2b1-68cd-4aae-8f2b-2400f40760d7.json
deleted file mode 100644
index 8fbcaddc2..000000000
--- a/data/hfopenllm_v2/failspy/Phi-3-medium-4k-instruct-abliterated-v3/6b13b2b1-68cd-4aae-8f2b-2400f40760d7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/failspy_Phi-3-medium-4k-instruct-abliterated-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3-medium-4k-instruct-abliterated-v3",
-    "id": "failspy/Phi-3-medium-4k-instruct-abliterated-v3",
-    "developer": "failspy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 13.96
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6319
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6305
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1594
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4604
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/failspy/llama-3-70B-Instruct-abliterated/5b02726c-ba3f-482b-9f10-87b8d69ffeb4.json b/data/hfopenllm_v2/failspy/llama-3-70B-Instruct-abliterated/5b02726c-ba3f-482b-9f10-87b8d69ffeb4.json
deleted file mode 100644
index c9da5b3a7..000000000
--- a/data/hfopenllm_v2/failspy/llama-3-70B-Instruct-abliterated/5b02726c-ba3f-482b-9f10-87b8d69ffeb4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/failspy_llama-3-70B-Instruct-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-70B-Instruct-abliterated",
-    "id": "failspy/llama-3-70B-Instruct-abliterated",
-    "developer": "failspy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8023
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6465
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2432
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4128
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5145
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fblgit/TheBeagle-v2beta-32B-MGS/21d6f2dd-7bd6-42a9-b14e-c25777497890.json b/data/hfopenllm_v2/fblgit/TheBeagle-v2beta-32B-MGS/21d6f2dd-7bd6-42a9-b14e-c25777497890.json
deleted file mode 100644
index bfc6fcad1..000000000
--- a/data/hfopenllm_v2/fblgit/TheBeagle-v2beta-32B-MGS/21d6f2dd-7bd6-42a9-b14e-c25777497890.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fblgit_TheBeagle-v2beta-32B-MGS/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TheBeagle-v2beta-32B-MGS",
-    "id": "fblgit/TheBeagle-v2beta-32B-MGS",
-    "developer": "fblgit",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5181
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7033
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4947
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3826
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5008
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5915
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fblgit/TheBeagle-v2beta-32B-MGS/d0bc11cb-56ff-4c77-9446-e76e550e0919.json b/data/hfopenllm_v2/fblgit/TheBeagle-v2beta-32B-MGS/d0bc11cb-56ff-4c77-9446-e76e550e0919.json
deleted file mode 100644
index 6080ab1e6..000000000
--- a/data/hfopenllm_v2/fblgit/TheBeagle-v2beta-32B-MGS/d0bc11cb-56ff-4c77-9446-e76e550e0919.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fblgit_TheBeagle-v2beta-32B-MGS/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TheBeagle-v2beta-32B-MGS",
-    "id": "fblgit/TheBeagle-v2beta-32B-MGS",
-    "developer": "fblgit",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4503
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7035
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3943
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.401
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5021
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5911
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fblgit/UNA-SimpleSmaug-34b-v1beta/ff78dc97-e9cf-4215-a607-3e80892af82c.json b/data/hfopenllm_v2/fblgit/UNA-SimpleSmaug-34b-v1beta/ff78dc97-e9cf-4215-a607-3e80892af82c.json
deleted file mode 100644
index a5cda2338..000000000
--- a/data/hfopenllm_v2/fblgit/UNA-SimpleSmaug-34b-v1beta/ff78dc97-e9cf-4215-a607-3e80892af82c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fblgit_UNA-SimpleSmaug-34b-v1beta/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "UNA-SimpleSmaug-34b-v1beta",
-    "id": "fblgit/UNA-SimpleSmaug-34b-v1beta",
-    "developer": "fblgit",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.389
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4556
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5287
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0718
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4256
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.454
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fblgit/UNA-TheBeagle-7b-v1/0ff1c6ff-5404-4d61-b6c6-f6ef7ae9ca8b.json b/data/hfopenllm_v2/fblgit/UNA-TheBeagle-7b-v1/0ff1c6ff-5404-4d61-b6c6-f6ef7ae9ca8b.json
deleted file mode 100644
index 4d873390f..000000000
--- a/data/hfopenllm_v2/fblgit/UNA-TheBeagle-7b-v1/0ff1c6ff-5404-4d61-b6c6-f6ef7ae9ca8b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fblgit_UNA-TheBeagle-7b-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "UNA-TheBeagle-7b-v1",
-    "id": "fblgit/UNA-TheBeagle-7b-v1",
-    "developer": "fblgit",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3689
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5029
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.077
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4564
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3019
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fblgit/UNA-ThePitbull-21.4B-v2/48837141-2556-4658-87e0-bb88cfcd562a.json b/data/hfopenllm_v2/fblgit/UNA-ThePitbull-21.4B-v2/48837141-2556-4658-87e0-bb88cfcd562a.json
deleted file mode 100644
index a60c55895..000000000
--- a/data/hfopenllm_v2/fblgit/UNA-ThePitbull-21.4B-v2/48837141-2556-4658-87e0-bb88cfcd562a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fblgit_UNA-ThePitbull-21.4B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "UNA-ThePitbull-21.4B-v2",
-    "id": "fblgit/UNA-ThePitbull-21.4B-v2",
-    "developer": "fblgit",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 21.421
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.379
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.635
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3922
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3516
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fblgit/cybertron-v4-qw7B-MGS/f2d6da5d-3685-43de-8ceb-5b798f88e24c.json b/data/hfopenllm_v2/fblgit/cybertron-v4-qw7B-MGS/f2d6da5d-3685-43de-8ceb-5b798f88e24c.json
deleted file mode 100644
index 8b3f04151..000000000
--- a/data/hfopenllm_v2/fblgit/cybertron-v4-qw7B-MGS/f2d6da5d-3685-43de-8ceb-5b798f88e24c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fblgit_cybertron-v4-qw7B-MGS/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "cybertron-v4-qw7B-MGS",
-    "id": "fblgit/cybertron-v4-qw7B-MGS",
-    "developer": "fblgit",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6264
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5592
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3489
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4371
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4473
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fblgit/cybertron-v4-qw7B-UNAMGS/9ec02ccd-329a-4d62-9f04-87de6fda5011.json b/data/hfopenllm_v2/fblgit/cybertron-v4-qw7B-UNAMGS/9ec02ccd-329a-4d62-9f04-87de6fda5011.json
deleted file mode 100644
index 4e95a568a..000000000
--- a/data/hfopenllm_v2/fblgit/cybertron-v4-qw7B-UNAMGS/9ec02ccd-329a-4d62-9f04-87de6fda5011.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fblgit_cybertron-v4-qw7B-UNAMGS/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "cybertron-v4-qw7B-UNAMGS",
-    "id": "fblgit/cybertron-v4-qw7B-UNAMGS",
-    "developer": "fblgit",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.609
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5643
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3731
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4343
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.45
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fblgit/juanako-7b-UNA/781d0332-e332-4ff7-8585-9c2d8395a147.json b/data/hfopenllm_v2/fblgit/juanako-7b-UNA/781d0332-e332-4ff7-8585-9c2d8395a147.json
deleted file mode 100644
index 0a2dc5709..000000000
--- a/data/hfopenllm_v2/fblgit/juanako-7b-UNA/781d0332-e332-4ff7-8585-9c2d8395a147.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fblgit_juanako-7b-UNA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "juanako-7b-UNA",
-    "id": "fblgit/juanako-7b-UNA",
-    "developer": "fblgit",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4837
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.507
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.034
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4645
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2771
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fblgit/miniclaus-qw1.5B-UNAMGS-GRPO/d6dd460e-c352-4d31-8941-183c6eabd0a7.json b/data/hfopenllm_v2/fblgit/miniclaus-qw1.5B-UNAMGS-GRPO/d6dd460e-c352-4d31-8941-183c6eabd0a7.json
deleted file mode 100644
index 5152ad99c..000000000
--- a/data/hfopenllm_v2/fblgit/miniclaus-qw1.5B-UNAMGS-GRPO/d6dd460e-c352-4d31-8941-183c6eabd0a7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fblgit_miniclaus-qw1.5B-UNAMGS-GRPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "miniclaus-qw1.5B-UNAMGS-GRPO",
-    "id": "fblgit/miniclaus-qw1.5B-UNAMGS-GRPO",
-    "developer": "fblgit",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3518
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4234
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1103
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4254
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fblgit/miniclaus-qw1.5B-UNAMGS/66bf6442-04ea-437b-88c4-e61afc6f7139.json b/data/hfopenllm_v2/fblgit/miniclaus-qw1.5B-UNAMGS/66bf6442-04ea-437b-88c4-e61afc6f7139.json
deleted file mode 100644
index 5d5d165d7..000000000
--- a/data/hfopenllm_v2/fblgit/miniclaus-qw1.5B-UNAMGS/66bf6442-04ea-437b-88c4-e61afc6f7139.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fblgit_miniclaus-qw1.5B-UNAMGS/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "miniclaus-qw1.5B-UNAMGS",
-    "id": "fblgit/miniclaus-qw1.5B-UNAMGS",
-    "developer": "fblgit",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3348
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4239
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1088
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4293
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2937
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fblgit/pancho-v1-qw25-3B-UNAMGS/0d1911f5-a2e7-4511-a8d8-098cbf9207df.json b/data/hfopenllm_v2/fblgit/pancho-v1-qw25-3B-UNAMGS/0d1911f5-a2e7-4511-a8d8-098cbf9207df.json
deleted file mode 100644
index 2dcb46652..000000000
--- a/data/hfopenllm_v2/fblgit/pancho-v1-qw25-3B-UNAMGS/0d1911f5-a2e7-4511-a8d8-098cbf9207df.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fblgit_pancho-v1-qw25-3B-UNAMGS/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pancho-v1-qw25-3B-UNAMGS",
-    "id": "fblgit/pancho-v1-qw25-3B-UNAMGS",
-    "developer": "fblgit",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.397
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5361
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4926
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1571
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4027
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3766
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fblgit/una-cybertron-7b-v2-bf16/abc18648-ef96-4695-94d5-fa14be277431.json b/data/hfopenllm_v2/fblgit/una-cybertron-7b-v2-bf16/abc18648-ef96-4695-94d5-fa14be277431.json
deleted file mode 100644
index 56fbad07e..000000000
--- a/data/hfopenllm_v2/fblgit/una-cybertron-7b-v2-bf16/abc18648-ef96-4695-94d5-fa14be277431.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fblgit_una-cybertron-7b-v2-bf16/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "una-cybertron-7b-v2-bf16",
-    "id": "fblgit/una-cybertron-7b-v2-bf16",
-    "developer": "fblgit",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4737
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3973
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4473
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2443
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fhai50032/RolePlayLake-7B/ff1e7aaa-3f29-4192-a0e0-80fcd11ba055.json b/data/hfopenllm_v2/fhai50032/RolePlayLake-7B/ff1e7aaa-3f29-4192-a0e0-80fcd11ba055.json
deleted file mode 100644
index 5c8be692e..000000000
--- a/data/hfopenllm_v2/fhai50032/RolePlayLake-7B/ff1e7aaa-3f29-4192-a0e0-80fcd11ba055.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fhai50032_RolePlayLake-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RolePlayLake-7B",
-    "id": "fhai50032/RolePlayLake-7B",
-    "developer": "fhai50032",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5057
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5252
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0725
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4459
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.316
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fhai50032/Unaligned-Thinker-PHI-4/cc8ef5bd-957f-4308-9539-00a696182056.json b/data/hfopenllm_v2/fhai50032/Unaligned-Thinker-PHI-4/cc8ef5bd-957f-4308-9539-00a696182056.json
deleted file mode 100644
index 97e284a57..000000000
--- a/data/hfopenllm_v2/fhai50032/Unaligned-Thinker-PHI-4/cc8ef5bd-957f-4308-9539-00a696182056.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fhai50032_Unaligned-Thinker-PHI-4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Unaligned-Thinker-PHI-4",
-    "id": "fhai50032/Unaligned-Thinker-PHI-4",
-    "developer": "fhai50032",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0563
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6643
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3353
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3809
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4679
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5147
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/flammenai/Llama3.1-Flammades-70B/abc7652f-b88e-40ba-847c-c99dce9f2719.json b/data/hfopenllm_v2/flammenai/Llama3.1-Flammades-70B/abc7652f-b88e-40ba-847c-c99dce9f2719.json
deleted file mode 100644
index 791c615fd..000000000
--- a/data/hfopenllm_v2/flammenai/Llama3.1-Flammades-70B/abc7652f-b88e-40ba-847c-c99dce9f2719.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/flammenai_Llama3.1-Flammades-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-Flammades-70B",
-    "id": "flammenai/Llama3.1-Flammades-70B",
-    "developer": "flammenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7058
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.666
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2092
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.354
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4871
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4752
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/flammenai/Mahou-1.2a-llama3-8B/56e36294-e616-45a1-8dc9-2c14cf3ee8d0.json b/data/hfopenllm_v2/flammenai/Mahou-1.2a-llama3-8B/56e36294-e616-45a1-8dc9-2c14cf3ee8d0.json
deleted file mode 100644
index b1967f57c..000000000
--- a/data/hfopenllm_v2/flammenai/Mahou-1.2a-llama3-8B/56e36294-e616-45a1-8dc9-2c14cf3ee8d0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/flammenai_Mahou-1.2a-llama3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mahou-1.2a-llama3-8B",
-    "id": "flammenai/Mahou-1.2a-llama3-8B",
-    "developer": "flammenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5093
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5094
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0838
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3847
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/flammenai/Mahou-1.2a-mistral-7B/4b81caad-92ed-4bd5-98bd-58582854b5d8.json b/data/hfopenllm_v2/flammenai/Mahou-1.2a-mistral-7B/4b81caad-92ed-4bd5-98bd-58582854b5d8.json
deleted file mode 100644
index cca5cbc09..000000000
--- a/data/hfopenllm_v2/flammenai/Mahou-1.2a-mistral-7B/4b81caad-92ed-4bd5-98bd-58582854b5d8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/flammenai_Mahou-1.2a-mistral-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mahou-1.2a-mistral-7B",
-    "id": "flammenai/Mahou-1.2a-mistral-7B",
-    "developer": "flammenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4552
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5118
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0687
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3896
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/flammenai/Mahou-1.5-llama3.1-70B/2cef0040-6d4c-4c38-be40-5477911f3063.json b/data/hfopenllm_v2/flammenai/Mahou-1.5-llama3.1-70B/2cef0040-6d4c-4c38-be40-5477911f3063.json
deleted file mode 100644
index 5ef38b931..000000000
--- a/data/hfopenllm_v2/flammenai/Mahou-1.5-llama3.1-70B/2cef0040-6d4c-4c38-be40-5477911f3063.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/flammenai_Mahou-1.5-llama3.1-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mahou-1.5-llama3.1-70B",
-    "id": "flammenai/Mahou-1.5-llama3.1-70B",
-    "developer": "flammenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7147
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6651
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.21
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.354
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.495
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4749
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/flammenai/Mahou-1.5-mistral-nemo-12B/4aeef94f-823e-4be5-b4f1-37463e052748.json b/data/hfopenllm_v2/flammenai/Mahou-1.5-mistral-nemo-12B/4aeef94f-823e-4be5-b4f1-37463e052748.json
deleted file mode 100644
index 818a33929..000000000
--- a/data/hfopenllm_v2/flammenai/Mahou-1.5-mistral-nemo-12B/4aeef94f-823e-4be5-b4f1-37463e052748.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/flammenai_Mahou-1.5-mistral-nemo-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mahou-1.5-mistral-nemo-12B",
-    "id": "flammenai/Mahou-1.5-mistral-nemo-12B",
-    "developer": "flammenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6751
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5522
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0869
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.452
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3602
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/flammenai/flammen15-gutenberg-DPO-v1-7B/3d367147-373f-4543-be19-55a6429558a2.json b/data/hfopenllm_v2/flammenai/flammen15-gutenberg-DPO-v1-7B/3d367147-373f-4543-be19-55a6429558a2.json
deleted file mode 100644
index fd18593b8..000000000
--- a/data/hfopenllm_v2/flammenai/flammen15-gutenberg-DPO-v1-7B/3d367147-373f-4543-be19-55a6429558a2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/flammenai_flammen15-gutenberg-DPO-v1-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "flammen15-gutenberg-DPO-v1-7B",
-    "id": "flammenai/flammen15-gutenberg-DPO-v1-7B",
-    "developer": "flammenai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4798
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5203
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0763
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4293
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3186
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fluently-lm/FluentlyLM-Prinum/cb93091a-6c46-438a-b111-cbf7e2fac420.json b/data/hfopenllm_v2/fluently-lm/FluentlyLM-Prinum/cb93091a-6c46-438a-b111-cbf7e2fac420.json
deleted file mode 100644
index 921cf029a..000000000
--- a/data/hfopenllm_v2/fluently-lm/FluentlyLM-Prinum/cb93091a-6c46-438a-b111-cbf7e2fac420.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fluently-lm_FluentlyLM-Prinum/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FluentlyLM-Prinum",
-    "id": "fluently-lm/FluentlyLM-Prinum",
-    "developer": "fluently-lm",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.809
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7144
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.54
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4471
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5808
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fluently-lm/Llama-TI-8B-Instruct/ea6048f1-8be4-4ec8-a5d5-35ff1523d74a.json b/data/hfopenllm_v2/fluently-lm/Llama-TI-8B-Instruct/ea6048f1-8be4-4ec8-a5d5-35ff1523d74a.json
deleted file mode 100644
index 9c0f198a8..000000000
--- a/data/hfopenllm_v2/fluently-lm/Llama-TI-8B-Instruct/ea6048f1-8be4-4ec8-a5d5-35ff1523d74a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fluently-lm_Llama-TI-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-TI-8B-Instruct",
-    "id": "fluently-lm/Llama-TI-8B-Instruct",
-    "developer": "fluently-lm",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7716
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5252
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2304
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3813
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3726
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fluently-lm/Llama-TI-8B/f4dc1659-800f-49d2-a290-48e9d4b15581.json b/data/hfopenllm_v2/fluently-lm/Llama-TI-8B/f4dc1659-800f-49d2-a290-48e9d4b15581.json
deleted file mode 100644
index bacff0c4e..000000000
--- a/data/hfopenllm_v2/fluently-lm/Llama-TI-8B/f4dc1659-800f-49d2-a290-48e9d4b15581.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fluently-lm_Llama-TI-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-TI-8B",
-    "id": "fluently-lm/Llama-TI-8B",
-    "developer": "fluently-lm",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.288
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5201
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1964
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4103
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fluently-sets/FalconThink3-10B-IT/d4d8a784-5bd5-4437-8e0d-75dcb967ae33.json b/data/hfopenllm_v2/fluently-sets/FalconThink3-10B-IT/d4d8a784-5bd5-4437-8e0d-75dcb967ae33.json
deleted file mode 100644
index 9c1acfca1..000000000
--- a/data/hfopenllm_v2/fluently-sets/FalconThink3-10B-IT/d4d8a784-5bd5-4437-8e0d-75dcb967ae33.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fluently-sets_FalconThink3-10B-IT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FalconThink3-10B-IT",
-    "id": "fluently-sets/FalconThink3-10B-IT",
-    "developer": "fluently-sets",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7326
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.62
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2447
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3347
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4479
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4435
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fluently-sets/reasoning-1-1k-demo/91017e73-f33a-49f5-ac87-f6e6a178d885.json b/data/hfopenllm_v2/fluently-sets/reasoning-1-1k-demo/91017e73-f33a-49f5-ac87-f6e6a178d885.json
deleted file mode 100644
index 3cfba9c12..000000000
--- a/data/hfopenllm_v2/fluently-sets/reasoning-1-1k-demo/91017e73-f33a-49f5-ac87-f6e6a178d885.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fluently-sets_reasoning-1-1k-demo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "reasoning-1-1k-demo",
-    "id": "fluently-sets/reasoning-1-1k-demo",
-    "developer": "fluently-sets",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7525
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6397
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4282
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4061
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4774
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/formulae/mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp/b7a75bca-6afe-448a-8e5c-53ebd577c964.json b/data/hfopenllm_v2/formulae/mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp/b7a75bca-6afe-448a-8e5c-53ebd577c964.json
deleted file mode 100644
index dc662457b..000000000
--- a/data/hfopenllm_v2/formulae/mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp/b7a75bca-6afe-448a-8e5c-53ebd577c964.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/formulae_mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp",
-    "id": "formulae/mita-elite-sce-gen1.1-v1-7b-2-26-2025-exp",
-    "developer": "formulae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1614
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2976
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0015
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4219
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1174
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/formulae/mita-elite-v1.1-7b-2-25-2025/8cdced5c-23bc-4426-a0c9-b9bf82913683.json b/data/hfopenllm_v2/formulae/mita-elite-v1.1-7b-2-25-2025/8cdced5c-23bc-4426-a0c9-b9bf82913683.json
deleted file mode 100644
index 735daf408..000000000
--- a/data/hfopenllm_v2/formulae/mita-elite-v1.1-7b-2-25-2025/8cdced5c-23bc-4426-a0c9-b9bf82913683.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/formulae_mita-elite-v1.1-7b-2-25-2025/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mita-elite-v1.1-7b-2-25-2025",
-    "id": "formulae/mita-elite-v1.1-7b-2-25-2025",
-    "developer": "formulae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.125
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2867
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2483
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3487
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1098
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/formulae/mita-elite-v1.1-gen2-7b-2-25-2025/368784c8-6fc2-4340-8277-a6a9a9800a99.json b/data/hfopenllm_v2/formulae/mita-elite-v1.1-gen2-7b-2-25-2025/368784c8-6fc2-4340-8277-a6a9a9800a99.json
deleted file mode 100644
index c6044805f..000000000
--- a/data/hfopenllm_v2/formulae/mita-elite-v1.1-gen2-7b-2-25-2025/368784c8-6fc2-4340-8277-a6a9a9800a99.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/formulae_mita-elite-v1.1-gen2-7b-2-25-2025/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mita-elite-v1.1-gen2-7b-2-25-2025",
-    "id": "formulae/mita-elite-v1.1-gen2-7b-2-25-2025",
-    "developer": "formulae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1411
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2924
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3541
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1101
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/formulae/mita-elite-v1.2-7b-2-26-2025/f7ddf26b-4b4c-404b-b9d3-6ceaf78d39aa.json b/data/hfopenllm_v2/formulae/mita-elite-v1.2-7b-2-26-2025/f7ddf26b-4b4c-404b-b9d3-6ceaf78d39aa.json
deleted file mode 100644
index 218786c8e..000000000
--- a/data/hfopenllm_v2/formulae/mita-elite-v1.2-7b-2-26-2025/f7ddf26b-4b4c-404b-b9d3-6ceaf78d39aa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/formulae_mita-elite-v1.2-7b-2-26-2025/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mita-elite-v1.2-7b-2-26-2025",
-    "id": "formulae/mita-elite-v1.2-7b-2-26-2025",
-    "developer": "formulae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.148
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.293
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0023
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4287
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1186
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/formulae/mita-gen3-7b-2-26-2025/f423b0d1-3536-4865-9615-f89b9d15b14c.json b/data/hfopenllm_v2/formulae/mita-gen3-7b-2-26-2025/f423b0d1-3536-4865-9615-f89b9d15b14c.json
deleted file mode 100644
index 45585cf79..000000000
--- a/data/hfopenllm_v2/formulae/mita-gen3-7b-2-26-2025/f423b0d1-3536-4865-9615-f89b9d15b14c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/formulae_mita-gen3-7b-2-26-2025/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mita-gen3-7b-2-26-2025",
-    "id": "formulae/mita-gen3-7b-2-26-2025",
-    "developer": "formulae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1964
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2916
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0023
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3912
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1124
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/formulae/mita-gen3-v1.2-7b-2-26-2025/c7e8333d-1d79-4cfa-9833-fa42f9fcbb4b.json b/data/hfopenllm_v2/formulae/mita-gen3-v1.2-7b-2-26-2025/c7e8333d-1d79-4cfa-9833-fa42f9fcbb4b.json
deleted file mode 100644
index edcc1062c..000000000
--- a/data/hfopenllm_v2/formulae/mita-gen3-v1.2-7b-2-26-2025/c7e8333d-1d79-4cfa-9833-fa42f9fcbb4b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/formulae_mita-gen3-v1.2-7b-2-26-2025/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mita-gen3-v1.2-7b-2-26-2025",
-    "id": "formulae/mita-gen3-v1.2-7b-2-26-2025",
-    "developer": "formulae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2044
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3058
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0023
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1128
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/formulae/mita-math-v2.3-2-25-2025/b6149d15-3e0f-43d2-ae90-eca290a94edb.json b/data/hfopenllm_v2/formulae/mita-math-v2.3-2-25-2025/b6149d15-3e0f-43d2-ae90-eca290a94edb.json
deleted file mode 100644
index e68dfb118..000000000
--- a/data/hfopenllm_v2/formulae/mita-math-v2.3-2-25-2025/b6149d15-3e0f-43d2-ae90-eca290a94edb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/formulae_mita-math-v2.3-2-25-2025/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mita-math-v2.3-2-25-2025",
-    "id": "formulae/mita-math-v2.3-2-25-2025",
-    "developer": "formulae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1373
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2949
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3698
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1118
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/formulae/mita-v1-7b/e21f5d83-6b71-488d-ad55-d23268fbd611.json b/data/hfopenllm_v2/formulae/mita-v1-7b/e21f5d83-6b71-488d-ad55-d23268fbd611.json
deleted file mode 100644
index fd92b7206..000000000
--- a/data/hfopenllm_v2/formulae/mita-v1-7b/e21f5d83-6b71-488d-ad55-d23268fbd611.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/formulae_mita-v1-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mita-v1-7b",
-    "id": "formulae/mita-v1-7b",
-    "developer": "formulae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1972
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0023
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4152
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1147
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/formulae/mita-v1.1-7b-2-24-2025/68e1a42e-4318-4b5a-a45b-2607b7c2fe05.json b/data/hfopenllm_v2/formulae/mita-v1.1-7b-2-24-2025/68e1a42e-4318-4b5a-a45b-2607b7c2fe05.json
deleted file mode 100644
index 354c32cae..000000000
--- a/data/hfopenllm_v2/formulae/mita-v1.1-7b-2-24-2025/68e1a42e-4318-4b5a-a45b-2607b7c2fe05.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/formulae_mita-v1.1-7b-2-24-2025/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mita-v1.1-7b-2-24-2025",
-    "id": "formulae/mita-v1.1-7b-2-24-2025",
-    "developer": "formulae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3412
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5442
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.435
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4557
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4524
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/formulae/mita-v1.2-7b-2-24-2025/12a03ffb-d66b-4d00-a43b-fd5be80e1b07.json b/data/hfopenllm_v2/formulae/mita-v1.2-7b-2-24-2025/12a03ffb-d66b-4d00-a43b-fd5be80e1b07.json
deleted file mode 100644
index 514af2af5..000000000
--- a/data/hfopenllm_v2/formulae/mita-v1.2-7b-2-24-2025/12a03ffb-d66b-4d00-a43b-fd5be80e1b07.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/formulae_mita-v1.2-7b-2-24-2025/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mita-v1.2-7b-2-24-2025",
-    "id": "formulae/mita-v1.2-7b-2-24-2025",
-    "developer": "formulae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2564
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4919
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4879
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4344
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3359
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/frameai/Loxa-4B/adbad8dc-7d13-44cc-a5c6-e8da1de27c37.json b/data/hfopenllm_v2/frameai/Loxa-4B/adbad8dc-7d13-44cc-a5c6-e8da1de27c37.json
deleted file mode 100644
index 3d74cbfc7..000000000
--- a/data/hfopenllm_v2/frameai/Loxa-4B/adbad8dc-7d13-44cc-a5c6-e8da1de27c37.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/frameai_Loxa-4B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Loxa-4B",
-    "id": "frameai/Loxa-4B",
-    "developer": "frameai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.018
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4765
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4217
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1095
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3377
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/freewheelin/free-evo-qwen72b-v0.8-re/7fb595e5-abbc-43ff-8135-c4bb4a2ea593.json b/data/hfopenllm_v2/freewheelin/free-evo-qwen72b-v0.8-re/7fb595e5-abbc-43ff-8135-c4bb4a2ea593.json
deleted file mode 100644
index 86d886da9..000000000
--- a/data/hfopenllm_v2/freewheelin/free-evo-qwen72b-v0.8-re/7fb595e5-abbc-43ff-8135-c4bb4a2ea593.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/freewheelin_free-evo-qwen72b-v0.8-re/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "free-evo-qwen72b-v0.8-re",
-    "id": "freewheelin/free-evo-qwen72b-v0.8-re",
-    "developer": "freewheelin",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 72.288
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5331
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6127
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1805
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4872
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.487
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/freewheelin/free-solar-evo-v0.1/1bb09da7-1675-4e57-b46a-9791c888ce6f.json b/data/hfopenllm_v2/freewheelin/free-solar-evo-v0.1/1bb09da7-1675-4e57-b46a-9791c888ce6f.json
deleted file mode 100644
index d246d43e9..000000000
--- a/data/hfopenllm_v2/freewheelin/free-solar-evo-v0.1/1bb09da7-1675-4e57-b46a-9791c888ce6f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/freewheelin_free-solar-evo-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "free-solar-evo-v0.1",
-    "id": "freewheelin/free-solar-evo-v0.1",
-    "developer": "freewheelin",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.205
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4502
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4946
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3414
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/freewheelin/free-solar-evo-v0.11/3ed7dd5a-e431-480a-91a7-5ccd915057e4.json b/data/hfopenllm_v2/freewheelin/free-solar-evo-v0.11/3ed7dd5a-e431-480a-91a7-5ccd915057e4.json
deleted file mode 100644
index 75ddbaee6..000000000
--- a/data/hfopenllm_v2/freewheelin/free-solar-evo-v0.11/3ed7dd5a-e431-480a-91a7-5ccd915057e4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/freewheelin_free-solar-evo-v0.11/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "free-solar-evo-v0.11",
-    "id": "freewheelin/free-solar-evo-v0.11",
-    "developer": "freewheelin",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2027
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4545
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5052
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3467
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/freewheelin/free-solar-evo-v0.13/9cab35b6-d6a7-475e-b715-e4493d07cd92.json b/data/hfopenllm_v2/freewheelin/free-solar-evo-v0.13/9cab35b6-d6a7-475e-b715-e4493d07cd92.json
deleted file mode 100644
index e1d46c7d8..000000000
--- a/data/hfopenllm_v2/freewheelin/free-solar-evo-v0.13/9cab35b6-d6a7-475e-b715-e4493d07cd92.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/freewheelin_free-solar-evo-v0.13/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "free-solar-evo-v0.13",
-    "id": "freewheelin/free-solar-evo-v0.13",
-    "developer": "freewheelin",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2321
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4555
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5052
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.347
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/fulim/FineLlama-3.1-8B/ef7149ae-8d50-4890-89ae-fb561a86d130.json b/data/hfopenllm_v2/fulim/FineLlama-3.1-8B/ef7149ae-8d50-4890-89ae-fb561a86d130.json
deleted file mode 100644
index 9613a08ac..000000000
--- a/data/hfopenllm_v2/fulim/FineLlama-3.1-8B/ef7149ae-8d50-4890-89ae-fb561a86d130.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/fulim_FineLlama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FineLlama-3.1-8B",
-    "id": "fulim/FineLlama-3.1-8B",
-    "developer": "fulim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1439
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4569
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3167
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/gabrielmbmb/SmolLM-1.7B-Instruct-IFEval/3fa14e1f-82a5-4c04-9c76-2a3f6d56aa81.json b/data/hfopenllm_v2/gabrielmbmb/SmolLM-1.7B-Instruct-IFEval/3fa14e1f-82a5-4c04-9c76-2a3f6d56aa81.json
deleted file mode 100644
index 690658385..000000000
--- a/data/hfopenllm_v2/gabrielmbmb/SmolLM-1.7B-Instruct-IFEval/3fa14e1f-82a5-4c04-9c76-2a3f6d56aa81.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/gabrielmbmb_SmolLM-1.7B-Instruct-IFEval/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM-1.7B-Instruct-IFEval",
-    "id": "gabrielmbmb/SmolLM-1.7B-Instruct-IFEval",
-    "developer": "gabrielmbmb",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.711
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2306
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3328
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1156
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA/4418c7d1-72da-4ed3-9d5c-9d8520f6641c.json b/data/hfopenllm_v2/gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA/4418c7d1-72da-4ed3-9d5c-9d8520f6641c.json
deleted file mode 100644
index f193f5a9b..000000000
--- a/data/hfopenllm_v2/gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA/4418c7d1-72da-4ed3-9d5c-9d8520f6641c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/gaverfraxz_Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA",
-    "id": "gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-DELLA",
-    "developer": "gaverfraxz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4009
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3985
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.365
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1654
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES/8fe13380-a045-4d63-96f8-ec977540478c.json b/data/hfopenllm_v2/gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES/8fe13380-a045-4d63-96f8-ec977540478c.json
deleted file mode 100644
index 280b09aa0..000000000
--- a/data/hfopenllm_v2/gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES/8fe13380-a045-4d63-96f8-ec977540478c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/gaverfraxz_Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES",
-    "id": "gaverfraxz/Meta-Llama-3.1-8B-Instruct-HalfAbliterated-TIES",
-    "developer": "gaverfraxz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4551
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5044
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1299
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3679
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/gbueno86/Brinebreath-Llama-3.1-70B/6da42427-c7de-4830-b368-ca7757ee1d51.json b/data/hfopenllm_v2/gbueno86/Brinebreath-Llama-3.1-70B/6da42427-c7de-4830-b368-ca7757ee1d51.json
deleted file mode 100644
index 700b6cf24..000000000
--- a/data/hfopenllm_v2/gbueno86/Brinebreath-Llama-3.1-70B/6da42427-c7de-4830-b368-ca7757ee1d51.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/gbueno86_Brinebreath-Llama-3.1-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Brinebreath-Llama-3.1-70B",
-    "id": "gbueno86/Brinebreath-Llama-3.1-70B",
-    "developer": "gbueno86",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5533
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6881
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2976
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3465
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4541
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5196
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/gbueno86/Meta-LLama-3-Cat-Smaug-LLama-70b/5faf24b3-38af-4f3f-8377-bba70d75f8df.json b/data/hfopenllm_v2/gbueno86/Meta-LLama-3-Cat-Smaug-LLama-70b/5faf24b3-38af-4f3f-8377-bba70d75f8df.json
deleted file mode 100644
index 8becc522b..000000000
--- a/data/hfopenllm_v2/gbueno86/Meta-LLama-3-Cat-Smaug-LLama-70b/5faf24b3-38af-4f3f-8377-bba70d75f8df.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/gbueno86_Meta-LLama-3-Cat-Smaug-LLama-70b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Meta-LLama-3-Cat-Smaug-LLama-70b",
-    "id": "gbueno86/Meta-LLama-3-Cat-Smaug-LLama-70b",
-    "developer": "gbueno86",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8072
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6674
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2938
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4368
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5075
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ghost-x/ghost-8b-beta-1608/9a26214c-2601-49be-b1b1-03796b704059.json b/data/hfopenllm_v2/ghost-x/ghost-8b-beta-1608/9a26214c-2601-49be-b1b1-03796b704059.json
deleted file mode 100644
index e2d293c3a..000000000
--- a/data/hfopenllm_v2/ghost-x/ghost-8b-beta-1608/9a26214c-2601-49be-b1b1-03796b704059.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ghost-x_ghost-8b-beta-1608/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ghost-8b-beta-1608",
-    "id": "ghost-x/ghost-8b-beta-1608",
-    "developer": "ghost-x",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4273
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4517
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0695
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3516
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.284
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/glaiveai/Reflection-Llama-3.1-70B/fa71ed09-45d4-4a5b-bfb1-a61a359a8f0c.json b/data/hfopenllm_v2/glaiveai/Reflection-Llama-3.1-70B/fa71ed09-45d4-4a5b-bfb1-a61a359a8f0c.json
deleted file mode 100644
index 3344ef2b5..000000000
--- a/data/hfopenllm_v2/glaiveai/Reflection-Llama-3.1-70B/fa71ed09-45d4-4a5b-bfb1-a61a359a8f0c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/glaiveai_Reflection-Llama-3.1-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Reflection-Llama-3.1-70B",
-    "id": "glaiveai/Reflection-Llama-3.1-70B",
-    "developer": "glaiveai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 69.5
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5991
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5681
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2757
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.438
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6341
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/gmonsoon/SahabatAI-Llama-11B-Test/25c5b304-46d3-4df3-9ac3-75ffa972849a.json b/data/hfopenllm_v2/gmonsoon/SahabatAI-Llama-11B-Test/25c5b304-46d3-4df3-9ac3-75ffa972849a.json
deleted file mode 100644
index 579c66afe..000000000
--- a/data/hfopenllm_v2/gmonsoon/SahabatAI-Llama-11B-Test/25c5b304-46d3-4df3-9ac3-75ffa972849a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/gmonsoon_SahabatAI-Llama-11B-Test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SahabatAI-Llama-11B-Test",
-    "id": "gmonsoon/SahabatAI-Llama-11B-Test",
-    "developer": "gmonsoon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 11.52
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3376
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4728
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.031
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4001
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3182
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/gmonsoon/SahabatAI-MediChatIndo-8B-v1/88ed0272-39f8-4676-970a-525aee058991.json b/data/hfopenllm_v2/gmonsoon/SahabatAI-MediChatIndo-8B-v1/88ed0272-39f8-4676-970a-525aee058991.json
deleted file mode 100644
index 4c3ee6998..000000000
--- a/data/hfopenllm_v2/gmonsoon/SahabatAI-MediChatIndo-8B-v1/88ed0272-39f8-4676-970a-525aee058991.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/gmonsoon_SahabatAI-MediChatIndo-8B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SahabatAI-MediChatIndo-8B-v1",
-    "id": "gmonsoon/SahabatAI-MediChatIndo-8B-v1",
-    "developer": "gmonsoon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4163
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4509
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0619
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3754
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3108
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/gmonsoon/SahabatAI-Rebase-8B-Test/d8eff5d0-061b-4b83-b96a-04f9ba47ea6c.json b/data/hfopenllm_v2/gmonsoon/SahabatAI-Rebase-8B-Test/d8eff5d0-061b-4b83-b96a-04f9ba47ea6c.json
deleted file mode 100644
index 58ed22e54..000000000
--- a/data/hfopenllm_v2/gmonsoon/SahabatAI-Rebase-8B-Test/d8eff5d0-061b-4b83-b96a-04f9ba47ea6c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/gmonsoon_SahabatAI-Rebase-8B-Test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SahabatAI-Rebase-8B-Test",
-    "id": "gmonsoon/SahabatAI-Rebase-8B-Test",
-    "developer": "gmonsoon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5156
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.523
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4133
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3664
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/gmonsoon/StockSeaLLMs-7B-v1/dcb90e75-8709-4729-8c00-e756e6a9a49d.json b/data/hfopenllm_v2/gmonsoon/StockSeaLLMs-7B-v1/dcb90e75-8709-4729-8c00-e756e6a9a49d.json
deleted file mode 100644
index 0e6375ecc..000000000
--- a/data/hfopenllm_v2/gmonsoon/StockSeaLLMs-7B-v1/dcb90e75-8709-4729-8c00-e756e6a9a49d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/gmonsoon_StockSeaLLMs-7B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "StockSeaLLMs-7B-v1",
-    "id": "gmonsoon/StockSeaLLMs-7B-v1",
-    "developer": "gmonsoon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4599
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5271
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1964
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4214
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3952
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/gmonsoon/gemma2-9b-sahabatai-v1-instruct-BaseTIES/81dcf3ca-f5c2-40a1-8871-b0188d5e9ceb.json b/data/hfopenllm_v2/gmonsoon/gemma2-9b-sahabatai-v1-instruct-BaseTIES/81dcf3ca-f5c2-40a1-8871-b0188d5e9ceb.json
deleted file mode 100644
index 61f640923..000000000
--- a/data/hfopenllm_v2/gmonsoon/gemma2-9b-sahabatai-v1-instruct-BaseTIES/81dcf3ca-f5c2-40a1-8871-b0188d5e9ceb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/gmonsoon_gemma2-9b-sahabatai-v1-instruct-BaseTIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma2-9b-sahabatai-v1-instruct-BaseTIES",
-    "id": "gmonsoon/gemma2-9b-sahabatai-v1-instruct-BaseTIES",
-    "developer": "gmonsoon",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7378
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6077
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1994
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4778
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4347
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_full_2/0a0a4d32-c7a9-49c9-bba4-dae6b464a5b6.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_full_2/0a0a4d32-c7a9-49c9-bba4-dae6b464a5b6.json
deleted file mode 100644
index cf37f2ef2..000000000
--- a/data/hfopenllm_v2/godlikehhd/alpaca_data_full_2/0a0a4d32-c7a9-49c9-bba4-dae6b464a5b6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_full_2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "alpaca_data_full_2",
-    "id": "godlikehhd/alpaca_data_full_2",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3178
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4217
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0929
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4052
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2854
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_full_3B/82a3a8ef-7e5f-48d0-a48e-41ea2c5b6452.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_full_3B/82a3a8ef-7e5f-48d0-a48e-41ea2c5b6452.json
deleted file mode 100644
index 4c1cc5329..000000000
--- a/data/hfopenllm_v2/godlikehhd/alpaca_data_full_3B/82a3a8ef-7e5f-48d0-a48e-41ea2c5b6452.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_full_3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "alpaca_data_full_3B",
-    "id": "godlikehhd/alpaca_data_full_3B",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3696
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4684
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1337
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4955
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3357
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_max_2600/e635e798-fa85-4430-bf1e-9d5ad7fe9f22.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_max_2600/e635e798-fa85-4430-bf1e-9d5ad7fe9f22.json
deleted file mode 100644
index dae6a5835..000000000
--- a/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_max_2600/e635e798-fa85-4430-bf1e-9d5ad7fe9f22.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_ifd_max_2600/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "alpaca_data_ifd_max_2600",
-    "id": "godlikehhd/alpaca_data_ifd_max_2600",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3043
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4029
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0989
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3509
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2916
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_max_2600_3B/7ccaa29a-4f73-4794-83a2-b925d755d91e.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_max_2600_3B/7ccaa29a-4f73-4794-83a2-b925d755d91e.json
deleted file mode 100644
index 3e8447d76..000000000
--- a/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_max_2600_3B/7ccaa29a-4f73-4794-83a2-b925d755d91e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_ifd_max_2600_3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "alpaca_data_ifd_max_2600_3B",
-    "id": "godlikehhd/alpaca_data_ifd_max_2600_3B",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2982
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4626
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1594
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4346
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3288
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_me_max_5200/ba8de8f6-c118-4bc3-ae8d-851e964684ed.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_me_max_5200/ba8de8f6-c118-4bc3-ae8d-851e964684ed.json
deleted file mode 100644
index 6e4a08360..000000000
--- a/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_me_max_5200/ba8de8f6-c118-4bc3-ae8d-851e964684ed.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_ifd_me_max_5200/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "alpaca_data_ifd_me_max_5200",
-    "id": "godlikehhd/alpaca_data_ifd_me_max_5200",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3683
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4153
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0974
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3483
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2982
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_min_2600/4011975a-e2a0-466a-9b34-923e1b4f8733.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_min_2600/4011975a-e2a0-466a-9b34-923e1b4f8733.json
deleted file mode 100644
index bc8f77337..000000000
--- a/data/hfopenllm_v2/godlikehhd/alpaca_data_ifd_min_2600/4011975a-e2a0-466a-9b34-923e1b4f8733.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_ifd_min_2600/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "alpaca_data_ifd_min_2600",
-    "id": "godlikehhd/alpaca_data_ifd_min_2600",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4219
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0967
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3656
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2893
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_ans_max_5200/8a172205-39c6-4dd1-86b2-11b234b37e3c.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_ans_max_5200/8a172205-39c6-4dd1-86b2-11b234b37e3c.json
deleted file mode 100644
index 8ecd5019c..000000000
--- a/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_ans_max_5200/8a172205-39c6-4dd1-86b2-11b234b37e3c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_ins_ans_max_5200/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "alpaca_data_ins_ans_max_5200",
-    "id": "godlikehhd/alpaca_data_ins_ans_max_5200",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3479
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4098
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1027
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3602
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2901
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_max_5200/495b2e8e-e2d8-4158-bc6e-7568604d44e9.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_max_5200/495b2e8e-e2d8-4158-bc6e-7568604d44e9.json
deleted file mode 100644
index b1a0ba707..000000000
--- a/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_max_5200/495b2e8e-e2d8-4158-bc6e-7568604d44e9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_ins_max_5200/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "alpaca_data_ins_max_5200",
-    "id": "godlikehhd/alpaca_data_ins_max_5200",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3275
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4155
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0997
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3614
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2916
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_min_2600/e6a97d0d-9dc3-43a5-a69f-8132e19f9c77.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_min_2600/e6a97d0d-9dc3-43a5-a69f-8132e19f9c77.json
deleted file mode 100644
index 31d6adcd2..000000000
--- a/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_min_2600/e6a97d0d-9dc3-43a5-a69f-8132e19f9c77.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_ins_min_2600/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "alpaca_data_ins_min_2600",
-    "id": "godlikehhd/alpaca_data_ins_min_2600",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.333
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4187
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.111
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3853
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.288
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_min_5200/4aecfd45-f47b-4f02-a0ed-288cbef46a6f.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_min_5200/4aecfd45-f47b-4f02-a0ed-288cbef46a6f.json
deleted file mode 100644
index b84f600b2..000000000
--- a/data/hfopenllm_v2/godlikehhd/alpaca_data_ins_min_5200/4aecfd45-f47b-4f02-a0ed-288cbef46a6f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_ins_min_5200/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "alpaca_data_ins_min_5200",
-    "id": "godlikehhd/alpaca_data_ins_min_5200",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.336
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4289
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1035
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3906
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2949
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_sampled_ifd_5200/a6f7bc45-c2b5-47d8-a062-60f20c3d7ea4.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_sampled_ifd_5200/a6f7bc45-c2b5-47d8-a062-60f20c3d7ea4.json
deleted file mode 100644
index 64a2916b2..000000000
--- a/data/hfopenllm_v2/godlikehhd/alpaca_data_sampled_ifd_5200/a6f7bc45-c2b5-47d8-a062-60f20c3d7ea4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_sampled_ifd_5200/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "alpaca_data_sampled_ifd_5200",
-    "id": "godlikehhd/alpaca_data_sampled_ifd_5200",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2924
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4033
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1254
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3521
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2896
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_sampled_ifd_new_5200/c85c79d6-28e0-4deb-ad84-901b725aeca8.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_sampled_ifd_new_5200/c85c79d6-28e0-4deb-ad84-901b725aeca8.json
deleted file mode 100644
index 081b3f249..000000000
--- a/data/hfopenllm_v2/godlikehhd/alpaca_data_sampled_ifd_new_5200/c85c79d6-28e0-4deb-ad84-901b725aeca8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_sampled_ifd_new_5200/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "alpaca_data_sampled_ifd_new_5200",
-    "id": "godlikehhd/alpaca_data_sampled_ifd_new_5200",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3663
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4178
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0944
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3613
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2925
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.1_2600/73271472-d06f-405b-af9d-2da7c17e1eb0.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.1_2600/73271472-d06f-405b-af9d-2da7c17e1eb0.json
deleted file mode 100644
index 3d3313dd2..000000000
--- a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.1_2600/73271472-d06f-405b-af9d-2da7c17e1eb0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_score_max_0.1_2600/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "alpaca_data_score_max_0.1_2600",
-    "id": "godlikehhd/alpaca_data_score_max_0.1_2600",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3288
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4252
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0989
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3706
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2923
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.3_2600/4e40bb43-c33d-4324-aa02-5bb7f88a5d1f.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.3_2600/4e40bb43-c33d-4324-aa02-5bb7f88a5d1f.json
deleted file mode 100644
index 90908a71a..000000000
--- a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.3_2600/4e40bb43-c33d-4324-aa02-5bb7f88a5d1f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_score_max_0.3_2600/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "alpaca_data_score_max_0.3_2600",
-    "id": "godlikehhd/alpaca_data_score_max_0.3_2600",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3375
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4151
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1035
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3759
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2913
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.7_2600/9b36e4c0-0d13-4988-8145-b9254da2e76e.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.7_2600/9b36e4c0-0d13-4988-8145-b9254da2e76e.json
deleted file mode 100644
index 97a5ad3b9..000000000
--- a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_0.7_2600/9b36e4c0-0d13-4988-8145-b9254da2e76e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_score_max_0.7_2600/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "alpaca_data_score_max_0.7_2600",
-    "id": "godlikehhd/alpaca_data_score_max_0.7_2600",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.364
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4185
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1073
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3469
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2983
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_2500/6a464798-0111-4c71-b156-72a5aba1da63.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_2500/6a464798-0111-4c71-b156-72a5aba1da63.json
deleted file mode 100644
index 5e21ebf38..000000000
--- a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_2500/6a464798-0111-4c71-b156-72a5aba1da63.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_score_max_2500/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "alpaca_data_score_max_2500",
-    "id": "godlikehhd/alpaca_data_score_max_2500",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3564
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.418
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0952
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3627
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.294
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_2600_3B/78252135-f15b-427d-86de-c32cd3dbcd0f.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_2600_3B/78252135-f15b-427d-86de-c32cd3dbcd0f.json
deleted file mode 100644
index 6e58fff0e..000000000
--- a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_2600_3B/78252135-f15b-427d-86de-c32cd3dbcd0f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_score_max_2600_3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "alpaca_data_score_max_2600_3B",
-    "id": "godlikehhd/alpaca_data_score_max_2600_3B",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3358
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4716
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1548
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4474
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3342
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_5200/c3b7bd57-9bc3-4d83-aad9-7d6315748c0a.json b/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_5200/c3b7bd57-9bc3-4d83-aad9-7d6315748c0a.json
deleted file mode 100644
index 25483eeb0..000000000
--- a/data/hfopenllm_v2/godlikehhd/alpaca_data_score_max_5200/c3b7bd57-9bc3-4d83-aad9-7d6315748c0a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_alpaca_data_score_max_5200/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "alpaca_data_score_max_5200",
-    "id": "godlikehhd/alpaca_data_score_max_5200",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3445
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4242
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0974
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3878
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/ifd_2500_qwen/bce17582-e807-4b91-b0e7-0a890bf5eb24.json b/data/hfopenllm_v2/godlikehhd/ifd_2500_qwen/bce17582-e807-4b91-b0e7-0a890bf5eb24.json
deleted file mode 100644
index 3d6fa2b02..000000000
--- a/data/hfopenllm_v2/godlikehhd/ifd_2500_qwen/bce17582-e807-4b91-b0e7-0a890bf5eb24.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_ifd_2500_qwen/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ifd_2500_qwen",
-    "id": "godlikehhd/ifd_2500_qwen",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3365
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4298
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0982
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3615
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2921
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/ifd_new_correct_all_sample_2500_qwen/f8371e81-f6d4-4441-bc6c-5d4a18da7d08.json b/data/hfopenllm_v2/godlikehhd/ifd_new_correct_all_sample_2500_qwen/f8371e81-f6d4-4441-bc6c-5d4a18da7d08.json
deleted file mode 100644
index 9f6d40e63..000000000
--- a/data/hfopenllm_v2/godlikehhd/ifd_new_correct_all_sample_2500_qwen/f8371e81-f6d4-4441-bc6c-5d4a18da7d08.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_ifd_new_correct_all_sample_2500_qwen/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ifd_new_correct_all_sample_2500_qwen",
-    "id": "godlikehhd/ifd_new_correct_all_sample_2500_qwen",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3376
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.402
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0959
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3562
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2889
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/ifd_new_correct_sample_2500_qwen/78407b2e-1f44-46f0-bc21-76bdc68f8d9c.json b/data/hfopenllm_v2/godlikehhd/ifd_new_correct_sample_2500_qwen/78407b2e-1f44-46f0-bc21-76bdc68f8d9c.json
deleted file mode 100644
index 72f6ab1df..000000000
--- a/data/hfopenllm_v2/godlikehhd/ifd_new_correct_sample_2500_qwen/78407b2e-1f44-46f0-bc21-76bdc68f8d9c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_ifd_new_correct_sample_2500_qwen/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ifd_new_correct_sample_2500_qwen",
-    "id": "godlikehhd/ifd_new_correct_sample_2500_qwen",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3397
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.411
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1042
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3627
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2932
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/ifd_new_qwen_2500/bdb9e2d2-8d09-4994-a320-2f968bcb4898.json b/data/hfopenllm_v2/godlikehhd/ifd_new_qwen_2500/bdb9e2d2-8d09-4994-a320-2f968bcb4898.json
deleted file mode 100644
index dd934c455..000000000
--- a/data/hfopenllm_v2/godlikehhd/ifd_new_qwen_2500/bdb9e2d2-8d09-4994-a320-2f968bcb4898.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_ifd_new_qwen_2500/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ifd_new_qwen_2500",
-    "id": "godlikehhd/ifd_new_qwen_2500",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.324
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.416
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1118
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.359
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/qwen-2.5-1.5b-cherry/c57d15c8-9581-4bb5-89e4-2fea1e3c584e.json b/data/hfopenllm_v2/godlikehhd/qwen-2.5-1.5b-cherry/c57d15c8-9581-4bb5-89e4-2fea1e3c584e.json
deleted file mode 100644
index 0eae1c7ae..000000000
--- a/data/hfopenllm_v2/godlikehhd/qwen-2.5-1.5b-cherry/c57d15c8-9581-4bb5-89e4-2fea1e3c584e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_qwen-2.5-1.5b-cherry/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen-2.5-1.5b-cherry",
-    "id": "godlikehhd/qwen-2.5-1.5b-cherry",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.772
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2893
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4036
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3456
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2923
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/qwen_2.5-1.5b-cherry_new/550d5665-7a8a-437e-b318-000690dd250f.json b/data/hfopenllm_v2/godlikehhd/qwen_2.5-1.5b-cherry_new/550d5665-7a8a-437e-b318-000690dd250f.json
deleted file mode 100644
index 8676d2e1f..000000000
--- a/data/hfopenllm_v2/godlikehhd/qwen_2.5-1.5b-cherry_new/550d5665-7a8a-437e-b318-000690dd250f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_qwen_2.5-1.5b-cherry_new/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen_2.5-1.5b-cherry_new",
-    "id": "godlikehhd/qwen_2.5-1.5b-cherry_new",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.312
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.415
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0967
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3496
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/qwen_full_data_alpaca/a1922f33-32f5-4f99-8df6-e2080808d292.json b/data/hfopenllm_v2/godlikehhd/qwen_full_data_alpaca/a1922f33-32f5-4f99-8df6-e2080808d292.json
deleted file mode 100644
index 0a3408038..000000000
--- a/data/hfopenllm_v2/godlikehhd/qwen_full_data_alpaca/a1922f33-32f5-4f99-8df6-e2080808d292.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_qwen_full_data_alpaca/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen_full_data_alpaca",
-    "id": "godlikehhd/qwen_full_data_alpaca",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3136
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4229
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0921
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4052
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2851
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/godlikehhd/qwen_ins_ans_2500/6ccc376b-24a4-42cc-8ea0-823ef14336db.json b/data/hfopenllm_v2/godlikehhd/qwen_ins_ans_2500/6ccc376b-24a4-42cc-8ea0-823ef14336db.json
deleted file mode 100644
index 79114530d..000000000
--- a/data/hfopenllm_v2/godlikehhd/qwen_ins_ans_2500/6ccc376b-24a4-42cc-8ea0-823ef14336db.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/godlikehhd_qwen_ins_ans_2500/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen_ins_ans_2500",
-    "id": "godlikehhd/qwen_ins_ans_2500",
-    "developer": "godlikehhd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2698
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4074
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3589
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2809
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/codegemma-1.1-2b/6547b6f3-63dd-4516-b294-62c4246c3dc7.json b/data/hfopenllm_v2/google/codegemma-1.1-2b/6547b6f3-63dd-4516-b294-62c4246c3dc7.json
deleted file mode 100644
index 262dce559..000000000
--- a/data/hfopenllm_v2/google/codegemma-1.1-2b/6547b6f3-63dd-4516-b294-62c4246c3dc7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_codegemma-1.1-2b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "codegemma-1.1-2b",
-    "id": "google/codegemma-1.1-2b",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 2.506
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2294
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3353
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3871
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1278
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/flan-t5-base/a58bf2d3-d209-41b8-a795-ba7a16e4a28f.json b/data/hfopenllm_v2/google/flan-t5-base/a58bf2d3-d209-41b8-a795-ba7a16e4a28f.json
deleted file mode 100644
index 42df32f4d..000000000
--- a/data/hfopenllm_v2/google/flan-t5-base/a58bf2d3-d209-41b8-a795-ba7a16e4a28f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_flan-t5-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "flan-t5-base",
-    "id": "google/flan-t5-base",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "T5ForConditionalGeneration",
-      "params_billions": 0.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1891
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3526
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2383
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3671
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1357
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/flan-t5-large/b15ad3b5-7ef2-439e-9acd-a85eab520d31.json b/data/hfopenllm_v2/google/flan-t5-large/b15ad3b5-7ef2-439e-9acd-a85eab520d31.json
deleted file mode 100644
index 12a83a4b2..000000000
--- a/data/hfopenllm_v2/google/flan-t5-large/b15ad3b5-7ef2-439e-9acd-a85eab520d31.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_flan-t5-large/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "flan-t5-large",
-    "id": "google/flan-t5-large",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "T5ForConditionalGeneration",
-      "params_billions": 0.783
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2201
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4153
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4083
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1709
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/flan-t5-small/64da2654-9fdb-4a08-ad16-cf8793a30ed8.json b/data/hfopenllm_v2/google/flan-t5-small/64da2654-9fdb-4a08-ad16-cf8793a30ed8.json
deleted file mode 100644
index 736e0e185..000000000
--- a/data/hfopenllm_v2/google/flan-t5-small/64da2654-9fdb-4a08-ad16-cf8793a30ed8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_flan-t5-small/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "flan-t5-small",
-    "id": "google/flan-t5-small",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "T5ForConditionalGeneration",
-      "params_billions": 0.077
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1524
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3283
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0076
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4123
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1233
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/flan-t5-xl/37080215-ee30-4e59-a407-b14695ac2a38.json b/data/hfopenllm_v2/google/flan-t5-xl/37080215-ee30-4e59-a407-b14695ac2a38.json
deleted file mode 100644
index 5eaa5962e..000000000
--- a/data/hfopenllm_v2/google/flan-t5-xl/37080215-ee30-4e59-a407-b14695ac2a38.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_flan-t5-xl/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "flan-t5-xl",
-    "id": "google/flan-t5-xl",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "T5ForConditionalGeneration",
-      "params_billions": 2.85
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2237
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4531
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0076
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4181
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2147
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/flan-t5-xl/b83a0ce7-bf13-4a98-81f3-04e5a44105f7.json b/data/hfopenllm_v2/google/flan-t5-xl/b83a0ce7-bf13-4a98-81f3-04e5a44105f7.json
deleted file mode 100644
index e4219b2cd..000000000
--- a/data/hfopenllm_v2/google/flan-t5-xl/b83a0ce7-bf13-4a98-81f3-04e5a44105f7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_flan-t5-xl/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "flan-t5-xl",
-    "id": "google/flan-t5-xl",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "T5ForConditionalGeneration",
-      "params_billions": 2.85
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2207
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4537
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2458
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.422
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2142
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/flan-t5-xxl/bb7bea21-5bc6-460d-98ff-b3ed02d5b215.json b/data/hfopenllm_v2/google/flan-t5-xxl/bb7bea21-5bc6-460d-98ff-b3ed02d5b215.json
deleted file mode 100644
index e29cc1652..000000000
--- a/data/hfopenllm_v2/google/flan-t5-xxl/bb7bea21-5bc6-460d-98ff-b3ed02d5b215.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_flan-t5-xxl/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "flan-t5-xxl",
-    "id": "google/flan-t5-xxl",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "T5ForConditionalGeneration",
-      "params_billions": 11.267
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.22
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5066
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4218
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2343
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/flan-ul2/da9ddecc-43cf-4055-a19e-795b1ee98826.json b/data/hfopenllm_v2/google/flan-ul2/da9ddecc-43cf-4055-a19e-795b1ee98826.json
deleted file mode 100644
index efa59a78c..000000000
--- a/data/hfopenllm_v2/google/flan-ul2/da9ddecc-43cf-4055-a19e-795b1ee98826.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_flan-ul2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "flan-ul2",
-    "id": "google/flan-ul2",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "T5ForConditionalGeneration",
-      "params_billions": 19.46
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2393
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5054
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3844
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2493
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/gemma-1.1-2b-it/a93ccb3f-f2d9-415d-8397-0c7fb765fada.json b/data/hfopenllm_v2/google/gemma-1.1-2b-it/a93ccb3f-f2d9-415d-8397-0c7fb765fada.json
deleted file mode 100644
index f13cf1295..000000000
--- a/data/hfopenllm_v2/google/gemma-1.1-2b-it/a93ccb3f-f2d9-415d-8397-0c7fb765fada.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_gemma-1.1-2b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-1.1-2b-it",
-    "id": "google/gemma-1.1-2b-it",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 2.506
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3067
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3185
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3394
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1484
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/gemma-1.1-7b-it/d0f86765-bdb4-4367-986b-28303bbe1844.json b/data/hfopenllm_v2/google/gemma-1.1-7b-it/d0f86765-bdb4-4367-986b-28303bbe1844.json
deleted file mode 100644
index 53d0c2f3f..000000000
--- a/data/hfopenllm_v2/google/gemma-1.1-7b-it/d0f86765-bdb4-4367-986b-28303bbe1844.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_gemma-1.1-7b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-1.1-7b-it",
-    "id": "google/gemma-1.1-7b-it",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 8.538
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5039
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0491
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.423
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/gemma-2-27b-it/693bb191-ae83-49dc-9df1-2f68b1b5fe4a.json b/data/hfopenllm_v2/google/gemma-2-27b-it/693bb191-ae83-49dc-9df1-2f68b1b5fe4a.json
deleted file mode 100644
index 3ca551306..000000000
--- a/data/hfopenllm_v2/google/gemma-2-27b-it/693bb191-ae83-49dc-9df1-2f68b1b5fe4a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_gemma-2-27b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-27b-it",
-    "id": "google/gemma-2-27b-it",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7978
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6451
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2387
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4033
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4451
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/gemma-2-27b/7b2c0b72-6421-4f33-8593-a4bbfd0c6d6b.json b/data/hfopenllm_v2/google/gemma-2-27b/7b2c0b72-6421-4f33-8593-a4bbfd0c6d6b.json
deleted file mode 100644
index b5cad47da..000000000
--- a/data/hfopenllm_v2/google/gemma-2-27b/7b2c0b72-6421-4f33-8593-a4bbfd0c6d6b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_gemma-2-27b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-27b",
-    "id": "google/gemma-2-27b",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2475
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5643
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1662
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3507
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4396
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4371
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/gemma-2-2b-it/c4ee822f-fc8b-4523-95b6-7c3f12a334b3.json b/data/hfopenllm_v2/google/gemma-2-2b-it/c4ee822f-fc8b-4523-95b6-7c3f12a334b3.json
deleted file mode 100644
index 9103c1efd..000000000
--- a/data/hfopenllm_v2/google/gemma-2-2b-it/c4ee822f-fc8b-4523-95b6-7c3f12a334b3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_gemma-2-2b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-it",
-    "id": "google/gemma-2-2b-it",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "InternLM2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5668
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3929
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.255
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/gemma-2-2b-jpn-it/1810033a-185b-4c91-91d3-43b8f6c61443.json b/data/hfopenllm_v2/google/gemma-2-2b-jpn-it/1810033a-185b-4c91-91d3-43b8f6c61443.json
deleted file mode 100644
index 4fde83e45..000000000
--- a/data/hfopenllm_v2/google/gemma-2-2b-jpn-it/1810033a-185b-4c91-91d3-43b8f6c61443.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_gemma-2-2b-jpn-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-jpn-it",
-    "id": "google/gemma-2-2b-jpn-it",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5078
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4226
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0347
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3964
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2578
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/gemma-2-2b-jpn-it/beb721ae-a35c-4f6b-a80f-aac4835d5f8d.json b/data/hfopenllm_v2/google/gemma-2-2b-jpn-it/beb721ae-a35c-4f6b-a80f-aac4835d5f8d.json
deleted file mode 100644
index 1442b3bca..000000000
--- a/data/hfopenllm_v2/google/gemma-2-2b-jpn-it/beb721ae-a35c-4f6b-a80f-aac4835d5f8d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_gemma-2-2b-jpn-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-jpn-it",
-    "id": "google/gemma-2-2b-jpn-it",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5288
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4178
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3728
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2467
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/gemma-2-2b/cf20e77a-340f-4d8d-b593-9645bdfc5877.json b/data/hfopenllm_v2/google/gemma-2-2b/cf20e77a-340f-4d8d-b593-9645bdfc5877.json
deleted file mode 100644
index c8343d88b..000000000
--- a/data/hfopenllm_v2/google/gemma-2-2b/cf20e77a-340f-4d8d-b593-9645bdfc5877.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_gemma-2-2b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b",
-    "id": "google/gemma-2-2b",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "InternLM2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2018
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3709
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4219
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2217
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/gemma-2-2b/eec73e49-ac2b-42ed-a115-76e45007cd5d.json b/data/hfopenllm_v2/google/gemma-2-2b/eec73e49-ac2b-42ed-a115-76e45007cd5d.json
deleted file mode 100644
index 96086b370..000000000
--- a/data/hfopenllm_v2/google/gemma-2-2b/eec73e49-ac2b-42ed-a115-76e45007cd5d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_gemma-2-2b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b",
-    "id": "google/gemma-2-2b",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "InternLM2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1993
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3656
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4232
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.218
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/gemma-2-9b-it/aa06d058-87f9-4fde-ad53-139b29a71448.json b/data/hfopenllm_v2/google/gemma-2-9b-it/aa06d058-87f9-4fde-ad53-139b29a71448.json
deleted file mode 100644
index 8f9ad5f6e..000000000
--- a/data/hfopenllm_v2/google/gemma-2-9b-it/aa06d058-87f9-4fde-ad53-139b29a71448.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_gemma-2-9b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-9b-it",
-    "id": "google/gemma-2-9b-it",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7436
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.599
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1949
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3607
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4073
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3875
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/gemma-2-9b/3f1d571a-fc42-411b-88ab-4700d5861367.json b/data/hfopenllm_v2/google/gemma-2-9b/3f1d571a-fc42-411b-88ab-4700d5861367.json
deleted file mode 100644
index c496a90c9..000000000
--- a/data/hfopenllm_v2/google/gemma-2-9b/3f1d571a-fc42-411b-88ab-4700d5861367.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_gemma-2-9b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-9b",
-    "id": "google/gemma-2-9b",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.204
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5377
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1344
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4461
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4103
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/gemma-2b-it/74a56080-aeb2-4cc6-a825-bbe4d9a5900a.json b/data/hfopenllm_v2/google/gemma-2b-it/74a56080-aeb2-4cc6-a825-bbe4d9a5900a.json
deleted file mode 100644
index e1752b69f..000000000
--- a/data/hfopenllm_v2/google/gemma-2b-it/74a56080-aeb2-4cc6-a825-bbe4d9a5900a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_gemma-2b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2b-it",
-    "id": "google/gemma-2b-it",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 2.506
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.269
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3151
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3341
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1353
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/gemma-2b/2eb433ba-5c93-4355-99dd-edcb65721603.json b/data/hfopenllm_v2/google/gemma-2b/2eb433ba-5c93-4355-99dd-edcb65721603.json
deleted file mode 100644
index 3c6b79553..000000000
--- a/data/hfopenllm_v2/google/gemma-2b/2eb433ba-5c93-4355-99dd-edcb65721603.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_gemma-2b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2b",
-    "id": "google/gemma-2b",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 2.506
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2038
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3366
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3978
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1366
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/gemma-7b-it/826fc3ab-6ff8-44fa-a745-a0b80bcb2db4.json b/data/hfopenllm_v2/google/gemma-7b-it/826fc3ab-6ff8-44fa-a745-a0b80bcb2db4.json
deleted file mode 100644
index 47f028978..000000000
--- a/data/hfopenllm_v2/google/gemma-7b-it/826fc3ab-6ff8-44fa-a745-a0b80bcb2db4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_gemma-7b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-7b-it",
-    "id": "google/gemma-7b-it",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 8.538
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3868
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3646
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4274
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1695
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/gemma-7b/6da54964-e3b5-4567-8ce4-7e0f279af84f.json b/data/hfopenllm_v2/google/gemma-7b/6da54964-e3b5-4567-8ce4-7e0f279af84f.json
deleted file mode 100644
index 0bb149bf8..000000000
--- a/data/hfopenllm_v2/google/gemma-7b/6da54964-e3b5-4567-8ce4-7e0f279af84f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_gemma-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-7b",
-    "id": "google/gemma-7b",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 8.538
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4362
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.074
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4062
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2948
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/mt5-base/a7dde688-a0ae-4731-909f-0bef0c6eeba9.json b/data/hfopenllm_v2/google/mt5-base/a7dde688-a0ae-4731-909f-0bef0c6eeba9.json
deleted file mode 100644
index 3221afa06..000000000
--- a/data/hfopenllm_v2/google/mt5-base/a7dde688-a0ae-4731-909f-0bef0c6eeba9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_mt5-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mt5-base",
-    "id": "google/mt5-base",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MT5ForConditionalGeneration",
-      "params_billions": 0.39
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1645
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2883
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2391
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3672
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.107
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/mt5-small/eb2a8a60-2240-4b08-9dc3-be0215aa7bfc.json b/data/hfopenllm_v2/google/mt5-small/eb2a8a60-2240-4b08-9dc3-be0215aa7bfc.json
deleted file mode 100644
index f462ca442..000000000
--- a/data/hfopenllm_v2/google/mt5-small/eb2a8a60-2240-4b08-9dc3-be0215aa7bfc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_mt5-small/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mt5-small",
-    "id": "google/mt5-small",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MT5ForConditionalGeneration",
-      "params_billions": 0.17
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1718
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2766
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2424
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3857
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1123
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/mt5-xl/9b05919f-d7c1-4e04-9dd8-9ae70e0005e6.json b/data/hfopenllm_v2/google/mt5-xl/9b05919f-d7c1-4e04-9dd8-9ae70e0005e6.json
deleted file mode 100644
index 53e4e6095..000000000
--- a/data/hfopenllm_v2/google/mt5-xl/9b05919f-d7c1-4e04-9dd8-9ae70e0005e6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_mt5-xl/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mt5-xl",
-    "id": "google/mt5-xl",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MT5ForConditionalGeneration",
-      "params_billions": 3.23
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.196
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3047
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3795
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.112
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/mt5-xxl/6cd98538-74b6-4ac6-a3ac-9a311cfe47f6.json b/data/hfopenllm_v2/google/mt5-xxl/6cd98538-74b6-4ac6-a3ac-9a311cfe47f6.json
deleted file mode 100644
index 151ffbc18..000000000
--- a/data/hfopenllm_v2/google/mt5-xxl/6cd98538-74b6-4ac6-a3ac-9a311cfe47f6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_mt5-xxl/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mt5-xxl",
-    "id": "google/mt5-xxl",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "T5ForConditionalGeneration",
-      "params_billions": 11.9
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2358
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2959
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2416
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3689
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1089
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/recurrentgemma-2b-it/b0ca2dec-387f-4b27-9adb-772af1899832.json b/data/hfopenllm_v2/google/recurrentgemma-2b-it/b0ca2dec-387f-4b27-9adb-772af1899832.json
deleted file mode 100644
index a93defb31..000000000
--- a/data/hfopenllm_v2/google/recurrentgemma-2b-it/b0ca2dec-387f-4b27-9adb-772af1899832.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_recurrentgemma-2b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "recurrentgemma-2b-it",
-    "id": "google/recurrentgemma-2b-it",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "RecurrentGemmaForCausalLM",
-      "params_billions": 2.683
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2949
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.333
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3341
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1402
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/recurrentgemma-2b/53c4b397-b78e-4699-a01e-3535aa072225.json b/data/hfopenllm_v2/google/recurrentgemma-2b/53c4b397-b78e-4699-a01e-3535aa072225.json
deleted file mode 100644
index ea66c049b..000000000
--- a/data/hfopenllm_v2/google/recurrentgemma-2b/53c4b397-b78e-4699-a01e-3535aa072225.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_recurrentgemma-2b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "recurrentgemma-2b",
-    "id": "google/recurrentgemma-2b",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "RecurrentGemmaForCausalLM",
-      "params_billions": 2.683
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3017
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3197
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2458
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3446
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1176
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/recurrentgemma-9b-it/f5b251f0-741c-4ad5-ab04-19c5202854ea.json b/data/hfopenllm_v2/google/recurrentgemma-9b-it/f5b251f0-741c-4ad5-ab04-19c5202854ea.json
deleted file mode 100644
index 001b78f3b..000000000
--- a/data/hfopenllm_v2/google/recurrentgemma-9b-it/f5b251f0-741c-4ad5-ab04-19c5202854ea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_recurrentgemma-9b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "recurrentgemma-9b-it",
-    "id": "google/recurrentgemma-9b-it",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "RecurrentGemmaForCausalLM",
-      "params_billions": 9.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.501
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4367
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4379
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2843
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/recurrentgemma-9b/7b2ba13a-e01d-4442-9abe-d16df1a1668a.json b/data/hfopenllm_v2/google/recurrentgemma-9b/7b2ba13a-e01d-4442-9abe-d16df1a1668a.json
deleted file mode 100644
index 549f07884..000000000
--- a/data/hfopenllm_v2/google/recurrentgemma-9b/7b2ba13a-e01d-4442-9abe-d16df1a1668a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_recurrentgemma-9b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "recurrentgemma-9b",
-    "id": "google/recurrentgemma-9b",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "RecurrentGemmaForCausalLM",
-      "params_billions": 9.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3116
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3956
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3803
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2605
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/switch-base-8/bf79f87c-3f14-49e8-acba-725e709d5f11.json b/data/hfopenllm_v2/google/switch-base-8/bf79f87c-3f14-49e8-acba-725e709d5f11.json
deleted file mode 100644
index 1bc6517df..000000000
--- a/data/hfopenllm_v2/google/switch-base-8/bf79f87c-3f14-49e8-acba-725e709d5f11.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_switch-base-8/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "switch-base-8",
-    "id": "google/switch-base-8",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "SwitchTransformersForConditionalGeneration",
-      "params_billions": 0.62
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1585
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2876
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3517
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1098
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/google/umt5-base/3fbac7d4-cbbb-4b77-9db4-fd7e122cc90e.json b/data/hfopenllm_v2/google/umt5-base/3fbac7d4-cbbb-4b77-9db4-fd7e122cc90e.json
deleted file mode 100644
index ee9655c66..000000000
--- a/data/hfopenllm_v2/google/umt5-base/3fbac7d4-cbbb-4b77-9db4-fd7e122cc90e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/google_umt5-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "umt5-base",
-    "id": "google/umt5-base",
-    "developer": "google",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "UMT5ForConditionalGeneration",
-      "params_billions": -1.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1746
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2788
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1078
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/goulue5/merging_LLM/6efd0dbd-b8c1-4c66-bdf7-19055c16ca22.json b/data/hfopenllm_v2/goulue5/merging_LLM/6efd0dbd-b8c1-4c66-bdf7-19055c16ca22.json
deleted file mode 100644
index 384e12c7d..000000000
--- a/data/hfopenllm_v2/goulue5/merging_LLM/6efd0dbd-b8c1-4c66-bdf7-19055c16ca22.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/goulue5_merging_LLM/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "merging_LLM",
-    "id": "goulue5/merging_LLM",
-    "developer": "goulue5",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3233
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4216
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0967
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4333
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2958
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/gradientai/Llama-3-8B-Instruct-Gradient-1048k/1388b8d4-c711-480c-8a06-a8b7bd8aa79c.json b/data/hfopenllm_v2/gradientai/Llama-3-8B-Instruct-Gradient-1048k/1388b8d4-c711-480c-8a06-a8b7bd8aa79c.json
deleted file mode 100644
index 62b5a87b9..000000000
--- a/data/hfopenllm_v2/gradientai/Llama-3-8B-Instruct-Gradient-1048k/1388b8d4-c711-480c-8a06-a8b7bd8aa79c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/gradientai_Llama-3-8B-Instruct-Gradient-1048k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-Gradient-1048k",
-    "id": "gradientai/Llama-3-8B-Instruct-Gradient-1048k",
-    "developer": "gradientai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4456
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4346
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4298
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.294
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B/03393ffd-1923-4767-ba14-d0e3e6751842.json b/data/hfopenllm_v2/grimjim/DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B/03393ffd-1923-4767-ba14-d0e3e6751842.json
deleted file mode 100644
index ff2605288..000000000
--- a/data/hfopenllm_v2/grimjim/DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B/03393ffd-1923-4767-ba14-d0e3e6751842.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B",
-    "id": "grimjim/DeepSauerHuatuoSkywork-R1-o1-Llama-3.1-8B",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4797
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5269
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2221
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4408
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3957
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/Gigantes-v1-gemma2-9b-it/b7d049dc-127d-4075-8067-22adac9a58c3.json b/data/hfopenllm_v2/grimjim/Gigantes-v1-gemma2-9b-it/b7d049dc-127d-4075-8067-22adac9a58c3.json
deleted file mode 100644
index fac07281a..000000000
--- a/data/hfopenllm_v2/grimjim/Gigantes-v1-gemma2-9b-it/b7d049dc-127d-4075-8067-22adac9a58c3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_Gigantes-v1-gemma2-9b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gigantes-v1-gemma2-9b-it",
-    "id": "grimjim/Gigantes-v1-gemma2-9b-it",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6925
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5978
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3532
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4555
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4225
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/Gigantes-v2-gemma2-9b-it/89d79024-f4b8-4165-bd88-47f2b0010800.json b/data/hfopenllm_v2/grimjim/Gigantes-v2-gemma2-9b-it/89d79024-f4b8-4165-bd88-47f2b0010800.json
deleted file mode 100644
index b451c7250..000000000
--- a/data/hfopenllm_v2/grimjim/Gigantes-v2-gemma2-9b-it/89d79024-f4b8-4165-bd88-47f2b0010800.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_Gigantes-v2-gemma2-9b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gigantes-v2-gemma2-9b-it",
-    "id": "grimjim/Gigantes-v2-gemma2-9b-it",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7351
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5987
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2017
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4595
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4259
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/Gigantes-v3-gemma2-9b-it/d2c0fb0d-6c0c-464a-b09f-6382a57b6afb.json b/data/hfopenllm_v2/grimjim/Gigantes-v3-gemma2-9b-it/d2c0fb0d-6c0c-464a-b09f-6382a57b6afb.json
deleted file mode 100644
index 918da3c13..000000000
--- a/data/hfopenllm_v2/grimjim/Gigantes-v3-gemma2-9b-it/d2c0fb0d-6c0c-464a-b09f-6382a57b6afb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_Gigantes-v3-gemma2-9b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gigantes-v3-gemma2-9b-it",
-    "id": "grimjim/Gigantes-v3-gemma2-9b-it",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6976
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5984
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.21
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4608
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4226
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/HuatuoSkywork-o1-Llama-3.1-8B/a891b28a-2dcc-4b8e-ad20-1f23d663b44b.json b/data/hfopenllm_v2/grimjim/HuatuoSkywork-o1-Llama-3.1-8B/a891b28a-2dcc-4b8e-ad20-1f23d663b44b.json
deleted file mode 100644
index 6204ca20a..000000000
--- a/data/hfopenllm_v2/grimjim/HuatuoSkywork-o1-Llama-3.1-8B/a891b28a-2dcc-4b8e-ad20-1f23d663b44b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_HuatuoSkywork-o1-Llama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HuatuoSkywork-o1-Llama-3.1-8B",
-    "id": "grimjim/HuatuoSkywork-o1-Llama-3.1-8B",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3961
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4886
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3882
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3839
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3095
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge/55e274bb-1e2c-4402-b7ae-09ff7b1f9738.json b/data/hfopenllm_v2/grimjim/Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge/55e274bb-1e2c-4402-b7ae-09ff7b1f9738.json
deleted file mode 100644
index bd55b41d5..000000000
--- a/data/hfopenllm_v2/grimjim/Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge/55e274bb-1e2c-4402-b7ae-09ff7b1f9738.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge",
-    "id": "grimjim/Llama-3-Instruct-8B-SPPO-Iter3-SimPO-merge",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4271
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4962
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0997
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4043
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3625
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge/fe7a6940-fc4c-4345-84be-609c8155be57.json b/data/hfopenllm_v2/grimjim/Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge/fe7a6940-fc4c-4345-84be-609c8155be57.json
deleted file mode 100644
index c0a6840ce..000000000
--- a/data/hfopenllm_v2/grimjim/Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge/fe7a6940-fc4c-4345-84be-609c8155be57.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge",
-    "id": "grimjim/Llama-3-Instruct-8B-SimPO-SPPO-Iter3-merge",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6806
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5022
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0891
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3885
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3684
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/Llama-3.1-8B-Instruct-abliterated_via_adapter/77eb2b0f-e3e3-474c-bb02-dabde2998ef0.json b/data/hfopenllm_v2/grimjim/Llama-3.1-8B-Instruct-abliterated_via_adapter/77eb2b0f-e3e3-474c-bb02-dabde2998ef0.json
deleted file mode 100644
index b33246650..000000000
--- a/data/hfopenllm_v2/grimjim/Llama-3.1-8B-Instruct-abliterated_via_adapter/77eb2b0f-e3e3-474c-bb02-dabde2998ef0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_Llama-3.1-8B-Instruct-abliterated_via_adapter/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-Instruct-abliterated_via_adapter",
-    "id": "grimjim/Llama-3.1-8B-Instruct-abliterated_via_adapter",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.487
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5105
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1397
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.401
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3651
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/Llama-3.1-Bonsaikraft-8B-Instruct/94d744be-5d28-490a-ba9a-8440cb97dce9.json b/data/hfopenllm_v2/grimjim/Llama-3.1-Bonsaikraft-8B-Instruct/94d744be-5d28-490a-ba9a-8440cb97dce9.json
deleted file mode 100644
index 3c3c836ca..000000000
--- a/data/hfopenllm_v2/grimjim/Llama-3.1-Bonsaikraft-8B-Instruct/94d744be-5d28-490a-ba9a-8440cb97dce9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_Llama-3.1-Bonsaikraft-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Bonsaikraft-8B-Instruct",
-    "id": "grimjim/Llama-3.1-Bonsaikraft-8B-Instruct",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.425
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5287
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1314
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4235
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3764
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/Llama-Nephilim-Metamorphosis-v2-8B/2765061e-7506-4eb6-b63f-312f6290665a.json b/data/hfopenllm_v2/grimjim/Llama-Nephilim-Metamorphosis-v2-8B/2765061e-7506-4eb6-b63f-312f6290665a.json
deleted file mode 100644
index ee870fb1c..000000000
--- a/data/hfopenllm_v2/grimjim/Llama-Nephilim-Metamorphosis-v2-8B/2765061e-7506-4eb6-b63f-312f6290665a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_Llama-Nephilim-Metamorphosis-v2-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-Nephilim-Metamorphosis-v2-8B",
-    "id": "grimjim/Llama-Nephilim-Metamorphosis-v2-8B",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4545
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5013
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1397
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4091
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3809
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B/167c937c-66c7-45a8-bbd9-97d98531bf7d.json b/data/hfopenllm_v2/grimjim/Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B/167c937c-66c7-45a8-bbd9-97d98531bf7d.json
deleted file mode 100644
index 7a0c41480..000000000
--- a/data/hfopenllm_v2/grimjim/Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B/167c937c-66c7-45a8-bbd9-97d98531bf7d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B",
-    "id": "grimjim/Llama3.1-SuperNovaLite-HuatuoSkywork-o1-8B",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4366
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5287
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3999
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3684
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/Magnolia-v1-Gemma2-8k-9B/9587c35c-1def-46e7-8642-7acb0340be5e.json b/data/hfopenllm_v2/grimjim/Magnolia-v1-Gemma2-8k-9B/9587c35c-1def-46e7-8642-7acb0340be5e.json
deleted file mode 100644
index 51f562ec4..000000000
--- a/data/hfopenllm_v2/grimjim/Magnolia-v1-Gemma2-8k-9B/9587c35c-1def-46e7-8642-7acb0340be5e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_Magnolia-v1-Gemma2-8k-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Magnolia-v1-Gemma2-8k-9B",
-    "id": "grimjim/Magnolia-v1-Gemma2-8k-9B",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3531
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5589
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1684
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4645
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4242
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/Magnolia-v2-12B/1c9594fe-03d6-4ec1-9da5-99960da0dcd4.json b/data/hfopenllm_v2/grimjim/Magnolia-v2-12B/1c9594fe-03d6-4ec1-9da5-99960da0dcd4.json
deleted file mode 100644
index 4765d94c9..000000000
--- a/data/hfopenllm_v2/grimjim/Magnolia-v2-12B/1c9594fe-03d6-4ec1-9da5-99960da0dcd4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_Magnolia-v2-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Magnolia-v2-12B",
-    "id": "grimjim/Magnolia-v2-12B",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3506
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.529
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1292
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4171
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3601
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/Magnolia-v2-Gemma2-8k-9B/8ed2c4eb-bc72-4dde-a559-1afd1698d37d.json b/data/hfopenllm_v2/grimjim/Magnolia-v2-Gemma2-8k-9B/8ed2c4eb-bc72-4dde-a559-1afd1698d37d.json
deleted file mode 100644
index 3274bce78..000000000
--- a/data/hfopenllm_v2/grimjim/Magnolia-v2-Gemma2-8k-9B/8ed2c4eb-bc72-4dde-a559-1afd1698d37d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_Magnolia-v2-Gemma2-8k-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Magnolia-v2-Gemma2-8k-9B",
-    "id": "grimjim/Magnolia-v2-Gemma2-8k-9B",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7384
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6016
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2281
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3574
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4488
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4332
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/Magnolia-v3-12B/a2f9536a-9266-4aee-be90-d04f4dcbe53c.json b/data/hfopenllm_v2/grimjim/Magnolia-v3-12B/a2f9536a-9266-4aee-be90-d04f4dcbe53c.json
deleted file mode 100644
index 0565ceb88..000000000
--- a/data/hfopenllm_v2/grimjim/Magnolia-v3-12B/a2f9536a-9266-4aee-be90-d04f4dcbe53c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_Magnolia-v3-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Magnolia-v3-12B",
-    "id": "grimjim/Magnolia-v3-12B",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3965
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5327
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1352
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4184
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3615
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/Magnolia-v3-Gemma2-8k-9B/7f116aaa-3880-4e53-948a-4b06e0d26cff.json b/data/hfopenllm_v2/grimjim/Magnolia-v3-Gemma2-8k-9B/7f116aaa-3880-4e53-948a-4b06e0d26cff.json
deleted file mode 100644
index 5518b4113..000000000
--- a/data/hfopenllm_v2/grimjim/Magnolia-v3-Gemma2-8k-9B/7f116aaa-3880-4e53-948a-4b06e0d26cff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_Magnolia-v3-Gemma2-8k-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Magnolia-v3-Gemma2-8k-9B",
-    "id": "grimjim/Magnolia-v3-Gemma2-8k-9B",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7378
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6015
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2319
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4488
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4337
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/Magnolia-v4-12B/7cbe4516-2be2-421b-95f4-c9500ad64ca5.json b/data/hfopenllm_v2/grimjim/Magnolia-v4-12B/7cbe4516-2be2-421b-95f4-c9500ad64ca5.json
deleted file mode 100644
index 53c5ccbfc..000000000
--- a/data/hfopenllm_v2/grimjim/Magnolia-v4-12B/7cbe4516-2be2-421b-95f4-c9500ad64ca5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_Magnolia-v4-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Magnolia-v4-12B",
-    "id": "grimjim/Magnolia-v4-12B",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3418
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5431
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1314
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4211
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3672
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/Magnolia-v5a-12B/07df565a-bc30-4a9d-b472-7a85f35938be.json b/data/hfopenllm_v2/grimjim/Magnolia-v5a-12B/07df565a-bc30-4a9d-b472-7a85f35938be.json
deleted file mode 100644
index edba3c4ff..000000000
--- a/data/hfopenllm_v2/grimjim/Magnolia-v5a-12B/07df565a-bc30-4a9d-b472-7a85f35938be.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_Magnolia-v5a-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Magnolia-v5a-12B",
-    "id": "grimjim/Magnolia-v5a-12B",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4114
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5312
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1375
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4145
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3601
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/Magot-v1-Gemma2-8k-9B/7545f7db-10bb-4d97-9b3f-4346f4f26bad.json b/data/hfopenllm_v2/grimjim/Magot-v1-Gemma2-8k-9B/7545f7db-10bb-4d97-9b3f-4346f4f26bad.json
deleted file mode 100644
index e97c90ea3..000000000
--- a/data/hfopenllm_v2/grimjim/Magot-v1-Gemma2-8k-9B/7545f7db-10bb-4d97-9b3f-4346f4f26bad.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_Magot-v1-Gemma2-8k-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Magot-v1-Gemma2-8k-9B",
-    "id": "grimjim/Magot-v1-Gemma2-8k-9B",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2997
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6019
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0989
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3465
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4488
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4337
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/Magot-v2-Gemma2-8k-9B/47384f10-ac6a-4629-92db-86f01a441f7f.json b/data/hfopenllm_v2/grimjim/Magot-v2-Gemma2-8k-9B/47384f10-ac6a-4629-92db-86f01a441f7f.json
deleted file mode 100644
index f21dbc7f0..000000000
--- a/data/hfopenllm_v2/grimjim/Magot-v2-Gemma2-8k-9B/47384f10-ac6a-4629-92db-86f01a441f7f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_Magot-v2-Gemma2-8k-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Magot-v2-Gemma2-8k-9B",
-    "id": "grimjim/Magot-v2-Gemma2-8k-9B",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7347
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5897
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2017
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.354
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4344
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4223
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/SauerHuatuoSkywork-o1-Llama-3.1-8B/3c9f022f-3e2b-48d6-acb9-07f066cfceb6.json b/data/hfopenllm_v2/grimjim/SauerHuatuoSkywork-o1-Llama-3.1-8B/3c9f022f-3e2b-48d6-acb9-07f066cfceb6.json
deleted file mode 100644
index 8620f2a75..000000000
--- a/data/hfopenllm_v2/grimjim/SauerHuatuoSkywork-o1-Llama-3.1-8B/3c9f022f-3e2b-48d6-acb9-07f066cfceb6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_SauerHuatuoSkywork-o1-Llama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SauerHuatuoSkywork-o1-Llama-3.1-8B",
-    "id": "grimjim/SauerHuatuoSkywork-o1-Llama-3.1-8B",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5219
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5222
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.173
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4527
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3991
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v1-8B/1d851cfb-8624-4516-8204-85569c60dc67.json b/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v1-8B/1d851cfb-8624-4516-8204-85569c60dc67.json
deleted file mode 100644
index a154af168..000000000
--- a/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v1-8B/1d851cfb-8624-4516-8204-85569c60dc67.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_llama-3-Nephilim-v1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-Nephilim-v1-8B",
-    "id": "grimjim/llama-3-Nephilim-v1-8B",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4277
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5132
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0906
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4136
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3796
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v2-8B/a7990990-7498-4b74-a0aa-9c266910698e.json b/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v2-8B/a7990990-7498-4b74-a0aa-9c266910698e.json
deleted file mode 100644
index 961e8d5ae..000000000
--- a/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v2-8B/a7990990-7498-4b74-a0aa-9c266910698e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_llama-3-Nephilim-v2-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-Nephilim-v2-8B",
-    "id": "grimjim/llama-3-Nephilim-v2-8B",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3922
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5048
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3895
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3641
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v2.1-8B/0b41d37e-0728-4575-9662-c150e2e29bd0.json b/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v2.1-8B/0b41d37e-0728-4575-9662-c150e2e29bd0.json
deleted file mode 100644
index 2e431db78..000000000
--- a/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v2.1-8B/0b41d37e-0728-4575-9662-c150e2e29bd0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_llama-3-Nephilim-v2.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-Nephilim-v2.1-8B",
-    "id": "grimjim/llama-3-Nephilim-v2.1-8B",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3895
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5095
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0997
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3644
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v3-8B/c565a7e9-bd1b-41a5-bff3-3a349553f4e8.json b/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v3-8B/c565a7e9-bd1b-41a5-bff3-3a349553f4e8.json
deleted file mode 100644
index 123748d61..000000000
--- a/data/hfopenllm_v2/grimjim/llama-3-Nephilim-v3-8B/c565a7e9-bd1b-41a5-bff3-3a349553f4e8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/grimjim_llama-3-Nephilim-v3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-Nephilim-v3-8B",
-    "id": "grimjim/llama-3-Nephilim-v3-8B",
-    "developer": "grimjim",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4174
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5013
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0952
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3989
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3612
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/gupta-tanish/llama-7b-dpo-baseline/680a4507-755e-4014-877b-6032f0220270.json b/data/hfopenllm_v2/gupta-tanish/llama-7b-dpo-baseline/680a4507-755e-4014-877b-6032f0220270.json
deleted file mode 100644
index 888dfc69c..000000000
--- a/data/hfopenllm_v2/gupta-tanish/llama-7b-dpo-baseline/680a4507-755e-4014-877b-6032f0220270.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/gupta-tanish_llama-7b-dpo-baseline/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-7b-dpo-baseline",
-    "id": "gupta-tanish/llama-7b-dpo-baseline",
-    "developer": "gupta-tanish",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.738
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3897
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4456
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2028
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.1/5ace8dc6-e348-4267-bb4a-f71a335d074e.json b/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.1/5ace8dc6-e348-4267-bb4a-f71a335d074e.json
deleted file mode 100644
index 711f9349b..000000000
--- a/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.1/5ace8dc6-e348-4267-bb4a-f71a335d074e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/gz987_qwen2.5-7b-cabs-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen2.5-7b-cabs-v0.1",
-    "id": "gz987/qwen2.5-7b-cabs-v0.1",
-    "developer": "gz987",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7506
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5482
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4796
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4376
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4406
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.2/07549821-db51-4b77-980a-056131b5dd29.json b/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.2/07549821-db51-4b77-980a-056131b5dd29.json
deleted file mode 100644
index 3ae09bf09..000000000
--- a/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.2/07549821-db51-4b77-980a-056131b5dd29.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/gz987_qwen2.5-7b-cabs-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen2.5-7b-cabs-v0.2",
-    "id": "gz987/qwen2.5-7b-cabs-v0.2",
-    "developer": "gz987",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7418
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5516
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4902
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4429
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4397
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.3/ff12a0a1-a913-441b-955c-bcbd50056acf.json b/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.3/ff12a0a1-a913-441b-955c-bcbd50056acf.json
deleted file mode 100644
index 9b5948568..000000000
--- a/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.3/ff12a0a1-a913-441b-955c-bcbd50056acf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/gz987_qwen2.5-7b-cabs-v0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen2.5-7b-cabs-v0.3",
-    "id": "gz987/qwen2.5-7b-cabs-v0.3",
-    "developer": "gz987",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.757
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5494
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4932
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.443
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4402
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.4/947cfc2b-b73c-40eb-9e57-be5278776711.json b/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.4/947cfc2b-b73c-40eb-9e57-be5278776711.json
deleted file mode 100644
index bc502245a..000000000
--- a/data/hfopenllm_v2/gz987/qwen2.5-7b-cabs-v0.4/947cfc2b-b73c-40eb-9e57-be5278776711.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/gz987_qwen2.5-7b-cabs-v0.4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen2.5-7b-cabs-v0.4",
-    "id": "gz987/qwen2.5-7b-cabs-v0.4",
-    "developer": "gz987",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7583
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5524
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4849
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.443
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4396
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/h2oai/h2o-danube-1.8b-chat/53639078-c50a-4147-bab0-16993f1790b6.json b/data/hfopenllm_v2/h2oai/h2o-danube-1.8b-chat/53639078-c50a-4147-bab0-16993f1790b6.json
deleted file mode 100644
index 47446d221..000000000
--- a/data/hfopenllm_v2/h2oai/h2o-danube-1.8b-chat/53639078-c50a-4147-bab0-16993f1790b6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/h2oai_h2o-danube-1.8b-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "h2o-danube-1.8b-chat",
-    "id": "h2oai/h2o-danube-1.8b-chat",
-    "developer": "h2oai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 1.831
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2199
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.322
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3989
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1314
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/h2oai/h2o-danube3-4b-base/b2cf96e0-382e-4200-a4a4-d66e8a188878.json b/data/hfopenllm_v2/h2oai/h2o-danube3-4b-base/b2cf96e0-382e-4200-a4a4-d66e8a188878.json
deleted file mode 100644
index 0acd0b4f9..000000000
--- a/data/hfopenllm_v2/h2oai/h2o-danube3-4b-base/b2cf96e0-382e-4200-a4a4-d66e8a188878.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/h2oai_h2o-danube3-4b-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "h2o-danube3-4b-base",
-    "id": "h2oai/h2o-danube3-4b-base",
-    "developer": "h2oai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.962
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2338
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3599
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0227
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3778
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2109
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/h2oai/h2o-danube3-4b-chat/d4ed3eb6-f569-4d4b-8da5-50eaaf824128.json b/data/hfopenllm_v2/h2oai/h2o-danube3-4b-chat/d4ed3eb6-f569-4d4b-8da5-50eaaf824128.json
deleted file mode 100644
index c17f6d915..000000000
--- a/data/hfopenllm_v2/h2oai/h2o-danube3-4b-chat/d4ed3eb6-f569-4d4b-8da5-50eaaf824128.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/h2oai_h2o-danube3-4b-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "h2o-danube3-4b-chat",
-    "id": "h2oai/h2o-danube3-4b-chat",
-    "developer": "h2oai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.962
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3629
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3466
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3781
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2228
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/h2oai/h2o-danube3-500m-chat/210f7063-e0d9-424d-94f4-3645e4e1b401.json b/data/hfopenllm_v2/h2oai/h2o-danube3-500m-chat/210f7063-e0d9-424d-94f4-3645e4e1b401.json
deleted file mode 100644
index fd3e0551c..000000000
--- a/data/hfopenllm_v2/h2oai/h2o-danube3-500m-chat/210f7063-e0d9-424d-94f4-3645e4e1b401.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/h2oai_h2o-danube3-500m-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "h2o-danube3-500m-chat",
-    "id": "h2oai/h2o-danube3-500m-chat",
-    "developer": "h2oai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.514
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2208
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3035
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3434
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1144
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/h2oai/h2o-danube3.1-4b-chat/4ecd26d8-8416-4dba-8d53-96f4013cfef0.json b/data/hfopenllm_v2/h2oai/h2o-danube3.1-4b-chat/4ecd26d8-8416-4dba-8d53-96f4013cfef0.json
deleted file mode 100644
index 6f2847856..000000000
--- a/data/hfopenllm_v2/h2oai/h2o-danube3.1-4b-chat/4ecd26d8-8416-4dba-8d53-96f4013cfef0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/h2oai_h2o-danube3.1-4b-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "h2o-danube3.1-4b-chat",
-    "id": "h2oai/h2o-danube3.1-4b-chat",
-    "developer": "h2oai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.962
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5021
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3608
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0332
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4102
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2719
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/haoranxu/ALMA-13B-R/15712b7d-e69f-4a4f-b13c-4e79ce859399.json b/data/hfopenllm_v2/haoranxu/ALMA-13B-R/15712b7d-e69f-4a4f-b13c-4e79ce859399.json
deleted file mode 100644
index b4507647e..000000000
--- a/data/hfopenllm_v2/haoranxu/ALMA-13B-R/15712b7d-e69f-4a4f-b13c-4e79ce859399.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/haoranxu_ALMA-13B-R/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ALMA-13B-R",
-    "id": "haoranxu/ALMA-13B-R",
-    "developer": "haoranxu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 13.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0039
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3457
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0174
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3528
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1817
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/haoranxu/Llama-3-Instruct-8B-CPO-SimPO/9148c375-7c08-4c1c-82ed-5f935b2a4f04.json b/data/hfopenllm_v2/haoranxu/Llama-3-Instruct-8B-CPO-SimPO/9148c375-7c08-4c1c-82ed-5f935b2a4f04.json
deleted file mode 100644
index e8754a2a8..000000000
--- a/data/hfopenllm_v2/haoranxu/Llama-3-Instruct-8B-CPO-SimPO/9148c375-7c08-4c1c-82ed-5f935b2a4f04.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/haoranxu_Llama-3-Instruct-8B-CPO-SimPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-CPO-SimPO",
-    "id": "haoranxu/Llama-3-Instruct-8B-CPO-SimPO",
-    "developer": "haoranxu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7046
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5048
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1027
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3567
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3686
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/haoranxu/Llama-3-Instruct-8B-SimPO/fb93274b-b7d8-483a-a95d-96340535febc.json b/data/hfopenllm_v2/haoranxu/Llama-3-Instruct-8B-SimPO/fb93274b-b7d8-483a-a95d-96340535febc.json
deleted file mode 100644
index e99c8aeb7..000000000
--- a/data/hfopenllm_v2/haoranxu/Llama-3-Instruct-8B-SimPO/fb93274b-b7d8-483a-a95d-96340535febc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/haoranxu_Llama-3-Instruct-8B-SimPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-SimPO",
-    "id": "haoranxu/Llama-3-Instruct-8B-SimPO",
-    "developer": "haoranxu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7347
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4979
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0876
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3566
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hatemmahmoud/qwen2.5-1.5b-sft-raft-grpo-hra-doc/0818b755-ec49-457c-8635-73f01816f30b.json b/data/hfopenllm_v2/hatemmahmoud/qwen2.5-1.5b-sft-raft-grpo-hra-doc/0818b755-ec49-457c-8635-73f01816f30b.json
deleted file mode 100644
index 86fc861f4..000000000
--- a/data/hfopenllm_v2/hatemmahmoud/qwen2.5-1.5b-sft-raft-grpo-hra-doc/0818b755-ec49-457c-8635-73f01816f30b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hatemmahmoud_qwen2.5-1.5b-sft-raft-grpo-hra-doc/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen2.5-1.5b-sft-raft-grpo-hra-doc",
-    "id": "hatemmahmoud/qwen2.5-1.5b-sft-raft-grpo-hra-doc",
-    "developer": "hatemmahmoud",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4196
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.427
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2175
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.361
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2776
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hon9kon9ize/CantoneseLLMChat-v0.5/77962326-0160-49bd-9ef1-59b403b2bfce.json b/data/hfopenllm_v2/hon9kon9ize/CantoneseLLMChat-v0.5/77962326-0160-49bd-9ef1-59b403b2bfce.json
deleted file mode 100644
index 4312be247..000000000
--- a/data/hfopenllm_v2/hon9kon9ize/CantoneseLLMChat-v0.5/77962326-0160-49bd-9ef1-59b403b2bfce.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hon9kon9ize_CantoneseLLMChat-v0.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CantoneseLLMChat-v0.5",
-    "id": "hon9kon9ize/CantoneseLLMChat-v0.5",
-    "developer": "hon9kon9ize",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.069
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3231
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4345
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0415
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4706
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2504
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hon9kon9ize/CantoneseLLMChat-v1.0-7B/272abbe5-8b61-442f-9860-d7411e7fec99.json b/data/hfopenllm_v2/hon9kon9ize/CantoneseLLMChat-v1.0-7B/272abbe5-8b61-442f-9860-d7411e7fec99.json
deleted file mode 100644
index 265d4e3f0..000000000
--- a/data/hfopenllm_v2/hon9kon9ize/CantoneseLLMChat-v1.0-7B/272abbe5-8b61-442f-9860-d7411e7fec99.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hon9kon9ize_CantoneseLLMChat-v1.0-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CantoneseLLMChat-v1.0-7B",
-    "id": "hon9kon9ize/CantoneseLLMChat-v1.0-7B",
-    "developer": "hon9kon9ize",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4455
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4866
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2107
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3883
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3785
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hongbai12/li-0.4-pre/14d617a8-18c6-40a7-a4ba-19cf5fc5f4e3.json b/data/hfopenllm_v2/hongbai12/li-0.4-pre/14d617a8-18c6-40a7-a4ba-19cf5fc5f4e3.json
deleted file mode 100644
index c385d6d79..000000000
--- a/data/hfopenllm_v2/hongbai12/li-0.4-pre/14d617a8-18c6-40a7-a4ba-19cf5fc5f4e3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hongbai12_li-0.4-pre/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "li-0.4-pre",
-    "id": "hongbai12/li-0.4-pre",
-    "developer": "hongbai12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6298
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4924
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4513
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5015
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/Deepseek-qwen-modelstock-2B/ef7b5e6d-b5b7-4c7b-9781-6f90eb1ff5dd.json b/data/hfopenllm_v2/hotmailuser/Deepseek-qwen-modelstock-2B/ef7b5e6d-b5b7-4c7b-9781-6f90eb1ff5dd.json
deleted file mode 100644
index 2ecbdfd24..000000000
--- a/data/hfopenllm_v2/hotmailuser/Deepseek-qwen-modelstock-2B/ef7b5e6d-b5b7-4c7b-9781-6f90eb1ff5dd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_Deepseek-qwen-modelstock-2B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Deepseek-qwen-modelstock-2B",
-    "id": "hotmailuser/Deepseek-qwen-modelstock-2B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2149
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3549
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3399
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3475
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1911
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/Falcon3Slerp1-10B/1970e257-7c93-4342-9ff4-a96af21acc67.json b/data/hfopenllm_v2/hotmailuser/Falcon3Slerp1-10B/1970e257-7c93-4342-9ff4-a96af21acc67.json
deleted file mode 100644
index 5c04c6bd5..000000000
--- a/data/hfopenllm_v2/hotmailuser/Falcon3Slerp1-10B/1970e257-7c93-4342-9ff4-a96af21acc67.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_Falcon3Slerp1-10B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon3Slerp1-10B",
-    "id": "hotmailuser/Falcon3Slerp1-10B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5694
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.617
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2598
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4318
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4402
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/Falcon3Slerp2-10B/15d71696-4b21-41ff-a4c6-0aea92fb844a.json b/data/hfopenllm_v2/hotmailuser/Falcon3Slerp2-10B/15d71696-4b21-41ff-a4c6-0aea92fb844a.json
deleted file mode 100644
index 7fc89171e..000000000
--- a/data/hfopenllm_v2/hotmailuser/Falcon3Slerp2-10B/15d71696-4b21-41ff-a4c6-0aea92fb844a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_Falcon3Slerp2-10B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon3Slerp2-10B",
-    "id": "hotmailuser/Falcon3Slerp2-10B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6118
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6164
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2319
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4096
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4369
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/Falcon3Slerp4-10B/ccb85394-5252-48d4-8980-8b3a6c67ab1a.json b/data/hfopenllm_v2/hotmailuser/Falcon3Slerp4-10B/ccb85394-5252-48d4-8980-8b3a6c67ab1a.json
deleted file mode 100644
index ad19a7974..000000000
--- a/data/hfopenllm_v2/hotmailuser/Falcon3Slerp4-10B/ccb85394-5252-48d4-8980-8b3a6c67ab1a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_Falcon3Slerp4-10B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon3Slerp4-10B",
-    "id": "hotmailuser/Falcon3Slerp4-10B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6072
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6114
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2289
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4017
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4387
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/FalconSlerp-3B/ea9837ff-f4c7-4bb0-b2af-7ae26371baf0.json b/data/hfopenllm_v2/hotmailuser/FalconSlerp-3B/ea9837ff-f4c7-4bb0-b2af-7ae26371baf0.json
deleted file mode 100644
index e5f0a213e..000000000
--- a/data/hfopenllm_v2/hotmailuser/FalconSlerp-3B/ea9837ff-f4c7-4bb0-b2af-7ae26371baf0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_FalconSlerp-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FalconSlerp-3B",
-    "id": "hotmailuser/FalconSlerp-3B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.228
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5695
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4624
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.176
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3989
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2968
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/FalconSlerp1-7B/fe9012a7-d07f-48d4-b460-eca256078d8b.json b/data/hfopenllm_v2/hotmailuser/FalconSlerp1-7B/fe9012a7-d07f-48d4-b460-eca256078d8b.json
deleted file mode 100644
index 500ef8052..000000000
--- a/data/hfopenllm_v2/hotmailuser/FalconSlerp1-7B/fe9012a7-d07f-48d4-b460-eca256078d8b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_FalconSlerp1-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FalconSlerp1-7B",
-    "id": "hotmailuser/FalconSlerp1-7B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5395
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5355
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2379
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4452
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4129
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/FalconSlerp2-7B/8e8d2071-8e7d-4dad-8536-4698b2d00316.json b/data/hfopenllm_v2/hotmailuser/FalconSlerp2-7B/8e8d2071-8e7d-4dad-8536-4698b2d00316.json
deleted file mode 100644
index 8e8a72c09..000000000
--- a/data/hfopenllm_v2/hotmailuser/FalconSlerp2-7B/8e8d2071-8e7d-4dad-8536-4698b2d00316.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_FalconSlerp2-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FalconSlerp2-7B",
-    "id": "hotmailuser/FalconSlerp2-7B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.616
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5538
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2983
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4479
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4141
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/FalconSlerp3-10B/dbcb41be-9ed6-4244-ada8-77f363c3487e.json b/data/hfopenllm_v2/hotmailuser/FalconSlerp3-10B/dbcb41be-9ed6-4244-ada8-77f363c3487e.json
deleted file mode 100644
index 3031ed1a6..000000000
--- a/data/hfopenllm_v2/hotmailuser/FalconSlerp3-10B/dbcb41be-9ed6-4244-ada8-77f363c3487e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_FalconSlerp3-10B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FalconSlerp3-10B",
-    "id": "hotmailuser/FalconSlerp3-10B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6002
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.606
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2273
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4031
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4323
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/FalconSlerp3-7B/e48e2d7e-6c14-4bb1-bd12-74d93a145ca3.json b/data/hfopenllm_v2/hotmailuser/FalconSlerp3-7B/e48e2d7e-6c14-4bb1-bd12-74d93a145ca3.json
deleted file mode 100644
index 5165cca5b..000000000
--- a/data/hfopenllm_v2/hotmailuser/FalconSlerp3-7B/e48e2d7e-6c14-4bb1-bd12-74d93a145ca3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_FalconSlerp3-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FalconSlerp3-7B",
-    "id": "hotmailuser/FalconSlerp3-7B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6096
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5533
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3157
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4507
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4127
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/FalconSlerp4-7B/30c2d908-3eaf-408a-a2b5-301e0cd9e052.json b/data/hfopenllm_v2/hotmailuser/FalconSlerp4-7B/30c2d908-3eaf-408a-a2b5-301e0cd9e052.json
deleted file mode 100644
index 7cb786f59..000000000
--- a/data/hfopenllm_v2/hotmailuser/FalconSlerp4-7B/30c2d908-3eaf-408a-a2b5-301e0cd9e052.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_FalconSlerp4-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FalconSlerp4-7B",
-    "id": "hotmailuser/FalconSlerp4-7B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6285
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5524
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2213
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3322
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4585
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4032
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/FalconSlerp6-7B/f7624d04-66d1-4c05-8c01-d015ecf8412c.json b/data/hfopenllm_v2/hotmailuser/FalconSlerp6-7B/f7624d04-66d1-4c05-8c01-d015ecf8412c.json
deleted file mode 100644
index df78b9118..000000000
--- a/data/hfopenllm_v2/hotmailuser/FalconSlerp6-7B/f7624d04-66d1-4c05-8c01-d015ecf8412c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_FalconSlerp6-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FalconSlerp6-7B",
-    "id": "hotmailuser/FalconSlerp6-7B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6027
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5384
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2047
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4492
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3995
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/Gemma2Crono-27B/511e4aad-1e5a-4515-9433-46989fc3945b.json b/data/hfopenllm_v2/hotmailuser/Gemma2Crono-27B/511e4aad-1e5a-4515-9433-46989fc3945b.json
deleted file mode 100644
index fbd307ade..000000000
--- a/data/hfopenllm_v2/hotmailuser/Gemma2Crono-27B/511e4aad-1e5a-4515-9433-46989fc3945b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_Gemma2Crono-27B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma2Crono-27B",
-    "id": "hotmailuser/Gemma2Crono-27B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7086
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6505
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2424
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3708
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4567
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4633
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/Gemma2SimPO-27B/863e71ec-03a4-47ed-8bc9-b064d5571162.json b/data/hfopenllm_v2/hotmailuser/Gemma2SimPO-27B/863e71ec-03a4-47ed-8bc9-b064d5571162.json
deleted file mode 100644
index 710203d42..000000000
--- a/data/hfopenllm_v2/hotmailuser/Gemma2SimPO-27B/863e71ec-03a4-47ed-8bc9-b064d5571162.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_Gemma2SimPO-27B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma2SimPO-27B",
-    "id": "hotmailuser/Gemma2SimPO-27B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7222
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6413
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2817
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3582
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4447
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4642
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/Gemma2atlas-27B/6a6dfcb4-192b-44ff-a34f-76b31bbf5ad3.json b/data/hfopenllm_v2/hotmailuser/Gemma2atlas-27B/6a6dfcb4-192b-44ff-a34f-76b31bbf5ad3.json
deleted file mode 100644
index b057bd05c..000000000
--- a/data/hfopenllm_v2/hotmailuser/Gemma2atlas-27B/6a6dfcb4-192b-44ff-a34f-76b31bbf5ad3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_Gemma2atlas-27B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma2atlas-27B",
-    "id": "hotmailuser/Gemma2atlas-27B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7214
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6545
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3557
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4445
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.475
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/Gemma2magnum-27b/e0dbec0b-a154-448a-be23-ef9b764469ea.json b/data/hfopenllm_v2/hotmailuser/Gemma2magnum-27b/e0dbec0b-a154-448a-be23-ef9b764469ea.json
deleted file mode 100644
index 29f7bb0c0..000000000
--- a/data/hfopenllm_v2/hotmailuser/Gemma2magnum-27b/e0dbec0b-a154-448a-be23-ef9b764469ea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_Gemma2magnum-27b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma2magnum-27b",
-    "id": "hotmailuser/Gemma2magnum-27b",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5051
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.62
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2205
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3851
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4723
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4596
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/Llama-Hermes-slerp-8B/ecd91300-b0cf-48ce-9e5c-253a7991f90e.json b/data/hfopenllm_v2/hotmailuser/Llama-Hermes-slerp-8B/ecd91300-b0cf-48ce-9e5c-253a7991f90e.json
deleted file mode 100644
index 2af872ca1..000000000
--- a/data/hfopenllm_v2/hotmailuser/Llama-Hermes-slerp-8B/ecd91300-b0cf-48ce-9e5c-253a7991f90e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_Llama-Hermes-slerp-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-Hermes-slerp-8B",
-    "id": "hotmailuser/Llama-Hermes-slerp-8B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.339
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.531
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0801
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4078
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/Llama-Hermes-slerp2-8B/e3df71f1-63e1-40f1-918d-07cb3ec939cf.json b/data/hfopenllm_v2/hotmailuser/Llama-Hermes-slerp2-8B/e3df71f1-63e1-40f1-918d-07cb3ec939cf.json
deleted file mode 100644
index c653551a2..000000000
--- a/data/hfopenllm_v2/hotmailuser/Llama-Hermes-slerp2-8B/e3df71f1-63e1-40f1-918d-07cb3ec939cf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_Llama-Hermes-slerp2-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-Hermes-slerp2-8B",
-    "id": "hotmailuser/Llama-Hermes-slerp2-8B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3728
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5265
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0974
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4248
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3379
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/LlamaStock-8B/52066a23-9847-490e-90e3-57eee3c63276.json b/data/hfopenllm_v2/hotmailuser/LlamaStock-8B/52066a23-9847-490e-90e3-57eee3c63276.json
deleted file mode 100644
index 8711f86d4..000000000
--- a/data/hfopenllm_v2/hotmailuser/LlamaStock-8B/52066a23-9847-490e-90e3-57eee3c63276.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_LlamaStock-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LlamaStock-8B",
-    "id": "hotmailuser/LlamaStock-8B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.425
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5329
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1699
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4129
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3807
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/Mistral-modelstock-24B/91f15ba3-a062-4b01-8a61-6e51fdf5f8d4.json b/data/hfopenllm_v2/hotmailuser/Mistral-modelstock-24B/91f15ba3-a062-4b01-8a61-6e51fdf5f8d4.json
deleted file mode 100644
index 656587aa6..000000000
--- a/data/hfopenllm_v2/hotmailuser/Mistral-modelstock-24B/91f15ba3-a062-4b01-8a61-6e51fdf5f8d4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_Mistral-modelstock-24B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-modelstock-24B",
-    "id": "hotmailuser/Mistral-modelstock-24B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3424
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6452
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1307
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4102
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.459
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.507
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/Mistral-modelstock2-24B/323630ee-fbe0-49a7-aa11-816fde38ba2d.json b/data/hfopenllm_v2/hotmailuser/Mistral-modelstock2-24B/323630ee-fbe0-49a7-aa11-816fde38ba2d.json
deleted file mode 100644
index 2a8edbdfe..000000000
--- a/data/hfopenllm_v2/hotmailuser/Mistral-modelstock2-24B/323630ee-fbe0-49a7-aa11-816fde38ba2d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_Mistral-modelstock2-24B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-modelstock2-24B",
-    "id": "hotmailuser/Mistral-modelstock2-24B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4318
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6689
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2402
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3926
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4616
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5318
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/Phi4-Slerp4-14B/e5c8f97d-1873-4c9d-8bed-50dc592543db.json b/data/hfopenllm_v2/hotmailuser/Phi4-Slerp4-14B/e5c8f97d-1873-4c9d-8bed-50dc592543db.json
deleted file mode 100644
index 4377cee38..000000000
--- a/data/hfopenllm_v2/hotmailuser/Phi4-Slerp4-14B/e5c8f97d-1873-4c9d-8bed-50dc592543db.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_Phi4-Slerp4-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi4-Slerp4-14B",
-    "id": "hotmailuser/Phi4-Slerp4-14B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0629
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6731
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3474
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3968
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5097
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5278
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/Qwen2.5-HomerSlerp-7B/7ee2803c-b8f8-4156-8472-bab4baab8863.json b/data/hfopenllm_v2/hotmailuser/Qwen2.5-HomerSlerp-7B/7ee2803c-b8f8-4156-8472-bab4baab8863.json
deleted file mode 100644
index 1e3e4c4b1..000000000
--- a/data/hfopenllm_v2/hotmailuser/Qwen2.5-HomerSlerp-7B/7ee2803c-b8f8-4156-8472-bab4baab8863.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_Qwen2.5-HomerSlerp-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-HomerSlerp-7B",
-    "id": "hotmailuser/Qwen2.5-HomerSlerp-7B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4488
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5633
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3316
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4383
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4549
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/QwenModelStock-1.8B/78573f63-3073-4be4-93a7-0ea00b1383fd.json b/data/hfopenllm_v2/hotmailuser/QwenModelStock-1.8B/78573f63-3073-4be4-93a7-0ea00b1383fd.json
deleted file mode 100644
index 818c900fe..000000000
--- a/data/hfopenllm_v2/hotmailuser/QwenModelStock-1.8B/78573f63-3073-4be4-93a7-0ea00b1383fd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_QwenModelStock-1.8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenModelStock-1.8B",
-    "id": "hotmailuser/QwenModelStock-1.8B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4188
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0989
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4359
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2959
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/QwenSlerp-14B/42da7295-d78d-49a4-9279-8406063240c4.json b/data/hfopenllm_v2/hotmailuser/QwenSlerp-14B/42da7295-d78d-49a4-9279-8406063240c4.json
deleted file mode 100644
index 448439df9..000000000
--- a/data/hfopenllm_v2/hotmailuser/QwenSlerp-14B/42da7295-d78d-49a4-9279-8406063240c4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_QwenSlerp-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenSlerp-14B",
-    "id": "hotmailuser/QwenSlerp-14B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7025
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6491
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3837
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3876
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4634
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.54
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/QwenSlerp-3B/b61c5735-53ca-4dda-a223-79921eee7f3e.json b/data/hfopenllm_v2/hotmailuser/QwenSlerp-3B/b61c5735-53ca-4dda-a223-79921eee7f3e.json
deleted file mode 100644
index ab9d3d420..000000000
--- a/data/hfopenllm_v2/hotmailuser/QwenSlerp-3B/b61c5735-53ca-4dda-a223-79921eee7f3e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_QwenSlerp-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenSlerp-3B",
-    "id": "hotmailuser/QwenSlerp-3B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.397
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4334
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4892
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2749
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4317
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3693
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/QwenSlerp-7B/310124ef-e33f-49de-83eb-e665a5143aaa.json b/data/hfopenllm_v2/hotmailuser/QwenSlerp-7B/310124ef-e33f-49de-83eb-e665a5143aaa.json
deleted file mode 100644
index 14cc9dac6..000000000
--- a/data/hfopenllm_v2/hotmailuser/QwenSlerp-7B/310124ef-e33f-49de-83eb-e665a5143aaa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_QwenSlerp-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenSlerp-7B",
-    "id": "hotmailuser/QwenSlerp-7B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4673
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5636
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3444
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4409
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4509
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/QwenSlerp2-14B/c9b056df-8bbe-4959-ab44-85813157c95c.json b/data/hfopenllm_v2/hotmailuser/QwenSlerp2-14B/c9b056df-8bbe-4959-ab44-85813157c95c.json
deleted file mode 100644
index 746637a2e..000000000
--- a/data/hfopenllm_v2/hotmailuser/QwenSlerp2-14B/c9b056df-8bbe-4959-ab44-85813157c95c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_QwenSlerp2-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenSlerp2-14B",
-    "id": "hotmailuser/QwenSlerp2-14B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7037
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6493
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3965
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3809
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4807
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5379
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/QwenSlerp2-3B/7a60385f-48dd-4926-8b66-3d42a1631db3.json b/data/hfopenllm_v2/hotmailuser/QwenSlerp2-3B/7a60385f-48dd-4926-8b66-3d42a1631db3.json
deleted file mode 100644
index 4a1951fcf..000000000
--- a/data/hfopenllm_v2/hotmailuser/QwenSlerp2-3B/7a60385f-48dd-4926-8b66-3d42a1631db3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_QwenSlerp2-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenSlerp2-3B",
-    "id": "hotmailuser/QwenSlerp2-3B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.397
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.428
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4802
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2606
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4252
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3742
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/QwenSlerp3-14B/da365c7b-74d0-4a9f-a8fd-cf4049ec4de6.json b/data/hfopenllm_v2/hotmailuser/QwenSlerp3-14B/da365c7b-74d0-4a9f-a8fd-cf4049ec4de6.json
deleted file mode 100644
index 812599df3..000000000
--- a/data/hfopenllm_v2/hotmailuser/QwenSlerp3-14B/da365c7b-74d0-4a9f-a8fd-cf4049ec4de6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_QwenSlerp3-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenSlerp3-14B",
-    "id": "hotmailuser/QwenSlerp3-14B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6632
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6267
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4305
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3666
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4808
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5263
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/QwenSparse-7B/e2930715-b616-49a4-83bc-53e92fc3580f.json b/data/hfopenllm_v2/hotmailuser/QwenSparse-7B/e2930715-b616-49a4-83bc-53e92fc3580f.json
deleted file mode 100644
index 8f33a3dbf..000000000
--- a/data/hfopenllm_v2/hotmailuser/QwenSparse-7B/e2930715-b616-49a4-83bc-53e92fc3580f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_QwenSparse-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenSparse-7B",
-    "id": "hotmailuser/QwenSparse-7B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1086
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2896
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3562
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1122
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/QwenStock-0.5B/543f45e0-a158-4fdb-bbb1-8deb38f4515b.json b/data/hfopenllm_v2/hotmailuser/QwenStock-0.5B/543f45e0-a158-4fdb-bbb1-8deb38f4515b.json
deleted file mode 100644
index be59d9fa4..000000000
--- a/data/hfopenllm_v2/hotmailuser/QwenStock-0.5B/543f45e0-a158-4fdb-bbb1-8deb38f4515b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_QwenStock-0.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenStock-0.5B",
-    "id": "hotmailuser/QwenStock-0.5B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2049
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2912
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3575
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1167
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/QwenStock-1.7B/b96a20e0-d044-4a66-8909-437aeaef569c.json b/data/hfopenllm_v2/hotmailuser/QwenStock-1.7B/b96a20e0-d044-4a66-8909-437aeaef569c.json
deleted file mode 100644
index efbdd247d..000000000
--- a/data/hfopenllm_v2/hotmailuser/QwenStock-1.7B/b96a20e0-d044-4a66-8909-437aeaef569c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_QwenStock-1.7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenStock-1.7B",
-    "id": "hotmailuser/QwenStock-1.7B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3214
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4188
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0997
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4412
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2955
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/QwenStock1-14B/408742ff-4b21-46dc-b4d6-4c78d652d228.json b/data/hfopenllm_v2/hotmailuser/QwenStock1-14B/408742ff-4b21-46dc-b4d6-4c78d652d228.json
deleted file mode 100644
index 2d5a4767d..000000000
--- a/data/hfopenllm_v2/hotmailuser/QwenStock1-14B/408742ff-4b21-46dc-b4d6-4c78d652d228.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_QwenStock1-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwenStock1-14B",
-    "id": "hotmailuser/QwenStock1-14B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6693
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6502
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3701
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3859
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4781
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5416
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/hotmailuser/RombosBeagle-v2beta-MGS-32B/496a9fbe-376c-4546-bd90-b42f583924ce.json b/data/hfopenllm_v2/hotmailuser/RombosBeagle-v2beta-MGS-32B/496a9fbe-376c-4546-bd90-b42f583924ce.json
deleted file mode 100644
index 3e888318d..000000000
--- a/data/hfopenllm_v2/hotmailuser/RombosBeagle-v2beta-MGS-32B/496a9fbe-376c-4546-bd90-b42f583924ce.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/hotmailuser_RombosBeagle-v2beta-MGS-32B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RombosBeagle-v2beta-MGS-32B",
-    "id": "hotmailuser/RombosBeagle-v2beta-MGS-32B",
-    "developer": "hotmailuser",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5157
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7037
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4992
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5021
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5908
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/huggyllama/llama-13b/f32c07b4-21a8-4cd2-91f8-f0f26d0b1b38.json b/data/hfopenllm_v2/huggyllama/llama-13b/f32c07b4-21a8-4cd2-91f8-f0f26d0b1b38.json
deleted file mode 100644
index a7410ce47..000000000
--- a/data/hfopenllm_v2/huggyllama/llama-13b/f32c07b4-21a8-4cd2-91f8-f0f26d0b1b38.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/huggyllama_llama-13b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-13b",
-    "id": "huggyllama/llama-13b",
-    "developer": "huggyllama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.016
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2411
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3988
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3462
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1952
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/huggyllama/llama-65b/cc36cc37-0f41-42aa-8051-54cc135820ef.json b/data/hfopenllm_v2/huggyllama/llama-65b/cc36cc37-0f41-42aa-8051-54cc135820ef.json
deleted file mode 100644
index 89a0f62be..000000000
--- a/data/hfopenllm_v2/huggyllama/llama-65b/cc36cc37-0f41-42aa-8051-54cc135820ef.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/huggyllama_llama-65b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-65b",
-    "id": "huggyllama/llama-65b",
-    "developer": "huggyllama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 65.286
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2526
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4703
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.031
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3595
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3078
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/huggyllama/llama-7b/20d3dac4-9f8c-431c-b20f-364dd860e37f.json b/data/hfopenllm_v2/huggyllama/llama-7b/20d3dac4-9f8c-431c-b20f-364dd860e37f.json
deleted file mode 100644
index 18ff58949..000000000
--- a/data/hfopenllm_v2/huggyllama/llama-7b/20d3dac4-9f8c-431c-b20f-364dd860e37f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/huggyllama_llama-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-7b",
-    "id": "huggyllama/llama-7b",
-    "developer": "huggyllama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.738
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2501
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3277
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3354
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1313
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/huihui-ai/DeepSeek-R1-Distill-Qwen-14B-abliterated-v2/89022ea8-2a5b-4eba-8d7a-320ba13d30a4.json b/data/hfopenllm_v2/huihui-ai/DeepSeek-R1-Distill-Qwen-14B-abliterated-v2/89022ea8-2a5b-4eba-8d7a-320ba13d30a4.json
deleted file mode 100644
index 3494d0b6e..000000000
--- a/data/hfopenllm_v2/huihui-ai/DeepSeek-R1-Distill-Qwen-14B-abliterated-v2/89022ea8-2a5b-4eba-8d7a-320ba13d30a4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/huihui-ai_DeepSeek-R1-Distill-Qwen-14B-abliterated-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-14B-abliterated-v2",
-    "id": "huihui-ai/DeepSeek-R1-Distill-Qwen-14B-abliterated-v2",
-    "developer": "huihui-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4211
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3487
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2205
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4701
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1915
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-7030/97bfd152-79c6-4c96-8d3e-588275339e41.json b/data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-7030/97bfd152-79c6-4c96-8d3e-588275339e41.json
deleted file mode 100644
index 20f0742c2..000000000
--- a/data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-7030/97bfd152-79c6-4c96-8d3e-588275339e41.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/huihui-ai_QwQ-32B-Coder-Fusion-7030/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwQ-32B-Coder-Fusion-7030",
-    "id": "huihui-ai/QwQ-32B-Coder-Fusion-7030",
-    "developer": "huihui-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3865
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6178
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2795
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3922
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4368
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-8020/93061947-2bcf-482e-ab22-38ef8ee33bcf.json b/data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-8020/93061947-2bcf-482e-ab22-38ef8ee33bcf.json
deleted file mode 100644
index 10b206b42..000000000
--- a/data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-8020/93061947-2bcf-482e-ab22-38ef8ee33bcf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/huihui-ai_QwQ-32B-Coder-Fusion-8020/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwQ-32B-Coder-Fusion-8020",
-    "id": "huihui-ai/QwQ-32B-Coder-Fusion-8020",
-    "developer": "huihui-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6021
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6665
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4592
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3549
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4293
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5367
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-9010/8f65748b-1251-49f8-bfed-d1e4a937d5ba.json b/data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-9010/8f65748b-1251-49f8-bfed-d1e4a937d5ba.json
deleted file mode 100644
index 9d7bf036a..000000000
--- a/data/hfopenllm_v2/huihui-ai/QwQ-32B-Coder-Fusion-9010/8f65748b-1251-49f8-bfed-d1e4a937d5ba.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/huihui-ai_QwQ-32B-Coder-Fusion-9010/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwQ-32B-Coder-Fusion-9010",
-    "id": "huihui-ai/QwQ-32B-Coder-Fusion-9010",
-    "developer": "huihui-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5778
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6727
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5317
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3616
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4682
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.56
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/huihui-ai/Qwen2.5-14B-Instruct-abliterated-v2/4f278881-69d3-42b5-b72c-ff8627a6ef44.json b/data/hfopenllm_v2/huihui-ai/Qwen2.5-14B-Instruct-abliterated-v2/4f278881-69d3-42b5-b72c-ff8627a6ef44.json
deleted file mode 100644
index ca8850291..000000000
--- a/data/hfopenllm_v2/huihui-ai/Qwen2.5-14B-Instruct-abliterated-v2/4f278881-69d3-42b5-b72c-ff8627a6ef44.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/huihui-ai_Qwen2.5-14B-Instruct-abliterated-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Instruct-abliterated-v2",
-    "id": "huihui-ai/Qwen2.5-14B-Instruct-abliterated-v2",
-    "developer": "huihui-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8328
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6324
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3339
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.422
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4962
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/huihui-ai/Qwen2.5-72B-Instruct-abliterated/d88e85c5-73df-46cc-9234-f0556592ad5a.json b/data/hfopenllm_v2/huihui-ai/Qwen2.5-72B-Instruct-abliterated/d88e85c5-73df-46cc-9234-f0556592ad5a.json
deleted file mode 100644
index 716aaba17..000000000
--- a/data/hfopenllm_v2/huihui-ai/Qwen2.5-72B-Instruct-abliterated/d88e85c5-73df-46cc-9234-f0556592ad5a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/huihui-ai_Qwen2.5-72B-Instruct-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-72B-Instruct-abliterated",
-    "id": "huihui-ai/Qwen2.5-72B-Instruct-abliterated",
-    "developer": "huihui-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8593
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.719
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6012
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3951
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4233
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5537
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/huihui-ai/Qwen2.5-7B-Instruct-abliterated-v2/44d2a20d-e867-4fa5-af3d-087f9c1b4067.json b/data/hfopenllm_v2/huihui-ai/Qwen2.5-7B-Instruct-abliterated-v2/44d2a20d-e867-4fa5-af3d-087f9c1b4067.json
deleted file mode 100644
index da59c59b9..000000000
--- a/data/hfopenllm_v2/huihui-ai/Qwen2.5-7B-Instruct-abliterated-v2/44d2a20d-e867-4fa5-af3d-087f9c1b4067.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/huihui-ai_Qwen2.5-7B-Instruct-abliterated-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-Instruct-abliterated-v2",
-    "id": "huihui-ai/Qwen2.5-7B-Instruct-abliterated-v2",
-    "developer": "huihui-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7606
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5377
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4637
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3981
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4208
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/huihui-ai/Qwen2.5-7B-Instruct-abliterated/e83b3e7e-dc34-4b06-bcfe-95b3ba28aab4.json b/data/hfopenllm_v2/huihui-ai/Qwen2.5-7B-Instruct-abliterated/e83b3e7e-dc34-4b06-bcfe-95b3ba28aab4.json
deleted file mode 100644
index 092cf89ac..000000000
--- a/data/hfopenllm_v2/huihui-ai/Qwen2.5-7B-Instruct-abliterated/e83b3e7e-dc34-4b06-bcfe-95b3ba28aab4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/huihui-ai_Qwen2.5-7B-Instruct-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-Instruct-abliterated",
-    "id": "huihui-ai/Qwen2.5-7B-Instruct-abliterated",
-    "developer": "huihui-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7546
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5262
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4577
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3967
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.418
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/huu-ontocord/wide_3b_orpo_stage1.1-ss1-orpo3/44f2948c-4564-44cc-98d8-4f82a30e1f09.json b/data/hfopenllm_v2/huu-ontocord/wide_3b_orpo_stage1.1-ss1-orpo3/44f2948c-4564-44cc-98d8-4f82a30e1f09.json
deleted file mode 100644
index a707ad657..000000000
--- a/data/hfopenllm_v2/huu-ontocord/wide_3b_orpo_stage1.1-ss1-orpo3/44f2948c-4564-44cc-98d8-4f82a30e1f09.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/huu-ontocord_wide_3b_orpo_stage1.1-ss1-orpo3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b_orpo_stage1.1-ss1-orpo3",
-    "id": "huu-ontocord/wide_3b_orpo_stage1.1-ss1-orpo3",
-    "developer": "huu-ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1505
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2937
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3618
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1164
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/iFaz/llama31_8B_en_emo_v4/846cf1ff-62c3-44e7-b6dd-0135ec77451a.json b/data/hfopenllm_v2/iFaz/llama31_8B_en_emo_v4/846cf1ff-62c3-44e7-b6dd-0135ec77451a.json
deleted file mode 100644
index 099d88b45..000000000
--- a/data/hfopenllm_v2/iFaz/llama31_8B_en_emo_v4/846cf1ff-62c3-44e7-b6dd-0135ec77451a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/iFaz_llama31_8B_en_emo_v4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama31_8B_en_emo_v4",
-    "id": "iFaz/llama31_8B_en_emo_v4",
-    "developer": "iFaz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "",
-      "params_billions": 4.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3043
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4916
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0884
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3643
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3049
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/iFaz/llama32_1B_en_emo_v1/d2054469-b38b-4b1d-bd40-7324319f8eca.json b/data/hfopenllm_v2/iFaz/llama32_1B_en_emo_v1/d2054469-b38b-4b1d-bd40-7324319f8eca.json
deleted file mode 100644
index 03f4a931f..000000000
--- a/data/hfopenllm_v2/iFaz/llama32_1B_en_emo_v1/d2054469-b38b-4b1d-bd40-7324319f8eca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/iFaz_llama32_1B_en_emo_v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama32_1B_en_emo_v1",
-    "id": "iFaz/llama32_1B_en_emo_v1",
-    "developer": "iFaz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.765
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4408
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.338
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0378
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3489
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1761
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_1000_stp/ce60608d-5b52-49d4-bbce-4b20e8272cef.json b/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_1000_stp/ce60608d-5b52-49d4-bbce-4b20e8272cef.json
deleted file mode 100644
index d3b7b81e6..000000000
--- a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_1000_stp/ce60608d-5b52-49d4-bbce-4b20e8272cef.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/iFaz_llama32_3B_en_emo_1000_stp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama32_3B_en_emo_1000_stp",
-    "id": "iFaz/llama32_3B_en_emo_1000_stp",
-    "developer": "iFaz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.848
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7295
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4522
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1465
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3621
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3123
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_2000_stp/f177bb70-fb7c-4b57-965d-acbcb4936bfa.json b/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_2000_stp/f177bb70-fb7c-4b57-965d-acbcb4936bfa.json
deleted file mode 100644
index 3a25847fc..000000000
--- a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_2000_stp/f177bb70-fb7c-4b57-965d-acbcb4936bfa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/iFaz_llama32_3B_en_emo_2000_stp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama32_3B_en_emo_2000_stp",
-    "id": "iFaz/llama32_3B_en_emo_2000_stp",
-    "developer": "iFaz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.848
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7369
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4535
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1533
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3527
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3098
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_300_stp/a5b2ab3d-1f12-4a5a-a110-2514185568b6.json b/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_300_stp/a5b2ab3d-1f12-4a5a-a110-2514185568b6.json
deleted file mode 100644
index b84be7a8b..000000000
--- a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_300_stp/a5b2ab3d-1f12-4a5a-a110-2514185568b6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/iFaz_llama32_3B_en_emo_300_stp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama32_3B_en_emo_300_stp",
-    "id": "iFaz/llama32_3B_en_emo_300_stp",
-    "developer": "iFaz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.848
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7256
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4505
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1601
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3621
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3148
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_5000_stp/63b887a1-a0b9-46db-a563-b9bd67a0805a.json b/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_5000_stp/63b887a1-a0b9-46db-a563-b9bd67a0805a.json
deleted file mode 100644
index 2c79dd41d..000000000
--- a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_5000_stp/63b887a1-a0b9-46db-a563-b9bd67a0805a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/iFaz_llama32_3B_en_emo_5000_stp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama32_3B_en_emo_5000_stp",
-    "id": "iFaz/llama32_3B_en_emo_5000_stp",
-    "developer": "iFaz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.848
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.71
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4568
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1292
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3446
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3067
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_v2/92d122f7-f29d-49e3-99da-bf20edf377a2.json b/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_v2/92d122f7-f29d-49e3-99da-bf20edf377a2.json
deleted file mode 100644
index bc0aeebe3..000000000
--- a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_v2/92d122f7-f29d-49e3-99da-bf20edf377a2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/iFaz_llama32_3B_en_emo_v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama32_3B_en_emo_v2",
-    "id": "iFaz/llama32_3B_en_emo_v2",
-    "developer": "iFaz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.848
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5454
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4284
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1088
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3482
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3004
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_v3/a0b71344-f3a8-4ad0-87c5-6393148488b1.json b/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_v3/a0b71344-f3a8-4ad0-87c5-6393148488b1.json
deleted file mode 100644
index 6e61f1e41..000000000
--- a/data/hfopenllm_v2/iFaz/llama32_3B_en_emo_v3/a0b71344-f3a8-4ad0-87c5-6393148488b1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/iFaz_llama32_3B_en_emo_v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama32_3B_en_emo_v3",
-    "id": "iFaz/llama32_3B_en_emo_v3",
-    "developer": "iFaz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.848
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5759
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4301
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3553
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/iRyanBell/ARC1-II/821ff784-c48a-4623-9fb5-b77b7114b625.json b/data/hfopenllm_v2/iRyanBell/ARC1-II/821ff784-c48a-4623-9fb5-b77b7114b625.json
deleted file mode 100644
index 2ccabcdfe..000000000
--- a/data/hfopenllm_v2/iRyanBell/ARC1-II/821ff784-c48a-4623-9fb5-b77b7114b625.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/iRyanBell_ARC1-II/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ARC1-II",
-    "id": "iRyanBell/ARC1-II",
-    "developer": "iRyanBell",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1708
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3382
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4913
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1686
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/iRyanBell/ARC1/ed251513-4807-4e31-bc8e-3ab0217ae4f3.json b/data/hfopenllm_v2/iRyanBell/ARC1/ed251513-4807-4e31-bc8e-3ab0217ae4f3.json
deleted file mode 100644
index 0ff922243..000000000
--- a/data/hfopenllm_v2/iRyanBell/ARC1/ed251513-4807-4e31-bc8e-3ab0217ae4f3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/iRyanBell_ARC1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ARC1",
-    "id": "iRyanBell/ARC1",
-    "developer": "iRyanBell",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4411
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4903
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0687
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3991
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3371
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibivibiv/colossus_120b/e7fa3baa-07b4-4f10-aa9c-8424d8fea303.json b/data/hfopenllm_v2/ibivibiv/colossus_120b/e7fa3baa-07b4-4f10-aa9c-8424d8fea303.json
deleted file mode 100644
index 7022b74fb..000000000
--- a/data/hfopenllm_v2/ibivibiv/colossus_120b/e7fa3baa-07b4-4f10-aa9c-8424d8fea303.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibivibiv_colossus_120b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "colossus_120b",
-    "id": "ibivibiv/colossus_120b",
-    "developer": "ibivibiv",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 117.749
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4276
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6061
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4733
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3961
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibivibiv/multimaster-7b-v6/11dfd131-00bf-4561-a913-f1c0cb15bf9c.json b/data/hfopenllm_v2/ibivibiv/multimaster-7b-v6/11dfd131-00bf-4561-a913-f1c0cb15bf9c.json
deleted file mode 100644
index 6b86a8133..000000000
--- a/data/hfopenllm_v2/ibivibiv/multimaster-7b-v6/11dfd131-00bf-4561-a913-f1c0cb15bf9c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibivibiv_multimaster-7b-v6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "multimaster-7b-v6",
-    "id": "ibivibiv/multimaster-7b-v6",
-    "developer": "ibivibiv",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 35.428
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4473
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5194
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0559
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4396
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3095
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.0-1b-a400m-base/3ba34f38-2340-407f-a7b5-82749f8a0ee6.json b/data/hfopenllm_v2/ibm-granite/granite-3.0-1b-a400m-base/3ba34f38-2340-407f-a7b5-82749f8a0ee6.json
deleted file mode 100644
index f51a2ca74..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-3.0-1b-a400m-base/3ba34f38-2340-407f-a7b5-82749f8a0ee6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.0-1b-a400m-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-3.0-1b-a400m-base",
-    "id": "ibm-granite/granite-3.0-1b-a400m-base",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GraniteForCausalLM",
-      "params_billions": 1.335
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2404
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2475
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3367
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1152
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.0-1b-a400m-instruct/91b9649b-bdf6-4b15-a038-47edc2e79ef6.json b/data/hfopenllm_v2/ibm-granite/granite-3.0-1b-a400m-instruct/91b9649b-bdf6-4b15-a038-47edc2e79ef6.json
deleted file mode 100644
index d041957dc..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-3.0-1b-a400m-instruct/91b9649b-bdf6-4b15-a038-47edc2e79ef6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.0-1b-a400m-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-3.0-1b-a400m-instruct",
-    "id": "ibm-granite/granite-3.0-1b-a400m-instruct",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GraniteForCausalLM",
-      "params_billions": 1.335
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3332
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3224
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0279
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3623
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1244
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.0-2b-base/24670e63-32e1-4c5d-82fe-0d0c45a4e165.json b/data/hfopenllm_v2/ibm-granite/granite-3.0-2b-base/24670e63-32e1-4c5d-82fe-0d0c45a4e165.json
deleted file mode 100644
index 875fbc380..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-3.0-2b-base/24670e63-32e1-4c5d-82fe-0d0c45a4e165.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.0-2b-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-3.0-2b-base",
-    "id": "ibm-granite/granite-3.0-2b-base",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GraniteForCausalLM",
-      "params_billions": 2.634
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3874
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4047
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3434
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2381
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.0-2b-instruct/198d1441-1d13-468a-a998-c8cf9f1e7a57.json b/data/hfopenllm_v2/ibm-granite/granite-3.0-2b-instruct/198d1441-1d13-468a-a998-c8cf9f1e7a57.json
deleted file mode 100644
index 3d240c282..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-3.0-2b-instruct/198d1441-1d13-468a-a998-c8cf9f1e7a57.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.0-2b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-3.0-2b-instruct",
-    "id": "ibm-granite/granite-3.0-2b-instruct",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GraniteForCausalLM",
-      "params_billions": 2.634
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.514
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4412
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0921
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2814
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.0-3b-a800m-base/e9eb1499-835c-4a70-b531-4be5a9718c34.json b/data/hfopenllm_v2/ibm-granite/granite-3.0-3b-a800m-base/e9eb1499-835c-4a70-b531-4be5a9718c34.json
deleted file mode 100644
index 226473a3b..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-3.0-3b-a800m-base/e9eb1499-835c-4a70-b531-4be5a9718c34.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.0-3b-a800m-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-3.0-3b-a800m-base",
-    "id": "ibm-granite/granite-3.0-3b-a800m-base",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GraniteForCausalLM",
-      "params_billions": 3.374
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2732
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3667
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0483
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1891
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.0-3b-a800m-instruct/b1fd95ad-767d-4c13-a936-00b08c74ca3d.json b/data/hfopenllm_v2/ibm-granite/granite-3.0-3b-a800m-instruct/b1fd95ad-767d-4c13-a936-00b08c74ca3d.json
deleted file mode 100644
index b7c39baf2..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-3.0-3b-a800m-instruct/b1fd95ad-767d-4c13-a936-00b08c74ca3d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.0-3b-a800m-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-3.0-3b-a800m-instruct",
-    "id": "ibm-granite/granite-3.0-3b-a800m-instruct",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GraniteForCausalLM",
-      "params_billions": 3.374
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4298
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3753
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0702
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3487
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2152
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.0-8b-base/f87bd357-535e-4450-b01d-b41e1b7571e0.json b/data/hfopenllm_v2/ibm-granite/granite-3.0-8b-base/f87bd357-535e-4450-b01d-b41e1b7571e0.json
deleted file mode 100644
index fba09f2d6..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-3.0-8b-base/f87bd357-535e-4450-b01d-b41e1b7571e0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.0-8b-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-3.0-8b-base",
-    "id": "ibm-granite/granite-3.0-8b-base",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GraniteForCausalLM",
-      "params_billions": 8.171
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4583
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4944
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1012
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4081
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3313
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.0-8b-instruct/300fd27e-4dce-441f-91da-f38bd14ffe5e.json b/data/hfopenllm_v2/ibm-granite/granite-3.0-8b-instruct/300fd27e-4dce-441f-91da-f38bd14ffe5e.json
deleted file mode 100644
index b5c6943a0..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-3.0-8b-instruct/300fd27e-4dce-441f-91da-f38bd14ffe5e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.0-8b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-3.0-8b-instruct",
-    "id": "ibm-granite/granite-3.0-8b-instruct",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GraniteForCausalLM",
-      "params_billions": 8.171
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.531
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5192
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.142
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3322
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3901
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3457
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.1-1b-a400m-base/1fd9a2e5-856f-4303-8ac1-611311f3e7b5.json b/data/hfopenllm_v2/ibm-granite/granite-3.1-1b-a400m-base/1fd9a2e5-856f-4303-8ac1-611311f3e7b5.json
deleted file mode 100644
index 81e9a2dd8..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-3.1-1b-a400m-base/1fd9a2e5-856f-4303-8ac1-611311f3e7b5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.1-1b-a400m-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-3.1-1b-a400m-base",
-    "id": "ibm-granite/granite-3.1-1b-a400m-base",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GraniteMoeForCausalLM",
-      "params_billions": 1.335
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2519
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3299
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0272
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3501
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1139
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.1-1b-a400m-instruct/4c34d5c6-af1b-4519-8d08-67bd837e9b97.json b/data/hfopenllm_v2/ibm-granite/granite-3.1-1b-a400m-instruct/4c34d5c6-af1b-4519-8d08-67bd837e9b97.json
deleted file mode 100644
index 18f994f0f..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-3.1-1b-a400m-instruct/4c34d5c6-af1b-4519-8d08-67bd837e9b97.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.1-1b-a400m-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-3.1-1b-a400m-instruct",
-    "id": "ibm-granite/granite-3.1-1b-a400m-instruct",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GraniteMoeForCausalLM",
-      "params_billions": 1.335
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4686
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2399
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3302
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1217
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.1-2b-base/ddc27df7-1c4c-4563-92b2-5a39380423a8.json b/data/hfopenllm_v2/ibm-granite/granite-3.1-2b-base/ddc27df7-1c4c-4563-92b2-5a39380423a8.json
deleted file mode 100644
index d36351d16..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-3.1-2b-base/ddc27df7-1c4c-4563-92b2-5a39380423a8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.1-2b-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-3.1-2b-base",
-    "id": "ibm-granite/granite-3.1-2b-base",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GraniteForCausalLM",
-      "params_billions": 2.534
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3522
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4047
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3486
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2251
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.1-2b-instruct/3e606ef8-9caa-43d4-81d6-8eae9936ab4c.json b/data/hfopenllm_v2/ibm-granite/granite-3.1-2b-instruct/3e606ef8-9caa-43d4-81d6-8eae9936ab4c.json
deleted file mode 100644
index 8f8fd088d..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-3.1-2b-instruct/3e606ef8-9caa-43d4-81d6-8eae9936ab4c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.1-2b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-3.1-2b-instruct",
-    "id": "ibm-granite/granite-3.1-2b-instruct",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GraniteForCausalLM",
-      "params_billions": 2.534
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6286
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4409
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1526
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3605
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.1-3b-a800m-base/b9053559-3b90-4de0-981a-dbb49db38eb5.json b/data/hfopenllm_v2/ibm-granite/granite-3.1-3b-a800m-base/b9053559-3b90-4de0-981a-dbb49db38eb5.json
deleted file mode 100644
index d42475062..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-3.1-3b-a800m-base/b9053559-3b90-4de0-981a-dbb49db38eb5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.1-3b-a800m-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-3.1-3b-a800m-base",
-    "id": "ibm-granite/granite-3.1-3b-a800m-base",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GraniteMoeForCausalLM",
-      "params_billions": 3.299
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2996
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3628
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3275
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1793
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.1-3b-a800m-instruct/cea89bc6-b1a1-4b67-a136-45e097563a5b.json b/data/hfopenllm_v2/ibm-granite/granite-3.1-3b-a800m-instruct/cea89bc6-b1a1-4b67-a136-45e097563a5b.json
deleted file mode 100644
index cd1998b51..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-3.1-3b-a800m-instruct/cea89bc6-b1a1-4b67-a136-45e097563a5b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.1-3b-a800m-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-3.1-3b-a800m-instruct",
-    "id": "ibm-granite/granite-3.1-3b-a800m-instruct",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GraniteMoeForCausalLM",
-      "params_billions": 3.299
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5516
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4009
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3486
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2148
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.1-8b-base/5eb16113-7d0d-47a0-91d8-ec7dab35efdd.json b/data/hfopenllm_v2/ibm-granite/granite-3.1-8b-base/5eb16113-7d0d-47a0-91d8-ec7dab35efdd.json
deleted file mode 100644
index 0a47fdba2..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-3.1-8b-base/5eb16113-7d0d-47a0-91d8-ec7dab35efdd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.1-8b-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-3.1-8b-base",
-    "id": "ibm-granite/granite-3.1-8b-base",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GraniteForCausalLM",
-      "params_billions": 8.171
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4221
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4777
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0944
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3922
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3232
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.1-8b-instruct/45aa6545-d20a-4dfb-a8a6-01f2fd34c9f5.json b/data/hfopenllm_v2/ibm-granite/granite-3.1-8b-instruct/45aa6545-d20a-4dfb-a8a6-01f2fd34c9f5.json
deleted file mode 100644
index 5014bf7fd..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-3.1-8b-instruct/45aa6545-d20a-4dfb-a8a6-01f2fd34c9f5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.1-8b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-3.1-8b-instruct",
-    "id": "ibm-granite/granite-3.1-8b-instruct",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GraniteForCausalLM",
-      "params_billions": 8.171
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7208
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5364
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2198
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4707
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3537
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.2-2b-instruct/c94079d1-d8b1-4198-8129-8c5a11c310ca.json b/data/hfopenllm_v2/ibm-granite/granite-3.2-2b-instruct/c94079d1-d8b1-4198-8129-8c5a11c310ca.json
deleted file mode 100644
index b97bdbfbb..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-3.2-2b-instruct/c94079d1-d8b1-4198-8129-8c5a11c310ca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.2-2b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-3.2-2b-instruct",
-    "id": "ibm-granite/granite-3.2-2b-instruct",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GraniteForCausalLM",
-      "params_billions": 2.534
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6152
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4387
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1443
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3646
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2783
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-3.2-8b-instruct/cb45306a-096c-4ed5-a028-6d720b26afe9.json b/data/hfopenllm_v2/ibm-granite/granite-3.2-8b-instruct/cb45306a-096c-4ed5-a028-6d720b26afe9.json
deleted file mode 100644
index 15fc669a2..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-3.2-8b-instruct/cb45306a-096c-4ed5-a028-6d720b26afe9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-3.2-8b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-3.2-8b-instruct",
-    "id": "ibm-granite/granite-3.2-8b-instruct",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GraniteForCausalLM",
-      "params_billions": 8.171
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7275
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5402
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2379
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4562
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3512
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-7b-base/f301908e-474b-4ba2-a873-610ca1b6c2bd.json b/data/hfopenllm_v2/ibm-granite/granite-7b-base/f301908e-474b-4ba2-a873-610ca1b6c2bd.json
deleted file mode 100644
index 678678ab3..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-7b-base/f301908e-474b-4ba2-a873-610ca1b6c2bd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-7b-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-7b-base",
-    "id": "ibm-granite/granite-7b-base",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.738
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2414
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.348
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2458
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3555
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1834
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm-granite/granite-7b-instruct/06f5865d-a62a-48da-b33f-486fe29e3685.json b/data/hfopenllm_v2/ibm-granite/granite-7b-instruct/06f5865d-a62a-48da-b33f-486fe29e3685.json
deleted file mode 100644
index d82e7bb12..000000000
--- a/data/hfopenllm_v2/ibm-granite/granite-7b-instruct/06f5865d-a62a-48da-b33f-486fe29e3685.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm-granite_granite-7b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "granite-7b-instruct",
-    "id": "ibm-granite/granite-7b-instruct",
-    "developer": "ibm-granite",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.738
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2972
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3723
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.402
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2286
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm/PowerLM-3b/4f952c51-91dc-446e-bda1-43ed66e1ca3e.json b/data/hfopenllm_v2/ibm/PowerLM-3b/4f952c51-91dc-446e-bda1-43ed66e1ca3e.json
deleted file mode 100644
index fb5d9a315..000000000
--- a/data/hfopenllm_v2/ibm/PowerLM-3b/4f952c51-91dc-446e-bda1-43ed66e1ca3e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm_PowerLM-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PowerLM-3b",
-    "id": "ibm/PowerLM-3b",
-    "developer": "ibm",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GraniteForCausalLM",
-      "params_billions": 3.512
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3321
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3679
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0363
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3563
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2016
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ibm/merlinite-7b/dcba3a6f-8f4f-49f6-af74-541de16be435.json b/data/hfopenllm_v2/ibm/merlinite-7b/dcba3a6f-8f4f-49f6-af74-541de16be435.json
deleted file mode 100644
index 240a78e34..000000000
--- a/data/hfopenllm_v2/ibm/merlinite-7b/dcba3a6f-8f4f-49f6-af74-541de16be435.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ibm_merlinite-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "merlinite-7b",
-    "id": "ibm/merlinite-7b",
-    "developer": "ibm",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2499
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5007
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0242
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4412
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3068
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.15-02.10-RP/b5d39bcb-dab4-4880-9cb1-68dbd20a3ce5.json b/data/hfopenllm_v2/icefog72/Ice0.15-02.10-RP/b5d39bcb-dab4-4880-9cb1-68dbd20a3ce5.json
deleted file mode 100644
index 920868959..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.15-02.10-RP/b5d39bcb-dab4-4880-9cb1-68dbd20a3ce5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.15-02.10-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.15-02.10-RP",
-    "id": "icefog72/Ice0.15-02.10-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5343
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4976
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.432
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3066
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.16-02.10-RP/1e597e9b-4e75-4981-842b-dad6f1c15ed7.json b/data/hfopenllm_v2/icefog72/Ice0.16-02.10-RP/1e597e9b-4e75-4981-842b-dad6f1c15ed7.json
deleted file mode 100644
index 6d89ee60d..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.16-02.10-RP/1e597e9b-4e75-4981-842b-dad6f1c15ed7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.16-02.10-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.16-02.10-RP",
-    "id": "icefog72/Ice0.16-02.10-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5069
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4946
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0589
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4334
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3068
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.17-03.10-RP/18752dc4-76d1-40dc-9f43-62b8087b7a88.json b/data/hfopenllm_v2/icefog72/Ice0.17-03.10-RP/18752dc4-76d1-40dc-9f43-62b8087b7a88.json
deleted file mode 100644
index 1e5cdb6bc..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.17-03.10-RP/18752dc4-76d1-40dc-9f43-62b8087b7a88.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.17-03.10-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.17-03.10-RP",
-    "id": "icefog72/Ice0.17-03.10-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5124
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5007
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4334
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3085
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.27-06.11-RP/fa30c36e-20f1-41ee-a59d-0044f2b76dfb.json b/data/hfopenllm_v2/icefog72/Ice0.27-06.11-RP/fa30c36e-20f1-41ee-a59d-0044f2b76dfb.json
deleted file mode 100644
index ce00faedd..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.27-06.11-RP/fa30c36e-20f1-41ee-a59d-0044f2b76dfb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.27-06.11-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.27-06.11-RP",
-    "id": "icefog72/Ice0.27-06.11-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4918
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5112
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4328
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.29-06.11-RP/5391ae8f-41b0-41cb-9365-b5cb7649c8b7.json b/data/hfopenllm_v2/icefog72/Ice0.29-06.11-RP/5391ae8f-41b0-41cb-9365-b5cb7649c8b7.json
deleted file mode 100644
index 8a1fccbff..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.29-06.11-RP/5391ae8f-41b0-41cb-9365-b5cb7649c8b7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.29-06.11-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.29-06.11-RP",
-    "id": "icefog72/Ice0.29-06.11-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4861
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5088
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4459
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3093
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.31-08.11-RP/a95ab4cf-456f-4b3d-9bab-2b755649758d.json b/data/hfopenllm_v2/icefog72/Ice0.31-08.11-RP/a95ab4cf-456f-4b3d-9bab-2b755649758d.json
deleted file mode 100644
index b2b7ce69b..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.31-08.11-RP/a95ab4cf-456f-4b3d-9bab-2b755649758d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.31-08.11-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.31-08.11-RP",
-    "id": "icefog72/Ice0.31-08.11-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5146
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5032
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4277
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3131
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.32-10.11-RP/9840baa9-2ddf-4dd9-b3b0-3ec3075089bc.json b/data/hfopenllm_v2/icefog72/Ice0.32-10.11-RP/9840baa9-2ddf-4dd9-b3b0-3ec3075089bc.json
deleted file mode 100644
index c07a2e5fe..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.32-10.11-RP/9840baa9-2ddf-4dd9-b3b0-3ec3075089bc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.32-10.11-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.32-10.11-RP",
-    "id": "icefog72/Ice0.32-10.11-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4915
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5048
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.31
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.34b-14.11-RP/26ff113c-95ca-4716-83f7-4792b46be246.json b/data/hfopenllm_v2/icefog72/Ice0.34b-14.11-RP/26ff113c-95ca-4716-83f7-4792b46be246.json
deleted file mode 100644
index e450135ff..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.34b-14.11-RP/26ff113c-95ca-4716-83f7-4792b46be246.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.34b-14.11-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.34b-14.11-RP",
-    "id": "icefog72/Ice0.34b-14.11-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4762
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5067
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.442
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3125
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.34n-14.11-RP/285e1d08-15a0-4d8b-a844-e4cad923ea9b.json b/data/hfopenllm_v2/icefog72/Ice0.34n-14.11-RP/285e1d08-15a0-4d8b-a844-e4cad923ea9b.json
deleted file mode 100644
index 8fc9467cf..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.34n-14.11-RP/285e1d08-15a0-4d8b-a844-e4cad923ea9b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.34n-14.11-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.34n-14.11-RP",
-    "id": "icefog72/Ice0.34n-14.11-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4787
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5091
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0725
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.438
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3124
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.37-18.11-RP/0462269d-94a3-4991-9af5-e55592f344e5.json b/data/hfopenllm_v2/icefog72/Ice0.37-18.11-RP/0462269d-94a3-4991-9af5-e55592f344e5.json
deleted file mode 100644
index 11c77c634..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.37-18.11-RP/0462269d-94a3-4991-9af5-e55592f344e5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.37-18.11-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.37-18.11-RP",
-    "id": "icefog72/Ice0.37-18.11-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4972
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5084
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0642
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4339
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3143
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.38-19.11-RP/c47c4cd6-90b6-42df-a3b9-4fc8f1b3c980.json b/data/hfopenllm_v2/icefog72/Ice0.38-19.11-RP/c47c4cd6-90b6-42df-a3b9-4fc8f1b3c980.json
deleted file mode 100644
index d940d4450..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.38-19.11-RP/c47c4cd6-90b6-42df-a3b9-4fc8f1b3c980.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.38-19.11-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.38-19.11-RP",
-    "id": "icefog72/Ice0.38-19.11-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4403
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5101
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0551
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4367
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.314
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.39-19.11-RP/0fecafe4-f8f0-4f97-ab2d-589a3856e1af.json b/data/hfopenllm_v2/icefog72/Ice0.39-19.11-RP/0fecafe4-f8f0-4f97-ab2d-589a3856e1af.json
deleted file mode 100644
index 07a2c60d3..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.39-19.11-RP/0fecafe4-f8f0-4f97-ab2d-589a3856e1af.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.39-19.11-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.39-19.11-RP",
-    "id": "icefog72/Ice0.39-19.11-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4757
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5093
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0498
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4341
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3127
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.40-20.11-RP/4b5529b9-0800-4cd6-b720-a905ab5e6c9a.json b/data/hfopenllm_v2/icefog72/Ice0.40-20.11-RP/4b5529b9-0800-4cd6-b720-a905ab5e6c9a.json
deleted file mode 100644
index ef805d1b3..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.40-20.11-RP/4b5529b9-0800-4cd6-b720-a905ab5e6c9a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.40-20.11-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.40-20.11-RP",
-    "id": "icefog72/Ice0.40-20.11-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4763
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5093
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0642
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4446
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3099
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.41-22.11-RP/84783e4d-5eed-474d-9463-a01a0890850e.json b/data/hfopenllm_v2/icefog72/Ice0.41-22.11-RP/84783e4d-5eed-474d-9463-a01a0890850e.json
deleted file mode 100644
index baa053681..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.41-22.11-RP/84783e4d-5eed-474d-9463-a01a0890850e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.41-22.11-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.41-22.11-RP",
-    "id": "icefog72/Ice0.41-22.11-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.462
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4723
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.031
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.456
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2618
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.50-16.01-RP/d9fe39c5-24a5-4240-bfc9-59860fcb3911.json b/data/hfopenllm_v2/icefog72/Ice0.50-16.01-RP/d9fe39c5-24a5-4240-bfc9-59860fcb3911.json
deleted file mode 100644
index eb81cb8cd..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.50-16.01-RP/d9fe39c5-24a5-4240-bfc9-59860fcb3911.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.50-16.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.50-16.01-RP",
-    "id": "icefog72/Ice0.50-16.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4385
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.498
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0468
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4381
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3069
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.50.1-16.01-RP/2ddf850e-36dc-41b2-92da-e2b45d1544c6.json b/data/hfopenllm_v2/icefog72/Ice0.50.1-16.01-RP/2ddf850e-36dc-41b2-92da-e2b45d1544c6.json
deleted file mode 100644
index bcbafb408..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.50.1-16.01-RP/2ddf850e-36dc-41b2-92da-e2b45d1544c6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.50.1-16.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.50.1-16.01-RP",
-    "id": "icefog72/Ice0.50.1-16.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4829
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5107
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4327
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3132
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.51-16.01-RP/b10a9284-fa5e-4a4e-8240-edc98cea6d9c.json b/data/hfopenllm_v2/icefog72/Ice0.51-16.01-RP/b10a9284-fa5e-4a4e-8240-edc98cea6d9c.json
deleted file mode 100644
index a158fdd3f..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.51-16.01-RP/b10a9284-fa5e-4a4e-8240-edc98cea6d9c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.51-16.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.51-16.01-RP",
-    "id": "icefog72/Ice0.51-16.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4431
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5044
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4437
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.306
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.51.1-16.01-RP/2c51bd1d-ebe8-4de9-9749-5f42f7ba3d5a.json b/data/hfopenllm_v2/icefog72/Ice0.51.1-16.01-RP/2c51bd1d-ebe8-4de9-9749-5f42f7ba3d5a.json
deleted file mode 100644
index 1cd3c840d..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.51.1-16.01-RP/2c51bd1d-ebe8-4de9-9749-5f42f7ba3d5a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.51.1-16.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.51.1-16.01-RP",
-    "id": "icefog72/Ice0.51.1-16.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4573
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5121
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0642
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4394
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.52-16.01-RP/425e6f1e-50dd-444f-b0da-5a0c47d5bf06.json b/data/hfopenllm_v2/icefog72/Ice0.52-16.01-RP/425e6f1e-50dd-444f-b0da-5a0c47d5bf06.json
deleted file mode 100644
index 10d868dab..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.52-16.01-RP/425e6f1e-50dd-444f-b0da-5a0c47d5bf06.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.52-16.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.52-16.01-RP",
-    "id": "icefog72/Ice0.52-16.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4503
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5047
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4396
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.308
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.52.1-16.01-RP/7e1fcf4e-9f64-4112-934c-4808f07d32b2.json b/data/hfopenllm_v2/icefog72/Ice0.52.1-16.01-RP/7e1fcf4e-9f64-4112-934c-4808f07d32b2.json
deleted file mode 100644
index 51094c005..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.52.1-16.01-RP/7e1fcf4e-9f64-4112-934c-4808f07d32b2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.52.1-16.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.52.1-16.01-RP",
-    "id": "icefog72/Ice0.52.1-16.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4549
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5106
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0627
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4394
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3105
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.53-16.01-RP/d3666566-09dc-4d53-9996-2301c6fb2721.json b/data/hfopenllm_v2/icefog72/Ice0.53-16.01-RP/d3666566-09dc-4d53-9996-2301c6fb2721.json
deleted file mode 100644
index 2b83a20cc..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.53-16.01-RP/d3666566-09dc-4d53-9996-2301c6fb2721.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.53-16.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.53-16.01-RP",
-    "id": "icefog72/Ice0.53-16.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4741
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5102
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0634
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4327
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.313
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.54-17.01-RP/36e5efb9-e3f0-4903-a9f1-3d51453bfdc4.json b/data/hfopenllm_v2/icefog72/Ice0.54-17.01-RP/36e5efb9-e3f0-4903-a9f1-3d51453bfdc4.json
deleted file mode 100644
index aac00db76..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.54-17.01-RP/36e5efb9-e3f0-4903-a9f1-3d51453bfdc4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.54-17.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.54-17.01-RP",
-    "id": "icefog72/Ice0.54-17.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4379
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4853
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4874
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2326
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.55-17.01-RP/a6dba337-81d2-40c6-89c2-aee6de82282e.json b/data/hfopenllm_v2/icefog72/Ice0.55-17.01-RP/a6dba337-81d2-40c6-89c2-aee6de82282e.json
deleted file mode 100644
index d93982f8a..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.55-17.01-RP/a6dba337-81d2-40c6-89c2-aee6de82282e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.55-17.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.55-17.01-RP",
-    "id": "icefog72/Ice0.55-17.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4961
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5077
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0604
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4725
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2658
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.57-17.01-RP/e44b8d9a-f270-45c8-b126-6a8911c35436.json b/data/hfopenllm_v2/icefog72/Ice0.57-17.01-RP/e44b8d9a-f270-45c8-b126-6a8911c35436.json
deleted file mode 100644
index 100730827..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.57-17.01-RP/e44b8d9a-f270-45c8-b126-6a8911c35436.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.57-17.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.57-17.01-RP",
-    "id": "icefog72/Ice0.57-17.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5152
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5064
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4686
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.60-18.01-RP/44d5e1ac-45d5-42aa-b9fa-f18112cf6676.json b/data/hfopenllm_v2/icefog72/Ice0.60-18.01-RP/44d5e1ac-45d5-42aa-b9fa-f18112cf6676.json
deleted file mode 100644
index 79e0d8058..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.60-18.01-RP/44d5e1ac-45d5-42aa-b9fa-f18112cf6676.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.60-18.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.60-18.01-RP",
-    "id": "icefog72/Ice0.60-18.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5374
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5094
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.467
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2837
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.60.1-18.01-RP/4246401d-9049-4c83-83d4-e2d9efa4dded.json b/data/hfopenllm_v2/icefog72/Ice0.60.1-18.01-RP/4246401d-9049-4c83-83d4-e2d9efa4dded.json
deleted file mode 100644
index a75134cde..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.60.1-18.01-RP/4246401d-9049-4c83-83d4-e2d9efa4dded.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.60.1-18.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.60.1-18.01-RP",
-    "id": "icefog72/Ice0.60.1-18.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5188
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.512
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0461
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4498
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2914
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.61-18.01-RP/26c4785a-0caf-4b01-be5d-1e421bfeb698.json b/data/hfopenllm_v2/icefog72/Ice0.61-18.01-RP/26c4785a-0caf-4b01-be5d-1e421bfeb698.json
deleted file mode 100644
index de443599c..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.61-18.01-RP/26c4785a-0caf-4b01-be5d-1e421bfeb698.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.61-18.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.61-18.01-RP",
-    "id": "icefog72/Ice0.61-18.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5441
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5105
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0468
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4697
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2709
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.62-18.01-RP/cc9b9a25-18f9-4cc3-a756-3975a3a3be7d.json b/data/hfopenllm_v2/icefog72/Ice0.62-18.01-RP/cc9b9a25-18f9-4cc3-a756-3975a3a3be7d.json
deleted file mode 100644
index 1c76bab97..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.62-18.01-RP/cc9b9a25-18f9-4cc3-a756-3975a3a3be7d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.62-18.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.62-18.01-RP",
-    "id": "icefog72/Ice0.62-18.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5367
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5103
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4538
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2877
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.62.1-24.01-RP/b4edb7f5-a675-4627-af96-7ed0909da1e5.json b/data/hfopenllm_v2/icefog72/Ice0.62.1-24.01-RP/b4edb7f5-a675-4627-af96-7ed0909da1e5.json
deleted file mode 100644
index da0bcab1f..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.62.1-24.01-RP/b4edb7f5-a675-4627-af96-7ed0909da1e5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.62.1-24.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.62.1-24.01-RP",
-    "id": "icefog72/Ice0.62.1-24.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5182
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5109
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0559
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4551
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2871
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.64-24.01-RP/461b6f40-6f19-48b1-857e-f0fb37f929f9.json b/data/hfopenllm_v2/icefog72/Ice0.64-24.01-RP/461b6f40-6f19-48b1-857e-f0fb37f929f9.json
deleted file mode 100644
index 3cff42e22..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.64-24.01-RP/461b6f40-6f19-48b1-857e-f0fb37f929f9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.64-24.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.64-24.01-RP",
-    "id": "icefog72/Ice0.64-24.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5441
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.506
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0627
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.462
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2933
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.64.1-24.01-RP/e924270d-a655-4093-91b2-f73b7f12eefd.json b/data/hfopenllm_v2/icefog72/Ice0.64.1-24.01-RP/e924270d-a655-4093-91b2-f73b7f12eefd.json
deleted file mode 100644
index 63500c106..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.64.1-24.01-RP/e924270d-a655-4093-91b2-f73b7f12eefd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.64.1-24.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.64.1-24.01-RP",
-    "id": "icefog72/Ice0.64.1-24.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5447
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.506
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0627
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.462
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2933
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.65-25.01-RP/af8905e0-e969-45bd-8e09-e7316fff0914.json b/data/hfopenllm_v2/icefog72/Ice0.65-25.01-RP/af8905e0-e969-45bd-8e09-e7316fff0914.json
deleted file mode 100644
index 6575a19cf..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.65-25.01-RP/af8905e0-e969-45bd-8e09-e7316fff0914.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.65-25.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.65-25.01-RP",
-    "id": "icefog72/Ice0.65-25.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5029
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5096
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.434
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2997
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.66-25.01-RP/e92a6d31-2277-4093-8fae-b3dfaa2d47dd.json b/data/hfopenllm_v2/icefog72/Ice0.66-25.01-RP/e92a6d31-2277-4093-8fae-b3dfaa2d47dd.json
deleted file mode 100644
index f0e31f665..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.66-25.01-RP/e92a6d31-2277-4093-8fae-b3dfaa2d47dd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.66-25.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.66-25.01-RP",
-    "id": "icefog72/Ice0.66-25.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5325
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5129
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0604
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4434
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3039
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.67-25.01-RP/47472cd9-36d3-4074-83d4-af53b9c23758.json b/data/hfopenllm_v2/icefog72/Ice0.67-25.01-RP/47472cd9-36d3-4074-83d4-af53b9c23758.json
deleted file mode 100644
index 46ca836b5..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.67-25.01-RP/47472cd9-36d3-4074-83d4-af53b9c23758.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.67-25.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.67-25.01-RP",
-    "id": "icefog72/Ice0.67-25.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5361
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5113
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0748
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4279
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3097
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.68-25.01-RP/b922f4e1-1fd9-4a32-94ce-4784430cef51.json b/data/hfopenllm_v2/icefog72/Ice0.68-25.01-RP/b922f4e1-1fd9-4a32-94ce-4784430cef51.json
deleted file mode 100644
index 02b0eca7c..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.68-25.01-RP/b922f4e1-1fd9-4a32-94ce-4784430cef51.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.68-25.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.68-25.01-RP",
-    "id": "icefog72/Ice0.68-25.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5514
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.513
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0725
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4446
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.69-25.01-RP/5bb2e77f-7709-4eb8-bd08-3c8da4a56310.json b/data/hfopenllm_v2/icefog72/Ice0.69-25.01-RP/5bb2e77f-7709-4eb8-bd08-3c8da4a56310.json
deleted file mode 100644
index 87263c5f0..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.69-25.01-RP/5bb2e77f-7709-4eb8-bd08-3c8da4a56310.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.69-25.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.69-25.01-RP",
-    "id": "icefog72/Ice0.69-25.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5438
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5098
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4486
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2965
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.7-29.09-RP/35937213-bb16-4935-9d92-9fa8fd61aac3.json b/data/hfopenllm_v2/icefog72/Ice0.7-29.09-RP/35937213-bb16-4935-9d92-9fa8fd61aac3.json
deleted file mode 100644
index 88704a4a5..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.7-29.09-RP/35937213-bb16-4935-9d92-9fa8fd61aac3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.7-29.09-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.7-29.09-RP",
-    "id": "icefog72/Ice0.7-29.09-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5176
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5048
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4238
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3127
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.70-25.01-RP/04122d1b-929d-439c-bb8d-f08508f7a00e.json b/data/hfopenllm_v2/icefog72/Ice0.70-25.01-RP/04122d1b-929d-439c-bb8d-f08508f7a00e.json
deleted file mode 100644
index 57317f5c1..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.70-25.01-RP/04122d1b-929d-439c-bb8d-f08508f7a00e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.70-25.01-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.70-25.01-RP",
-    "id": "icefog72/Ice0.70-25.01-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5498
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5136
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0597
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4512
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2996
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.70.1-01.02-RP/03beb242-2628-4ea0-a2f3-c3ec43d379de.json b/data/hfopenllm_v2/icefog72/Ice0.70.1-01.02-RP/03beb242-2628-4ea0-a2f3-c3ec43d379de.json
deleted file mode 100644
index 72031f33b..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.70.1-01.02-RP/03beb242-2628-4ea0-a2f3-c3ec43d379de.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.70.1-01.02-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.70.1-01.02-RP",
-    "id": "icefog72/Ice0.70.1-01.02-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.507
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.506
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.034
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4599
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2749
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.73-01.02-RP/46d55b7b-1972-4cb0-97ca-e04d306282a7.json b/data/hfopenllm_v2/icefog72/Ice0.73-01.02-RP/46d55b7b-1972-4cb0-97ca-e04d306282a7.json
deleted file mode 100644
index 6f985e2ba..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.73-01.02-RP/46d55b7b-1972-4cb0-97ca-e04d306282a7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.73-01.02-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.73-01.02-RP",
-    "id": "icefog72/Ice0.73-01.02-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5292
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5103
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4664
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2702
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.74-02.02-RP/32730d82-cfac-481f-9a22-9cbe40646218.json b/data/hfopenllm_v2/icefog72/Ice0.74-02.02-RP/32730d82-cfac-481f-9a22-9cbe40646218.json
deleted file mode 100644
index db1bba27d..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.74-02.02-RP/32730d82-cfac-481f-9a22-9cbe40646218.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.74-02.02-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.74-02.02-RP",
-    "id": "icefog72/Ice0.74-02.02-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2935
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4646
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0015
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.428
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2143
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.76-02.02-RP/a290a75f-753b-489d-87a2-ce0637c09f41.json b/data/hfopenllm_v2/icefog72/Ice0.76-02.02-RP/a290a75f-753b-489d-87a2-ce0637c09f41.json
deleted file mode 100644
index 9c170c9cd..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.76-02.02-RP/a290a75f-753b-489d-87a2-ce0637c09f41.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.76-02.02-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.76-02.02-RP",
-    "id": "icefog72/Ice0.76-02.02-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4529
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5086
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4362
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2652
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.77-02.02-RP/54032eb0-c4cd-4c76-be2e-f0c81bd26365.json b/data/hfopenllm_v2/icefog72/Ice0.77-02.02-RP/54032eb0-c4cd-4c76-be2e-f0c81bd26365.json
deleted file mode 100644
index d50d5a736..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.77-02.02-RP/54032eb0-c4cd-4c76-be2e-f0c81bd26365.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.77-02.02-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.77-02.02-RP",
-    "id": "icefog72/Ice0.77-02.02-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.531
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5109
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4765
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2999
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.78-02.02-RP/73b59506-cc1d-413c-a28b-d25e0e6bf413.json b/data/hfopenllm_v2/icefog72/Ice0.78-02.02-RP/73b59506-cc1d-413c-a28b-d25e0e6bf413.json
deleted file mode 100644
index cbf1db27b..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.78-02.02-RP/73b59506-cc1d-413c-a28b-d25e0e6bf413.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.78-02.02-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.78-02.02-RP",
-    "id": "icefog72/Ice0.78-02.02-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4053
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5002
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4686
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2955
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/Ice0.80-03.02-RP/bea2dcd6-4772-4aac-bcbc-4802cfb33495.json b/data/hfopenllm_v2/icefog72/Ice0.80-03.02-RP/bea2dcd6-4772-4aac-bcbc-4802cfb33495.json
deleted file mode 100644
index 1c2303ea6..000000000
--- a/data/hfopenllm_v2/icefog72/Ice0.80-03.02-RP/bea2dcd6-4772-4aac-bcbc-4802cfb33495.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_Ice0.80-03.02-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ice0.80-03.02-RP",
-    "id": "icefog72/Ice0.80-03.02-RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5516
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5098
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0559
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4923
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2912
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/IceCocoaRP-7b/66275215-28e6-42bc-bc22-5d152682ce53.json b/data/hfopenllm_v2/icefog72/IceCocoaRP-7b/66275215-28e6-42bc-bc22-5d152682ce53.json
deleted file mode 100644
index 481d7723a..000000000
--- a/data/hfopenllm_v2/icefog72/IceCocoaRP-7b/66275215-28e6-42bc-bc22-5d152682ce53.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_IceCocoaRP-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IceCocoaRP-7b",
-    "id": "icefog72/IceCocoaRP-7b",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4962
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4938
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4198
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3098
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/IceCoffeeRP-7b/9015365c-400b-4fa3-85f2-a1033b030cf7.json b/data/hfopenllm_v2/icefog72/IceCoffeeRP-7b/9015365c-400b-4fa3-85f2-a1033b030cf7.json
deleted file mode 100644
index dd02fc052..000000000
--- a/data/hfopenllm_v2/icefog72/IceCoffeeRP-7b/9015365c-400b-4fa3-85f2-a1033b030cf7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_IceCoffeeRP-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IceCoffeeRP-7b",
-    "id": "icefog72/IceCoffeeRP-7b",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4959
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4889
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.416
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2975
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/IceDrinkByFrankensteinV3RP/55d52914-0904-4e6e-8b37-c22b06f5f2bf.json b/data/hfopenllm_v2/icefog72/IceDrinkByFrankensteinV3RP/55d52914-0904-4e6e-8b37-c22b06f5f2bf.json
deleted file mode 100644
index ccf1b184c..000000000
--- a/data/hfopenllm_v2/icefog72/IceDrinkByFrankensteinV3RP/55d52914-0904-4e6e-8b37-c22b06f5f2bf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_IceDrinkByFrankensteinV3RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IceDrinkByFrankensteinV3RP",
-    "id": "icefog72/IceDrinkByFrankensteinV3RP",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4975
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4833
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4253
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2927
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/IceDrinkNameGoesHereRP-7b-Model_Stock/3677260a-2fd5-41bf-9010-f1b31cedacbc.json b/data/hfopenllm_v2/icefog72/IceDrinkNameGoesHereRP-7b-Model_Stock/3677260a-2fd5-41bf-9010-f1b31cedacbc.json
deleted file mode 100644
index 24025b247..000000000
--- a/data/hfopenllm_v2/icefog72/IceDrinkNameGoesHereRP-7b-Model_Stock/3677260a-2fd5-41bf-9010-f1b31cedacbc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_IceDrinkNameGoesHereRP-7b-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IceDrinkNameGoesHereRP-7b-Model_Stock",
-    "id": "icefog72/IceDrinkNameGoesHereRP-7b-Model_Stock",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4968
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4658
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4067
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2817
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/IceDrinkNameNotFoundRP-7b-Model_Stock/fc54f87a-2e4a-4f3f-b407-e268c4487d16.json b/data/hfopenllm_v2/icefog72/IceDrinkNameNotFoundRP-7b-Model_Stock/fc54f87a-2e4a-4f3f-b407-e268c4487d16.json
deleted file mode 100644
index 2746a283f..000000000
--- a/data/hfopenllm_v2/icefog72/IceDrinkNameNotFoundRP-7b-Model_Stock/fc54f87a-2e4a-4f3f-b407-e268c4487d16.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_IceDrinkNameNotFoundRP-7b-Model_Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IceDrinkNameNotFoundRP-7b-Model_Stock",
-    "id": "icefog72/IceDrinkNameNotFoundRP-7b-Model_Stock",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.513
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5026
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0604
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4372
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3064
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/IceDrunkCherryRP-7b/8d893736-1707-4c0b-860d-16c62ec26d78.json b/data/hfopenllm_v2/icefog72/IceDrunkCherryRP-7b/8d893736-1707-4c0b-860d-16c62ec26d78.json
deleted file mode 100644
index fdf69bdb5..000000000
--- a/data/hfopenllm_v2/icefog72/IceDrunkCherryRP-7b/8d893736-1707-4c0b-860d-16c62ec26d78.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_IceDrunkCherryRP-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IceDrunkCherryRP-7b",
-    "id": "icefog72/IceDrunkCherryRP-7b",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4898
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4847
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4292
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3009
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/IceDrunkenCherryRP-7b/d3d2728f-74bf-4196-a909-43797d8b628a.json b/data/hfopenllm_v2/icefog72/IceDrunkenCherryRP-7b/d3d2728f-74bf-4196-a909-43797d8b628a.json
deleted file mode 100644
index 2dca8d588..000000000
--- a/data/hfopenllm_v2/icefog72/IceDrunkenCherryRP-7b/d3d2728f-74bf-4196-a909-43797d8b628a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_IceDrunkenCherryRP-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IceDrunkenCherryRP-7b",
-    "id": "icefog72/IceDrunkenCherryRP-7b",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4763
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5093
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0642
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4446
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3099
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/IceEspressoRPv2-7b/ed241e67-8718-48be-a6e8-19e295a2b5cd.json b/data/hfopenllm_v2/icefog72/IceEspressoRPv2-7b/ed241e67-8718-48be-a6e8-19e295a2b5cd.json
deleted file mode 100644
index 1ca2a66f9..000000000
--- a/data/hfopenllm_v2/icefog72/IceEspressoRPv2-7b/ed241e67-8718-48be-a6e8-19e295a2b5cd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_IceEspressoRPv2-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IceEspressoRPv2-7b",
-    "id": "icefog72/IceEspressoRPv2-7b",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4977
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5055
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0619
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4331
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3061
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/IceLemonTeaRP-32k-7b/05aafad3-e07a-453b-a70b-f18fbd4eb218.json b/data/hfopenllm_v2/icefog72/IceLemonTeaRP-32k-7b/05aafad3-e07a-453b-a70b-f18fbd4eb218.json
deleted file mode 100644
index 751ec205c..000000000
--- a/data/hfopenllm_v2/icefog72/IceLemonTeaRP-32k-7b/05aafad3-e07a-453b-a70b-f18fbd4eb218.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_IceLemonTeaRP-32k-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IceLemonTeaRP-32k-7b",
-    "id": "icefog72/IceLemonTeaRP-32k-7b",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5212
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4997
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.429
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3068
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/IceMartiniRP-7b/f79ac32e-ab83-40c3-9c18-35623f5ae1d4.json b/data/hfopenllm_v2/icefog72/IceMartiniRP-7b/f79ac32e-ab83-40c3-9c18-35623f5ae1d4.json
deleted file mode 100644
index e94d36e4e..000000000
--- a/data/hfopenllm_v2/icefog72/IceMartiniRP-7b/f79ac32e-ab83-40c3-9c18-35623f5ae1d4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_IceMartiniRP-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IceMartiniRP-7b",
-    "id": "icefog72/IceMartiniRP-7b",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5045
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4972
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4345
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3073
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/IceNalyvkaRP-7b/cec76b15-1069-4d37-b8bc-74dde28101f6.json b/data/hfopenllm_v2/icefog72/IceNalyvkaRP-7b/cec76b15-1069-4d37-b8bc-74dde28101f6.json
deleted file mode 100644
index f70b98ac1..000000000
--- a/data/hfopenllm_v2/icefog72/IceNalyvkaRP-7b/cec76b15-1069-4d37-b8bc-74dde28101f6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_IceNalyvkaRP-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IceNalyvkaRP-7b",
-    "id": "icefog72/IceNalyvkaRP-7b",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5498
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5136
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0597
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4512
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2996
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/IceSakeRP-7b/e4ac0d0c-65ea-4b43-bb4b-7371c6cd5d61.json b/data/hfopenllm_v2/icefog72/IceSakeRP-7b/e4ac0d0c-65ea-4b43-bb4b-7371c6cd5d61.json
deleted file mode 100644
index 70a43f1aa..000000000
--- a/data/hfopenllm_v2/icefog72/IceSakeRP-7b/e4ac0d0c-65ea-4b43-bb4b-7371c6cd5d61.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_IceSakeRP-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IceSakeRP-7b",
-    "id": "icefog72/IceSakeRP-7b",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5228
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5119
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0634
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.413
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3177
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/IceSakeV4RP-7b/f8d629bf-df0b-4c6a-8c18-17dda002b089.json b/data/hfopenllm_v2/icefog72/IceSakeV4RP-7b/f8d629bf-df0b-4c6a-8c18-17dda002b089.json
deleted file mode 100644
index 620f3fe56..000000000
--- a/data/hfopenllm_v2/icefog72/IceSakeV4RP-7b/f8d629bf-df0b-4c6a-8c18-17dda002b089.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_IceSakeV4RP-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IceSakeV4RP-7b",
-    "id": "icefog72/IceSakeV4RP-7b",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4634
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.493
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0559
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4082
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3103
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/IceSakeV6RP-7b/6739d8e3-f4bd-4fd5-98f3-887f5ed3f9c0.json b/data/hfopenllm_v2/icefog72/IceSakeV6RP-7b/6739d8e3-f4bd-4fd5-98f3-887f5ed3f9c0.json
deleted file mode 100644
index 477e0dbce..000000000
--- a/data/hfopenllm_v2/icefog72/IceSakeV6RP-7b/6739d8e3-f4bd-4fd5-98f3-887f5ed3f9c0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_IceSakeV6RP-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IceSakeV6RP-7b",
-    "id": "icefog72/IceSakeV6RP-7b",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5033
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4976
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0619
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.42
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3093
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/IceSakeV8RP-7b/a51722f4-29f4-47a5-acba-4c8b5355551b.json b/data/hfopenllm_v2/icefog72/IceSakeV8RP-7b/a51722f4-29f4-47a5-acba-4c8b5355551b.json
deleted file mode 100644
index 7a5af5dc8..000000000
--- a/data/hfopenllm_v2/icefog72/IceSakeV8RP-7b/a51722f4-29f4-47a5-acba-4c8b5355551b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_IceSakeV8RP-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IceSakeV8RP-7b",
-    "id": "icefog72/IceSakeV8RP-7b",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6086
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4885
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0597
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3993
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.301
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/IceTea21EnergyDrinkRPV13-DPOv3.5/06d0a21f-f6e4-4ca9-a679-8c4502aaaad1.json b/data/hfopenllm_v2/icefog72/IceTea21EnergyDrinkRPV13-DPOv3.5/06d0a21f-f6e4-4ca9-a679-8c4502aaaad1.json
deleted file mode 100644
index bf984e1a4..000000000
--- a/data/hfopenllm_v2/icefog72/IceTea21EnergyDrinkRPV13-DPOv3.5/06d0a21f-f6e4-4ca9-a679-8c4502aaaad1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_IceTea21EnergyDrinkRPV13-DPOv3.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IceTea21EnergyDrinkRPV13-DPOv3.5",
-    "id": "icefog72/IceTea21EnergyDrinkRPV13-DPOv3.5",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4871
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0363
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3964
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2498
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/icefog72/IceTea21EnergyDrinkRPV13-DPOv3/04a4dcc9-3784-4aea-9faf-9db49c2e4c43.json b/data/hfopenllm_v2/icefog72/IceTea21EnergyDrinkRPV13-DPOv3/04a4dcc9-3784-4aea-9faf-9db49c2e4c43.json
deleted file mode 100644
index 87ff25f1f..000000000
--- a/data/hfopenllm_v2/icefog72/IceTea21EnergyDrinkRPV13-DPOv3/04a4dcc9-3784-4aea-9faf-9db49c2e4c43.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/icefog72_IceTea21EnergyDrinkRPV13-DPOv3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IceTea21EnergyDrinkRPV13-DPOv3",
-    "id": "icefog72/IceTea21EnergyDrinkRPV13-DPOv3",
-    "developer": "icefog72",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5263
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.502
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0582
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4372
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3056
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ifable/gemma-2-Ifable-9B/e4668365-d3dd-4996-9bb1-5b4e6f510264.json b/data/hfopenllm_v2/ifable/gemma-2-Ifable-9B/e4668365-d3dd-4996-9bb1-5b4e6f510264.json
deleted file mode 100644
index 186f9833c..000000000
--- a/data/hfopenllm_v2/ifable/gemma-2-Ifable-9B/e4668365-d3dd-4996-9bb1-5b4e6f510264.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ifable_gemma-2-Ifable-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-Ifable-9B",
-    "id": "ifable/gemma-2-Ifable-9B",
-    "developer": "ifable",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2984
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5866
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1397
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3414
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4053
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4226
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ilsp/Llama-Krikri-8B-Instruct/4d743678-e14d-4866-b1bf-0d660787847b.json b/data/hfopenllm_v2/ilsp/Llama-Krikri-8B-Instruct/4d743678-e14d-4866-b1bf-0d660787847b.json
deleted file mode 100644
index 4ae435b53..000000000
--- a/data/hfopenllm_v2/ilsp/Llama-Krikri-8B-Instruct/4d743678-e14d-4866-b1bf-0d660787847b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ilsp_Llama-Krikri-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-Krikri-8B-Instruct",
-    "id": "ilsp/Llama-Krikri-8B-Instruct",
-    "developer": "ilsp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.202
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6079
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5047
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1178
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.408
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3313
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/inflatebot/MN-12B-Mag-Mell-R1/720b1476-876c-47d1-bf46-d037389b4b2f.json b/data/hfopenllm_v2/inflatebot/MN-12B-Mag-Mell-R1/720b1476-876c-47d1-bf46-d037389b4b2f.json
deleted file mode 100644
index 848ed7790..000000000
--- a/data/hfopenllm_v2/inflatebot/MN-12B-Mag-Mell-R1/720b1476-876c-47d1-bf46-d037389b4b2f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/inflatebot_MN-12B-Mag-Mell-R1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-Mag-Mell-R1",
-    "id": "inflatebot/MN-12B-Mag-Mell-R1",
-    "developer": "inflatebot",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4613
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5304
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1299
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4002
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3438
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/informatiker/Qwen2-7B-Instruct-abliterated/4e4f3b2d-5b17-486a-a2ab-c2e89194c765.json b/data/hfopenllm_v2/informatiker/Qwen2-7B-Instruct-abliterated/4e4f3b2d-5b17-486a-a2ab-c2e89194c765.json
deleted file mode 100644
index 69b688c62..000000000
--- a/data/hfopenllm_v2/informatiker/Qwen2-7B-Instruct-abliterated/4e4f3b2d-5b17-486a-a2ab-c2e89194c765.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/informatiker_Qwen2-7B-Instruct-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-7B-Instruct-abliterated",
-    "id": "informatiker/Qwen2-7B-Instruct-abliterated",
-    "developer": "informatiker",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5822
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5534
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2636
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3888
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3873
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/insightfactory/Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model/b738668e-3ac1-4a36-ad71-ad7d2a5256ae.json b/data/hfopenllm_v2/insightfactory/Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model/b738668e-3ac1-4a36-ad71-ad7d2a5256ae.json
deleted file mode 100644
index 35ea90e61..000000000
--- a/data/hfopenllm_v2/insightfactory/Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model/b738668e-3ac1-4a36-ad71-ad7d2a5256ae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/insightfactory_Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model",
-    "id": "insightfactory/Llama-3.2-3B-Instruct-unsloth-bnb-4bitlora_model",
-    "developer": "insightfactory",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "",
-      "params_billions": 1.933
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4588
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4146
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.105
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3499
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.296
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/instruction-pretrain/InstructLM-500M/623f1b73-1505-4527-b41c-dcb2b711226d.json b/data/hfopenllm_v2/instruction-pretrain/InstructLM-500M/623f1b73-1505-4527-b41c-dcb2b711226d.json
deleted file mode 100644
index 250a5012f..000000000
--- a/data/hfopenllm_v2/instruction-pretrain/InstructLM-500M/623f1b73-1505-4527-b41c-dcb2b711226d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/instruction-pretrain_InstructLM-500M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "InstructLM-500M",
-    "id": "instruction-pretrain/InstructLM-500M",
-    "developer": "instruction-pretrain",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 0.5
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1028
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2941
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3528
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1141
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/internlm/internlm2-1_8b/53f03454-9587-4208-bc01-21de62f59195.json b/data/hfopenllm_v2/internlm/internlm2-1_8b/53f03454-9587-4208-bc01-21de62f59195.json
deleted file mode 100644
index 112c6e318..000000000
--- a/data/hfopenllm_v2/internlm/internlm2-1_8b/53f03454-9587-4208-bc01-21de62f59195.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/internlm_internlm2-1_8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "internlm2-1_8b",
-    "id": "internlm/internlm2-1_8b",
-    "developer": "internlm",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "InternLM2ForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2198
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.388
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0211
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2483
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3813
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1588
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/internlm/internlm2-7b/fb38d8b4-6320-4b8d-bf3d-e3d22bb0ed83.json b/data/hfopenllm_v2/internlm/internlm2-7b/fb38d8b4-6320-4b8d-bf3d-e3d22bb0ed83.json
deleted file mode 100644
index 0954fe225..000000000
--- a/data/hfopenllm_v2/internlm/internlm2-7b/fb38d8b4-6320-4b8d-bf3d-e3d22bb0ed83.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/internlm_internlm2-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "internlm2-7b",
-    "id": "internlm/internlm2-7b",
-    "developer": "internlm",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Unknown",
-      "params_billions": 0.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.228
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5825
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0857
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3367
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.19
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/internlm/internlm2-chat-1_8b/b127a923-3bf2-4cad-9225-d738efe800e3.json b/data/hfopenllm_v2/internlm/internlm2-chat-1_8b/b127a923-3bf2-4cad-9225-d738efe800e3.json
deleted file mode 100644
index 0ea7fed94..000000000
--- a/data/hfopenllm_v2/internlm/internlm2-chat-1_8b/b127a923-3bf2-4cad-9225-d738efe800e3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/internlm_internlm2-chat-1_8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "internlm2-chat-1_8b",
-    "id": "internlm/internlm2-chat-1_8b",
-    "developer": "internlm",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "InternLM2ForCausalLM",
-      "params_billions": 1.889
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2387
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4452
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0325
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3631
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1839
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/internlm/internlm2_5-1_8b-chat/a94ae52a-7936-4750-83f5-4740f23adf15.json b/data/hfopenllm_v2/internlm/internlm2_5-1_8b-chat/a94ae52a-7936-4750-83f5-4740f23adf15.json
deleted file mode 100644
index 89f784e4a..000000000
--- a/data/hfopenllm_v2/internlm/internlm2_5-1_8b-chat/a94ae52a-7936-4750-83f5-4740f23adf15.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/internlm_internlm2_5-1_8b-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "internlm2_5-1_8b-chat",
-    "id": "internlm/internlm2_5-1_8b-chat",
-    "developer": "internlm",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "InternLM2ForCausalLM",
-      "params_billions": 1.89
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3849
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4489
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1586
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3594
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1299
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/internlm/internlm2_5-20b-chat/95e689c6-cd19-4114-b3b5-1672ab849214.json b/data/hfopenllm_v2/internlm/internlm2_5-20b-chat/95e689c6-cd19-4114-b3b5-1672ab849214.json
deleted file mode 100644
index c3237ef07..000000000
--- a/data/hfopenllm_v2/internlm/internlm2_5-20b-chat/95e689c6-cd19-4114-b3b5-1672ab849214.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/internlm_internlm2_5-20b-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "internlm2_5-20b-chat",
-    "id": "internlm/internlm2_5-20b-chat",
-    "developer": "internlm",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "InternLM2ForCausalLM",
-      "params_billions": 19.86
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.701
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7474
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4079
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4558
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3998
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/internlm/internlm2_5-7b-chat/890a8414-bccf-4a66-8013-6c270d017965.json b/data/hfopenllm_v2/internlm/internlm2_5-7b-chat/890a8414-bccf-4a66-8013-6c270d017965.json
deleted file mode 100644
index 39a2746e0..000000000
--- a/data/hfopenllm_v2/internlm/internlm2_5-7b-chat/890a8414-bccf-4a66-8013-6c270d017965.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/internlm_internlm2_5-7b-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "internlm2_5-7b-chat",
-    "id": "internlm/internlm2_5-7b-chat",
-    "developer": "internlm",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "InternLM2ForCausalLM",
-      "params_billions": 7.738
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5539
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7073
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.253
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4594
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3777
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/intervitens/mini-magnum-12b-v1.1/0f8ce410-cf3b-4f78-81b9-a0a1fe91b963.json b/data/hfopenllm_v2/intervitens/mini-magnum-12b-v1.1/0f8ce410-cf3b-4f78-81b9-a0a1fe91b963.json
deleted file mode 100644
index 37988d1da..000000000
--- a/data/hfopenllm_v2/intervitens/mini-magnum-12b-v1.1/0f8ce410-cf3b-4f78-81b9-a0a1fe91b963.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/intervitens_mini-magnum-12b-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mini-magnum-12b-v1.1",
-    "id": "intervitens/mini-magnum-12b-v1.1",
-    "developer": "intervitens",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5156
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5062
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0619
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4004
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3291
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/inumulaisk/eval_model/121096cf-356b-4069-a0a3-8cf6aad52b81.json b/data/hfopenllm_v2/inumulaisk/eval_model/121096cf-356b-4069-a0a3-8cf6aad52b81.json
deleted file mode 100644
index fd8c4c747..000000000
--- a/data/hfopenllm_v2/inumulaisk/eval_model/121096cf-356b-4069-a0a3-8cf6aad52b81.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/inumulaisk_eval_model/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "eval_model",
-    "id": "inumulaisk/eval_model",
-    "developer": "inumulaisk",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1931
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3512
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2976
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1664
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/invalid-coder/Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp/fb0bcadf-32a0-4320-909f-2c38ba7d9372.json b/data/hfopenllm_v2/invalid-coder/Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp/fb0bcadf-32a0-4320-909f-2c38ba7d9372.json
deleted file mode 100644
index 8bfc59d66..000000000
--- a/data/hfopenllm_v2/invalid-coder/Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp/fb0bcadf-32a0-4320-909f-2c38ba7d9372.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/invalid-coder_Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp",
-    "id": "invalid-coder/Sakura-SOLAR-Instruct-CarbonVillain-en-10.7B-v2-slerp",
-    "developer": "invalid-coder",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4555
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5158
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0491
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3992
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/invisietch/EtherealRainbow-v0.2-8B/ab941c52-cf33-4b8e-87af-4a73930cf72a.json b/data/hfopenllm_v2/invisietch/EtherealRainbow-v0.2-8B/ab941c52-cf33-4b8e-87af-4a73930cf72a.json
deleted file mode 100644
index a6bf367ef..000000000
--- a/data/hfopenllm_v2/invisietch/EtherealRainbow-v0.2-8B/ab941c52-cf33-4b8e-87af-4a73930cf72a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/invisietch_EtherealRainbow-v0.2-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "EtherealRainbow-v0.2-8B",
-    "id": "invisietch/EtherealRainbow-v0.2-8B",
-    "developer": "invisietch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3903
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5102
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0823
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3827
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3653
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/invisietch/EtherealRainbow-v0.3-8B/08c242fd-0258-4817-970a-668584ed9385.json b/data/hfopenllm_v2/invisietch/EtherealRainbow-v0.3-8B/08c242fd-0258-4817-970a-668584ed9385.json
deleted file mode 100644
index d5ccbfe08..000000000
--- a/data/hfopenllm_v2/invisietch/EtherealRainbow-v0.3-8B/08c242fd-0258-4817-970a-668584ed9385.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/invisietch_EtherealRainbow-v0.3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "EtherealRainbow-v0.3-8B",
-    "id": "invisietch/EtherealRainbow-v0.3-8B",
-    "developer": "invisietch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3682
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5097
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0763
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3904
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3626
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/invisietch/MiS-Firefly-v0.2-22B/2171af9a-be5e-4daf-8e67-a5239ccec7bd.json b/data/hfopenllm_v2/invisietch/MiS-Firefly-v0.2-22B/2171af9a-be5e-4daf-8e67-a5239ccec7bd.json
deleted file mode 100644
index 501989bf4..000000000
--- a/data/hfopenllm_v2/invisietch/MiS-Firefly-v0.2-22B/2171af9a-be5e-4daf-8e67-a5239ccec7bd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/invisietch_MiS-Firefly-v0.2-22B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MiS-Firefly-v0.2-22B",
-    "id": "invisietch/MiS-Firefly-v0.2-22B",
-    "developer": "invisietch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5371
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5514
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1654
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4694
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.362
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/invisietch/Nimbus-Miqu-v0.1-70B/706f75a1-2f6b-47dd-809e-a830e739b574.json b/data/hfopenllm_v2/invisietch/Nimbus-Miqu-v0.1-70B/706f75a1-2f6b-47dd-809e-a830e739b574.json
deleted file mode 100644
index 4802b7933..000000000
--- a/data/hfopenllm_v2/invisietch/Nimbus-Miqu-v0.1-70B/706f75a1-2f6b-47dd-809e-a830e739b574.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/invisietch_Nimbus-Miqu-v0.1-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nimbus-Miqu-v0.1-70B",
-    "id": "invisietch/Nimbus-Miqu-v0.1-70B",
-    "developer": "invisietch",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 68.977
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4647
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.601
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0604
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3389
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4133
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3853
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/irahulpandey/mistralai-7B-slerp-v0.1/a9cd0399-4670-4f5c-8c64-c82dac97cd8c.json b/data/hfopenllm_v2/irahulpandey/mistralai-7B-slerp-v0.1/a9cd0399-4670-4f5c-8c64-c82dac97cd8c.json
deleted file mode 100644
index 42cef2a10..000000000
--- a/data/hfopenllm_v2/irahulpandey/mistralai-7B-slerp-v0.1/a9cd0399-4670-4f5c-8c64-c82dac97cd8c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/irahulpandey_mistralai-7B-slerp-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistralai-7B-slerp-v0.1",
-    "id": "irahulpandey/mistralai-7B-slerp-v0.1",
-    "developer": "irahulpandey",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4966
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5011
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.455
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2951
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaredjoss/pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model/67cfd12d-0551-406d-bd1d-8ced75c69478.json b/data/hfopenllm_v2/jaredjoss/pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model/67cfd12d-0551-406d-bd1d-8ced75c69478.json
deleted file mode 100644
index 6cad09ae3..000000000
--- a/data/hfopenllm_v2/jaredjoss/pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model/67cfd12d-0551-406d-bd1d-8ced75c69478.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaredjoss_pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model",
-    "id": "jaredjoss/pythia-410m-roberta-lr_8e7-kl_01-steps_12000-rlhf-model",
-    "developer": "jaredjoss",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 0.407
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1572
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2863
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3607
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1169
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2-8B/0a31d2f0-196b-4508-861a-1ba7bd28ea23.json b/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2-8B/0a31d2f0-196b-4508-861a-1ba7bd28ea23.json
deleted file mode 100644
index 8ebe97770..000000000
--- a/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2-8B/0a31d2f0-196b-4508-861a-1ba7bd28ea23.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Auro-Kosmos-EVAA-v2-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Auro-Kosmos-EVAA-v2-8B",
-    "id": "jaspionjader/Auro-Kosmos-EVAA-v2-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4778
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5447
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1412
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.425
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3858
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.1-8B/57576999-2749-441a-91d6-5a976e83a658.json b/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.1-8B/57576999-2749-441a-91d6-5a976e83a658.json
deleted file mode 100644
index 315886d52..000000000
--- a/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.1-8B/57576999-2749-441a-91d6-5a976e83a658.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Auro-Kosmos-EVAA-v2.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Auro-Kosmos-EVAA-v2.1-8B",
-    "id": "jaspionjader/Auro-Kosmos-EVAA-v2.1-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4666
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5444
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1458
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4317
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3826
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.2-8B/e44792e6-0329-4784-832b-3043478e70a4.json b/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.2-8B/e44792e6-0329-4784-832b-3043478e70a4.json
deleted file mode 100644
index 3152352b6..000000000
--- a/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.2-8B/e44792e6-0329-4784-832b-3043478e70a4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Auro-Kosmos-EVAA-v2.2-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Auro-Kosmos-EVAA-v2.2-8B",
-    "id": "jaspionjader/Auro-Kosmos-EVAA-v2.2-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4268
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5431
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1412
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4251
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3798
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.3-8B/8b3789d6-51be-472a-95d3-2ae7c34ad140.json b/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.3-8B/8b3789d6-51be-472a-95d3-2ae7c34ad140.json
deleted file mode 100644
index f7a617f03..000000000
--- a/data/hfopenllm_v2/jaspionjader/Auro-Kosmos-EVAA-v2.3-8B/8b3789d6-51be-472a-95d3-2ae7c34ad140.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Auro-Kosmos-EVAA-v2.3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Auro-Kosmos-EVAA-v2.3-8B",
-    "id": "jaspionjader/Auro-Kosmos-EVAA-v2.3-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4271
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5441
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1344
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4278
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3784
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-Aurora_faustus-8B/3f4765f2-551b-485f-9020-0cf17a36a887.json b/data/hfopenllm_v2/jaspionjader/Kosmos-Aurora_faustus-8B/3f4765f2-551b-485f-9020-0cf17a36a887.json
deleted file mode 100644
index 0ae45d0a4..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-Aurora_faustus-8B/3f4765f2-551b-485f-9020-0cf17a36a887.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-Aurora_faustus-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-Aurora_faustus-8B",
-    "id": "jaspionjader/Kosmos-Aurora_faustus-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4432
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.526
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4117
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3813
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-8B/6375a845-5d86-4dcf-bfd2-e836daa4ca11.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-8B/6375a845-5d86-4dcf-bfd2-e836daa4ca11.json
deleted file mode 100644
index f9aae6a67..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-8B/6375a845-5d86-4dcf-bfd2-e836daa4ca11.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-8B",
-    "id": "jaspionjader/Kosmos-EVAA-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4405
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5312
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1178
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4237
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3818
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Franken-Immersive-v39-8B/65a74446-6964-4f5f-8ea6-aeb1b09595ae.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Franken-Immersive-v39-8B/65a74446-6964-4f5f-8ea6-aeb1b09595ae.json
deleted file mode 100644
index 79154e34e..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Franken-Immersive-v39-8B/65a74446-6964-4f5f-8ea6-aeb1b09595ae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-Franken-Immersive-v39-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-Franken-Immersive-v39-8B",
-    "id": "jaspionjader/Kosmos-EVAA-Franken-Immersive-v39-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4378
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.519
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1292
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4236
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Franken-v38-8B/dcba5998-3b84-4753-a4fa-2558ffe3e69b.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Franken-v38-8B/dcba5998-3b84-4753-a4fa-2558ffe3e69b.json
deleted file mode 100644
index 394557379..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Franken-v38-8B/dcba5998-3b84-4753-a4fa-2558ffe3e69b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-Franken-v38-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-Franken-v38-8B",
-    "id": "jaspionjader/Kosmos-EVAA-Franken-v38-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4356
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.523
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1292
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4212
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.389
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Fusion-8B/0af6b3c0-6638-4bd8-bdd9-349e2b9ca71c.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Fusion-8B/0af6b3c0-6638-4bd8-bdd9-349e2b9ca71c.json
deleted file mode 100644
index d05e11595..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Fusion-8B/0af6b3c0-6638-4bd8-bdd9-349e2b9ca71c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-Fusion-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-Fusion-8B",
-    "id": "jaspionjader/Kosmos-EVAA-Fusion-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4345
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5419
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1292
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4277
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3854
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Fusion-8B/4e332594-d0b9-4913-9950-208abe4faab7.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Fusion-8B/4e332594-d0b9-4913-9950-208abe4faab7.json
deleted file mode 100644
index eb9755b5c..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-Fusion-8B/4e332594-d0b9-4913-9950-208abe4faab7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-Fusion-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-Fusion-8B",
-    "id": "jaspionjader/Kosmos-EVAA-Fusion-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4418
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5406
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1352
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4277
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.386
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-8B/5ad2ad73-47ed-465d-b4c0-b358e6b6435f.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-8B/5ad2ad73-47ed-465d-b4c0-b358e6b6435f.json
deleted file mode 100644
index b95e94419..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-8B/5ad2ad73-47ed-465d-b4c0-b358e6b6435f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-PRP-8B",
-    "id": "jaspionjader/Kosmos-EVAA-PRP-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3405
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5196
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0884
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4301
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3647
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-light-8B/c9f716ef-0aa6-445f-8fc9-b102f3a0ea2a.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-light-8B/c9f716ef-0aa6-445f-8fc9-b102f3a0ea2a.json
deleted file mode 100644
index 8874fb70f..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-light-8B/c9f716ef-0aa6-445f-8fc9-b102f3a0ea2a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-light-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-PRP-light-8B",
-    "id": "jaspionjader/Kosmos-EVAA-PRP-light-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3824
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5271
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1103
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4249
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3782
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v23-8B/a2e32a77-867c-4921-ada4-c7b169efbebe.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v23-8B/a2e32a77-867c-4921-ada4-c7b169efbebe.json
deleted file mode 100644
index 3407edf36..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v23-8B/a2e32a77-867c-4921-ada4-c7b169efbebe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v23-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-PRP-v23-8B",
-    "id": "jaspionjader/Kosmos-EVAA-PRP-v23-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4041
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.529
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1156
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4368
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3706
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v24-8B/f76f759f-d05d-4eb6-a2b9-3b1dfbe840f0.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v24-8B/f76f759f-d05d-4eb6-a2b9-3b1dfbe840f0.json
deleted file mode 100644
index 269ec5b3a..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v24-8B/f76f759f-d05d-4eb6-a2b9-3b1dfbe840f0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v24-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-PRP-v24-8B",
-    "id": "jaspionjader/Kosmos-EVAA-PRP-v24-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4259
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5276
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1103
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.429
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3779
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v25-8B/ece0bd6b-4eec-485c-942b-e23f3295c2f8.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v25-8B/ece0bd6b-4eec-485c-942b-e23f3295c2f8.json
deleted file mode 100644
index c4a81c92e..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v25-8B/ece0bd6b-4eec-485c-942b-e23f3295c2f8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v25-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-PRP-v25-8B",
-    "id": "jaspionjader/Kosmos-EVAA-PRP-v25-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4421
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5291
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1186
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4303
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v26-8B/ada110bb-0988-4c19-9798-74577dde5ce9.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v26-8B/ada110bb-0988-4c19-9798-74577dde5ce9.json
deleted file mode 100644
index fb137b1ab..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v26-8B/ada110bb-0988-4c19-9798-74577dde5ce9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v26-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-PRP-v26-8B",
-    "id": "jaspionjader/Kosmos-EVAA-PRP-v26-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4414
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5271
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1133
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4264
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3793
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v27-8B/ed4f994d-d196-40bd-8f8f-f6a7f07c3c90.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v27-8B/ed4f994d-d196-40bd-8f8f-f6a7f07c3c90.json
deleted file mode 100644
index 3a868b97e..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v27-8B/ed4f994d-d196-40bd-8f8f-f6a7f07c3c90.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v27-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-PRP-v27-8B",
-    "id": "jaspionjader/Kosmos-EVAA-PRP-v27-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4378
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.529
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1193
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4343
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3755
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v28-8B/57395f9a-0534-453e-80fc-96e9dc5cd9c3.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v28-8B/57395f9a-0534-453e-80fc-96e9dc5cd9c3.json
deleted file mode 100644
index 9e9f72bfb..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v28-8B/57395f9a-0534-453e-80fc-96e9dc5cd9c3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v28-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-PRP-v28-8B",
-    "id": "jaspionjader/Kosmos-EVAA-PRP-v28-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4366
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5295
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1171
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.433
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v29-8B/f8f70702-9ab4-4e1a-a11d-090627d58f02.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v29-8B/f8f70702-9ab4-4e1a-a11d-090627d58f02.json
deleted file mode 100644
index 444db8839..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v29-8B/f8f70702-9ab4-4e1a-a11d-090627d58f02.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v29-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-PRP-v29-8B",
-    "id": "jaspionjader/Kosmos-EVAA-PRP-v29-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4487
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5275
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4237
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3765
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v30-8B/3cab8bda-bdf6-4345-b89e-18d34a8f6361.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v30-8B/3cab8bda-bdf6-4345-b89e-18d34a8f6361.json
deleted file mode 100644
index 325f0d8d0..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v30-8B/3cab8bda-bdf6-4345-b89e-18d34a8f6361.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v30-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-PRP-v30-8B",
-    "id": "jaspionjader/Kosmos-EVAA-PRP-v30-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4295
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5328
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1178
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4263
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3938
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v31-8B/0955fc17-8878-401a-9ec3-149528ee51e1.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v31-8B/0955fc17-8878-401a-9ec3-149528ee51e1.json
deleted file mode 100644
index 4cfc98e57..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v31-8B/0955fc17-8878-401a-9ec3-149528ee51e1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v31-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-PRP-v31-8B",
-    "id": "jaspionjader/Kosmos-EVAA-PRP-v31-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4399
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5315
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1133
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4251
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v32-8B/c63bf49a-e7d4-4853-8684-9cc03eaa7840.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v32-8B/c63bf49a-e7d4-4853-8684-9cc03eaa7840.json
deleted file mode 100644
index ddd83d684..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v32-8B/c63bf49a-e7d4-4853-8684-9cc03eaa7840.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v32-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-PRP-v32-8B",
-    "id": "jaspionjader/Kosmos-EVAA-PRP-v32-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4487
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5293
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4211
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3777
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v33-8B/65e6a3b6-4291-4591-bc0b-576930061c68.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v33-8B/65e6a3b6-4291-4591-bc0b-576930061c68.json
deleted file mode 100644
index 48e58506f..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v33-8B/65e6a3b6-4291-4591-bc0b-576930061c68.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v33-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-PRP-v33-8B",
-    "id": "jaspionjader/Kosmos-EVAA-PRP-v33-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4302
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5321
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1178
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4184
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3909
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v34-8B/1ddf9e02-4066-440e-a777-fcd3f96bc4b3.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v34-8B/1ddf9e02-4066-440e-a777-fcd3f96bc4b3.json
deleted file mode 100644
index 89f19eb09..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-PRP-v34-8B/1ddf9e02-4066-440e-a777-fcd3f96bc4b3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-PRP-v34-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-PRP-v34-8B",
-    "id": "jaspionjader/Kosmos-EVAA-PRP-v34-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4563
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5333
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4237
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3927
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-8B/f9f96bb2-edbc-4112-97aa-a7420dea32a1.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-8B/f9f96bb2-edbc-4112-97aa-a7420dea32a1.json
deleted file mode 100644
index f496d05c5..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-8B/f9f96bb2-edbc-4112-97aa-a7420dea32a1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-TSN-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-TSN-8B",
-    "id": "jaspionjader/Kosmos-EVAA-TSN-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4721
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5177
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1344
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4329
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3816
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-light-8B/3a24b30f-7698-4ecb-ac26-3537a0b38616.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-light-8B/3a24b30f-7698-4ecb-ac26-3537a0b38616.json
deleted file mode 100644
index 6959b532b..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-light-8B/3a24b30f-7698-4ecb-ac26-3537a0b38616.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-TSN-light-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-TSN-light-8B",
-    "id": "jaspionjader/Kosmos-EVAA-TSN-light-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4685
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5235
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4289
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3806
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v19-8B/d4030df6-2be6-4f46-9c9b-ce3037b9a004.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v19-8B/d4030df6-2be6-4f46-9c9b-ce3037b9a004.json
deleted file mode 100644
index e3426247a..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v19-8B/d4030df6-2be6-4f46-9c9b-ce3037b9a004.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-TSN-v19-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-TSN-v19-8B",
-    "id": "jaspionjader/Kosmos-EVAA-TSN-v19-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4564
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5316
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1156
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4277
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.379
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v20-8B/ec234403-f43d-46a0-84a4-ab47673226b3.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v20-8B/ec234403-f43d-46a0-84a4-ab47673226b3.json
deleted file mode 100644
index 3081940fd..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v20-8B/ec234403-f43d-46a0-84a4-ab47673226b3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-TSN-v20-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-TSN-v20-8B",
-    "id": "jaspionjader/Kosmos-EVAA-TSN-v20-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4423
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.525
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1246
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.421
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3936
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v21-8B/805379f4-784f-4602-92e8-180df4da9fc3.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v21-8B/805379f4-784f-4602-92e8-180df4da9fc3.json
deleted file mode 100644
index 3d18c5f8b..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v21-8B/805379f4-784f-4602-92e8-180df4da9fc3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-TSN-v21-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-TSN-v21-8B",
-    "id": "jaspionjader/Kosmos-EVAA-TSN-v21-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.467
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5248
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1193
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4343
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3816
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v22-8B/9f3920aa-9400-46f1-bcfa-969f69b3335c.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v22-8B/9f3920aa-9400-46f1-bcfa-969f69b3335c.json
deleted file mode 100644
index c54dd019f..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-TSN-v22-8B/9f3920aa-9400-46f1-bcfa-969f69b3335c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-TSN-v22-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-TSN-v22-8B",
-    "id": "jaspionjader/Kosmos-EVAA-TSN-v22-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4673
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5246
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1133
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4303
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3812
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-8B/26cbf444-ab93-409a-b85d-e2bd267eae5e.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-8B/26cbf444-ab93-409a-b85d-e2bd267eae5e.json
deleted file mode 100644
index 92fa310e3..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-8B/26cbf444-ab93-409a-b85d-e2bd267eae5e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-gamma-8B",
-    "id": "jaspionjader/Kosmos-EVAA-gamma-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4572
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5322
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.105
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4306
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3901
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-alt-8B/7c2b17a8-1de2-4441-a281-fe3fd043f831.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-alt-8B/7c2b17a8-1de2-4441-a281-fe3fd043f831.json
deleted file mode 100644
index 90cb19b6b..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-alt-8B/7c2b17a8-1de2-4441-a281-fe3fd043f831.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-alt-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-gamma-alt-8B",
-    "id": "jaspionjader/Kosmos-EVAA-gamma-alt-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4542
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5298
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1095
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4292
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3896
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-light-8B/94c5756c-cbde-46e2-90d2-207678373061.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-light-8B/94c5756c-cbde-46e2-90d2-207678373061.json
deleted file mode 100644
index 980ba2f9f..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-light-8B/94c5756c-cbde-46e2-90d2-207678373061.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-light-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-gamma-light-8B",
-    "id": "jaspionjader/Kosmos-EVAA-gamma-light-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4581
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5376
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1103
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4291
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3943
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-light-alt-8B/e0048124-89bf-4327-88a8-00aa51ee29af.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-light-alt-8B/e0048124-89bf-4327-88a8-00aa51ee29af.json
deleted file mode 100644
index e1956821f..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-light-alt-8B/e0048124-89bf-4327-88a8-00aa51ee29af.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-light-alt-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-gamma-light-alt-8B",
-    "id": "jaspionjader/Kosmos-EVAA-gamma-light-alt-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4454
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5327
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1133
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4305
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3923
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-ultra-light-8B/9d776307-43af-43bb-ab64-52fb7f331cfe.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-ultra-light-8B/9d776307-43af-43bb-ab64-52fb7f331cfe.json
deleted file mode 100644
index 2646a813c..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-ultra-light-8B/9d776307-43af-43bb-ab64-52fb7f331cfe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-ultra-light-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-gamma-ultra-light-8B",
-    "id": "jaspionjader/Kosmos-EVAA-gamma-ultra-light-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4563
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5316
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1178
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4197
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3915
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v13-8B/d8d41981-a7c8-48e9-a63c-86520a0f23d5.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v13-8B/d8d41981-a7c8-48e9-a63c-86520a0f23d5.json
deleted file mode 100644
index d1b28b8b8..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v13-8B/d8d41981-a7c8-48e9-a63c-86520a0f23d5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-v13-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-gamma-v13-8B",
-    "id": "jaspionjader/Kosmos-EVAA-gamma-v13-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4429
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5359
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1118
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4278
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.393
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v14-8B/1355985c-fbcb-4eac-8435-417d6034f2f0.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v14-8B/1355985c-fbcb-4eac-8435-417d6034f2f0.json
deleted file mode 100644
index 8087ad74c..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v14-8B/1355985c-fbcb-4eac-8435-417d6034f2f0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-v14-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-gamma-v14-8B",
-    "id": "jaspionjader/Kosmos-EVAA-gamma-v14-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.438
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5363
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1103
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4277
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3931
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v15-8B/44486b02-7bdd-4f59-8d4e-5c8deeb1fd60.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v15-8B/44486b02-7bdd-4f59-8d4e-5c8deeb1fd60.json
deleted file mode 100644
index a02407d4d..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v15-8B/44486b02-7bdd-4f59-8d4e-5c8deeb1fd60.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-v15-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-gamma-v15-8B",
-    "id": "jaspionjader/Kosmos-EVAA-gamma-v15-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4654
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5343
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.111
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4277
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3941
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v16-8B/45ae3dc3-6dc0-4d10-99cb-a7f330110906.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v16-8B/45ae3dc3-6dc0-4d10-99cb-a7f330110906.json
deleted file mode 100644
index 703c586b6..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v16-8B/45ae3dc3-6dc0-4d10-99cb-a7f330110906.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-v16-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-gamma-v16-8B",
-    "id": "jaspionjader/Kosmos-EVAA-gamma-v16-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4557
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5344
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1171
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4264
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3917
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v17-8B/6b54763a-6329-47fb-bf50-296604251b47.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v17-8B/6b54763a-6329-47fb-bf50-296604251b47.json
deleted file mode 100644
index d04ea2204..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v17-8B/6b54763a-6329-47fb-bf50-296604251b47.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-v17-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-gamma-v17-8B",
-    "id": "jaspionjader/Kosmos-EVAA-gamma-v17-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4462
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5347
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.111
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4291
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3923
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v18-8B/96a26bf3-b4b2-465f-8ce6-a2ef943c001a.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v18-8B/96a26bf3-b4b2-465f-8ce6-a2ef943c001a.json
deleted file mode 100644
index c9226fc57..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-gamma-v18-8B/96a26bf3-b4b2-465f-8ce6-a2ef943c001a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-gamma-v18-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-gamma-v18-8B",
-    "id": "jaspionjader/Kosmos-EVAA-gamma-v18-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4341
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5339
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.111
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4317
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3905
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-immersive-sof-v44-8B/655b047f-c3a8-4c9c-b864-81d318b2f506.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-immersive-sof-v44-8B/655b047f-c3a8-4c9c-b864-81d318b2f506.json
deleted file mode 100644
index 53ac1daaf..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-immersive-sof-v44-8B/655b047f-c3a8-4c9c-b864-81d318b2f506.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-immersive-sof-v44-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-immersive-sof-v44-8B",
-    "id": "jaspionjader/Kosmos-EVAA-immersive-sof-v44-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4408
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5215
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1186
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4144
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3888
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v10-8B/f62fed77-e166-422d-b5ce-c50b7bccbf4c.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v10-8B/f62fed77-e166-422d-b5ce-c50b7bccbf4c.json
deleted file mode 100644
index 5c80d82d6..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v10-8B/f62fed77-e166-422d-b5ce-c50b7bccbf4c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v10-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-v10-8B",
-    "id": "jaspionjader/Kosmos-EVAA-v10-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4262
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5376
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1246
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4224
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3831
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v11-8B/7ffdabf3-0a8e-4316-b6bd-85b10a81db53.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v11-8B/7ffdabf3-0a8e-4316-b6bd-85b10a81db53.json
deleted file mode 100644
index ce89b68ef..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v11-8B/7ffdabf3-0a8e-4316-b6bd-85b10a81db53.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v11-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-v11-8B",
-    "id": "jaspionjader/Kosmos-EVAA-v11-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4426
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5359
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1322
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4184
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3836
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v12-8B/2c93c987-b32d-4a02-8df4-949cc45b8eb2.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v12-8B/2c93c987-b32d-4a02-8df4-949cc45b8eb2.json
deleted file mode 100644
index cc567f9c9..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v12-8B/2c93c987-b32d-4a02-8df4-949cc45b8eb2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v12-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-v12-8B",
-    "id": "jaspionjader/Kosmos-EVAA-v12-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4378
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5349
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1367
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4211
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3836
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v2-8B/02e7c1d6-9db1-4de8-b13e-afd752b3669a.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v2-8B/02e7c1d6-9db1-4de8-b13e-afd752b3669a.json
deleted file mode 100644
index 308d1927e..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v2-8B/02e7c1d6-9db1-4de8-b13e-afd752b3669a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v2-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-v2-8B",
-    "id": "jaspionjader/Kosmos-EVAA-v2-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4396
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5341
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1322
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4211
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3826
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v3-8B/580a3045-338a-47b2-8ed7-54c993d5aa90.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v3-8B/580a3045-338a-47b2-8ed7-54c993d5aa90.json
deleted file mode 100644
index f03f0b4c8..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v3-8B/580a3045-338a-47b2-8ed7-54c993d5aa90.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-v3-8B",
-    "id": "jaspionjader/Kosmos-EVAA-v3-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4411
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5331
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1329
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4224
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3821
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v4-8B/e71d3be5-ea9d-4426-aa58-5806b7541aa6.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v4-8B/e71d3be5-ea9d-4426-aa58-5806b7541aa6.json
deleted file mode 100644
index c7bfce252..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v4-8B/e71d3be5-ea9d-4426-aa58-5806b7541aa6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v4-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-v4-8B",
-    "id": "jaspionjader/Kosmos-EVAA-v4-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4289
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5337
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1254
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4197
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v5-8B/1174683a-9488-4c6b-be6b-e5a96328a96f.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v5-8B/1174683a-9488-4c6b-be6b-e5a96328a96f.json
deleted file mode 100644
index d9d42b842..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v5-8B/1174683a-9488-4c6b-be6b-e5a96328a96f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v5-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-v5-8B",
-    "id": "jaspionjader/Kosmos-EVAA-v5-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.446
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5345
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1261
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4224
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3821
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v6-8B/3789b37f-daf0-4c21-82b8-309cbf00312e.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v6-8B/3789b37f-daf0-4c21-82b8-309cbf00312e.json
deleted file mode 100644
index e333bd78c..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v6-8B/3789b37f-daf0-4c21-82b8-309cbf00312e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v6-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-v6-8B",
-    "id": "jaspionjader/Kosmos-EVAA-v6-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4396
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.538
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1292
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4184
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3821
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v7-8B/8586cdc1-dd4e-4112-a59c-f6bc2766701b.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v7-8B/8586cdc1-dd4e-4112-a59c-f6bc2766701b.json
deleted file mode 100644
index 7a7e506dc..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v7-8B/8586cdc1-dd4e-4112-a59c-f6bc2766701b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v7-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-v7-8B",
-    "id": "jaspionjader/Kosmos-EVAA-v7-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4277
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5335
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1337
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4171
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3836
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v8-8B/946a7b16-dfa6-42ad-97c1-955bf8a40dae.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v8-8B/946a7b16-dfa6-42ad-97c1-955bf8a40dae.json
deleted file mode 100644
index 5b504be25..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v8-8B/946a7b16-dfa6-42ad-97c1-955bf8a40dae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v8-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-v8-8B",
-    "id": "jaspionjader/Kosmos-EVAA-v8-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4383
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5359
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1307
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.421
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3827
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v9-8B/d9a6cc31-57c4-4480-a019-25a34b31fcc8.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v9-8B/d9a6cc31-57c4-4480-a019-25a34b31fcc8.json
deleted file mode 100644
index 8db1d146b..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v9-8B/d9a6cc31-57c4-4480-a019-25a34b31fcc8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v9-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-v9-8B",
-    "id": "jaspionjader/Kosmos-EVAA-v9-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4369
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5361
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1276
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4184
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.382
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v9-TitanFusion-Mix-8B/279bd5fa-0ab1-411b-871b-bd9ff23853f6.json b/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v9-TitanFusion-Mix-8B/279bd5fa-0ab1-411b-871b-bd9ff23853f6.json
deleted file mode 100644
index 9e70660a3..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-EVAA-v9-TitanFusion-Mix-8B/279bd5fa-0ab1-411b-871b-bd9ff23853f6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-EVAA-v9-TitanFusion-Mix-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-EVAA-v9-TitanFusion-Mix-8B",
-    "id": "jaspionjader/Kosmos-EVAA-v9-TitanFusion-Mix-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4284
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.554
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4354
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3836
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-8b/c26fae10-e65a-49ac-a2da-2dbf024fd10d.json b/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-8b/c26fae10-e65a-49ac-a2da-2dbf024fd10d.json
deleted file mode 100644
index ab43cc04d..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-8b/c26fae10-e65a-49ac-a2da-2dbf024fd10d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-Elusive-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-Elusive-8b",
-    "id": "jaspionjader/Kosmos-Elusive-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4169
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5339
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1261
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4078
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.376
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-8B/6d37b2b4-630e-4471-b7a8-50f8a58902fe.json b/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-8B/6d37b2b4-630e-4471-b7a8-50f8a58902fe.json
deleted file mode 100644
index 47b464069..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-8B/6d37b2b4-630e-4471-b7a8-50f8a58902fe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-Elusive-VENN-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-Elusive-VENN-8B",
-    "id": "jaspionjader/Kosmos-Elusive-VENN-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4233
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5356
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1246
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4157
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3797
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-Asymmetric-8B/de687865-4297-4130-bcfe-0c5116c9b0d1.json b/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-Asymmetric-8B/de687865-4297-4130-bcfe-0c5116c9b0d1.json
deleted file mode 100644
index c8e3d1871..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-Asymmetric-8B/de687865-4297-4130-bcfe-0c5116c9b0d1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-Elusive-VENN-Asymmetric-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-Elusive-VENN-Asymmetric-8B",
-    "id": "jaspionjader/Kosmos-Elusive-VENN-Asymmetric-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4542
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5313
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1344
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4251
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-Aurora_faustus-8B/ee1acad1-5dc4-4d8b-8aca-544af5dc2392.json b/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-Aurora_faustus-8B/ee1acad1-5dc4-4d8b-8aca-544af5dc2392.json
deleted file mode 100644
index c80a27e40..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-Elusive-VENN-Aurora_faustus-8B/ee1acad1-5dc4-4d8b-8aca-544af5dc2392.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-Elusive-VENN-Aurora_faustus-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-Elusive-VENN-Aurora_faustus-8B",
-    "id": "jaspionjader/Kosmos-Elusive-VENN-Aurora_faustus-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4335
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5304
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.417
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3795
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/Kosmos-VENN-8B/52e3f1b1-5a1c-4cca-a36f-9f60284e1883.json b/data/hfopenllm_v2/jaspionjader/Kosmos-VENN-8B/52e3f1b1-5a1c-4cca-a36f-9f60284e1883.json
deleted file mode 100644
index 4060c65ec..000000000
--- a/data/hfopenllm_v2/jaspionjader/Kosmos-VENN-8B/52e3f1b1-5a1c-4cca-a36f-9f60284e1883.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_Kosmos-VENN-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kosmos-VENN-8B",
-    "id": "jaspionjader/Kosmos-VENN-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4332
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5318
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1412
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4211
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3801
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/PRP-Kosmos-EVAA-8B/2d54c67e-fad5-4a61-b3ae-0393f16dc1ba.json b/data/hfopenllm_v2/jaspionjader/PRP-Kosmos-EVAA-8B/2d54c67e-fad5-4a61-b3ae-0393f16dc1ba.json
deleted file mode 100644
index 331d553d5..000000000
--- a/data/hfopenllm_v2/jaspionjader/PRP-Kosmos-EVAA-8B/2d54c67e-fad5-4a61-b3ae-0393f16dc1ba.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_PRP-Kosmos-EVAA-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PRP-Kosmos-EVAA-8B",
-    "id": "jaspionjader/PRP-Kosmos-EVAA-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3633
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5237
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0959
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.425
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3766
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/PRP-Kosmos-EVAA-light-8B/5120e433-f5c7-45fa-be56-566101556271.json b/data/hfopenllm_v2/jaspionjader/PRP-Kosmos-EVAA-light-8B/5120e433-f5c7-45fa-be56-566101556271.json
deleted file mode 100644
index a17b8d20f..000000000
--- a/data/hfopenllm_v2/jaspionjader/PRP-Kosmos-EVAA-light-8B/5120e433-f5c7-45fa-be56-566101556271.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_PRP-Kosmos-EVAA-light-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PRP-Kosmos-EVAA-light-8B",
-    "id": "jaspionjader/PRP-Kosmos-EVAA-light-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4321
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5275
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1103
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4235
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3631
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/TSN-Kosmos-EVAA-8B/7f4b4668-c3a0-4575-957d-ba321d55f420.json b/data/hfopenllm_v2/jaspionjader/TSN-Kosmos-EVAA-8B/7f4b4668-c3a0-4575-957d-ba321d55f420.json
deleted file mode 100644
index c4d0d61b7..000000000
--- a/data/hfopenllm_v2/jaspionjader/TSN-Kosmos-EVAA-8B/7f4b4668-c3a0-4575-957d-ba321d55f420.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_TSN-Kosmos-EVAA-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TSN-Kosmos-EVAA-8B",
-    "id": "jaspionjader/TSN-Kosmos-EVAA-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4903
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5347
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4173
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3831
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/TSN-Kosmos-EVAA-v2-8B/9245b74d-4b9d-4158-a402-0c3742097eba.json b/data/hfopenllm_v2/jaspionjader/TSN-Kosmos-EVAA-v2-8B/9245b74d-4b9d-4158-a402-0c3742097eba.json
deleted file mode 100644
index 7eb53a06b..000000000
--- a/data/hfopenllm_v2/jaspionjader/TSN-Kosmos-EVAA-v2-8B/9245b74d-4b9d-4158-a402-0c3742097eba.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_TSN-Kosmos-EVAA-v2-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TSN-Kosmos-EVAA-v2-8B",
-    "id": "jaspionjader/TSN-Kosmos-EVAA-v2-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4667
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5343
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4186
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3762
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bbb-1/29a5fcd3-9c22-424c-ab17-70cfe187aea1.json b/data/hfopenllm_v2/jaspionjader/bbb-1/29a5fcd3-9c22-424c-ab17-70cfe187aea1.json
deleted file mode 100644
index 3ef1955b2..000000000
--- a/data/hfopenllm_v2/jaspionjader/bbb-1/29a5fcd3-9c22-424c-ab17-70cfe187aea1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bbb-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bbb-1",
-    "id": "jaspionjader/bbb-1",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4864
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5376
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1367
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4171
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3897
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bbb-2/af71bfa0-1077-4c96-a4c1-0aa28dc789bf.json b/data/hfopenllm_v2/jaspionjader/bbb-2/af71bfa0-1077-4c96-a4c1-0aa28dc789bf.json
deleted file mode 100644
index 6b4a860f0..000000000
--- a/data/hfopenllm_v2/jaspionjader/bbb-2/af71bfa0-1077-4c96-a4c1-0aa28dc789bf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bbb-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bbb-2",
-    "id": "jaspionjader/bbb-2",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4077
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5067
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4145
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3635
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bbb-3/258ebe6d-191d-4804-b5e1-5cd6ce93ba88.json b/data/hfopenllm_v2/jaspionjader/bbb-3/258ebe6d-191d-4804-b5e1-5cd6ce93ba88.json
deleted file mode 100644
index 2f10bb3d8..000000000
--- a/data/hfopenllm_v2/jaspionjader/bbb-3/258ebe6d-191d-4804-b5e1-5cd6ce93ba88.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bbb-3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bbb-3",
-    "id": "jaspionjader/bbb-3",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4168
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5158
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1405
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4265
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3856
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bbb-4/4765f197-82ed-44b3-9a7c-7cbabc6ecd8e.json b/data/hfopenllm_v2/jaspionjader/bbb-4/4765f197-82ed-44b3-9a7c-7cbabc6ecd8e.json
deleted file mode 100644
index 2e270c373..000000000
--- a/data/hfopenllm_v2/jaspionjader/bbb-4/4765f197-82ed-44b3-9a7c-7cbabc6ecd8e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bbb-4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bbb-4",
-    "id": "jaspionjader/bbb-4",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4768
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5212
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1276
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4092
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3773
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bbb-5/a5d66f97-1f4b-43da-a83a-4a262e297fd9.json b/data/hfopenllm_v2/jaspionjader/bbb-5/a5d66f97-1f4b-43da-a83a-4a262e297fd9.json
deleted file mode 100644
index e596a11bf..000000000
--- a/data/hfopenllm_v2/jaspionjader/bbb-5/a5d66f97-1f4b-43da-a83a-4a262e297fd9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bbb-5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bbb-5",
-    "id": "jaspionjader/bbb-5",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4703
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5207
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1397
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3998
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3834
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bbb-6/5d29cf73-65d6-4965-a504-4caf07108cc8.json b/data/hfopenllm_v2/jaspionjader/bbb-6/5d29cf73-65d6-4965-a504-4caf07108cc8.json
deleted file mode 100644
index 33980d484..000000000
--- a/data/hfopenllm_v2/jaspionjader/bbb-6/5d29cf73-65d6-4965-a504-4caf07108cc8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bbb-6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bbb-6",
-    "id": "jaspionjader/bbb-6",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.488
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5211
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.139
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4052
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3871
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bbb-7/15ec04ae-30d3-4ffb-9b0c-54ba63410e3d.json b/data/hfopenllm_v2/jaspionjader/bbb-7/15ec04ae-30d3-4ffb-9b0c-54ba63410e3d.json
deleted file mode 100644
index 36795088a..000000000
--- a/data/hfopenllm_v2/jaspionjader/bbb-7/15ec04ae-30d3-4ffb-9b0c-54ba63410e3d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bbb-7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bbb-7",
-    "id": "jaspionjader/bbb-7",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4828
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5211
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1367
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4038
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.386
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-1/2ed96c70-390b-44de-aa08-9883a2f33ff3.json b/data/hfopenllm_v2/jaspionjader/bh-1/2ed96c70-390b-44de-aa08-9883a2f33ff3.json
deleted file mode 100644
index a17534661..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-1/2ed96c70-390b-44de-aa08-9883a2f33ff3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-1",
-    "id": "jaspionjader/bh-1",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4284
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.589
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4441
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3449
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-10/67c95889-8a67-40fd-99e2-62e767c16416.json b/data/hfopenllm_v2/jaspionjader/bh-10/67c95889-8a67-40fd-99e2-62e767c16416.json
deleted file mode 100644
index bef06887f..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-10/67c95889-8a67-40fd-99e2-62e767c16416.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-10/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-10",
-    "id": "jaspionjader/bh-10",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4618
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5856
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1103
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3708
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-11/a518f39d-e073-493d-9a4f-9af53fc71abf.json b/data/hfopenllm_v2/jaspionjader/bh-11/a518f39d-e073-493d-9a4f-9af53fc71abf.json
deleted file mode 100644
index 517e4b745..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-11/a518f39d-e073-493d-9a4f-9af53fc71abf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-11/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-11",
-    "id": "jaspionjader/bh-11",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4575
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5851
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1178
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4146
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-12/24f0d9bc-d743-4f46-b5a6-e855e39a1daf.json b/data/hfopenllm_v2/jaspionjader/bh-12/24f0d9bc-d743-4f46-b5a6-e855e39a1daf.json
deleted file mode 100644
index 1eb93e6a6..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-12/24f0d9bc-d743-4f46-b5a6-e855e39a1daf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-12/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-12",
-    "id": "jaspionjader/bh-12",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4734
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5802
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1186
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4145
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3737
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-13/3d27f6d9-05a0-44bd-a225-6e6a0bf4a35b.json b/data/hfopenllm_v2/jaspionjader/bh-13/3d27f6d9-05a0-44bd-a225-6e6a0bf4a35b.json
deleted file mode 100644
index 932f2640d..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-13/3d27f6d9-05a0-44bd-a225-6e6a0bf4a35b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-13/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-13",
-    "id": "jaspionjader/bh-13",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4698
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5778
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4159
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.373
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-15/ad28e7b8-69e6-4fb9-bec4-62c67fae6d58.json b/data/hfopenllm_v2/jaspionjader/bh-15/ad28e7b8-69e6-4fb9-bec4-62c67fae6d58.json
deleted file mode 100644
index 87663e79d..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-15/ad28e7b8-69e6-4fb9-bec4-62c67fae6d58.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-15/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-15",
-    "id": "jaspionjader/bh-15",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4745
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5819
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1246
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4105
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3767
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-16/0da639d4-181c-4ee1-808c-3de8003c2471.json b/data/hfopenllm_v2/jaspionjader/bh-16/0da639d4-181c-4ee1-808c-3de8003c2471.json
deleted file mode 100644
index 5973a4f9d..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-16/0da639d4-181c-4ee1-808c-3de8003c2471.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-16/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-16",
-    "id": "jaspionjader/bh-16",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4731
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5783
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1193
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4159
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3776
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-17/480bd62c-bc67-4379-bce0-b28a5d6bdf4f.json b/data/hfopenllm_v2/jaspionjader/bh-17/480bd62c-bc67-4379-bce0-b28a5d6bdf4f.json
deleted file mode 100644
index cd717325d..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-17/480bd62c-bc67-4379-bce0-b28a5d6bdf4f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-17/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-17",
-    "id": "jaspionjader/bh-17",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4722
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5776
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1133
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4158
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3757
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-18/dd94c18e-b2c3-4135-aa2d-5eb0248315d0.json b/data/hfopenllm_v2/jaspionjader/bh-18/dd94c18e-b2c3-4135-aa2d-5eb0248315d0.json
deleted file mode 100644
index 0e961f484..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-18/dd94c18e-b2c3-4135-aa2d-5eb0248315d0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-18/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-18",
-    "id": "jaspionjader/bh-18",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4725
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5824
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1186
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4185
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3757
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-19/a2ae2953-e341-49be-8469-32bd41d780d7.json b/data/hfopenllm_v2/jaspionjader/bh-19/a2ae2953-e341-49be-8469-32bd41d780d7.json
deleted file mode 100644
index 4406baf52..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-19/a2ae2953-e341-49be-8469-32bd41d780d7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-19/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-19",
-    "id": "jaspionjader/bh-19",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4584
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5766
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1193
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4171
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3775
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-2/23bdd694-f250-46dd-9b8b-526fda47bc9e.json b/data/hfopenllm_v2/jaspionjader/bh-2/23bdd694-f250-46dd-9b8b-526fda47bc9e.json
deleted file mode 100644
index 504f5d126..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-2/23bdd694-f250-46dd-9b8b-526fda47bc9e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-2",
-    "id": "jaspionjader/bh-2",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4579
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5937
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1027
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4186
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3695
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-20/d600a69d-1952-4e30-abe8-1769ab63ac29.json b/data/hfopenllm_v2/jaspionjader/bh-20/d600a69d-1952-4e30-abe8-1769ab63ac29.json
deleted file mode 100644
index 47f8d0aaa..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-20/d600a69d-1952-4e30-abe8-1769ab63ac29.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-20/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-20",
-    "id": "jaspionjader/bh-20",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4727
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.575
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4105
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3768
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-21/afc031d4-852e-4ead-9098-6ce30112b459.json b/data/hfopenllm_v2/jaspionjader/bh-21/afc031d4-852e-4ead-9098-6ce30112b459.json
deleted file mode 100644
index 00f6374fa..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-21/afc031d4-852e-4ead-9098-6ce30112b459.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-21/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-21",
-    "id": "jaspionjader/bh-21",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.47
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5738
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4158
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3776
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-22/cb33e29f-e5e1-4bf5-9e20-86d9c3486d2d.json b/data/hfopenllm_v2/jaspionjader/bh-22/cb33e29f-e5e1-4bf5-9e20-86d9c3486d2d.json
deleted file mode 100644
index 1e2ca8954..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-22/cb33e29f-e5e1-4bf5-9e20-86d9c3486d2d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-22/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-22",
-    "id": "jaspionjader/bh-22",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5793
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1186
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4172
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3764
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-23/a4b93124-1151-4f69-8a5e-6b916e8cf11f.json b/data/hfopenllm_v2/jaspionjader/bh-23/a4b93124-1151-4f69-8a5e-6b916e8cf11f.json
deleted file mode 100644
index 532376bf1..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-23/a4b93124-1151-4f69-8a5e-6b916e8cf11f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-23/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-23",
-    "id": "jaspionjader/bh-23",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4658
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.57
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4197
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3796
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-24/efe11d8f-65e6-4ba6-8148-fdd43c9346be.json b/data/hfopenllm_v2/jaspionjader/bh-24/efe11d8f-65e6-4ba6-8148-fdd43c9346be.json
deleted file mode 100644
index 22202dd5f..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-24/efe11d8f-65e6-4ba6-8148-fdd43c9346be.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-24/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-24",
-    "id": "jaspionjader/bh-24",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4715
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5717
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1269
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4158
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3809
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-25/923da7be-2ec8-46b2-8187-fe08eb86d5a0.json b/data/hfopenllm_v2/jaspionjader/bh-25/923da7be-2ec8-46b2-8187-fe08eb86d5a0.json
deleted file mode 100644
index ae523eebf..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-25/923da7be-2ec8-46b2-8187-fe08eb86d5a0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-25/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-25",
-    "id": "jaspionjader/bh-25",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4752
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5706
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1133
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4118
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3782
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-26/1652b9fe-640a-48f9-b7a5-20ae28fb5985.json b/data/hfopenllm_v2/jaspionjader/bh-26/1652b9fe-640a-48f9-b7a5-20ae28fb5985.json
deleted file mode 100644
index 4a003c489..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-26/1652b9fe-640a-48f9-b7a5-20ae28fb5985.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-26/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-26",
-    "id": "jaspionjader/bh-26",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4691
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5735
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1163
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4277
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3772
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-27/572463ed-f6b9-460d-9c38-0e0ee5327511.json b/data/hfopenllm_v2/jaspionjader/bh-27/572463ed-f6b9-460d-9c38-0e0ee5327511.json
deleted file mode 100644
index 041f63301..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-27/572463ed-f6b9-460d-9c38-0e0ee5327511.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-27/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-27",
-    "id": "jaspionjader/bh-27",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4819
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5714
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1276
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4091
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3799
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-28/5f6bbbfd-16a8-4ea8-b9d9-b436a882700a.json b/data/hfopenllm_v2/jaspionjader/bh-28/5f6bbbfd-16a8-4ea8-b9d9-b436a882700a.json
deleted file mode 100644
index 9d000a51b..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-28/5f6bbbfd-16a8-4ea8-b9d9-b436a882700a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-28/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-28",
-    "id": "jaspionjader/bh-28",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4785
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5703
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1231
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4131
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3812
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-29/32322361-f18d-480d-9475-cd11a45bc4bc.json b/data/hfopenllm_v2/jaspionjader/bh-29/32322361-f18d-480d-9475-cd11a45bc4bc.json
deleted file mode 100644
index 015e4ce5c..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-29/32322361-f18d-480d-9475-cd11a45bc4bc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-29/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-29",
-    "id": "jaspionjader/bh-29",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4688
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.567
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1208
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4237
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3819
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-3/f62d1aee-2d9e-466e-85e2-002fae5d2504.json b/data/hfopenllm_v2/jaspionjader/bh-3/f62d1aee-2d9e-466e-85e2-002fae5d2504.json
deleted file mode 100644
index 7c0e9e2c4..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-3/f62d1aee-2d9e-466e-85e2-002fae5d2504.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-3",
-    "id": "jaspionjader/bh-3",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4664
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5891
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4173
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3702
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-30/af389bf1-da63-49a9-9e49-32613d8d05b8.json b/data/hfopenllm_v2/jaspionjader/bh-30/af389bf1-da63-49a9-9e49-32613d8d05b8.json
deleted file mode 100644
index 8e6b682db..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-30/af389bf1-da63-49a9-9e49-32613d8d05b8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-30/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-30",
-    "id": "jaspionjader/bh-30",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4666
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5706
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1231
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4144
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3782
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-31/ea13ae62-d050-4cc4-9cbe-99eedfc206e2.json b/data/hfopenllm_v2/jaspionjader/bh-31/ea13ae62-d050-4cc4-9cbe-99eedfc206e2.json
deleted file mode 100644
index da3e5651c..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-31/ea13ae62-d050-4cc4-9cbe-99eedfc206e2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-31/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-31",
-    "id": "jaspionjader/bh-31",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4727
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5665
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1284
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4104
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.382
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-32/1e697620-36a7-459c-b88c-405febb57c3a.json b/data/hfopenllm_v2/jaspionjader/bh-32/1e697620-36a7-459c-b88c-405febb57c3a.json
deleted file mode 100644
index ab338397a..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-32/1e697620-36a7-459c-b88c-405febb57c3a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-32/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-32",
-    "id": "jaspionjader/bh-32",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4636
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5662
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1246
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4157
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3812
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-33/532723e8-a9b7-4f72-a015-c2bd9363b5d8.json b/data/hfopenllm_v2/jaspionjader/bh-33/532723e8-a9b7-4f72-a015-c2bd9363b5d8.json
deleted file mode 100644
index cb0a494fb..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-33/532723e8-a9b7-4f72-a015-c2bd9363b5d8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-33/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-33",
-    "id": "jaspionjader/bh-33",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4685
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5653
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1178
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4157
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3808
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-34/be096a57-7d81-4999-919a-ed8a243012b2.json b/data/hfopenllm_v2/jaspionjader/bh-34/be096a57-7d81-4999-919a-ed8a243012b2.json
deleted file mode 100644
index e3c52f212..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-34/be096a57-7d81-4999-919a-ed8a243012b2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-34/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-34",
-    "id": "jaspionjader/bh-34",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4624
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5681
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1208
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4185
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3804
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-35/cadeb016-e158-4a49-921c-efe0e4eb0cb2.json b/data/hfopenllm_v2/jaspionjader/bh-35/cadeb016-e158-4a49-921c-efe0e4eb0cb2.json
deleted file mode 100644
index 3cf6f3543..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-35/cadeb016-e158-4a49-921c-efe0e4eb0cb2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-35/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-35",
-    "id": "jaspionjader/bh-35",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4721
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.564
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1246
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4183
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.383
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-36/c606d7b9-3ea3-49d4-9ecc-9610ed4b4eac.json b/data/hfopenllm_v2/jaspionjader/bh-36/c606d7b9-3ea3-49d4-9ecc-9610ed4b4eac.json
deleted file mode 100644
index c44481774..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-36/c606d7b9-3ea3-49d4-9ecc-9610ed4b4eac.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-36/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-36",
-    "id": "jaspionjader/bh-36",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4666
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5664
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1239
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4196
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3831
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-37/04a5eed3-7eea-4d9f-acc6-5a96ec987e2b.json b/data/hfopenllm_v2/jaspionjader/bh-37/04a5eed3-7eea-4d9f-acc6-5a96ec987e2b.json
deleted file mode 100644
index 00efc0c32..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-37/04a5eed3-7eea-4d9f-acc6-5a96ec987e2b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-37/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-37",
-    "id": "jaspionjader/bh-37",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.488
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5625
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4156
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3828
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-38/a1c60d74-dabe-423d-9e40-3dd8112d7d8e.json b/data/hfopenllm_v2/jaspionjader/bh-38/a1c60d74-dabe-423d-9e40-3dd8112d7d8e.json
deleted file mode 100644
index 8a76bd6cd..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-38/a1c60d74-dabe-423d-9e40-3dd8112d7d8e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-38/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-38",
-    "id": "jaspionjader/bh-38",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4618
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5658
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1239
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4117
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3811
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-39/29c7bc9b-6833-497b-a553-2941026efea5.json b/data/hfopenllm_v2/jaspionjader/bh-39/29c7bc9b-6833-497b-a553-2941026efea5.json
deleted file mode 100644
index 8f546e7ae..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-39/29c7bc9b-6833-497b-a553-2941026efea5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-39/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-39",
-    "id": "jaspionjader/bh-39",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4576
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5633
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1254
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4262
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3831
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-4/09a60955-978e-4136-bdde-d5459e37ad2c.json b/data/hfopenllm_v2/jaspionjader/bh-4/09a60955-978e-4136-bdde-d5459e37ad2c.json
deleted file mode 100644
index 76f667723..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-4/09a60955-978e-4136-bdde-d5459e37ad2c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-4",
-    "id": "jaspionjader/bh-4",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4673
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5892
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1095
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4173
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3705
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-40/501744a2-070a-4378-9232-f7ccd9b2a67e.json b/data/hfopenllm_v2/jaspionjader/bh-40/501744a2-070a-4378-9232-f7ccd9b2a67e.json
deleted file mode 100644
index e827aaca1..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-40/501744a2-070a-4378-9232-f7ccd9b2a67e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-40/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-40",
-    "id": "jaspionjader/bh-40",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4536
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5634
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1246
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4236
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3835
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-41/369efdc6-6529-477c-b5f0-d229c8102491.json b/data/hfopenllm_v2/jaspionjader/bh-41/369efdc6-6529-477c-b5f0-d229c8102491.json
deleted file mode 100644
index 559005960..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-41/369efdc6-6529-477c-b5f0-d229c8102491.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-41/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-41",
-    "id": "jaspionjader/bh-41",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.474
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5614
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1254
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4183
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3825
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-42/906645f3-2041-4380-8118-ac26b92297ba.json b/data/hfopenllm_v2/jaspionjader/bh-42/906645f3-2041-4380-8118-ac26b92297ba.json
deleted file mode 100644
index 8e371ce85..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-42/906645f3-2041-4380-8118-ac26b92297ba.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-42/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-42",
-    "id": "jaspionjader/bh-42",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.466
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5646
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1269
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.421
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3812
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-43/57fe8deb-02dc-43a8-8a92-14bdaf61dd67.json b/data/hfopenllm_v2/jaspionjader/bh-43/57fe8deb-02dc-43a8-8a92-14bdaf61dd67.json
deleted file mode 100644
index 2479e939a..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-43/57fe8deb-02dc-43a8-8a92-14bdaf61dd67.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-43/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-43",
-    "id": "jaspionjader/bh-43",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5635
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1239
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4156
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.382
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-44/95f2fa22-3da9-4876-ace3-50763f2b2453.json b/data/hfopenllm_v2/jaspionjader/bh-44/95f2fa22-3da9-4876-ace3-50763f2b2453.json
deleted file mode 100644
index e04123e5c..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-44/95f2fa22-3da9-4876-ace3-50763f2b2453.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-44/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-44",
-    "id": "jaspionjader/bh-44",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4706
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5643
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4249
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3834
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-46/b2f9e38f-c2a1-4e5f-a7ce-4e33a05b503b.json b/data/hfopenllm_v2/jaspionjader/bh-46/b2f9e38f-c2a1-4e5f-a7ce-4e33a05b503b.json
deleted file mode 100644
index b0fd30662..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-46/b2f9e38f-c2a1-4e5f-a7ce-4e33a05b503b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-46/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-46",
-    "id": "jaspionjader/bh-46",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4727
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5632
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1276
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4262
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3822
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-47/b3173a2a-8309-498d-961b-0167d5d5dea6.json b/data/hfopenllm_v2/jaspionjader/bh-47/b3173a2a-8309-498d-961b-0167d5d5dea6.json
deleted file mode 100644
index 12bf1bc95..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-47/b3173a2a-8309-498d-961b-0167d5d5dea6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-47/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-47",
-    "id": "jaspionjader/bh-47",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4652
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5546
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1276
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4156
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3855
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-48/0d59dd75-c999-4a7e-919a-fd084202fc9c.json b/data/hfopenllm_v2/jaspionjader/bh-48/0d59dd75-c999-4a7e-919a-fd084202fc9c.json
deleted file mode 100644
index 0b617599c..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-48/0d59dd75-c999-4a7e-919a-fd084202fc9c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-48/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-48",
-    "id": "jaspionjader/bh-48",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4688
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5541
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1254
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4209
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.386
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-49/639e91d9-ebbf-4ba2-bce3-6953e7c91e32.json b/data/hfopenllm_v2/jaspionjader/bh-49/639e91d9-ebbf-4ba2-bce3-6953e7c91e32.json
deleted file mode 100644
index be5e9a125..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-49/639e91d9-ebbf-4ba2-bce3-6953e7c91e32.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-49/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-49",
-    "id": "jaspionjader/bh-49",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4725
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.554
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4129
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3808
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-5/56a5fb9b-a4b7-4290-9ec9-6864b3efaa82.json b/data/hfopenllm_v2/jaspionjader/bh-5/56a5fb9b-a4b7-4290-9ec9-6864b3efaa82.json
deleted file mode 100644
index f712b9f05..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-5/56a5fb9b-a4b7-4290-9ec9-6864b3efaa82.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-5",
-    "id": "jaspionjader/bh-5",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4652
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5882
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1057
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4186
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3702
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-50/d03fb481-be0b-4dfb-bb4d-54067e058e99.json b/data/hfopenllm_v2/jaspionjader/bh-50/d03fb481-be0b-4dfb-bb4d-54067e058e99.json
deleted file mode 100644
index fab8832f6..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-50/d03fb481-be0b-4dfb-bb4d-54067e058e99.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-50/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-50",
-    "id": "jaspionjader/bh-50",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4725
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5553
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1208
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4169
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-51/d8fc3475-83e9-4790-a472-72b442087562.json b/data/hfopenllm_v2/jaspionjader/bh-51/d8fc3475-83e9-4790-a472-72b442087562.json
deleted file mode 100644
index 82a672fc1..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-51/d8fc3475-83e9-4790-a472-72b442087562.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-51/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-51",
-    "id": "jaspionjader/bh-51",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.463
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5557
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1239
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4168
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3831
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-52/57efd335-4873-4e01-bfc3-0d704b3d482a.json b/data/hfopenllm_v2/jaspionjader/bh-52/57efd335-4873-4e01-bfc3-0d704b3d482a.json
deleted file mode 100644
index 5d9a2c2b6..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-52/57efd335-4873-4e01-bfc3-0d704b3d482a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-52/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-52",
-    "id": "jaspionjader/bh-52",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4536
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5444
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4169
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3843
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-53/25fdcc8a-0e7d-4148-8508-2631ea6deb05.json b/data/hfopenllm_v2/jaspionjader/bh-53/25fdcc8a-0e7d-4148-8508-2631ea6deb05.json
deleted file mode 100644
index c656925be..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-53/25fdcc8a-0e7d-4148-8508-2631ea6deb05.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-53/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-53",
-    "id": "jaspionjader/bh-53",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.478
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5494
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1269
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4196
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3858
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-54/f5f63d06-7e51-4b91-8814-ecbda604fe6b.json b/data/hfopenllm_v2/jaspionjader/bh-54/f5f63d06-7e51-4b91-8814-ecbda604fe6b.json
deleted file mode 100644
index b9fb7dee3..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-54/f5f63d06-7e51-4b91-8814-ecbda604fe6b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-54/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-54",
-    "id": "jaspionjader/bh-54",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4841
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5548
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1292
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4155
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3825
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-55/5326c33b-6b8a-472a-9058-a9e9fe83b599.json b/data/hfopenllm_v2/jaspionjader/bh-55/5326c33b-6b8a-472a-9058-a9e9fe83b599.json
deleted file mode 100644
index f9f52d25d..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-55/5326c33b-6b8a-472a-9058-a9e9fe83b599.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-55/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-55",
-    "id": "jaspionjader/bh-55",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4709
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.555
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1284
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4222
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3846
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-56/28674053-e1b6-4f0a-a90e-5dd5082ec164.json b/data/hfopenllm_v2/jaspionjader/bh-56/28674053-e1b6-4f0a-a90e-5dd5082ec164.json
deleted file mode 100644
index 4876a15a1..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-56/28674053-e1b6-4f0a-a90e-5dd5082ec164.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-56/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-56",
-    "id": "jaspionjader/bh-56",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5447
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1231
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4116
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3844
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-57/fd27bfa7-11b3-46d3-915c-373ddf5a9865.json b/data/hfopenllm_v2/jaspionjader/bh-57/fd27bfa7-11b3-46d3-915c-373ddf5a9865.json
deleted file mode 100644
index f2fb7ade4..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-57/fd27bfa7-11b3-46d3-915c-373ddf5a9865.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-57/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-57",
-    "id": "jaspionjader/bh-57",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4405
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5425
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1261
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.421
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3896
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-58/91f190ba-39c8-47af-8351-73d1f382dd99.json b/data/hfopenllm_v2/jaspionjader/bh-58/91f190ba-39c8-47af-8351-73d1f382dd99.json
deleted file mode 100644
index 3b78d4b75..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-58/91f190ba-39c8-47af-8351-73d1f382dd99.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-58/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-58",
-    "id": "jaspionjader/bh-58",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.463
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5446
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1322
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4183
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3896
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-59/b637b55c-dd05-4060-bf33-e63e9de7fac9.json b/data/hfopenllm_v2/jaspionjader/bh-59/b637b55c-dd05-4060-bf33-e63e9de7fac9.json
deleted file mode 100644
index 40582b054..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-59/b637b55c-dd05-4060-bf33-e63e9de7fac9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-59/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-59",
-    "id": "jaspionjader/bh-59",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4341
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5512
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1541
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.417
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3838
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-6/bcacef79-d7c0-46e7-9194-43541c2f01fc.json b/data/hfopenllm_v2/jaspionjader/bh-6/bcacef79-d7c0-46e7-9194-43541c2f01fc.json
deleted file mode 100644
index 66016308e..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-6/bcacef79-d7c0-46e7-9194-43541c2f01fc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-6",
-    "id": "jaspionjader/bh-6",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4621
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5891
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1088
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3698
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-60/77a358c7-59fa-4b22-a190-dfca86c5166b.json b/data/hfopenllm_v2/jaspionjader/bh-60/77a358c7-59fa-4b22-a190-dfca86c5166b.json
deleted file mode 100644
index 60b6bf47a..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-60/77a358c7-59fa-4b22-a190-dfca86c5166b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-60/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-60",
-    "id": "jaspionjader/bh-60",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4207
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5369
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1579
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4289
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3689
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-61/ad4c8922-7079-4383-8f42-d3de6326a1e1.json b/data/hfopenllm_v2/jaspionjader/bh-61/ad4c8922-7079-4383-8f42-d3de6326a1e1.json
deleted file mode 100644
index 1e6b5f4bd..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-61/ad4c8922-7079-4383-8f42-d3de6326a1e1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-61/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-61",
-    "id": "jaspionjader/bh-61",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4247
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5271
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1707
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4356
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3679
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-62/7f89eded-e5fc-4b3b-9afd-dcd71b7b44d5.json b/data/hfopenllm_v2/jaspionjader/bh-62/7f89eded-e5fc-4b3b-9afd-dcd71b7b44d5.json
deleted file mode 100644
index ba9efa108..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-62/7f89eded-e5fc-4b3b-9afd-dcd71b7b44d5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-62/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-62",
-    "id": "jaspionjader/bh-62",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.415
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5379
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1624
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4289
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3719
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-63/07cb94ab-0aea-4ce2-89b0-4378cb892c7e.json b/data/hfopenllm_v2/jaspionjader/bh-63/07cb94ab-0aea-4ce2-89b0-4378cb892c7e.json
deleted file mode 100644
index 61c277886..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-63/07cb94ab-0aea-4ce2-89b0-4378cb892c7e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-63/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-63",
-    "id": "jaspionjader/bh-63",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4308
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4917
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.111
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4313
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3248
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-64/5fb04756-c7bb-4772-b209-0d9a300bbf7d.json b/data/hfopenllm_v2/jaspionjader/bh-64/5fb04756-c7bb-4772-b209-0d9a300bbf7d.json
deleted file mode 100644
index 4fa30fb6a..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-64/5fb04756-c7bb-4772-b209-0d9a300bbf7d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-64/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-64",
-    "id": "jaspionjader/bh-64",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.414
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.536
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1548
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4355
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3693
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-7/0c02d1b6-2d31-4c54-b881-588cbfb0c686.json b/data/hfopenllm_v2/jaspionjader/bh-7/0c02d1b6-2d31-4c54-b881-588cbfb0c686.json
deleted file mode 100644
index 898316818..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-7/0c02d1b6-2d31-4c54-b881-588cbfb0c686.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-7",
-    "id": "jaspionjader/bh-7",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4624
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5861
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4119
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3715
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-8/a32e4d22-8096-4537-a68a-98ff9171ac8c.json b/data/hfopenllm_v2/jaspionjader/bh-8/a32e4d22-8096-4537-a68a-98ff9171ac8c.json
deleted file mode 100644
index 2f29f8cdb..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-8/a32e4d22-8096-4537-a68a-98ff9171ac8c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-8/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-8",
-    "id": "jaspionjader/bh-8",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4597
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.59
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1178
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4265
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.372
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/bh-9/4e45b666-fa7e-4a38-8b6b-65846876c8d9.json b/data/hfopenllm_v2/jaspionjader/bh-9/4e45b666-fa7e-4a38-8b6b-65846876c8d9.json
deleted file mode 100644
index 038a2b90e..000000000
--- a/data/hfopenllm_v2/jaspionjader/bh-9/4e45b666-fa7e-4a38-8b6b-65846876c8d9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_bh-9/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bh-9",
-    "id": "jaspionjader/bh-9",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4509
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.585
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1156
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4146
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3703
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/dp-6-8b/d9cb1d13-2af5-4385-aa78-5c053e00e6c6.json b/data/hfopenllm_v2/jaspionjader/dp-6-8b/d9cb1d13-2af5-4385-aa78-5c053e00e6c6.json
deleted file mode 100644
index d490d1352..000000000
--- a/data/hfopenllm_v2/jaspionjader/dp-6-8b/d9cb1d13-2af5-4385-aa78-5c053e00e6c6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_dp-6-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dp-6-8b",
-    "id": "jaspionjader/dp-6-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4806
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1329
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4434
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3897
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/dp-7-8b/6afaec07-ebb8-4f3f-af48-c679f38f4917.json b/data/hfopenllm_v2/jaspionjader/dp-7-8b/6afaec07-ebb8-4f3f-af48-c679f38f4917.json
deleted file mode 100644
index 2aba7ace1..000000000
--- a/data/hfopenllm_v2/jaspionjader/dp-7-8b/6afaec07-ebb8-4f3f-af48-c679f38f4917.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_dp-7-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "dp-7-8b",
-    "id": "jaspionjader/dp-7-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4498
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5291
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1261
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4407
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3934
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/ek-6/bf8370c9-baed-4034-ac38-c6f796baca15.json b/data/hfopenllm_v2/jaspionjader/ek-6/bf8370c9-baed-4034-ac38-c6f796baca15.json
deleted file mode 100644
index 3351de25f..000000000
--- a/data/hfopenllm_v2/jaspionjader/ek-6/bf8370c9-baed-4034-ac38-c6f796baca15.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_ek-6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ek-6",
-    "id": "jaspionjader/ek-6",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4642
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5219
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1322
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4144
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3861
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/ek-7/d397c078-6fe3-44a8-859c-a0f7c551dc3a.json b/data/hfopenllm_v2/jaspionjader/ek-7/d397c078-6fe3-44a8-859c-a0f7c551dc3a.json
deleted file mode 100644
index 7e3d311e0..000000000
--- a/data/hfopenllm_v2/jaspionjader/ek-7/d397c078-6fe3-44a8-859c-a0f7c551dc3a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_ek-7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ek-7",
-    "id": "jaspionjader/ek-7",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4767
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5194
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1329
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4171
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3887
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/f-1-8b/ed61cd6a-bbf0-45f2-9536-a7a262d5d6fb.json b/data/hfopenllm_v2/jaspionjader/f-1-8b/ed61cd6a-bbf0-45f2-9536-a7a262d5d6fb.json
deleted file mode 100644
index 1d3bd64ed..000000000
--- a/data/hfopenllm_v2/jaspionjader/f-1-8b/ed61cd6a-bbf0-45f2-9536-a7a262d5d6fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_f-1-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "f-1-8b",
-    "id": "jaspionjader/f-1-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4983
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5141
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1284
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4527
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3907
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/f-2-8b/6be795f4-0784-44bf-8926-e3060ec37dcf.json b/data/hfopenllm_v2/jaspionjader/f-2-8b/6be795f4-0784-44bf-8926-e3060ec37dcf.json
deleted file mode 100644
index cdc2207e8..000000000
--- a/data/hfopenllm_v2/jaspionjader/f-2-8b/6be795f4-0784-44bf-8926-e3060ec37dcf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_f-2-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "f-2-8b",
-    "id": "jaspionjader/f-2-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4824
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5294
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1171
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4501
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3962
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/f-3-8b/d4d808f5-3b79-43b5-8076-d3f785083789.json b/data/hfopenllm_v2/jaspionjader/f-3-8b/d4d808f5-3b79-43b5-8076-d3f785083789.json
deleted file mode 100644
index 9a8bac39d..000000000
--- a/data/hfopenllm_v2/jaspionjader/f-3-8b/d4d808f5-3b79-43b5-8076-d3f785083789.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_f-3-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "f-3-8b",
-    "id": "jaspionjader/f-3-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4803
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5275
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4421
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3954
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/f-4-8b/370f5923-91d7-40d2-bd06-bf2b657b8ef2.json b/data/hfopenllm_v2/jaspionjader/f-4-8b/370f5923-91d7-40d2-bd06-bf2b657b8ef2.json
deleted file mode 100644
index 85c9a3a2a..000000000
--- a/data/hfopenllm_v2/jaspionjader/f-4-8b/370f5923-91d7-40d2-bd06-bf2b657b8ef2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_f-4-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "f-4-8b",
-    "id": "jaspionjader/f-4-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4797
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5289
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4514
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3956
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/f-5-8b/5334e5e4-d243-4c20-912c-d0ded74d6ea5.json b/data/hfopenllm_v2/jaspionjader/f-5-8b/5334e5e4-d243-4c20-912c-d0ded74d6ea5.json
deleted file mode 100644
index dff79aa05..000000000
--- a/data/hfopenllm_v2/jaspionjader/f-5-8b/5334e5e4-d243-4c20-912c-d0ded74d6ea5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_f-5-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "f-5-8b",
-    "id": "jaspionjader/f-5-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5044
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5313
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1239
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4461
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3949
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/f-6-8b/7306f2cd-4fd2-4dd4-b06b-8c9aa558388b.json b/data/hfopenllm_v2/jaspionjader/f-6-8b/7306f2cd-4fd2-4dd4-b06b-8c9aa558388b.json
deleted file mode 100644
index 64a06c461..000000000
--- a/data/hfopenllm_v2/jaspionjader/f-6-8b/7306f2cd-4fd2-4dd4-b06b-8c9aa558388b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_f-6-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "f-6-8b",
-    "id": "jaspionjader/f-6-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4846
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5241
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1193
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4474
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3939
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/f-7-8b/68cc19eb-423b-4d6d-a3bf-eac6f666bc4b.json b/data/hfopenllm_v2/jaspionjader/f-7-8b/68cc19eb-423b-4d6d-a3bf-eac6f666bc4b.json
deleted file mode 100644
index e277f31b0..000000000
--- a/data/hfopenllm_v2/jaspionjader/f-7-8b/68cc19eb-423b-4d6d-a3bf-eac6f666bc4b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_f-7-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "f-7-8b",
-    "id": "jaspionjader/f-7-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4462
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5277
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1239
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4315
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3936
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/f-8-8b/59aa26a8-93b3-43fc-8c38-ef67cd8efd80.json b/data/hfopenllm_v2/jaspionjader/f-8-8b/59aa26a8-93b3-43fc-8c38-ef67cd8efd80.json
deleted file mode 100644
index 5cbc566d2..000000000
--- a/data/hfopenllm_v2/jaspionjader/f-8-8b/59aa26a8-93b3-43fc-8c38-ef67cd8efd80.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_f-8-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "f-8-8b",
-    "id": "jaspionjader/f-8-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4739
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5259
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1224
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4354
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.394
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/f-9-8b/220cd306-0613-4c8f-9848-4af812a1d37f.json b/data/hfopenllm_v2/jaspionjader/f-9-8b/220cd306-0613-4c8f-9848-4af812a1d37f.json
deleted file mode 100644
index e31f7fa11..000000000
--- a/data/hfopenllm_v2/jaspionjader/f-9-8b/220cd306-0613-4c8f-9848-4af812a1d37f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_f-9-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "f-9-8b",
-    "id": "jaspionjader/f-9-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4602
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5292
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1299
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4461
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3944
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/fct-14-8b/39a6a40c-3fa0-41ba-9d13-da9381263d4a.json b/data/hfopenllm_v2/jaspionjader/fct-14-8b/39a6a40c-3fa0-41ba-9d13-da9381263d4a.json
deleted file mode 100644
index 4829d4d11..000000000
--- a/data/hfopenllm_v2/jaspionjader/fct-14-8b/39a6a40c-3fa0-41ba-9d13-da9381263d4a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_fct-14-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "fct-14-8b",
-    "id": "jaspionjader/fct-14-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4129
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5206
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4186
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3875
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/fct-9-8b/4d037b71-5d03-41a1-bf23-c0aea0cdcbbb.json b/data/hfopenllm_v2/jaspionjader/fct-9-8b/4d037b71-5d03-41a1-bf23-c0aea0cdcbbb.json
deleted file mode 100644
index 6ed9866d8..000000000
--- a/data/hfopenllm_v2/jaspionjader/fct-9-8b/4d037b71-5d03-41a1-bf23-c0aea0cdcbbb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_fct-9-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "fct-9-8b",
-    "id": "jaspionjader/fct-9-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4354
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5205
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1193
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4291
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3932
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/fr-1-8b/16baf620-7dcc-49f3-a787-b431e11ad4f6.json b/data/hfopenllm_v2/jaspionjader/fr-1-8b/16baf620-7dcc-49f3-a787-b431e11ad4f6.json
deleted file mode 100644
index e4d8bb492..000000000
--- a/data/hfopenllm_v2/jaspionjader/fr-1-8b/16baf620-7dcc-49f3-a787-b431e11ad4f6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_fr-1-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "fr-1-8b",
-    "id": "jaspionjader/fr-1-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4211
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5142
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1118
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4277
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.361
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/fr-10-8b/4745add2-7bcb-4c05-8b12-6bd30856890b.json b/data/hfopenllm_v2/jaspionjader/fr-10-8b/4745add2-7bcb-4c05-8b12-6bd30856890b.json
deleted file mode 100644
index 9d8f40937..000000000
--- a/data/hfopenllm_v2/jaspionjader/fr-10-8b/4745add2-7bcb-4c05-8b12-6bd30856890b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_fr-10-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "fr-10-8b",
-    "id": "jaspionjader/fr-10-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4402
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5207
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1224
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4119
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3863
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/fr-3-8b/f68b122d-4dec-4d5c-ac22-198da3d3e96b.json b/data/hfopenllm_v2/jaspionjader/fr-3-8b/f68b122d-4dec-4d5c-ac22-198da3d3e96b.json
deleted file mode 100644
index 4ae5f5614..000000000
--- a/data/hfopenllm_v2/jaspionjader/fr-3-8b/f68b122d-4dec-4d5c-ac22-198da3d3e96b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_fr-3-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "fr-3-8b",
-    "id": "jaspionjader/fr-3-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4326
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5255
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1133
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4198
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3863
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-8B/2e20f780-ceab-4d1d-a1ab-35f4f0ac44aa.json b/data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-8B/2e20f780-ceab-4d1d-a1ab-35f4f0ac44aa.json
deleted file mode 100644
index 84a9fa596..000000000
--- a/data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-8B/2e20f780-ceab-4d1d-a1ab-35f4f0ac44aa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_gamma-Kosmos-EVAA-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gamma-Kosmos-EVAA-8B",
-    "id": "jaspionjader/gamma-Kosmos-EVAA-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.425
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5253
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0899
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4412
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3776
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-v2-8B/f21bcd75-fc9f-4266-8976-3227b18b6b32.json b/data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-v2-8B/f21bcd75-fc9f-4266-8976-3227b18b6b32.json
deleted file mode 100644
index fdf1ccc99..000000000
--- a/data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-v2-8B/f21bcd75-fc9f-4266-8976-3227b18b6b32.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_gamma-Kosmos-EVAA-v2-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gamma-Kosmos-EVAA-v2-8B",
-    "id": "jaspionjader/gamma-Kosmos-EVAA-v2-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4233
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5262
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1057
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4344
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3756
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-v3-8B/7c1a81ec-1cb7-4858-8f1f-23b3ee49b73f.json b/data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-v3-8B/7c1a81ec-1cb7-4858-8f1f-23b3ee49b73f.json
deleted file mode 100644
index 9aa3757ce..000000000
--- a/data/hfopenllm_v2/jaspionjader/gamma-Kosmos-EVAA-v3-8B/7c1a81ec-1cb7-4858-8f1f-23b3ee49b73f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_gamma-Kosmos-EVAA-v3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gamma-Kosmos-EVAA-v3-8B",
-    "id": "jaspionjader/gamma-Kosmos-EVAA-v3-8B",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4333
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5278
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.111
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4263
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3898
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/knf-2-8b/1cbfd1ad-237d-4cd3-8b5d-3135c194fcc0.json b/data/hfopenllm_v2/jaspionjader/knf-2-8b/1cbfd1ad-237d-4cd3-8b5d-3135c194fcc0.json
deleted file mode 100644
index f0dcdddf2..000000000
--- a/data/hfopenllm_v2/jaspionjader/knf-2-8b/1cbfd1ad-237d-4cd3-8b5d-3135c194fcc0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_knf-2-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "knf-2-8b",
-    "id": "jaspionjader/knf-2-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.425
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5207
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4185
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3875
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/knfp-2-8b/ef5c1813-a74d-4b3d-9911-c27a46c1c84e.json b/data/hfopenllm_v2/jaspionjader/knfp-2-8b/ef5c1813-a74d-4b3d-9911-c27a46c1c84e.json
deleted file mode 100644
index 2ee41f38d..000000000
--- a/data/hfopenllm_v2/jaspionjader/knfp-2-8b/ef5c1813-a74d-4b3d-9911-c27a46c1c84e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_knfp-2-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "knfp-2-8b",
-    "id": "jaspionjader/knfp-2-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5327
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5305
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1427
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4185
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3726
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/knfp-3-8b/df50857d-c90e-4ec8-a9b6-96a6d2f894b1.json b/data/hfopenllm_v2/jaspionjader/knfp-3-8b/df50857d-c90e-4ec8-a9b6-96a6d2f894b1.json
deleted file mode 100644
index 3ed3a2d85..000000000
--- a/data/hfopenllm_v2/jaspionjader/knfp-3-8b/df50857d-c90e-4ec8-a9b6-96a6d2f894b1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_knfp-3-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "knfp-3-8b",
-    "id": "jaspionjader/knfp-3-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4946
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1224
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4171
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3881
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/kstc-1-8b/774d54fb-a445-4ed9-b79a-9c1346537e98.json b/data/hfopenllm_v2/jaspionjader/kstc-1-8b/774d54fb-a445-4ed9-b79a-9c1346537e98.json
deleted file mode 100644
index 3c1fc3df6..000000000
--- a/data/hfopenllm_v2/jaspionjader/kstc-1-8b/774d54fb-a445-4ed9-b79a-9c1346537e98.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_kstc-1-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "kstc-1-8b",
-    "id": "jaspionjader/kstc-1-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4643
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5209
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1171
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4158
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3892
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/kstc-11-8b/420b8be3-3560-48e8-8ab3-bb55338a9069.json b/data/hfopenllm_v2/jaspionjader/kstc-11-8b/420b8be3-3560-48e8-8ab3-bb55338a9069.json
deleted file mode 100644
index cbbf1f3a0..000000000
--- a/data/hfopenllm_v2/jaspionjader/kstc-11-8b/420b8be3-3560-48e8-8ab3-bb55338a9069.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_kstc-11-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "kstc-11-8b",
-    "id": "jaspionjader/kstc-11-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4757
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5189
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4118
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3879
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/kstc-4-8b/c118b75c-597f-48a7-a4eb-675af72c9930.json b/data/hfopenllm_v2/jaspionjader/kstc-4-8b/c118b75c-597f-48a7-a4eb-675af72c9930.json
deleted file mode 100644
index 3b5882807..000000000
--- a/data/hfopenllm_v2/jaspionjader/kstc-4-8b/c118b75c-597f-48a7-a4eb-675af72c9930.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_kstc-4-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "kstc-4-8b",
-    "id": "jaspionjader/kstc-4-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.477
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5216
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1239
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4118
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3869
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/kstc-5-8b/e75534d3-b994-4e88-9274-7b62f61916cf.json b/data/hfopenllm_v2/jaspionjader/kstc-5-8b/e75534d3-b994-4e88-9274-7b62f61916cf.json
deleted file mode 100644
index 4dec5cab3..000000000
--- a/data/hfopenllm_v2/jaspionjader/kstc-5-8b/e75534d3-b994-4e88-9274-7b62f61916cf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_kstc-5-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "kstc-5-8b",
-    "id": "jaspionjader/kstc-5-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4721
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5211
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1299
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4224
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3892
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/kstc-6-8b/770a1ff1-057f-49a7-9402-c6dd881ac03d.json b/data/hfopenllm_v2/jaspionjader/kstc-6-8b/770a1ff1-057f-49a7-9402-c6dd881ac03d.json
deleted file mode 100644
index 2000e4ffd..000000000
--- a/data/hfopenllm_v2/jaspionjader/kstc-6-8b/770a1ff1-057f-49a7-9402-c6dd881ac03d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_kstc-6-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "kstc-6-8b",
-    "id": "jaspionjader/kstc-6-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4944
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5231
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1246
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4105
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3857
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/kstc-8-8b/6cc9790d-9b02-437e-8ac7-be4152f5b17d.json b/data/hfopenllm_v2/jaspionjader/kstc-8-8b/6cc9790d-9b02-437e-8ac7-be4152f5b17d.json
deleted file mode 100644
index 3637c9325..000000000
--- a/data/hfopenllm_v2/jaspionjader/kstc-8-8b/6cc9790d-9b02-437e-8ac7-be4152f5b17d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_kstc-8-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "kstc-8-8b",
-    "id": "jaspionjader/kstc-8-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.491
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5239
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1307
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4211
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3889
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/kstc-9-8b/264f5b42-a3ac-4af1-8145-c5763b8e7fa6.json b/data/hfopenllm_v2/jaspionjader/kstc-9-8b/264f5b42-a3ac-4af1-8145-c5763b8e7fa6.json
deleted file mode 100644
index 23d708f9b..000000000
--- a/data/hfopenllm_v2/jaspionjader/kstc-9-8b/264f5b42-a3ac-4af1-8145-c5763b8e7fa6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_kstc-9-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "kstc-9-8b",
-    "id": "jaspionjader/kstc-9-8b",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4861
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5238
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4118
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3872
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/slu-10/549db368-437a-4982-ba5b-5c4d7bf203ae.json b/data/hfopenllm_v2/jaspionjader/slu-10/549db368-437a-4982-ba5b-5c4d7bf203ae.json
deleted file mode 100644
index bde42f6e8..000000000
--- a/data/hfopenllm_v2/jaspionjader/slu-10/549db368-437a-4982-ba5b-5c4d7bf203ae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_slu-10/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "slu-10",
-    "id": "jaspionjader/slu-10",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.436
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5096
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0974
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.392
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3664
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/slu-11/0d098a19-7e8f-4a52-8466-729be91388d8.json b/data/hfopenllm_v2/jaspionjader/slu-11/0d098a19-7e8f-4a52-8466-729be91388d8.json
deleted file mode 100644
index 00f33a8c5..000000000
--- a/data/hfopenllm_v2/jaspionjader/slu-11/0d098a19-7e8f-4a52-8466-729be91388d8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_slu-11/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "slu-11",
-    "id": "jaspionjader/slu-11",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3725
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.489
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0559
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3919
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3382
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/slu-13/83335f65-25a4-4bec-a901-587567ed0e99.json b/data/hfopenllm_v2/jaspionjader/slu-13/83335f65-25a4-4bec-a901-587567ed0e99.json
deleted file mode 100644
index 7dba81194..000000000
--- a/data/hfopenllm_v2/jaspionjader/slu-13/83335f65-25a4-4bec-a901-587567ed0e99.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_slu-13/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "slu-13",
-    "id": "jaspionjader/slu-13",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4378
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5097
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0808
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3814
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/slu-14/02fb24c3-927f-4c21-bd47-b883521162a3.json b/data/hfopenllm_v2/jaspionjader/slu-14/02fb24c3-927f-4c21-bd47-b883521162a3.json
deleted file mode 100644
index ff8e6a9e1..000000000
--- a/data/hfopenllm_v2/jaspionjader/slu-14/02fb24c3-927f-4c21-bd47-b883521162a3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_slu-14/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "slu-14",
-    "id": "jaspionjader/slu-14",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4107
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5089
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0974
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.396
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3627
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/slu-17/2a6507c7-44c1-4416-9ff1-36abd6af3b73.json b/data/hfopenllm_v2/jaspionjader/slu-17/2a6507c7-44c1-4416-9ff1-36abd6af3b73.json
deleted file mode 100644
index 5214d57f6..000000000
--- a/data/hfopenllm_v2/jaspionjader/slu-17/2a6507c7-44c1-4416-9ff1-36abd6af3b73.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_slu-17/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "slu-17",
-    "id": "jaspionjader/slu-17",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4217
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5071
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0853
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3761
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3619
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/slu-2/327a146a-8cfd-4480-8342-46afde530677.json b/data/hfopenllm_v2/jaspionjader/slu-2/327a146a-8cfd-4480-8342-46afde530677.json
deleted file mode 100644
index 746324375..000000000
--- a/data/hfopenllm_v2/jaspionjader/slu-2/327a146a-8cfd-4480-8342-46afde530677.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_slu-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "slu-2",
-    "id": "jaspionjader/slu-2",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4016
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5008
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0634
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3959
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3506
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/slu-20/0700fb7a-e722-432f-a64d-c040bba4deee.json b/data/hfopenllm_v2/jaspionjader/slu-20/0700fb7a-e722-432f-a64d-c040bba4deee.json
deleted file mode 100644
index b57d6ab6a..000000000
--- a/data/hfopenllm_v2/jaspionjader/slu-20/0700fb7a-e722-432f-a64d-c040bba4deee.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_slu-20/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "slu-20",
-    "id": "jaspionjader/slu-20",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4393
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5061
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0869
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3933
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3665
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/slu-22/131d3a7e-43dd-4189-8466-6562703b3bdd.json b/data/hfopenllm_v2/jaspionjader/slu-22/131d3a7e-43dd-4189-8466-6562703b3bdd.json
deleted file mode 100644
index 3b02267ec..000000000
--- a/data/hfopenllm_v2/jaspionjader/slu-22/131d3a7e-43dd-4189-8466-6562703b3bdd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_slu-22/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "slu-22",
-    "id": "jaspionjader/slu-22",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4321
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5082
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0793
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3893
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.365
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/slu-23/8f6d7008-b8de-4a76-94aa-bbecc93ef3f7.json b/data/hfopenllm_v2/jaspionjader/slu-23/8f6d7008-b8de-4a76-94aa-bbecc93ef3f7.json
deleted file mode 100644
index 12d59c50d..000000000
--- a/data/hfopenllm_v2/jaspionjader/slu-23/8f6d7008-b8de-4a76-94aa-bbecc93ef3f7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_slu-23/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "slu-23",
-    "id": "jaspionjader/slu-23",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4478
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5132
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0944
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4092
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3725
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/slu-25/aadb0ce5-a1aa-4b0d-bec4-8bb0e8e54a1d.json b/data/hfopenllm_v2/jaspionjader/slu-25/aadb0ce5-a1aa-4b0d-bec4-8bb0e8e54a1d.json
deleted file mode 100644
index f8a9592be..000000000
--- a/data/hfopenllm_v2/jaspionjader/slu-25/aadb0ce5-a1aa-4b0d-bec4-8bb0e8e54a1d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_slu-25/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "slu-25",
-    "id": "jaspionjader/slu-25",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.45
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5095
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0838
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3946
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3684
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/slu-29/a73250f1-399a-4afa-bf83-4036dce78ef3.json b/data/hfopenllm_v2/jaspionjader/slu-29/a73250f1-399a-4afa-bf83-4036dce78ef3.json
deleted file mode 100644
index 56321b0e4..000000000
--- a/data/hfopenllm_v2/jaspionjader/slu-29/a73250f1-399a-4afa-bf83-4036dce78ef3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_slu-29/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "slu-29",
-    "id": "jaspionjader/slu-29",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4431
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5096
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0869
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3933
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3669
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/slu-32/f68bf680-9626-4952-b95e-12a18fd60820.json b/data/hfopenllm_v2/jaspionjader/slu-32/f68bf680-9626-4952-b95e-12a18fd60820.json
deleted file mode 100644
index 3e62ecf10..000000000
--- a/data/hfopenllm_v2/jaspionjader/slu-32/f68bf680-9626-4952-b95e-12a18fd60820.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_slu-32/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "slu-32",
-    "id": "jaspionjader/slu-32",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4516
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5167
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1073
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4039
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3766
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/slu-33/d6a78a5c-4a2e-4370-88f2-d8627a94f1ea.json b/data/hfopenllm_v2/jaspionjader/slu-33/d6a78a5c-4a2e-4370-88f2-d8627a94f1ea.json
deleted file mode 100644
index c25031e6e..000000000
--- a/data/hfopenllm_v2/jaspionjader/slu-33/d6a78a5c-4a2e-4370-88f2-d8627a94f1ea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_slu-33/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "slu-33",
-    "id": "jaspionjader/slu-33",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4457
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5081
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0997
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3679
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/slu-34/7b5eab2e-fba3-47d5-9839-02249c2568c5.json b/data/hfopenllm_v2/jaspionjader/slu-34/7b5eab2e-fba3-47d5-9839-02249c2568c5.json
deleted file mode 100644
index 53e4bfd25..000000000
--- a/data/hfopenllm_v2/jaspionjader/slu-34/7b5eab2e-fba3-47d5-9839-02249c2568c5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_slu-34/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "slu-34",
-    "id": "jaspionjader/slu-34",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4351
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5077
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0997
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.388
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.372
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/slu-35/2acee2c3-4322-4152-8151-c1d571475b7c.json b/data/hfopenllm_v2/jaspionjader/slu-35/2acee2c3-4322-4152-8151-c1d571475b7c.json
deleted file mode 100644
index 3720fab6c..000000000
--- a/data/hfopenllm_v2/jaspionjader/slu-35/2acee2c3-4322-4152-8151-c1d571475b7c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_slu-35/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "slu-35",
-    "id": "jaspionjader/slu-35",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4242
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5103
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1012
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3946
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3676
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/slu-36/67ffb2de-0410-44a2-aad7-4a32e2c49c7d.json b/data/hfopenllm_v2/jaspionjader/slu-36/67ffb2de-0410-44a2-aad7-4a32e2c49c7d.json
deleted file mode 100644
index 54b88f06a..000000000
--- a/data/hfopenllm_v2/jaspionjader/slu-36/67ffb2de-0410-44a2-aad7-4a32e2c49c7d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_slu-36/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "slu-36",
-    "id": "jaspionjader/slu-36",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4518
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5087
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0906
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3933
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3711
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/slu-37/2923aeb3-982f-400d-9588-707583c75a1d.json b/data/hfopenllm_v2/jaspionjader/slu-37/2923aeb3-982f-400d-9588-707583c75a1d.json
deleted file mode 100644
index c1501c97b..000000000
--- a/data/hfopenllm_v2/jaspionjader/slu-37/2923aeb3-982f-400d-9588-707583c75a1d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_slu-37/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "slu-37",
-    "id": "jaspionjader/slu-37",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4534
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.51
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0974
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3946
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3695
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/slu-6/b6a622da-5ce8-4ea5-a82a-f3a2a299ddf2.json b/data/hfopenllm_v2/jaspionjader/slu-6/b6a622da-5ce8-4ea5-a82a-f3a2a299ddf2.json
deleted file mode 100644
index bdbe422b6..000000000
--- a/data/hfopenllm_v2/jaspionjader/slu-6/b6a622da-5ce8-4ea5-a82a-f3a2a299ddf2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_slu-6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "slu-6",
-    "id": "jaspionjader/slu-6",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4117
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5099
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0944
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4066
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3611
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/slu-mix-1/7b06ac17-bfc6-43d5-99e6-d2b7a31290fb.json b/data/hfopenllm_v2/jaspionjader/slu-mix-1/7b06ac17-bfc6-43d5-99e6-d2b7a31290fb.json
deleted file mode 100644
index d2cd73f98..000000000
--- a/data/hfopenllm_v2/jaspionjader/slu-mix-1/7b06ac17-bfc6-43d5-99e6-d2b7a31290fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_slu-mix-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "slu-mix-1",
-    "id": "jaspionjader/slu-mix-1",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4569
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.524
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1118
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4277
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.393
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/sof-1/fd481b93-55b2-4831-9be9-1b1b2886fda3.json b/data/hfopenllm_v2/jaspionjader/sof-1/fd481b93-55b2-4831-9be9-1b1b2886fda3.json
deleted file mode 100644
index 23e9adbfe..000000000
--- a/data/hfopenllm_v2/jaspionjader/sof-1/fd481b93-55b2-4831-9be9-1b1b2886fda3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_sof-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "sof-1",
-    "id": "jaspionjader/sof-1",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4314
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.501
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4082
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3674
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/sof-10/f159748f-234e-4962-b582-cd5805448f33.json b/data/hfopenllm_v2/jaspionjader/sof-10/f159748f-234e-4962-b582-cd5805448f33.json
deleted file mode 100644
index a665ef820..000000000
--- a/data/hfopenllm_v2/jaspionjader/sof-10/f159748f-234e-4962-b582-cd5805448f33.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_sof-10/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "sof-10",
-    "id": "jaspionjader/sof-10",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4648
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5197
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1239
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4091
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3874
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/sof-3/044d53dd-d134-4959-a70c-46f11cc0b300.json b/data/hfopenllm_v2/jaspionjader/sof-3/044d53dd-d134-4959-a70c-46f11cc0b300.json
deleted file mode 100644
index 02de1aae3..000000000
--- a/data/hfopenllm_v2/jaspionjader/sof-3/044d53dd-d134-4959-a70c-46f11cc0b300.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_sof-3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "sof-3",
-    "id": "jaspionjader/sof-3",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4637
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5206
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1276
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4131
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3812
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/sof-6/f05501fd-7c06-46d5-bc20-a9d0cc5c2e0f.json b/data/hfopenllm_v2/jaspionjader/sof-6/f05501fd-7c06-46d5-bc20-a9d0cc5c2e0f.json
deleted file mode 100644
index f3a8226cf..000000000
--- a/data/hfopenllm_v2/jaspionjader/sof-6/f05501fd-7c06-46d5-bc20-a9d0cc5c2e0f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_sof-6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "sof-6",
-    "id": "jaspionjader/sof-6",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4354
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5209
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1299
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4171
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3844
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/test-10/5c44a2f2-23e3-4c9f-9b7c-9012ca8b15e9.json b/data/hfopenllm_v2/jaspionjader/test-10/5c44a2f2-23e3-4c9f-9b7c-9012ca8b15e9.json
deleted file mode 100644
index 6d72f6368..000000000
--- a/data/hfopenllm_v2/jaspionjader/test-10/5c44a2f2-23e3-4c9f-9b7c-9012ca8b15e9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_test-10/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "test-10",
-    "id": "jaspionjader/test-10",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4578
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5316
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4251
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3936
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/test-11/80e5134b-0733-41cc-8b4f-ef32fbe57066.json b/data/hfopenllm_v2/jaspionjader/test-11/80e5134b-0733-41cc-8b4f-ef32fbe57066.json
deleted file mode 100644
index 91ee0a934..000000000
--- a/data/hfopenllm_v2/jaspionjader/test-11/80e5134b-0733-41cc-8b4f-ef32fbe57066.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_test-11/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "test-11",
-    "id": "jaspionjader/test-11",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4541
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.535
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.429
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3939
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/test-12/61123e41-7b2a-40da-9f7f-b830c27d7f12.json b/data/hfopenllm_v2/jaspionjader/test-12/61123e41-7b2a-40da-9f7f-b830c27d7f12.json
deleted file mode 100644
index f704e5a09..000000000
--- a/data/hfopenllm_v2/jaspionjader/test-12/61123e41-7b2a-40da-9f7f-b830c27d7f12.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_test-12/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "test-12",
-    "id": "jaspionjader/test-12",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4368
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5347
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.425
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/test-13/b93c31d7-54c3-47b9-a267-3f8fdb796805.json b/data/hfopenllm_v2/jaspionjader/test-13/b93c31d7-54c3-47b9-a267-3f8fdb796805.json
deleted file mode 100644
index d5aea42d0..000000000
--- a/data/hfopenllm_v2/jaspionjader/test-13/b93c31d7-54c3-47b9-a267-3f8fdb796805.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_test-13/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "test-13",
-    "id": "jaspionjader/test-13",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4581
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5318
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1057
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4264
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/test-14/b3eaa4c5-7abc-4e2d-9c11-c70ecb8a843b.json b/data/hfopenllm_v2/jaspionjader/test-14/b3eaa4c5-7abc-4e2d-9c11-c70ecb8a843b.json
deleted file mode 100644
index d1c730295..000000000
--- a/data/hfopenllm_v2/jaspionjader/test-14/b3eaa4c5-7abc-4e2d-9c11-c70ecb8a843b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_test-14/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "test-14",
-    "id": "jaspionjader/test-14",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4444
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5323
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1103
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4317
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.393
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/test-15/3b06f75e-3d22-4428-8d4f-2e704b96961e.json b/data/hfopenllm_v2/jaspionjader/test-15/3b06f75e-3d22-4428-8d4f-2e704b96961e.json
deleted file mode 100644
index 1f7a65dff..000000000
--- a/data/hfopenllm_v2/jaspionjader/test-15/3b06f75e-3d22-4428-8d4f-2e704b96961e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_test-15/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "test-15",
-    "id": "jaspionjader/test-15",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4365
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5328
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1118
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4264
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.393
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/test-16/dfda4aab-f8d4-49ee-b141-78539b69007c.json b/data/hfopenllm_v2/jaspionjader/test-16/dfda4aab-f8d4-49ee-b141-78539b69007c.json
deleted file mode 100644
index e35aa96fb..000000000
--- a/data/hfopenllm_v2/jaspionjader/test-16/dfda4aab-f8d4-49ee-b141-78539b69007c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_test-16/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "test-16",
-    "id": "jaspionjader/test-16",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4599
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.533
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1095
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4225
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.393
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/test-17/690f3c19-c148-458d-b4c5-87761d72b851.json b/data/hfopenllm_v2/jaspionjader/test-17/690f3c19-c148-458d-b4c5-87761d72b851.json
deleted file mode 100644
index ecdd22a51..000000000
--- a/data/hfopenllm_v2/jaspionjader/test-17/690f3c19-c148-458d-b4c5-87761d72b851.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_test-17/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "test-17",
-    "id": "jaspionjader/test-17",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.015
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4267
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5329
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1103
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.429
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3929
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/test-18/b6a18246-776d-463f-80d5-140df74e9704.json b/data/hfopenllm_v2/jaspionjader/test-18/b6a18246-776d-463f-80d5-140df74e9704.json
deleted file mode 100644
index ae471a895..000000000
--- a/data/hfopenllm_v2/jaspionjader/test-18/b6a18246-776d-463f-80d5-140df74e9704.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_test-18/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "test-18",
-    "id": "jaspionjader/test-18",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4392
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5317
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4251
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.393
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/test-19/9831abdc-ad08-48c0-8384-86240e7350b5.json b/data/hfopenllm_v2/jaspionjader/test-19/9831abdc-ad08-48c0-8384-86240e7350b5.json
deleted file mode 100644
index acc7c5b32..000000000
--- a/data/hfopenllm_v2/jaspionjader/test-19/9831abdc-ad08-48c0-8384-86240e7350b5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_test-19/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "test-19",
-    "id": "jaspionjader/test-19",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4401
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5319
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1095
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4264
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3929
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jaspionjader/test-20/96a572e5-4751-46ce-9202-deb223ef4dfe.json b/data/hfopenllm_v2/jaspionjader/test-20/96a572e5-4751-46ce-9202-deb223ef4dfe.json
deleted file mode 100644
index 98974db50..000000000
--- a/data/hfopenllm_v2/jaspionjader/test-20/96a572e5-4751-46ce-9202-deb223ef4dfe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jaspionjader_test-20/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "test-20",
-    "id": "jaspionjader/test-20",
-    "developer": "jaspionjader",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4529
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5327
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1118
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4251
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.392
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jayasuryajsk/Qwen2.5-3B-reasoner/f4320b1e-ea4f-4aea-8dab-cdb221ce53e5.json b/data/hfopenllm_v2/jayasuryajsk/Qwen2.5-3B-reasoner/f4320b1e-ea4f-4aea-8dab-cdb221ce53e5.json
deleted file mode 100644
index b9587c6d6..000000000
--- a/data/hfopenllm_v2/jayasuryajsk/Qwen2.5-3B-reasoner/f4320b1e-ea4f-4aea-8dab-cdb221ce53e5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jayasuryajsk_Qwen2.5-3B-reasoner/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-3B-reasoner",
-    "id": "jayasuryajsk/Qwen2.5-3B-reasoner",
-    "developer": "jayasuryajsk",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.416
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4651
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2085
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4123
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3482
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeanmichela/o-distil-qwen/8376c0bf-f9c3-4529-b13c-c57106182d15.json b/data/hfopenllm_v2/jeanmichela/o-distil-qwen/8376c0bf-f9c3-4529-b13c-c57106182d15.json
deleted file mode 100644
index 214d189b4..000000000
--- a/data/hfopenllm_v2/jeanmichela/o-distil-qwen/8376c0bf-f9c3-4529-b13c-c57106182d15.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeanmichela_o-distil-qwen/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "o-distil-qwen",
-    "id": "jeanmichela/o-distil-qwen",
-    "developer": "jeanmichela",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4482
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.59
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.565
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.534
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4658
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jebcarter/psyonic-cetacean-20B/97a80145-e621-4603-8ff8-2cc4bd74190a.json b/data/hfopenllm_v2/jebcarter/psyonic-cetacean-20B/97a80145-e621-4603-8ff8-2cc4bd74190a.json
deleted file mode 100644
index bb13bcad2..000000000
--- a/data/hfopenllm_v2/jebcarter/psyonic-cetacean-20B/97a80145-e621-4603-8ff8-2cc4bd74190a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jebcarter_psyonic-cetacean-20B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "psyonic-cetacean-20B",
-    "id": "jebcarter/psyonic-cetacean-20B",
-    "developer": "jebcarter",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 19.994
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2544
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4907
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4661
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jebish7/Llama-3-Nanda-10B-Chat/99a7881c-cca0-43d6-96f5-ce5292ed60a0.json b/data/hfopenllm_v2/jebish7/Llama-3-Nanda-10B-Chat/99a7881c-cca0-43d6-96f5-ce5292ed60a0.json
deleted file mode 100644
index f63ea2032..000000000
--- a/data/hfopenllm_v2/jebish7/Llama-3-Nanda-10B-Chat/99a7881c-cca0-43d6-96f5-ce5292ed60a0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jebish7_Llama-3-Nanda-10B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Nanda-10B-Chat",
-    "id": "jebish7/Llama-3-Nanda-10B-Chat",
-    "developer": "jebish7",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 9.985
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4959
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0559
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4356
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3157
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jebish7/Llama-3.1-8B-Instruct/60ca8f7e-1c20-4adb-bb84-892bad3c0d63.json b/data/hfopenllm_v2/jebish7/Llama-3.1-8B-Instruct/60ca8f7e-1c20-4adb-bb84-892bad3c0d63.json
deleted file mode 100644
index d88b01af6..000000000
--- a/data/hfopenllm_v2/jebish7/Llama-3.1-8B-Instruct/60ca8f7e-1c20-4adb-bb84-892bad3c0d63.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jebish7_Llama-3.1-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-Instruct",
-    "id": "jebish7/Llama-3.1-8B-Instruct",
-    "developer": "jebish7",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5058
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5088
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1548
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3998
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3777
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jebish7/Nemotron-4-Mini-Hindi-4B-Base/4a0f8dc7-9446-4dda-bf49-8cca4851746c.json b/data/hfopenllm_v2/jebish7/Nemotron-4-Mini-Hindi-4B-Base/4a0f8dc7-9446-4dda-bf49-8cca4851746c.json
deleted file mode 100644
index 9ffd03cf9..000000000
--- a/data/hfopenllm_v2/jebish7/Nemotron-4-Mini-Hindi-4B-Base/4a0f8dc7-9446-4dda-bf49-8cca4851746c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jebish7_Nemotron-4-Mini-Hindi-4B-Base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nemotron-4-Mini-Hindi-4B-Base",
-    "id": "jebish7/Nemotron-4-Mini-Hindi-4B-Base",
-    "developer": "jebish7",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "NemotronForCausalLM",
-      "params_billions": 4.191
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2285
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3924
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0272
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4249
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2503
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jebish7/Nemotron-4-Mini-Hindi-4B-Instruct/6eb3a040-8234-4d31-8274-6987b0e4e3b4.json b/data/hfopenllm_v2/jebish7/Nemotron-4-Mini-Hindi-4B-Instruct/6eb3a040-8234-4d31-8274-6987b0e4e3b4.json
deleted file mode 100644
index bd15b91c4..000000000
--- a/data/hfopenllm_v2/jebish7/Nemotron-4-Mini-Hindi-4B-Instruct/6eb3a040-8234-4d31-8274-6987b0e4e3b4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jebish7_Nemotron-4-Mini-Hindi-4B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nemotron-4-Mini-Hindi-4B-Instruct",
-    "id": "jebish7/Nemotron-4-Mini-Hindi-4B-Instruct",
-    "developer": "jebish7",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "NemotronForCausalLM",
-      "params_billions": 4.191
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3345
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4041
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4153
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2595
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jebish7/Nemotron-Mini-4B-Instruct/16053077-38fd-4136-81a5-fea0d4cd927a.json b/data/hfopenllm_v2/jebish7/Nemotron-Mini-4B-Instruct/16053077-38fd-4136-81a5-fea0d4cd927a.json
deleted file mode 100644
index b0a9fe425..000000000
--- a/data/hfopenllm_v2/jebish7/Nemotron-Mini-4B-Instruct/16053077-38fd-4136-81a5-fea0d4cd927a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jebish7_Nemotron-Mini-4B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nemotron-Mini-4B-Instruct",
-    "id": "jebish7/Nemotron-Mini-4B-Instruct",
-    "developer": "jebish7",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "NemotronForCausalLM",
-      "params_billions": 4.191
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3709
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4244
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0325
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4727
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2783
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jebish7/aya-expanse-8b/25abb99f-536e-4638-8611-a1db5dee931d.json b/data/hfopenllm_v2/jebish7/aya-expanse-8b/25abb99f-536e-4638-8611-a1db5dee931d.json
deleted file mode 100644
index d66b00bb7..000000000
--- a/data/hfopenllm_v2/jebish7/aya-expanse-8b/25abb99f-536e-4638-8611-a1db5dee931d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jebish7_aya-expanse-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "aya-expanse-8b",
-    "id": "jebish7/aya-expanse-8b",
-    "developer": "jebish7",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "CohereForCausalLM",
-      "params_billions": 8.028
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3791
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4969
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0816
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3869
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3103
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jebish7/gemma-2-2b-it/aaf0e5bd-b033-455e-bb23-b12b6f7c4520.json b/data/hfopenllm_v2/jebish7/gemma-2-2b-it/aaf0e5bd-b033-455e-bb23-b12b6f7c4520.json
deleted file mode 100644
index d43d0aaca..000000000
--- a/data/hfopenllm_v2/jebish7/gemma-2-2b-it/aaf0e5bd-b033-455e-bb23-b12b6f7c4520.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jebish7_gemma-2-2b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-it",
-    "id": "jebish7/gemma-2-2b-it",
-    "developer": "jebish7",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1272
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4395
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.034
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4244
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2715
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jebish7/gemma-2-9b-it/b3a46478-c5f4-4c74-9bf0-d1ba616ae24c.json b/data/hfopenllm_v2/jebish7/gemma-2-9b-it/b3a46478-c5f4-4c74-9bf0-d1ba616ae24c.json
deleted file mode 100644
index 2343609a8..000000000
--- a/data/hfopenllm_v2/jebish7/gemma-2-9b-it/b3a46478-c5f4-4c74-9bf0-d1ba616ae24c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jebish7_gemma-2-9b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-9b-it",
-    "id": "jebish7/gemma-2-9b-it",
-    "developer": "jebish7",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1557
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5949
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0846
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4554
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4143
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jebish7/qwen2.5-0.5B-IHA-Hin/169fb05f-5201-47b8-a06e-7d01e574c689.json b/data/hfopenllm_v2/jebish7/qwen2.5-0.5B-IHA-Hin/169fb05f-5201-47b8-a06e-7d01e574c689.json
deleted file mode 100644
index 8d24c633c..000000000
--- a/data/hfopenllm_v2/jebish7/qwen2.5-0.5B-IHA-Hin/169fb05f-5201-47b8-a06e-7d01e574c689.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jebish7_qwen2.5-0.5B-IHA-Hin/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen2.5-0.5B-IHA-Hin",
-    "id": "jebish7/qwen2.5-0.5B-IHA-Hin",
-    "developer": "jebish7",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1416
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2989
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3475
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1094
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen-7B-nerd-uncensored-v1.0/db076309-32e5-4d46-9786-ff14f8daf5d2.json b/data/hfopenllm_v2/jeffmeloy/Qwen-7B-nerd-uncensored-v1.0/db076309-32e5-4d46-9786-ff14f8daf5d2.json
deleted file mode 100644
index 88151c7d8..000000000
--- a/data/hfopenllm_v2/jeffmeloy/Qwen-7B-nerd-uncensored-v1.0/db076309-32e5-4d46-9786-ff14f8daf5d2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen-7B-nerd-uncensored-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-7B-nerd-uncensored-v1.0",
-    "id": "jeffmeloy/Qwen-7B-nerd-uncensored-v1.0",
-    "developer": "jeffmeloy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6136
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5421
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4793
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4363
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-minperplexity-2/cde914dc-7d57-425f-9787-e4b8d36d61cf.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-minperplexity-2/cde914dc-7d57-425f-9787-e4b8d36d61cf.json
deleted file mode 100644
index 470df5e03..000000000
--- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-minperplexity-2/cde914dc-7d57-425f-9787-e4b8d36d61cf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-minperplexity-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-minperplexity-2",
-    "id": "jeffmeloy/Qwen2.5-7B-minperplexity-2",
-    "developer": "jeffmeloy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5097
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5524
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3014
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4625
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4346
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v0.9/5d793ce3-a7fd-4ee3-b32c-c9da63ec0566.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v0.9/5d793ce3-a7fd-4ee3-b32c-c9da63ec0566.json
deleted file mode 100644
index b70182eb1..000000000
--- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v0.9/5d793ce3-a7fd-4ee3-b32c-c9da63ec0566.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v0.9/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-nerd-uncensored-v0.9",
-    "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v0.9",
-    "developer": "jeffmeloy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6048
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.547
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2946
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.482
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4363
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.0/8c645c9f-02f6-44a5-b295-d6364ed49464.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.0/8c645c9f-02f6-44a5-b295-d6364ed49464.json
deleted file mode 100644
index 5f5c44741..000000000
--- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.0/8c645c9f-02f6-44a5-b295-d6364ed49464.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-nerd-uncensored-v1.0",
-    "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.0",
-    "developer": "jeffmeloy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7695
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5418
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4713
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4551
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4254
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.1/97bb5519-e2d3-44d5-abf4-b5263c2b3245.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.1/97bb5519-e2d3-44d5-abf4-b5263c2b3245.json
deleted file mode 100644
index 87afc83a8..000000000
--- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.1/97bb5519-e2d3-44d5-abf4-b5263c2b3245.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-nerd-uncensored-v1.1",
-    "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.1",
-    "developer": "jeffmeloy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6626
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4864
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1329
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3843
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.385
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.2/bd3d78d3-3ff1-4a92-a316-e4e30787a331.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.2/bd3d78d3-3ff1-4a92-a316-e4e30787a331.json
deleted file mode 100644
index edab0f5d7..000000000
--- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.2/bd3d78d3-3ff1-4a92-a316-e4e30787a331.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-nerd-uncensored-v1.2",
-    "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.2",
-    "developer": "jeffmeloy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4965
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4946
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1208
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4172
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3969
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.3/d8951ed7-f4ef-49ce-891e-8d8509e9cf93.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.3/d8951ed7-f4ef-49ce-891e-8d8509e9cf93.json
deleted file mode 100644
index ae6f30f79..000000000
--- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.3/d8951ed7-f4ef-49ce-891e-8d8509e9cf93.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-nerd-uncensored-v1.3",
-    "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.3",
-    "developer": "jeffmeloy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4995
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5026
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1231
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4187
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4016
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.4/e1772d6c-fd26-43a7-82b3-7997d8a6809f.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.4/e1772d6c-fd26-43a7-82b3-7997d8a6809f.json
deleted file mode 100644
index 130b5f1a1..000000000
--- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.4/e1772d6c-fd26-43a7-82b3-7997d8a6809f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-nerd-uncensored-v1.4",
-    "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.4",
-    "developer": "jeffmeloy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6079
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5467
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4714
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4419
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.5/febaf893-6aaf-4c87-89fc-cc865ebf2859.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.5/febaf893-6aaf-4c87-89fc-cc865ebf2859.json
deleted file mode 100644
index 41e373c25..000000000
--- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.5/febaf893-6aaf-4c87-89fc-cc865ebf2859.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-nerd-uncensored-v1.5",
-    "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.5",
-    "developer": "jeffmeloy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.565
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5523
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2757
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4982
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4448
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.7/0ad591f4-c846-4fd1-8536-a169e0a7e4ab.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.7/0ad591f4-c846-4fd1-8536-a169e0a7e4ab.json
deleted file mode 100644
index adea07aff..000000000
--- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.7/0ad591f4-c846-4fd1-8536-a169e0a7e4ab.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-nerd-uncensored-v1.7",
-    "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.7",
-    "developer": "jeffmeloy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4202
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5392
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2915
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4848
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.428
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.8/0a318ebd-7bbb-456b-a6e4-9b480a858b5e.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.8/0a318ebd-7bbb-456b-a6e4-9b480a858b5e.json
deleted file mode 100644
index b0715c0fe..000000000
--- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.8/0a318ebd-7bbb-456b-a6e4-9b480a858b5e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-nerd-uncensored-v1.8/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-nerd-uncensored-v1.8",
-    "id": "jeffmeloy/Qwen2.5-7B-nerd-uncensored-v1.8",
-    "developer": "jeffmeloy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6256
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5447
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2704
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4767
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4343
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.0/e1cfdc32-3c5e-4f4b-a205-f416c96cf5e6.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.0/e1cfdc32-3c5e-4f4b-a205-f416c96cf5e6.json
deleted file mode 100644
index ff3474fac..000000000
--- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.0/e1cfdc32-3c5e-4f4b-a205-f416c96cf5e6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-olm-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-olm-v1.0",
-    "id": "jeffmeloy/Qwen2.5-7B-olm-v1.0",
-    "developer": "jeffmeloy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5331
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.566
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2863
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4278
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4566
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.1/85426280-8138-46d0-a111-b59b0d7c86c8.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.1/85426280-8138-46d0-a111-b59b0d7c86c8.json
deleted file mode 100644
index 9cbcd9f74..000000000
--- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.1/85426280-8138-46d0-a111-b59b0d7c86c8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-olm-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-olm-v1.1",
-    "id": "jeffmeloy/Qwen2.5-7B-olm-v1.1",
-    "developer": "jeffmeloy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4329
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5478
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3829
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4808
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4354
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.2/32bbd26e-05e7-4a0f-a491-8f54cea9f3d3.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.2/32bbd26e-05e7-4a0f-a491-8f54cea9f3d3.json
deleted file mode 100644
index ed920c47e..000000000
--- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.2/32bbd26e-05e7-4a0f-a491-8f54cea9f3d3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-olm-v1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-olm-v1.2",
-    "id": "jeffmeloy/Qwen2.5-7B-olm-v1.2",
-    "developer": "jeffmeloy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4203
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5533
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2847
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4688
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4387
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.3/86ed6833-ae85-4a8e-b840-b0c9540083ce.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.3/86ed6833-ae85-4a8e-b840-b0c9540083ce.json
deleted file mode 100644
index c0cbb0d3b..000000000
--- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.3/86ed6833-ae85-4a8e-b840-b0c9540083ce.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-olm-v1.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-olm-v1.3",
-    "id": "jeffmeloy/Qwen2.5-7B-olm-v1.3",
-    "developer": "jeffmeloy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4219
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5532
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4701
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.447
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.4/2f751ac3-5ca5-4d0d-9ad4-48155e51468a.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.4/2f751ac3-5ca5-4d0d-9ad4-48155e51468a.json
deleted file mode 100644
index 120c4b77c..000000000
--- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.4/2f751ac3-5ca5-4d0d-9ad4-48155e51468a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-olm-v1.4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-olm-v1.4",
-    "id": "jeffmeloy/Qwen2.5-7B-olm-v1.4",
-    "developer": "jeffmeloy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4545
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5582
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2923
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4622
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4457
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.5/9677e68d-afda-4917-825c-83318219ff59.json b/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.5/9677e68d-afda-4917-825c-83318219ff59.json
deleted file mode 100644
index c40b034ac..000000000
--- a/data/hfopenllm_v2/jeffmeloy/Qwen2.5-7B-olm-v1.5/9677e68d-afda-4917-825c-83318219ff59.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeffmeloy_Qwen2.5-7B-olm-v1.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-olm-v1.5",
-    "id": "jeffmeloy/Qwen2.5-7B-olm-v1.5",
-    "developer": "jeffmeloy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4547
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5544
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2817
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3398
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4539
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4399
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeffmeloy/jeffmeloy_Qwen2.5-7B-minperplexity-1/23cd57c2-bf7f-440a-ab3e-edfdede5e8cd.json b/data/hfopenllm_v2/jeffmeloy/jeffmeloy_Qwen2.5-7B-minperplexity-1/23cd57c2-bf7f-440a-ab3e-edfdede5e8cd.json
deleted file mode 100644
index a05b1551c..000000000
--- a/data/hfopenllm_v2/jeffmeloy/jeffmeloy_Qwen2.5-7B-minperplexity-1/23cd57c2-bf7f-440a-ab3e-edfdede5e8cd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeffmeloy_jeffmeloy_Qwen2.5-7B-minperplexity-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "jeffmeloy_Qwen2.5-7B-minperplexity-1",
-    "id": "jeffmeloy/jeffmeloy_Qwen2.5-7B-minperplexity-1",
-    "developer": "jeffmeloy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3757
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5582
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2915
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3322
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.429
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4368
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jeonsworld/CarbonVillain-en-10.7B-v4/bec23315-f98a-4211-81a0-c49f395e66c9.json b/data/hfopenllm_v2/jeonsworld/CarbonVillain-en-10.7B-v4/bec23315-f98a-4211-81a0-c49f395e66c9.json
deleted file mode 100644
index 186c8ce3d..000000000
--- a/data/hfopenllm_v2/jeonsworld/CarbonVillain-en-10.7B-v4/bec23315-f98a-4211-81a0-c49f395e66c9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jeonsworld_CarbonVillain-en-10.7B-v4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CarbonVillain-en-10.7B-v4",
-    "id": "jeonsworld/CarbonVillain-en-10.7B-v4",
-    "developer": "jeonsworld",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4579
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5168
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0468
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3965
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3142
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jiangxinyang-shanda/Homer-LLama3-8B/1ac5faef-7fa0-4b58-a6ba-0c444a2023a8.json b/data/hfopenllm_v2/jiangxinyang-shanda/Homer-LLama3-8B/1ac5faef-7fa0-4b58-a6ba-0c444a2023a8.json
deleted file mode 100644
index 122a2bdfb..000000000
--- a/data/hfopenllm_v2/jiangxinyang-shanda/Homer-LLama3-8B/1ac5faef-7fa0-4b58-a6ba-0c444a2023a8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jiangxinyang-shanda_Homer-LLama3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Homer-LLama3-8B",
-    "id": "jiangxinyang-shanda/Homer-LLama3-8B",
-    "developer": "jiangxinyang-shanda",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3992
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5173
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0861
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4056
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3139
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jieliu/Storm-7B/39327803-11e7-4b28-8750-81feb027e8f3.json b/data/hfopenllm_v2/jieliu/Storm-7B/39327803-11e7-4b28-8750-81feb027e8f3.json
deleted file mode 100644
index eaad1d2e4..000000000
--- a/data/hfopenllm_v2/jieliu/Storm-7B/39327803-11e7-4b28-8750-81feb027e8f3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jieliu_Storm-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Storm-7B",
-    "id": "jieliu/Storm-7B",
-    "developer": "jieliu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3424
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5187
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4429
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3119
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jiviai/medX_v2/ce2b6874-0fc8-4364-a526-7b25b101e1e3.json b/data/hfopenllm_v2/jiviai/medX_v2/ce2b6874-0fc8-4364-a526-7b25b101e1e3.json
deleted file mode 100644
index 57f4909ab..000000000
--- a/data/hfopenllm_v2/jiviai/medX_v2/ce2b6874-0fc8-4364-a526-7b25b101e1e3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jiviai_medX_v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "medX_v2",
-    "id": "jiviai/medX_v2",
-    "developer": "jiviai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3743
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4509
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3498
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3428
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jlzhou/Qwen2.5-3B-Infinity-Instruct-0625/9f9ebc90-31f9-45c1-b9c2-07b727b12f3d.json b/data/hfopenllm_v2/jlzhou/Qwen2.5-3B-Infinity-Instruct-0625/9f9ebc90-31f9-45c1-b9c2-07b727b12f3d.json
deleted file mode 100644
index bacdbbca7..000000000
--- a/data/hfopenllm_v2/jlzhou/Qwen2.5-3B-Infinity-Instruct-0625/9f9ebc90-31f9-45c1-b9c2-07b727b12f3d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jlzhou_Qwen2.5-3B-Infinity-Instruct-0625/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-3B-Infinity-Instruct-0625",
-    "id": "jlzhou/Qwen2.5-3B-Infinity-Instruct-0625",
-    "developer": "jlzhou",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3558
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4774
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1367
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3981
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3199
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01/d189a2fc-71f5-4bc9-a0b1-7e744a19921f.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01/d189a2fc-71f5-4bc9-a0b1-7e744a19921f.json
deleted file mode 100644
index c7139a751..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01/d189a2fc-71f5-4bc9-a0b1-7e744a19921f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.01",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4271
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5036
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4638
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3739
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1/1eb697fe-9dd4-4a41-aa47-33456df39e2d.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1/1eb697fe-9dd4-4a41-aa47-33456df39e2d.json
deleted file mode 100644
index a10daf5d6..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1/1eb697fe-9dd4-4a41-aa47-33456df39e2d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.1-gamma-0.1",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4253
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5019
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0967
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.415
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3724
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01/5f10df7b-cd2c-44ca-b13a-2852483c71f8.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01/5f10df7b-cd2c-44ca-b13a-2852483c71f8.json
deleted file mode 100644
index c3e92c793..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01/5f10df7b-cd2c-44ca-b13a-2852483c71f8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.01",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3377
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4917
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5018
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3533
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1/3abbb4b6-8050-44fd-b066-0f061ce2f4d7.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1/3abbb4b6-8050-44fd-b066-0f061ce2f4d7.json
deleted file mode 100644
index 8e8a04411..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1/3abbb4b6-8050-44fd-b066-0f061ce2f4d7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.3-gamma-0.1",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4274
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5126
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0808
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4226
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3739
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01/5f47e65d-293f-469e-a18f-5627ca1adf44.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01/5f47e65d-293f-469e-a18f-5627ca1adf44.json
deleted file mode 100644
index f7941ba70..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01/5f47e65d-293f-469e-a18f-5627ca1adf44.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.01",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3204
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4884
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0038
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5098
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3344
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1/b753c1aa-8a0c-4600-99ec-8eb51ab50da7.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1/b753c1aa-8a0c-4600-99ec-8eb51ab50da7.json
deleted file mode 100644
index 9fd6df7ed..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1/b753c1aa-8a0c-4600-99ec-8eb51ab50da7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.5-gamma-0.1",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4396
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.514
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0801
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4398
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3696
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01/15c21655-9af8-4bee-9884-b047683e9adf.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01/15c21655-9af8-4bee-9884-b047683e9adf.json
deleted file mode 100644
index 2d9d72344..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01/15c21655-9af8-4bee-9884-b047683e9adf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.01",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2814
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4854
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0023
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5163
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3295
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1/f642de95-218a-4db0-807f-1bb97618b4f6.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1/f642de95-218a-4db0-807f-1bb97618b4f6.json
deleted file mode 100644
index 0a6e5d576..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1/f642de95-218a-4db0-807f-1bb97618b4f6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.7-gamma-0.1",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4302
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5157
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0627
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4332
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3663
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01/01443b06-9ad3-41f5-ae0d-bc84086e0a0d.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01/01443b06-9ad3-41f5-ae0d-bc84086e0a0d.json
deleted file mode 100644
index 960cb299c..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01/01443b06-9ad3-41f5-ae0d-bc84086e0a0d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.01",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.279
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4861
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0015
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.515
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3305
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1/1ee8c377-2236-4225-942f-ef8ce5770741.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1/1ee8c377-2236-4225-942f-ef8ce5770741.json
deleted file mode 100644
index 25f0a4caa..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1/1ee8c377-2236-4225-942f-ef8ce5770741.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs-density-0.9-gamma-0.1",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4223
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5154
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.074
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4384
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.365
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01/4ee9aa78-d9eb-4a1c-91c4-f29f093b95d3.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01/4ee9aa78-d9eb-4a1c-91c4-f29f093b95d3.json
deleted file mode 100644
index 9cbebea51..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01/4ee9aa78-d9eb-4a1c-91c4-f29f093b95d3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.01",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4359
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5041
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0483
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4532
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3762
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1/419c6631-805f-43ba-9db8-5296f8d221ec.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1/419c6631-805f-43ba-9db8-5296f8d221ec.json
deleted file mode 100644
index bd49fe5af..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1/419c6631-805f-43ba-9db8-5296f8d221ec.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.1-gamma-0.1",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4202
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5011
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0982
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.415
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3699
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01/3fc1822f-4a43-4a3b-90d7-fc163491c90a.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01/3fc1822f-4a43-4a3b-90d7-fc163491c90a.json
deleted file mode 100644
index e230b5938..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01/3fc1822f-4a43-4a3b-90d7-fc163491c90a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.01",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3518
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4999
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0234
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4871
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3611
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1/76b4037b-c5d0-435f-966a-bd88b1665dad.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1/76b4037b-c5d0-435f-966a-bd88b1665dad.json
deleted file mode 100644
index 449e51cc0..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1/76b4037b-c5d0-435f-966a-bd88b1665dad.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.3-gamma-0.1",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4204
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5107
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0876
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4279
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.371
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01/757b85e7-84c8-429f-aeb4-870852fa8959.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01/757b85e7-84c8-429f-aeb4-870852fa8959.json
deleted file mode 100644
index 3c0a95ced..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01/757b85e7-84c8-429f-aeb4-870852fa8959.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.01",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3454
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4984
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4911
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3531
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1/acab4982-1205-4362-803e-306b1e2371bf.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1/acab4982-1205-4362-803e-306b1e2371bf.json
deleted file mode 100644
index 07aaad644..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1/acab4982-1205-4362-803e-306b1e2371bf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.5-gamma-0.1",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4092
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5137
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0808
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4357
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3669
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01/0e549b5d-c1d9-443d-9a80-8dd34dadd22e.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01/0e549b5d-c1d9-443d-9a80-8dd34dadd22e.json
deleted file mode 100644
index a9858ea11..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01/0e549b5d-c1d9-443d-9a80-8dd34dadd22e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.01",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2904
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4967
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4991
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.349
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1/d3d4eccc-8792-40e5-91cf-22885f4cbaf5.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1/d3d4eccc-8792-40e5-91cf-22885f4cbaf5.json
deleted file mode 100644
index 0de836a5f..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1/d3d4eccc-8792-40e5-91cf-22885f4cbaf5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.7-gamma-0.1",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5147
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0808
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4358
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3615
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01/708aded5-6252-44e3-bf0d-08bf3e7f32e0.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01/708aded5-6252-44e3-bf0d-08bf3e7f32e0.json
deleted file mode 100644
index 987fd3b28..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01/708aded5-6252-44e3-bf0d-08bf3e7f32e0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.01",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2913
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4918
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4977
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3454
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1/ce6d31f2-f38e-4af3-85a3-d2f6c80f71f1.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1/ce6d31f2-f38e-4af3-85a3-d2f6c80f71f1.json
deleted file mode 100644
index 50c1000ff..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1/ce6d31f2-f38e-4af3-85a3-d2f6c80f71f1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1",
-    "id": "johnsutor/Llama-3-8B-Instruct_breadcrumbs_ties-density-0.9-gamma-0.1",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4162
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5139
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0778
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4317
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3625
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_linear/5efcc291-ca9a-4ca9-b2ed-dab37dce5f5a.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_linear/5efcc291-ca9a-4ca9-b2ed-dab37dce5f5a.json
deleted file mode 100644
index 4cb808ae2..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_linear/5efcc291-ca9a-4ca9-b2ed-dab37dce5f5a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_dare_linear/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_dare_linear",
-    "id": "johnsutor/Llama-3-8B-Instruct_dare_linear",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2145
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4283
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4979
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2414
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.1/47320824-8064-40d4-a08c-810faafbba77.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.1/47320824-8064-40d4-a08c-810faafbba77.json
deleted file mode 100644
index 2aa16cb5b..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.1/47320824-8064-40d4-a08c-810faafbba77.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_dare_ties-density-0.1",
-    "id": "johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.1",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1891
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4119
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4658
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2265
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.3/8baeef58-0ba6-4723-8f23-7a4c386f2cad.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.3/8baeef58-0ba6-4723-8f23-7a4c386f2cad.json
deleted file mode 100644
index 0f28a208f..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.3/8baeef58-0ba6-4723-8f23-7a4c386f2cad.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_dare_ties-density-0.3",
-    "id": "johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.3",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2113
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4559
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0015
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5069
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.304
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.7/0387ca63-1e31-4eaa-ac7c-35d417548c54.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.7/0387ca63-1e31-4eaa-ac7c-35d417548c54.json
deleted file mode 100644
index d8989fbe5..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.7/0387ca63-1e31-4eaa-ac7c-35d417548c54.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_dare_ties-density-0.7",
-    "id": "johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.7",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2034
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4723
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.003
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.511
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3148
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.9/733983fe-4b9c-47e6-963d-c57829b6f1af.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.9/733983fe-4b9c-47e6-963d-c57829b6f1af.json
deleted file mode 100644
index ad8724889..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.9/733983fe-4b9c-47e6-963d-c57829b6f1af.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_dare_ties-density-0.9/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_dare_ties-density-0.9",
-    "id": "johnsutor/Llama-3-8B-Instruct_dare_ties-density-0.9",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2161
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4664
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0015
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.523
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3143
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_linear/80c4859d-8016-4650-939f-100ba2e6d808.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_linear/80c4859d-8016-4650-939f-100ba2e6d808.json
deleted file mode 100644
index 011100dc4..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_linear/80c4859d-8016-4650-939f-100ba2e6d808.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_linear/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_linear",
-    "id": "johnsutor/Llama-3-8B-Instruct_linear",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4308
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5031
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1005
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4097
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3712
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.1/21724d3a-cc6c-43eb-9d69-46d8d91c97f8.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.1/21724d3a-cc6c-43eb-9d69-46d8d91c97f8.json
deleted file mode 100644
index 9778173ee..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.1/21724d3a-cc6c-43eb-9d69-46d8d91c97f8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_ties-density-0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_ties-density-0.1",
-    "id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.1",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4116
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5021
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0793
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4174
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.36
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.3/d781945e-e9df-4136-90cd-632f0bed6246.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.3/d781945e-e9df-4136-90cd-632f0bed6246.json
deleted file mode 100644
index 386af3f39..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.3/d781945e-e9df-4136-90cd-632f0bed6246.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_ties-density-0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_ties-density-0.3",
-    "id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.3",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3626
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4906
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0672
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4025
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3321
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.5/8f146bb5-dd4d-49ce-ac60-76f66321feb8.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.5/8f146bb5-dd4d-49ce-ac60-76f66321feb8.json
deleted file mode 100644
index dfb672445..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.5/8f146bb5-dd4d-49ce-ac60-76f66321feb8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_ties-density-0.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_ties-density-0.5",
-    "id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.5",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3797
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4793
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.388
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3175
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.7/89bfba6d-c622-445e-b0b9-512aadcea7cf.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.7/89bfba6d-c622-445e-b0b9-512aadcea7cf.json
deleted file mode 100644
index 5246939f3..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.7/89bfba6d-c622-445e-b0b9-512aadcea7cf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_ties-density-0.7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_ties-density-0.7",
-    "id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.7",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3681
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4738
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0672
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3881
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3152
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.9/9c27f2e6-ebbe-4fac-bc51-74455d3a6512.json b/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.9/9c27f2e6-ebbe-4fac-bc51-74455d3a6512.json
deleted file mode 100644
index 248669cbb..000000000
--- a/data/hfopenllm_v2/johnsutor/Llama-3-8B-Instruct_ties-density-0.9/9c27f2e6-ebbe-4fac-bc51-74455d3a6512.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/johnsutor_Llama-3-8B-Instruct_ties-density-0.9/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct_ties-density-0.9",
-    "id": "johnsutor/Llama-3-8B-Instruct_ties-density-0.9",
-    "developer": "johnsutor",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3858
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4735
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0619
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.388
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3182
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-4k-DPO/455ef1e0-bdf2-49bf-a53d-2c9e3d00d5f3.json b/data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-4k-DPO/455ef1e0-bdf2-49bf-a53d-2c9e3d00d5f3.json
deleted file mode 100644
index 300e25443..000000000
--- a/data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-4k-DPO/455ef1e0-bdf2-49bf-a53d-2c9e3d00d5f3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-14B-Instruct-4k-DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chocolatine-14B-Instruct-4k-DPO",
-    "id": "jpacifico/Chocolatine-14B-Instruct-4k-DPO",
-    "developer": "jpacifico",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 13.96
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4689
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.63
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1782
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3414
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4439
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4764
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-DPO-v1.2/e04a76a6-ac22-43b2-bbf9-196a08de2949.json b/data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-DPO-v1.2/e04a76a6-ac22-43b2-bbf9-196a08de2949.json
deleted file mode 100644
index ec8836b29..000000000
--- a/data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-DPO-v1.2/e04a76a6-ac22-43b2-bbf9-196a08de2949.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-14B-Instruct-DPO-v1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chocolatine-14B-Instruct-DPO-v1.2",
-    "id": "jpacifico/Chocolatine-14B-Instruct-DPO-v1.2",
-    "developer": "jpacifico",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 13.96
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6852
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6438
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2092
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4268
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4697
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-DPO-v1.3/2fcb74f0-add1-4d46-8a0f-8578a616dbed.json b/data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-DPO-v1.3/2fcb74f0-add1-4d46-8a0f-8578a616dbed.json
deleted file mode 100644
index e3fa490e4..000000000
--- a/data/hfopenllm_v2/jpacifico/Chocolatine-14B-Instruct-DPO-v1.3/2fcb74f0-add1-4d46-8a0f-8578a616dbed.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-14B-Instruct-DPO-v1.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chocolatine-14B-Instruct-DPO-v1.3",
-    "id": "jpacifico/Chocolatine-14B-Instruct-DPO-v1.3",
-    "developer": "jpacifico",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.704
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6846
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5619
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3414
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4234
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5374
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-DPO-v2.0b1/51530638-ef76-43ce-9396-8a0d07988712.json b/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-DPO-v2.0b1/51530638-ef76-43ce-9396-8a0d07988712.json
deleted file mode 100644
index a7ff62b04..000000000
--- a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-DPO-v2.0b1/51530638-ef76-43ce-9396-8a0d07988712.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-2-14B-Instruct-DPO-v2.0b1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chocolatine-2-14B-Instruct-DPO-v2.0b1",
-    "id": "jpacifico/Chocolatine-2-14B-Instruct-DPO-v2.0b1",
-    "developer": "jpacifico",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1033
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6696
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2757
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3758
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4467
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5124
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0.1/74d99e4d-0e6f-4804-aa52-0dc76d37fac3.json b/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0.1/74d99e4d-0e6f-4804-aa52-0dc76d37fac3.json
deleted file mode 100644
index 1b5b639c2..000000000
--- a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0.1/74d99e4d-0e6f-4804-aa52-0dc76d37fac3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-2-14B-Instruct-v2.0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chocolatine-2-14B-Instruct-v2.0.1",
-    "id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0.1",
-    "developer": "jpacifico",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0742
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6736
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4796
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3918
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5008
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5299
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0.3/80e8b9f0-b507-4927-9d24-1c793e3783cc.json b/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0.3/80e8b9f0-b507-4927-9d24-1c793e3783cc.json
deleted file mode 100644
index a34d92883..000000000
--- a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0.3/80e8b9f0-b507-4927-9d24-1c793e3783cc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-2-14B-Instruct-v2.0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chocolatine-2-14B-Instruct-v2.0.3",
-    "id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0.3",
-    "developer": "jpacifico",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7037
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6548
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4207
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4768
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5374
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0/7b037520-a5e9-4b58-80f3-f0ecc5957c67.json b/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0/7b037520-a5e9-4b58-80f3-f0ecc5957c67.json
deleted file mode 100644
index 157f2eb32..000000000
--- a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0/7b037520-a5e9-4b58-80f3-f0ecc5957c67.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-2-14B-Instruct-v2.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chocolatine-2-14B-Instruct-v2.0",
-    "id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0",
-    "developer": "jpacifico",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0885
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.677
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4804
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3876
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5021
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5302
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0b2/10b88d05-62d2-4603-9d04-b0854e39ed40.json b/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0b2/10b88d05-62d2-4603-9d04-b0854e39ed40.json
deleted file mode 100644
index eb93d587d..000000000
--- a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0b2/10b88d05-62d2-4603-9d04-b0854e39ed40.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-2-14B-Instruct-v2.0b2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chocolatine-2-14B-Instruct-v2.0b2",
-    "id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0b2",
-    "developer": "jpacifico",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7241
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6476
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.395
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3834
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4808
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5369
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0b3/4b693f41-d811-4b64-892c-d840eee5ace4.json b/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0b3/4b693f41-d811-4b64-892c-d840eee5ace4.json
deleted file mode 100644
index d9d2d02b9..000000000
--- a/data/hfopenllm_v2/jpacifico/Chocolatine-2-14B-Instruct-v2.0b3/4b693f41-d811-4b64-892c-d840eee5ace4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-2-14B-Instruct-v2.0b3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chocolatine-2-14B-Instruct-v2.0b3",
-    "id": "jpacifico/Chocolatine-2-14B-Instruct-v2.0b3",
-    "developer": "jpacifico",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7323
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6469
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4109
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4781
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5337
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-Revised/90d86c8c-3aa6-42ba-a94f-75c961e65c41.json b/data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-Revised/90d86c8c-3aa6-42ba-a94f-75c961e65c41.json
deleted file mode 100644
index 5f96bd801..000000000
--- a/data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-Revised/90d86c8c-3aa6-42ba-a94f-75c961e65c41.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-3B-Instruct-DPO-Revised/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chocolatine-3B-Instruct-DPO-Revised",
-    "id": "jpacifico/Chocolatine-3B-Instruct-DPO-Revised",
-    "developer": "jpacifico",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5623
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.554
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1805
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4453
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3989
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-v1.0/8318ae52-6ae3-45ce-82db-73f8cb5ad7c7.json b/data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-v1.0/8318ae52-6ae3-45ce-82db-73f8cb5ad7c7.json
deleted file mode 100644
index 8e9903c3a..000000000
--- a/data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-v1.0/8318ae52-6ae3-45ce-82db-73f8cb5ad7c7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-3B-Instruct-DPO-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chocolatine-3B-Instruct-DPO-v1.0",
-    "id": "jpacifico/Chocolatine-3B-Instruct-DPO-v1.0",
-    "developer": "jpacifico",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3737
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5471
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1782
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4755
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3937
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-v1.2/b20a1d13-2f14-42e4-bdde-49f053cef325.json b/data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-v1.2/b20a1d13-2f14-42e4-bdde-49f053cef325.json
deleted file mode 100644
index ae3e1be3f..000000000
--- a/data/hfopenllm_v2/jpacifico/Chocolatine-3B-Instruct-DPO-v1.2/b20a1d13-2f14-42e4-bdde-49f053cef325.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jpacifico_Chocolatine-3B-Instruct-DPO-v1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chocolatine-3B-Instruct-DPO-v1.2",
-    "id": "jpacifico/Chocolatine-3B-Instruct-DPO-v1.2",
-    "developer": "jpacifico",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5455
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5487
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2047
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3389
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4154
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3877
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jpacifico/Distilucie-7B-Math-Instruct-DPO-v0.1/51521dfb-d4b5-45df-ac2a-54190aed0b9f.json b/data/hfopenllm_v2/jpacifico/Distilucie-7B-Math-Instruct-DPO-v0.1/51521dfb-d4b5-45df-ac2a-54190aed0b9f.json
deleted file mode 100644
index e3a74e372..000000000
--- a/data/hfopenllm_v2/jpacifico/Distilucie-7B-Math-Instruct-DPO-v0.1/51521dfb-d4b5-45df-ac2a-54190aed0b9f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jpacifico_Distilucie-7B-Math-Instruct-DPO-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Distilucie-7B-Math-Instruct-DPO-v0.1",
-    "id": "jpacifico/Distilucie-7B-Math-Instruct-DPO-v0.1",
-    "developer": "jpacifico",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.707
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3048
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3835
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0257
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3644
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1809
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-DPO-v1.1.3/997a1ceb-185a-4e6c-8383-eb5a6f976771.json b/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-DPO-v1.1.3/997a1ceb-185a-4e6c-8383-eb5a6f976771.json
deleted file mode 100644
index 524704add..000000000
--- a/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-DPO-v1.1.3/997a1ceb-185a-4e6c-8383-eb5a6f976771.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jpacifico_Lucie-7B-Instruct-DPO-v1.1.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lucie-7B-Instruct-DPO-v1.1.3",
-    "id": "jpacifico/Lucie-7B-Instruct-DPO-v1.1.3",
-    "developer": "jpacifico",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.707
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3819
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0242
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3818
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1764
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-DPO-v1.1/22101998-c3d3-414f-9ed1-99330cdbe3b2.json b/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-DPO-v1.1/22101998-c3d3-414f-9ed1-99330cdbe3b2.json
deleted file mode 100644
index da8506358..000000000
--- a/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-DPO-v1.1/22101998-c3d3-414f-9ed1-99330cdbe3b2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jpacifico_Lucie-7B-Instruct-DPO-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lucie-7B-Instruct-DPO-v1.1",
-    "id": "jpacifico/Lucie-7B-Instruct-DPO-v1.1",
-    "developer": "jpacifico",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.707
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3781
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0234
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4016
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1838
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.0/a2408953-a7eb-449c-b80c-3620915d44d0.json b/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.0/a2408953-a7eb-449c-b80c-3620915d44d0.json
deleted file mode 100644
index 95eaddbfc..000000000
--- a/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.0/a2408953-a7eb-449c-b80c-3620915d44d0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jpacifico_Lucie-7B-Instruct-Merged-Model_Stock-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lucie-7B-Instruct-Merged-Model_Stock-v1.0",
-    "id": "jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.0",
-    "developer": "jpacifico",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.707
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3234
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3802
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0242
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3844
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1871
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.1/d65e5b08-7d3c-4c0d-85fa-496db65a235c.json b/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.1/d65e5b08-7d3c-4c0d-85fa-496db65a235c.json
deleted file mode 100644
index 6a31392cf..000000000
--- a/data/hfopenllm_v2/jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.1/d65e5b08-7d3c-4c0d-85fa-496db65a235c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jpacifico_Lucie-7B-Instruct-Merged-Model_Stock-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lucie-7B-Instruct-Merged-Model_Stock-v1.1",
-    "id": "jpacifico/Lucie-7B-Instruct-Merged-Model_Stock-v1.1",
-    "developer": "jpacifico",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.707
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3014
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3808
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0279
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1862
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jpacifico/Lucie-Boosted-7B-Instruct/ce2c9614-46d2-481d-ac25-3cc71a93bd5e.json b/data/hfopenllm_v2/jpacifico/Lucie-Boosted-7B-Instruct/ce2c9614-46d2-481d-ac25-3cc71a93bd5e.json
deleted file mode 100644
index b3296bdfe..000000000
--- a/data/hfopenllm_v2/jpacifico/Lucie-Boosted-7B-Instruct/ce2c9614-46d2-481d-ac25-3cc71a93bd5e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jpacifico_Lucie-Boosted-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lucie-Boosted-7B-Instruct",
-    "id": "jpacifico/Lucie-Boosted-7B-Instruct",
-    "developer": "jpacifico",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.707
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2566
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3465
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3699
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.163
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jsfs11/L3-8B-Stheno-slerp/e9ba998d-8147-4046-afae-9ee7d544e98d.json b/data/hfopenllm_v2/jsfs11/L3-8B-Stheno-slerp/e9ba998d-8147-4046-afae-9ee7d544e98d.json
deleted file mode 100644
index 09c46c3d4..000000000
--- a/data/hfopenllm_v2/jsfs11/L3-8B-Stheno-slerp/e9ba998d-8147-4046-afae-9ee7d544e98d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jsfs11_L3-8B-Stheno-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-8B-Stheno-slerp",
-    "id": "jsfs11/L3-8B-Stheno-slerp",
-    "developer": "jsfs11",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6752
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5326
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0989
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3725
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3649
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jsfs11/MixtureofMerges-MoE-4x7b-v4/c44f1012-1123-42c8-b110-5735dc756fd5.json b/data/hfopenllm_v2/jsfs11/MixtureofMerges-MoE-4x7b-v4/c44f1012-1123-42c8-b110-5735dc756fd5.json
deleted file mode 100644
index b881157bf..000000000
--- a/data/hfopenllm_v2/jsfs11/MixtureofMerges-MoE-4x7b-v4/c44f1012-1123-42c8-b110-5735dc756fd5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jsfs11_MixtureofMerges-MoE-4x7b-v4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MixtureofMerges-MoE-4x7b-v4",
-    "id": "jsfs11/MixtureofMerges-MoE-4x7b-v4",
-    "developer": "jsfs11",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 24.154
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.403
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5169
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0634
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4386
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3032
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/jsfs11/MixtureofMerges-MoE-4x7b-v5/5088f6a6-2acf-4d10-8b78-0d5bd4126ab5.json b/data/hfopenllm_v2/jsfs11/MixtureofMerges-MoE-4x7b-v5/5088f6a6-2acf-4d10-8b78-0d5bd4126ab5.json
deleted file mode 100644
index fe751bdd0..000000000
--- a/data/hfopenllm_v2/jsfs11/MixtureofMerges-MoE-4x7b-v5/5088f6a6-2acf-4d10-8b78-0d5bd4126ab5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/jsfs11_MixtureofMerges-MoE-4x7b-v5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MixtureofMerges-MoE-4x7b-v5",
-    "id": "jsfs11/MixtureofMerges-MoE-4x7b-v5",
-    "developer": "jsfs11",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 24.154
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5198
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0755
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4305
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3098
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/kaist-ai/janus-7b/b4d96088-5cc0-4ebc-8b8b-8c7e9f90420b.json b/data/hfopenllm_v2/kaist-ai/janus-7b/b4d96088-5cc0-4ebc-8b8b-8c7e9f90420b.json
deleted file mode 100644
index 7009871f9..000000000
--- a/data/hfopenllm_v2/kaist-ai/janus-7b/b4d96088-5cc0-4ebc-8b8b-8c7e9f90420b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/kaist-ai_janus-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "janus-7b",
-    "id": "kaist-ai/janus-7b",
-    "developer": "kaist-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3775
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4694
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4401
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2874
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/kaist-ai/janus-dpo-7b/529dba11-53af-4045-ae46-04e1b9838d4a.json b/data/hfopenllm_v2/kaist-ai/janus-dpo-7b/529dba11-53af-4045-ae46-04e1b9838d4a.json
deleted file mode 100644
index c70f9f282..000000000
--- a/data/hfopenllm_v2/kaist-ai/janus-dpo-7b/529dba11-53af-4045-ae46-04e1b9838d4a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/kaist-ai_janus-dpo-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "janus-dpo-7b",
-    "id": "kaist-ai/janus-dpo-7b",
-    "developer": "kaist-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4003
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4773
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0415
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4387
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2976
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/kaist-ai/janus-rm-7b/391f6d6c-418f-44be-910a-fb90b5712649.json b/data/hfopenllm_v2/kaist-ai/janus-rm-7b/391f6d6c-418f-44be-910a-fb90b5712649.json
deleted file mode 100644
index 979e94c2a..000000000
--- a/data/hfopenllm_v2/kaist-ai/janus-rm-7b/391f6d6c-418f-44be-910a-fb90b5712649.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/kaist-ai_janus-rm-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "janus-rm-7b",
-    "id": "kaist-ai/janus-rm-7b",
-    "developer": "kaist-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LLMForSequenceRegression",
-      "params_billions": 7.111
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1778
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3056
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3883
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1126
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/kaist-ai/mistral-orpo-capybara-7k/2ccccb4b-7260-4a1a-9426-117e359c7c5c.json b/data/hfopenllm_v2/kaist-ai/mistral-orpo-capybara-7k/2ccccb4b-7260-4a1a-9426-117e359c7c5c.json
deleted file mode 100644
index 41fa69358..000000000
--- a/data/hfopenllm_v2/kaist-ai/mistral-orpo-capybara-7k/2ccccb4b-7260-4a1a-9426-117e359c7c5c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/kaist-ai_mistral-orpo-capybara-7k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-orpo-capybara-7k",
-    "id": "kaist-ai/mistral-orpo-capybara-7k",
-    "developer": "kaist-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5367
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4489
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3964
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2971
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/kavonalds/BunderMaxx-0710/84afecec-453d-491c-9f5a-de31d8fba43e.json b/data/hfopenllm_v2/kavonalds/BunderMaxx-0710/84afecec-453d-491c-9f5a-de31d8fba43e.json
deleted file mode 100644
index 7d5fbcff2..000000000
--- a/data/hfopenllm_v2/kavonalds/BunderMaxx-0710/84afecec-453d-491c-9f5a-de31d8fba43e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/kavonalds_BunderMaxx-0710/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BunderMaxx-0710",
-    "id": "kavonalds/BunderMaxx-0710",
-    "developer": "kavonalds",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3283
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6651
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3393
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1314
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/kavonalds/BunderMaxx-0710/dba3a3a4-cd23-44c9-823f-0bd88cf6465b.json b/data/hfopenllm_v2/kavonalds/BunderMaxx-0710/dba3a3a4-cd23-44c9-823f-0bd88cf6465b.json
deleted file mode 100644
index a38c701e1..000000000
--- a/data/hfopenllm_v2/kavonalds/BunderMaxx-0710/dba3a3a4-cd23-44c9-823f-0bd88cf6465b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/kavonalds_BunderMaxx-0710/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BunderMaxx-0710",
-    "id": "kavonalds/BunderMaxx-0710",
-    "developer": "kavonalds",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5566
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3682
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1449
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/kavonalds/BunderMaxx-1010/1179bcce-558e-40ad-8537-c74c59557975.json b/data/hfopenllm_v2/kavonalds/BunderMaxx-1010/1179bcce-558e-40ad-8537-c74c59557975.json
deleted file mode 100644
index a63fbd6f4..000000000
--- a/data/hfopenllm_v2/kavonalds/BunderMaxx-1010/1179bcce-558e-40ad-8537-c74c59557975.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/kavonalds_BunderMaxx-1010/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BunderMaxx-1010",
-    "id": "kavonalds/BunderMaxx-1010",
-    "developer": "kavonalds",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2981
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.702
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.105
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3484
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1224
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/kavonalds/Lancer-1-1b-Instruct/fe0a5c17-6c8d-4f06-a58e-47648ef9ecec.json b/data/hfopenllm_v2/kavonalds/Lancer-1-1b-Instruct/fe0a5c17-6c8d-4f06-a58e-47648ef9ecec.json
deleted file mode 100644
index 13c6651be..000000000
--- a/data/hfopenllm_v2/kavonalds/Lancer-1-1b-Instruct/fe0a5c17-6c8d-4f06-a58e-47648ef9ecec.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/kavonalds_Lancer-1-1b-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lancer-1-1b-Instruct",
-    "id": "kavonalds/Lancer-1-1b-Instruct",
-    "developer": "kavonalds",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5546
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3253
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3144
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1568
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/kayfour/T3Q-Qwen2.5-7B-it-KOR-Safe/81cf8cbd-33bc-44ab-930a-65242e1ae7b2.json b/data/hfopenllm_v2/kayfour/T3Q-Qwen2.5-7B-it-KOR-Safe/81cf8cbd-33bc-44ab-930a-65242e1ae7b2.json
deleted file mode 100644
index 773475a94..000000000
--- a/data/hfopenllm_v2/kayfour/T3Q-Qwen2.5-7B-it-KOR-Safe/81cf8cbd-33bc-44ab-930a-65242e1ae7b2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/kayfour_T3Q-Qwen2.5-7B-it-KOR-Safe/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "T3Q-Qwen2.5-7B-it-KOR-Safe",
-    "id": "kayfour/T3Q-Qwen2.5-7B-it-KOR-Safe",
-    "developer": "kayfour",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6081
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.555
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3761
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4277
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4464
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/keeeeenw/MicroLlama/173bb053-e817-4551-b169-c3f71163650a.json b/data/hfopenllm_v2/keeeeenw/MicroLlama/173bb053-e817-4551-b169-c3f71163650a.json
deleted file mode 100644
index 50d40f2d1..000000000
--- a/data/hfopenllm_v2/keeeeenw/MicroLlama/173bb053-e817-4551-b169-c3f71163650a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/keeeeenw_MicroLlama/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MicroLlama",
-    "id": "keeeeenw/MicroLlama",
-    "developer": "keeeeenw",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.305
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1985
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3007
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0113
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3698
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1138
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/kekmodel/StopCarbon-10.7B-v5/b7e6a86f-340c-48ed-a828-2e80a13aa515.json b/data/hfopenllm_v2/kekmodel/StopCarbon-10.7B-v5/b7e6a86f-340c-48ed-a828-2e80a13aa515.json
deleted file mode 100644
index 087787728..000000000
--- a/data/hfopenllm_v2/kekmodel/StopCarbon-10.7B-v5/b7e6a86f-340c-48ed-a828-2e80a13aa515.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/kekmodel_StopCarbon-10.7B-v5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "StopCarbon-10.7B-v5",
-    "id": "kekmodel/StopCarbon-10.7B-v5",
-    "developer": "kekmodel",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4728
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5178
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0559
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4019
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3157
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/kevin009/llamaRAGdrama/bd221eee-7aa8-4d6f-a6be-89ee5568e729.json b/data/hfopenllm_v2/kevin009/llamaRAGdrama/bd221eee-7aa8-4d6f-a6be-89ee5568e729.json
deleted file mode 100644
index 9b012321f..000000000
--- a/data/hfopenllm_v2/kevin009/llamaRAGdrama/bd221eee-7aa8-4d6f-a6be-89ee5568e729.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/kevin009_llamaRAGdrama/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llamaRAGdrama",
-    "id": "kevin009/llamaRAGdrama",
-    "developer": "kevin009",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2598
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4007
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4316
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2724
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/khoantap/cheap-moe-merge/8727a325-a515-4456-ba34-65c30f84644a.json b/data/hfopenllm_v2/khoantap/cheap-moe-merge/8727a325-a515-4456-ba34-65c30f84644a.json
deleted file mode 100644
index 20cf2da74..000000000
--- a/data/hfopenllm_v2/khoantap/cheap-moe-merge/8727a325-a515-4456-ba34-65c30f84644a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/khoantap_cheap-moe-merge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "cheap-moe-merge",
-    "id": "khoantap/cheap-moe-merge",
-    "developer": "khoantap",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2MoeForCausalLM",
-      "params_billions": 19.305
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4557
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5131
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0921
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4103
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3339
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/khoantap/llama-3-8b-stock-merge/3e4011fa-d480-4c16-9371-2025bc834358.json b/data/hfopenllm_v2/khoantap/llama-3-8b-stock-merge/3e4011fa-d480-4c16-9371-2025bc834358.json
deleted file mode 100644
index df818dab9..000000000
--- a/data/hfopenllm_v2/khoantap/llama-3-8b-stock-merge/3e4011fa-d480-4c16-9371-2025bc834358.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/khoantap_llama-3-8b-stock-merge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-8b-stock-merge",
-    "id": "khoantap/llama-3-8b-stock-merge",
-    "developer": "khoantap",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4812
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5162
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1616
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3946
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/khoantap/llama-breadcrumbs-ties-merge/867499a7-589b-4564-b04d-a004b7c0abb4.json b/data/hfopenllm_v2/khoantap/llama-breadcrumbs-ties-merge/867499a7-589b-4564-b04d-a004b7c0abb4.json
deleted file mode 100644
index ce7273c9a..000000000
--- a/data/hfopenllm_v2/khoantap/llama-breadcrumbs-ties-merge/867499a7-589b-4564-b04d-a004b7c0abb4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/khoantap_llama-breadcrumbs-ties-merge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-breadcrumbs-ties-merge",
-    "id": "khoantap/llama-breadcrumbs-ties-merge",
-    "developer": "khoantap",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2205
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5416
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4434
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3172
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/khoantap/llama-evolve-ties-best-merge/52f1fb51-fc7e-4cc2-918a-7c7226ae2ce5.json b/data/hfopenllm_v2/khoantap/llama-evolve-ties-best-merge/52f1fb51-fc7e-4cc2-918a-7c7226ae2ce5.json
deleted file mode 100644
index 8af7559fa..000000000
--- a/data/hfopenllm_v2/khoantap/llama-evolve-ties-best-merge/52f1fb51-fc7e-4cc2-918a-7c7226ae2ce5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/khoantap_llama-evolve-ties-best-merge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-evolve-ties-best-merge",
-    "id": "khoantap/llama-evolve-ties-best-merge",
-    "developer": "khoantap",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6744
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5414
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1563
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3946
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.386
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/khoantap/llama-linear-0.5-0.5-1-merge/5f4a8fb6-b22d-4eb2-aaef-da05ca45fbeb.json b/data/hfopenllm_v2/khoantap/llama-linear-0.5-0.5-1-merge/5f4a8fb6-b22d-4eb2-aaef-da05ca45fbeb.json
deleted file mode 100644
index 75bc7bcd6..000000000
--- a/data/hfopenllm_v2/khoantap/llama-linear-0.5-0.5-1-merge/5f4a8fb6-b22d-4eb2-aaef-da05ca45fbeb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/khoantap_llama-linear-0.5-0.5-1-merge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-linear-0.5-0.5-1-merge",
-    "id": "khoantap/llama-linear-0.5-0.5-1-merge",
-    "developer": "khoantap",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4812
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5643
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2054
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4143
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3833
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/khoantap/llama-linear-0.5-1-0.5-merge/3278855d-7bd1-4e7e-b27b-b1393006e7e7.json b/data/hfopenllm_v2/khoantap/llama-linear-0.5-1-0.5-merge/3278855d-7bd1-4e7e-b27b-b1393006e7e7.json
deleted file mode 100644
index 28ab86f9c..000000000
--- a/data/hfopenllm_v2/khoantap/llama-linear-0.5-1-0.5-merge/3278855d-7bd1-4e7e-b27b-b1393006e7e7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/khoantap_llama-linear-0.5-1-0.5-merge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-linear-0.5-1-0.5-merge",
-    "id": "khoantap/llama-linear-0.5-1-0.5-merge",
-    "developer": "khoantap",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5032
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5951
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4172
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.369
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/khoantap/llama-linear-1-0.5-0.5-merge/5193ab4d-1627-43b5-bfb7-89e08ea1f810.json b/data/hfopenllm_v2/khoantap/llama-linear-1-0.5-0.5-merge/5193ab4d-1627-43b5-bfb7-89e08ea1f810.json
deleted file mode 100644
index e1f8e572b..000000000
--- a/data/hfopenllm_v2/khoantap/llama-linear-1-0.5-0.5-merge/5193ab4d-1627-43b5-bfb7-89e08ea1f810.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/khoantap_llama-linear-1-0.5-0.5-merge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-linear-1-0.5-0.5-merge",
-    "id": "khoantap/llama-linear-1-0.5-0.5-merge",
-    "developer": "khoantap",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4515
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5526
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2477
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4118
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3635
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/khoantap/llama-slerp-merge/598faeda-48fb-43a8-aaa9-849d5dfcea79.json b/data/hfopenllm_v2/khoantap/llama-slerp-merge/598faeda-48fb-43a8-aaa9-849d5dfcea79.json
deleted file mode 100644
index b429d2b76..000000000
--- a/data/hfopenllm_v2/khoantap/llama-slerp-merge/598faeda-48fb-43a8-aaa9-849d5dfcea79.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/khoantap_llama-slerp-merge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-slerp-merge",
-    "id": "khoantap/llama-slerp-merge",
-    "developer": "khoantap",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.498
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5783
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0831
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4053
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3678
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/khoantap/moe-out-merge/d1afa2fb-1256-4dd3-b13b-802917bf481b.json b/data/hfopenllm_v2/khoantap/moe-out-merge/d1afa2fb-1256-4dd3-b13b-802917bf481b.json
deleted file mode 100644
index 7471ab85d..000000000
--- a/data/hfopenllm_v2/khoantap/moe-out-merge/d1afa2fb-1256-4dd3-b13b-802917bf481b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/khoantap_moe-out-merge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "moe-out-merge",
-    "id": "khoantap/moe-out-merge",
-    "developer": "khoantap",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2MoeForCausalLM",
-      "params_billions": 19.305
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4505
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5151
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0929
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4063
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3348
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/khulaifi95/Llama-3.1-8B-Reason-Blend-888k/397c9bc3-0af5-453c-9b68-5360783dfbf7.json b/data/hfopenllm_v2/khulaifi95/Llama-3.1-8B-Reason-Blend-888k/397c9bc3-0af5-453c-9b68-5360783dfbf7.json
deleted file mode 100644
index b28191893..000000000
--- a/data/hfopenllm_v2/khulaifi95/Llama-3.1-8B-Reason-Blend-888k/397c9bc3-0af5-453c-9b68-5360783dfbf7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/khulaifi95_Llama-3.1-8B-Reason-Blend-888k/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-Reason-Blend-888k",
-    "id": "khulaifi95/Llama-3.1-8B-Reason-Blend-888k",
-    "developer": "khulaifi95",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5832
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.479
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1156
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3379
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.31
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/kms7530/chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1/9bb39652-c79a-42bf-b6d8-c4ed6174a4c7.json b/data/hfopenllm_v2/kms7530/chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1/9bb39652-c79a-42bf-b6d8-c4ed6174a4c7.json
deleted file mode 100644
index 06712c0c5..000000000
--- a/data/hfopenllm_v2/kms7530/chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1/9bb39652-c79a-42bf-b6d8-c4ed6174a4c7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/kms7530_chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1",
-    "id": "kms7530/chemeng_llama-3-8b-Instruct-bnb-4bit_24_1_100_1",
-    "developer": "kms7530",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 9.3
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5455
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4289
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0619
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3821
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2798
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/kms7530/chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath/7e793244-b746-4aa4-a401-dcf5884f61a4.json b/data/hfopenllm_v2/kms7530/chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath/7e793244-b746-4aa4-a401-dcf5884f61a4.json
deleted file mode 100644
index a2ee08c81..000000000
--- a/data/hfopenllm_v2/kms7530/chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath/7e793244-b746-4aa4-a401-dcf5884f61a4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/kms7530_chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath",
-    "id": "kms7530/chemeng_phi-3-mini-4k-instruct-bnb-4bit_16_4_100_1_nonmath",
-    "developer": "kms7530",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 4.132
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4863
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4987
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3983
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3481
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/kms7530/chemeng_qwen-math-7b_24_1_100_1/26a8da03-debd-41e3-8ee1-2827d76b26ca.json b/data/hfopenllm_v2/kms7530/chemeng_qwen-math-7b_24_1_100_1/26a8da03-debd-41e3-8ee1-2827d76b26ca.json
deleted file mode 100644
index 969ace9a0..000000000
--- a/data/hfopenllm_v2/kms7530/chemeng_qwen-math-7b_24_1_100_1/26a8da03-debd-41e3-8ee1-2827d76b26ca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/kms7530_chemeng_qwen-math-7b_24_1_100_1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "chemeng_qwen-math-7b_24_1_100_1",
-    "id": "kms7530/chemeng_qwen-math-7b_24_1_100_1",
-    "developer": "kms7530",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 8.911
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2111
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3578
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2243
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2441
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3687
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2158
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/kms7530/chemeng_qwen-math-7b_24_1_100_1_nonmath/e214c326-dd84-4915-bba1-faaafbb026b2.json b/data/hfopenllm_v2/kms7530/chemeng_qwen-math-7b_24_1_100_1_nonmath/e214c326-dd84-4915-bba1-faaafbb026b2.json
deleted file mode 100644
index ce14e36aa..000000000
--- a/data/hfopenllm_v2/kms7530/chemeng_qwen-math-7b_24_1_100_1_nonmath/e214c326-dd84-4915-bba1-faaafbb026b2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/kms7530_chemeng_qwen-math-7b_24_1_100_1_nonmath/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "chemeng_qwen-math-7b_24_1_100_1_nonmath",
-    "id": "kms7530/chemeng_qwen-math-7b_24_1_100_1_nonmath",
-    "developer": "kms7530",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 15.231
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3893
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3097
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4087
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2452
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/kno10/ende-chat-0.0.5/98a5ea0a-6e45-48f8-8219-32099b9fa9d0.json b/data/hfopenllm_v2/kno10/ende-chat-0.0.5/98a5ea0a-6e45-48f8-8219-32099b9fa9d0.json
deleted file mode 100644
index dfb3a0c4e..000000000
--- a/data/hfopenllm_v2/kno10/ende-chat-0.0.5/98a5ea0a-6e45-48f8-8219-32099b9fa9d0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/kno10_ende-chat-0.0.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ende-chat-0.0.5",
-    "id": "kno10/ende-chat-0.0.5",
-    "developer": "kno10",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.891
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3404
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3604
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3938
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.179
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/kno10/ende-chat-0.0.7/40d7d17d-2d41-4d23-83c1-ab5f3320e36e.json b/data/hfopenllm_v2/kno10/ende-chat-0.0.7/40d7d17d-2d41-4d23-83c1-ab5f3320e36e.json
deleted file mode 100644
index eb7bd3e9e..000000000
--- a/data/hfopenllm_v2/kno10/ende-chat-0.0.7/40d7d17d-2d41-4d23-83c1-ab5f3320e36e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/kno10_ende-chat-0.0.7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ende-chat-0.0.7",
-    "id": "kno10/ende-chat-0.0.7",
-    "developer": "kno10",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.891
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4401
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0174
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3861
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1966
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/kyutai/helium-1-preview-2b/d881a83a-9ba8-4919-8b89-45f5a7220621.json b/data/hfopenllm_v2/kyutai/helium-1-preview-2b/d881a83a-9ba8-4919-8b89-45f5a7220621.json
deleted file mode 100644
index 9db17dbd8..000000000
--- a/data/hfopenllm_v2/kyutai/helium-1-preview-2b/d881a83a-9ba8-4919-8b89-45f5a7220621.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/kyutai_helium-1-preview-2b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "helium-1-preview-2b",
-    "id": "kyutai/helium-1-preview-2b",
-    "developer": "kyutai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "HeliumForCausalLM",
-      "params_billions": 2.173
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2614
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3638
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.355
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1873
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/kz919/QwQ-0.5B-Distilled-SFT/d6c966a1-7927-424a-9886-b98688d27e6f.json b/data/hfopenllm_v2/kz919/QwQ-0.5B-Distilled-SFT/d6c966a1-7927-424a-9886-b98688d27e6f.json
deleted file mode 100644
index 34900ada9..000000000
--- a/data/hfopenllm_v2/kz919/QwQ-0.5B-Distilled-SFT/d6c966a1-7927-424a-9886-b98688d27e6f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/kz919_QwQ-0.5B-Distilled-SFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwQ-0.5B-Distilled-SFT",
-    "id": "kz919/QwQ-0.5B-Distilled-SFT",
-    "developer": "kz919",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3077
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3256
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.074
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3409
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1587
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ladydaina/ECE-FDF/c09fe163-a7f7-4b6b-b407-ee8d698b2ee8.json b/data/hfopenllm_v2/ladydaina/ECE-FDF/c09fe163-a7f7-4b6b-b407-ee8d698b2ee8.json
deleted file mode 100644
index 298900d6a..000000000
--- a/data/hfopenllm_v2/ladydaina/ECE-FDF/c09fe163-a7f7-4b6b-b407-ee8d698b2ee8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ladydaina_ECE-FDF/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-FDF",
-    "id": "ladydaina/ECE-FDF",
-    "developer": "ladydaina",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3728
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.515
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0816
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4504
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3007
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/laislemke/LLaMA-2-vicuna-7b-slerp/b3979c7f-0596-4a24-b264-73a17ba19821.json b/data/hfopenllm_v2/laislemke/LLaMA-2-vicuna-7b-slerp/b3979c7f-0596-4a24-b264-73a17ba19821.json
deleted file mode 100644
index d0f53a22e..000000000
--- a/data/hfopenllm_v2/laislemke/LLaMA-2-vicuna-7b-slerp/b3979c7f-0596-4a24-b264-73a17ba19821.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/laislemke_LLaMA-2-vicuna-7b-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMA-2-vicuna-7b-slerp",
-    "id": "laislemke/LLaMA-2-vicuna-7b-slerp",
-    "developer": "laislemke",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.738
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2932
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2986
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0113
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3833
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1342
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-0.5B-FT-V5-MUSR/f6156893-92e7-4c4f-bff4-8b6d774ecbd8.json b/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-0.5B-FT-V5-MUSR/f6156893-92e7-4c4f-bff4-8b6d774ecbd8.json
deleted file mode 100644
index d16b8f1ef..000000000
--- a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-0.5B-FT-V5-MUSR/f6156893-92e7-4c4f-bff4-8b6d774ecbd8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lalainy_ECE-PRYMMAL-0.5B-FT-V5-MUSR/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-0.5B-FT-V5-MUSR",
-    "id": "lalainy/ECE-PRYMMAL-0.5B-FT-V5-MUSR",
-    "developer": "lalainy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2138
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3269
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3262
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1533
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-0.5B-SLERP-V4/8b1c19e0-8b47-46ae-8bf3-f84c7d3a9c0e.json b/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-0.5B-SLERP-V4/8b1c19e0-8b47-46ae-8bf3-f84c7d3a9c0e.json
deleted file mode 100644
index bf518c502..000000000
--- a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-0.5B-SLERP-V4/8b1c19e0-8b47-46ae-8bf3-f84c7d3a9c0e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lalainy_ECE-PRYMMAL-0.5B-SLERP-V4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-0.5B-SLERP-V4",
-    "id": "lalainy/ECE-PRYMMAL-0.5B-SLERP-V4",
-    "developer": "lalainy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1564
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3789
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1169
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1/6221102e-4e8c-46dd-8c03-fa9e92b7e4ea.json b/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1/6221102e-4e8c-46dd-8c03-fa9e92b7e4ea.json
deleted file mode 100644
index e44f2c10f..000000000
--- a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1/6221102e-4e8c-46dd-8c03-fa9e92b7e4ea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lalainy_ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1",
-    "id": "lalainy/ECE-PRYMMAL-YL-0.5B-SLERP-BIS-V1",
-    "developer": "lalainy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1437
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3032
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2349
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3646
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1121
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-1B-SLERP-V3/329e5e91-10ba-4795-ae86-dda95e698b4f.json b/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-1B-SLERP-V3/329e5e91-10ba-4795-ae86-dda95e698b4f.json
deleted file mode 100644
index 97784607c..000000000
--- a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-1B-SLERP-V3/329e5e91-10ba-4795-ae86-dda95e698b4f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lalainy_ECE-PRYMMAL-YL-1B-SLERP-V3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-YL-1B-SLERP-V3",
-    "id": "lalainy/ECE-PRYMMAL-YL-1B-SLERP-V3",
-    "developer": "lalainy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.325
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4225
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0974
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4213
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2931
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-1B-SLERP-V4/3fe89b13-135d-4790-871d-74e7a28ea2e9.json b/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-1B-SLERP-V4/3fe89b13-135d-4790-871d-74e7a28ea2e9.json
deleted file mode 100644
index e1cc670e9..000000000
--- a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-1B-SLERP-V4/3fe89b13-135d-4790-871d-74e7a28ea2e9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lalainy_ECE-PRYMMAL-YL-1B-SLERP-V4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-YL-1B-SLERP-V4",
-    "id": "lalainy/ECE-PRYMMAL-YL-1B-SLERP-V4",
-    "developer": "lalainy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3324
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4171
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1005
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4306
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2893
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-6B-SLERP-V1/4b807741-f1b9-4964-9bc9-bb93f9b34217.json b/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-6B-SLERP-V1/4b807741-f1b9-4964-9bc9-bb93f9b34217.json
deleted file mode 100644
index fb7527528..000000000
--- a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-6B-SLERP-V1/4b807741-f1b9-4964-9bc9-bb93f9b34217.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lalainy_ECE-PRYMMAL-YL-6B-SLERP-V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-YL-6B-SLERP-V1",
-    "id": "lalainy/ECE-PRYMMAL-YL-6B-SLERP-V1",
-    "developer": "lalainy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.061
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3264
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4629
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1269
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4864
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3214
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-6B-SLERP-V2/c52a8a4d-be91-4a0d-8cd5-8473a42f0978.json b/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-6B-SLERP-V2/c52a8a4d-be91-4a0d-8cd5-8473a42f0978.json
deleted file mode 100644
index c4e7de98b..000000000
--- a/data/hfopenllm_v2/lalainy/ECE-PRYMMAL-YL-6B-SLERP-V2/c52a8a4d-be91-4a0d-8cd5-8473a42f0978.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lalainy_ECE-PRYMMAL-YL-6B-SLERP-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-YL-6B-SLERP-V2",
-    "id": "lalainy/ECE-PRYMMAL-YL-6B-SLERP-V2",
-    "developer": "lalainy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.061
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3249
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4629
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1269
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4864
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3214
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/langgptai/Qwen-las-v0.1/f6e157c4-0ce9-41c9-b885-9222d894ff0c.json b/data/hfopenllm_v2/langgptai/Qwen-las-v0.1/f6e157c4-0ce9-41c9-b885-9222d894ff0c.json
deleted file mode 100644
index 0046e64f5..000000000
--- a/data/hfopenllm_v2/langgptai/Qwen-las-v0.1/f6e157c4-0ce9-41c9-b885-9222d894ff0c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/langgptai_Qwen-las-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-las-v0.1",
-    "id": "langgptai/Qwen-las-v0.1",
-    "developer": "langgptai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 7.901
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3301
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3893
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.037
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2466
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3701
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2325
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/langgptai/qwen1.5-7b-chat-sa-v0.1/fe52a94a-5324-4b59-accc-dfd1f9d4aead.json b/data/hfopenllm_v2/langgptai/qwen1.5-7b-chat-sa-v0.1/fe52a94a-5324-4b59-accc-dfd1f9d4aead.json
deleted file mode 100644
index 17240757f..000000000
--- a/data/hfopenllm_v2/langgptai/qwen1.5-7b-chat-sa-v0.1/fe52a94a-5324-4b59-accc-dfd1f9d4aead.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/langgptai_qwen1.5-7b-chat-sa-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen1.5-7b-chat-sa-v0.1",
-    "id": "langgptai/qwen1.5-7b-chat-sa-v0.1",
-    "developer": "langgptai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 15.443
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4268
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4325
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3551
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2993
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lars1234/Mistral-Small-24B-Instruct-2501-writer/1241f5e3-54eb-429e-b109-a5e163e39eda.json b/data/hfopenllm_v2/lars1234/Mistral-Small-24B-Instruct-2501-writer/1241f5e3-54eb-429e-b109-a5e163e39eda.json
deleted file mode 100644
index 92337f309..000000000
--- a/data/hfopenllm_v2/lars1234/Mistral-Small-24B-Instruct-2501-writer/1241f5e3-54eb-429e-b109-a5e163e39eda.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lars1234_Mistral-Small-24B-Instruct-2501-writer/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Small-24B-Instruct-2501-writer",
-    "id": "lars1234/Mistral-Small-24B-Instruct-2501-writer",
-    "developer": "lars1234",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6565
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6733
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3557
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3893
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4645
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5448
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/leafspark/Llama-3.1-8B-MultiReflection-Instruct/8ccc7c8c-1d14-45bb-9a6b-f8f69e506139.json b/data/hfopenllm_v2/leafspark/Llama-3.1-8B-MultiReflection-Instruct/8ccc7c8c-1d14-45bb-9a6b-f8f69e506139.json
deleted file mode 100644
index b9d1ba6c5..000000000
--- a/data/hfopenllm_v2/leafspark/Llama-3.1-8B-MultiReflection-Instruct/8ccc7c8c-1d14-45bb-9a6b-f8f69e506139.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/leafspark_Llama-3.1-8B-MultiReflection-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-MultiReflection-Instruct",
-    "id": "leafspark/Llama-3.1-8B-MultiReflection-Instruct",
-    "developer": "leafspark",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7125
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5009
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1707
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3682
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3724
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-9B/5531b59e-24c0-41af-ab6b-d6a5e38b0a98.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-9B/5531b59e-24c0-41af-ab6b-d6a5e38b0a98.json
deleted file mode 100644
index 1a22b28f0..000000000
--- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-9B/5531b59e-24c0-41af-ab6b-d6a5e38b0a98.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-Ataraxy-9B",
-    "id": "lemon07r/Gemma-2-Ataraxy-9B",
-    "developer": "lemon07r",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3009
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5931
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0853
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3347
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4424
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4226
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-Advanced-9B/63e82cb3-2f6f-4617-abb7-ae093bc27830.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-Advanced-9B/63e82cb3-2f6f-4617-abb7-ae093bc27830.json
deleted file mode 100644
index d0d68741f..000000000
--- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-Advanced-9B/63e82cb3-2f6f-4617-abb7-ae093bc27830.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-Advanced-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-Ataraxy-Advanced-9B",
-    "id": "lemon07r/Gemma-2-Ataraxy-Advanced-9B",
-    "developer": "lemon07r",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5516
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5889
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1979
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3761
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4244
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-Remix-9B/0feb74e6-40d4-472d-9233-27faa2d3f802.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-Remix-9B/0feb74e6-40d4-472d-9233-27faa2d3f802.json
deleted file mode 100644
index 3e6258067..000000000
--- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-Remix-9B/0feb74e6-40d4-472d-9233-27faa2d3f802.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-Remix-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-Ataraxy-Remix-9B",
-    "id": "lemon07r/Gemma-2-Ataraxy-Remix-9B",
-    "developer": "lemon07r",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7083
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5892
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2017
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3389
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4372
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4239
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2-9B/e74dd005-c9b5-45c9-b7f5-455c3110e09b.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2-9B/e74dd005-c9b5-45c9-b7f5-455c3110e09b.json
deleted file mode 100644
index 48d6b6abd..000000000
--- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2-9B/e74dd005-c9b5-45c9-b7f5-455c3110e09b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-Ataraxy-v2-9B",
-    "id": "lemon07r/Gemma-2-Ataraxy-v2-9B",
-    "developer": "lemon07r",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2136
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5766
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0846
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3423
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3484
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4221
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2a-9B/d094bf6f-9952-45c7-995e-d7eda07f4668.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2a-9B/d094bf6f-9952-45c7-995e-d7eda07f4668.json
deleted file mode 100644
index 9dfdf8d74..000000000
--- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2a-9B/d094bf6f-9952-45c7-995e-d7eda07f4668.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v2a-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-Ataraxy-v2a-9B",
-    "id": "lemon07r/Gemma-2-Ataraxy-v2a-9B",
-    "developer": "lemon07r",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1595
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5182
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3398
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3165
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2f-9B/0e5f3393-8a6a-4f2f-948a-a37ae4d8fdeb.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2f-9B/0e5f3393-8a6a-4f2f-948a-a37ae4d8fdeb.json
deleted file mode 100644
index a5ae93322..000000000
--- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v2f-9B/0e5f3393-8a6a-4f2f-948a-a37ae4d8fdeb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v2f-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-Ataraxy-v2f-9B",
-    "id": "lemon07r/Gemma-2-Ataraxy-v2f-9B",
-    "developer": "lemon07r",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3791
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5193
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1163
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3389
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3231
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3503
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3-Advanced-9B/f91982ac-0cab-415a-8503-e090d195bd05.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3-Advanced-9B/f91982ac-0cab-415a-8503-e090d195bd05.json
deleted file mode 100644
index c793e8539..000000000
--- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3-Advanced-9B/f91982ac-0cab-415a-8503-e090d195bd05.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v3-Advanced-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-Ataraxy-v3-Advanced-9B",
-    "id": "lemon07r/Gemma-2-Ataraxy-v3-Advanced-9B",
-    "developer": "lemon07r",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6602
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5935
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1873
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.445
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4196
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3b-9B/fb1af66e-7828-495b-8277-5cff77c3070e.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3b-9B/fb1af66e-7828-495b-8277-5cff77c3070e.json
deleted file mode 100644
index 5ef333acb..000000000
--- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3b-9B/fb1af66e-7828-495b-8277-5cff77c3070e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v3b-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-Ataraxy-v3b-9B",
-    "id": "lemon07r/Gemma-2-Ataraxy-v3b-9B",
-    "developer": "lemon07r",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6809
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5908
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2153
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4489
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4205
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3i-9B/ac84c157-4d11-43c1-8731-b1e5cfa91668.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3i-9B/ac84c157-4d11-43c1-8731-b1e5cfa91668.json
deleted file mode 100644
index fb3af6e05..000000000
--- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3i-9B/ac84c157-4d11-43c1-8731-b1e5cfa91668.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v3i-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-Ataraxy-v3i-9B",
-    "id": "lemon07r/Gemma-2-Ataraxy-v3i-9B",
-    "developer": "lemon07r",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4203
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5626
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1533
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3181
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4166
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3j-9B/bbc812dd-9a9c-4f99-b813-50361025eea3.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3j-9B/bbc812dd-9a9c-4f99-b813-50361025eea3.json
deleted file mode 100644
index 627159167..000000000
--- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v3j-9B/bbc812dd-9a9c-4f99-b813-50361025eea3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v3j-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-Ataraxy-v3j-9B",
-    "id": "lemon07r/Gemma-2-Ataraxy-v3j-9B",
-    "developer": "lemon07r",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4169
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5632
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1692
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4134
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4-Advanced-9B/fc818799-49d5-4fca-b131-ebe8d5d831f1.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4-Advanced-9B/fc818799-49d5-4fca-b131-ebe8d5d831f1.json
deleted file mode 100644
index 5762b4771..000000000
--- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4-Advanced-9B/fc818799-49d5-4fca-b131-ebe8d5d831f1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v4-Advanced-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-Ataraxy-v4-Advanced-9B",
-    "id": "lemon07r/Gemma-2-Ataraxy-v4-Advanced-9B",
-    "developer": "lemon07r",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7015
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6024
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2153
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3389
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4581
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4367
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4a-Advanced-9B/33349989-8573-4d71-ae0f-99691fdaffc3.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4a-Advanced-9B/33349989-8573-4d71-ae0f-99691fdaffc3.json
deleted file mode 100644
index f1c4a9342..000000000
--- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4a-Advanced-9B/33349989-8573-4d71-ae0f-99691fdaffc3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v4a-Advanced-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-Ataraxy-v4a-Advanced-9B",
-    "id": "lemon07r/Gemma-2-Ataraxy-v4a-Advanced-9B",
-    "developer": "lemon07r",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7135
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5988
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2115
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4489
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4309
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4b-9B/91551de5-d8ac-4c0d-b9b4-3627db947f0e.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4b-9B/91551de5-d8ac-4c0d-b9b4-3627db947f0e.json
deleted file mode 100644
index de6307f19..000000000
--- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4b-9B/91551de5-d8ac-4c0d-b9b4-3627db947f0e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v4b-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-Ataraxy-v4b-9B",
-    "id": "lemon07r/Gemma-2-Ataraxy-v4b-9B",
-    "developer": "lemon07r",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6878
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6039
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2334
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4555
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4357
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4c-9B/c2d2c1f4-aaab-45f1-b3f6-5b4ea56b696e.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4c-9B/c2d2c1f4-aaab-45f1-b3f6-5b4ea56b696e.json
deleted file mode 100644
index ddb47c621..000000000
--- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4c-9B/c2d2c1f4-aaab-45f1-b3f6-5b4ea56b696e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v4c-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-Ataraxy-v4c-9B",
-    "id": "lemon07r/Gemma-2-Ataraxy-v4c-9B",
-    "developer": "lemon07r",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6945
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6084
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2266
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3339
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4528
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4395
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4d-9B/36821a8b-af18-4631-b4b0-7e4b37bb194b.json b/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4d-9B/36821a8b-af18-4631-b4b0-7e4b37bb194b.json
deleted file mode 100644
index 911a9ca74..000000000
--- a/data/hfopenllm_v2/lemon07r/Gemma-2-Ataraxy-v4d-9B/36821a8b-af18-4631-b4b0-7e4b37bb194b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lemon07r_Gemma-2-Ataraxy-v4d-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-Ataraxy-v4d-9B",
-    "id": "lemon07r/Gemma-2-Ataraxy-v4d-9B",
-    "developer": "lemon07r",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.725
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6054
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2334
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4541
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4346
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lemon07r/Llama-3-RedMagic4-8B/e402d129-f4f1-4b95-b079-4f30936119aa.json b/data/hfopenllm_v2/lemon07r/Llama-3-RedMagic4-8B/e402d129-f4f1-4b95-b079-4f30936119aa.json
deleted file mode 100644
index 4d4ff0851..000000000
--- a/data/hfopenllm_v2/lemon07r/Llama-3-RedMagic4-8B/e402d129-f4f1-4b95-b079-4f30936119aa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lemon07r_Llama-3-RedMagic4-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-RedMagic4-8B",
-    "id": "lemon07r/Llama-3-RedMagic4-8B",
-    "developer": "lemon07r",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4864
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4256
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0899
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3766
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3676
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lemon07r/llama-3-NeuralMahou-8b/814e1ea7-a639-4b05-9208-0bf537ea5479.json b/data/hfopenllm_v2/lemon07r/llama-3-NeuralMahou-8b/814e1ea7-a639-4b05-9208-0bf537ea5479.json
deleted file mode 100644
index 27784d919..000000000
--- a/data/hfopenllm_v2/lemon07r/llama-3-NeuralMahou-8b/814e1ea7-a639-4b05-9208-0bf537ea5479.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lemon07r_llama-3-NeuralMahou-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-NeuralMahou-8b",
-    "id": "lemon07r/llama-3-NeuralMahou-8b",
-    "developer": "lemon07r",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4901
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4184
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3873
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.369
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lesubra/ECE-EIFFEL-3B/35a50d36-31d0-454b-a13c-80ca26945f94.json b/data/hfopenllm_v2/lesubra/ECE-EIFFEL-3B/35a50d36-31d0-454b-a13c-80ca26945f94.json
deleted file mode 100644
index 83e427178..000000000
--- a/data/hfopenllm_v2/lesubra/ECE-EIFFEL-3B/35a50d36-31d0-454b-a13c-80ca26945f94.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lesubra_ECE-EIFFEL-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-EIFFEL-3B",
-    "id": "lesubra/ECE-EIFFEL-3B",
-    "developer": "lesubra",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3469
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5102
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4362
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3821
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lesubra/ECE-EIFFEL-3Bv2/87347017-4ff1-4bd3-a1d7-8f3999061209.json b/data/hfopenllm_v2/lesubra/ECE-EIFFEL-3Bv2/87347017-4ff1-4bd3-a1d7-8f3999061209.json
deleted file mode 100644
index 3d51d39fa..000000000
--- a/data/hfopenllm_v2/lesubra/ECE-EIFFEL-3Bv2/87347017-4ff1-4bd3-a1d7-8f3999061209.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lesubra_ECE-EIFFEL-3Bv2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-EIFFEL-3Bv2",
-    "id": "lesubra/ECE-EIFFEL-3Bv2",
-    "developer": "lesubra",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3013
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5424
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1186
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4443
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3999
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lesubra/ECE-EIFFEL-3Bv3/976184ed-c4ed-4898-83c7-521a8a8309ac.json b/data/hfopenllm_v2/lesubra/ECE-EIFFEL-3Bv3/976184ed-c4ed-4898-83c7-521a8a8309ac.json
deleted file mode 100644
index 33d7b6db8..000000000
--- a/data/hfopenllm_v2/lesubra/ECE-EIFFEL-3Bv3/976184ed-c4ed-4898-83c7-521a8a8309ac.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lesubra_ECE-EIFFEL-3Bv3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-EIFFEL-3Bv3",
-    "id": "lesubra/ECE-EIFFEL-3Bv3",
-    "developer": "lesubra",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3786
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5469
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1669
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4675
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3975
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP-V1/fa52f072-7725-4a4e-b728-042e5897a1bd.json b/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP-V1/fa52f072-7725-4a4e-b728-042e5897a1bd.json
deleted file mode 100644
index 692fe6956..000000000
--- a/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP-V1/fa52f072-7725-4a4e-b728-042e5897a1bd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lesubra_ECE-PRYMMAL-3B-SLERP-V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-3B-SLERP-V1",
-    "id": "lesubra/ECE-PRYMMAL-3B-SLERP-V1",
-    "developer": "lesubra",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2933
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5341
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1662
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4595
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP-V2/6374dcee-301c-4f28-9316-82ed8e693089.json b/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP-V2/6374dcee-301c-4f28-9316-82ed8e693089.json
deleted file mode 100644
index 91ea20f99..000000000
--- a/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP-V2/6374dcee-301c-4f28-9316-82ed8e693089.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lesubra_ECE-PRYMMAL-3B-SLERP-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-3B-SLERP-V2",
-    "id": "lesubra/ECE-PRYMMAL-3B-SLERP-V2",
-    "developer": "lesubra",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2933
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5341
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1662
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4595
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP_2-V1/b7c95cb4-f32f-466e-a28c-32afd9ec5578.json b/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP_2-V1/b7c95cb4-f32f-466e-a28c-32afd9ec5578.json
deleted file mode 100644
index 7c02e49b7..000000000
--- a/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP_2-V1/b7c95cb4-f32f-466e-a28c-32afd9ec5578.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lesubra_ECE-PRYMMAL-3B-SLERP_2-V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-3B-SLERP_2-V1",
-    "id": "lesubra/ECE-PRYMMAL-3B-SLERP_2-V1",
-    "developer": "lesubra",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3649
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5411
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1677
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4661
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.399
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP_2-V2/bddd742b-f7c9-44aa-ad2f-83f51a4625be.json b/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP_2-V2/bddd742b-f7c9-44aa-ad2f-83f51a4625be.json
deleted file mode 100644
index 7ac9d72fd..000000000
--- a/data/hfopenllm_v2/lesubra/ECE-PRYMMAL-3B-SLERP_2-V2/bddd742b-f7c9-44aa-ad2f-83f51a4625be.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lesubra_ECE-PRYMMAL-3B-SLERP_2-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-3B-SLERP_2-V2",
-    "id": "lesubra/ECE-PRYMMAL-3B-SLERP_2-V2",
-    "developer": "lesubra",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3664
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5411
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1677
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4661
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.399
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lesubra/merge-test/099af0ee-c06b-4435-8f97-27681f3eddff.json b/data/hfopenllm_v2/lesubra/merge-test/099af0ee-c06b-4435-8f97-27681f3eddff.json
deleted file mode 100644
index 0105f3bb4..000000000
--- a/data/hfopenllm_v2/lesubra/merge-test/099af0ee-c06b-4435-8f97-27681f3eddff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lesubra_merge-test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "merge-test",
-    "id": "lesubra/merge-test",
-    "developer": "lesubra",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5383
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.524
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1208
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4419
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3874
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-full/fa826f3a-8688-4518-8d44-68189abb47ba.json b/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-full/fa826f3a-8688-4518-8d44-68189abb47ba.json
deleted file mode 100644
index 122789813..000000000
--- a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-full/fa826f3a-8688-4518-8d44-68189abb47ba.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-full/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "suzume-llama-3-8B-multilingual-orpo-borda-full",
-    "id": "lightblue/suzume-llama-3-8B-multilingual-orpo-borda-full",
-    "developer": "lightblue",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5817
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4714
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0763
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3222
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.331
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-half/10d29dc0-3486-40df-9933-1ce8f0fabaa2.json b/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-half/10d29dc0-3486-40df-9933-1ce8f0fabaa2.json
deleted file mode 100644
index b4ab0a93e..000000000
--- a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-half/10d29dc0-3486-40df-9933-1ce8f0fabaa2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-half/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "suzume-llama-3-8B-multilingual-orpo-borda-half",
-    "id": "lightblue/suzume-llama-3-8B-multilingual-orpo-borda-half",
-    "developer": "lightblue",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6249
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4707
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0906
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.245
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3516
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3614
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top25/741ff375-3392-461e-a9b0-e0dab4e6e9f8.json b/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top25/741ff375-3392-461e-a9b0-e0dab4e6e9f8.json
deleted file mode 100644
index 9a8eccba8..000000000
--- a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top25/741ff375-3392-461e-a9b0-e0dab4e6e9f8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-top25/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "suzume-llama-3-8B-multilingual-orpo-borda-top25",
-    "id": "lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top25",
-    "developer": "lightblue",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6637
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4865
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1042
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3566
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3684
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top75/c3d709de-118d-40c2-ab89-040efedd7fdb.json b/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top75/c3d709de-118d-40c2-ab89-040efedd7fdb.json
deleted file mode 100644
index 669b89390..000000000
--- a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top75/c3d709de-118d-40c2-ab89-040efedd7fdb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lightblue_suzume-llama-3-8B-multilingual-orpo-borda-top75/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "suzume-llama-3-8B-multilingual-orpo-borda-top75",
-    "id": "lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top75",
-    "developer": "lightblue",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6687
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4833
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0785
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3769
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual/9be3dd27-93fa-49e9-a628-5a77a8a3bb9a.json b/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual/9be3dd27-93fa-49e9-a628-5a77a8a3bb9a.json
deleted file mode 100644
index 919b41cbc..000000000
--- a/data/hfopenllm_v2/lightblue/suzume-llama-3-8B-multilingual/9be3dd27-93fa-49e9-a628-5a77a8a3bb9a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lightblue_suzume-llama-3-8B-multilingual/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "suzume-llama-3-8B-multilingual",
-    "id": "lightblue/suzume-llama-3-8B-multilingual",
-    "developer": "lightblue",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6678
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.495
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0944
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3977
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3383
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lkoenig/BBAI_145_/be850d1b-bf75-4c34-830f-8881792ac842.json b/data/hfopenllm_v2/lkoenig/BBAI_145_/be850d1b-bf75-4c34-830f-8881792ac842.json
deleted file mode 100644
index 302154c6b..000000000
--- a/data/hfopenllm_v2/lkoenig/BBAI_145_/be850d1b-bf75-4c34-830f-8881792ac842.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_145_/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI_145_",
-    "id": "lkoenig/BBAI_145_",
-    "developer": "lkoenig",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.445
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5567
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.361
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.449
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lkoenig/BBAI_200_Gemma/6b644b97-4fc3-4826-9ea9-68be1dc8e947.json b/data/hfopenllm_v2/lkoenig/BBAI_200_Gemma/6b644b97-4fc3-4826-9ea9-68be1dc8e947.json
deleted file mode 100644
index 7865c5bfc..000000000
--- a/data/hfopenllm_v2/lkoenig/BBAI_200_Gemma/6b644b97-4fc3-4826-9ea9-68be1dc8e947.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_200_Gemma/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI_200_Gemma",
-    "id": "lkoenig/BBAI_200_Gemma",
-    "developer": "lkoenig",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 19.3
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0705
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3449
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3631
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1679
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lkoenig/BBAI_212_QwenLawLo/861d41f1-6d33-4e07-96ea-2c39a36c4b63.json b/data/hfopenllm_v2/lkoenig/BBAI_212_QwenLawLo/861d41f1-6d33-4e07-96ea-2c39a36c4b63.json
deleted file mode 100644
index 2d76a0d19..000000000
--- a/data/hfopenllm_v2/lkoenig/BBAI_212_QwenLawLo/861d41f1-6d33-4e07-96ea-2c39a36c4b63.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_212_QwenLawLo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI_212_QwenLawLo",
-    "id": "lkoenig/BBAI_212_QwenLawLo",
-    "developer": "lkoenig",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4566
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5574
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3603
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.437
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4489
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lkoenig/BBAI_212_Qwencore/7501b038-4847-45bc-8b92-6800d7a58c1e.json b/data/hfopenllm_v2/lkoenig/BBAI_212_Qwencore/7501b038-4847-45bc-8b92-6800d7a58c1e.json
deleted file mode 100644
index 3b77ae393..000000000
--- a/data/hfopenllm_v2/lkoenig/BBAI_212_Qwencore/7501b038-4847-45bc-8b92-6800d7a58c1e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_212_Qwencore/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI_212_Qwencore",
-    "id": "lkoenig/BBAI_212_Qwencore",
-    "developer": "lkoenig",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4384
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5569
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3489
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4343
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.449
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lkoenig/BBAI_230_Xiaqwen/db48206d-700b-45f3-b597-8752110113b5.json b/data/hfopenllm_v2/lkoenig/BBAI_230_Xiaqwen/db48206d-700b-45f3-b597-8752110113b5.json
deleted file mode 100644
index af9941d74..000000000
--- a/data/hfopenllm_v2/lkoenig/BBAI_230_Xiaqwen/db48206d-700b-45f3-b597-8752110113b5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_230_Xiaqwen/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI_230_Xiaqwen",
-    "id": "lkoenig/BBAI_230_Xiaqwen",
-    "developer": "lkoenig",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4649
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5578
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3663
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4422
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4481
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lkoenig/BBAI_375_QwenDyancabs/b52b76e4-9dec-4336-88b1-d98b95b95d2a.json b/data/hfopenllm_v2/lkoenig/BBAI_375_QwenDyancabs/b52b76e4-9dec-4336-88b1-d98b95b95d2a.json
deleted file mode 100644
index 687f6c068..000000000
--- a/data/hfopenllm_v2/lkoenig/BBAI_375_QwenDyancabs/b52b76e4-9dec-4336-88b1-d98b95b95d2a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_375_QwenDyancabs/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI_375_QwenDyancabs",
-    "id": "lkoenig/BBAI_375_QwenDyancabs",
-    "developer": "lkoenig",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4566
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5571
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3776
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4462
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4476
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lkoenig/BBAI_456_QwenKoen/ba9ec2ea-2bce-4999-9e48-e1d0795b31d0.json b/data/hfopenllm_v2/lkoenig/BBAI_456_QwenKoen/ba9ec2ea-2bce-4999-9e48-e1d0795b31d0.json
deleted file mode 100644
index b1a7b3d3e..000000000
--- a/data/hfopenllm_v2/lkoenig/BBAI_456_QwenKoen/ba9ec2ea-2bce-4999-9e48-e1d0795b31d0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_456_QwenKoen/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI_456_QwenKoen",
-    "id": "lkoenig/BBAI_456_QwenKoen",
-    "developer": "lkoenig",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4529
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5553
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3686
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4395
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4469
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lkoenig/BBAI_7B_KoenQwenDyan/724221ce-d7b2-43cb-8e16-72ac529a7b60.json b/data/hfopenllm_v2/lkoenig/BBAI_7B_KoenQwenDyan/724221ce-d7b2-43cb-8e16-72ac529a7b60.json
deleted file mode 100644
index 41f147ae3..000000000
--- a/data/hfopenllm_v2/lkoenig/BBAI_7B_KoenQwenDyan/724221ce-d7b2-43cb-8e16-72ac529a7b60.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_7B_KoenQwenDyan/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI_7B_KoenQwenDyan",
-    "id": "lkoenig/BBAI_7B_KoenQwenDyan",
-    "developer": "lkoenig",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5807
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5537
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3739
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4369
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.446
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lkoenig/BBAI_7B_Qwen2.5koen/552f3814-d071-4d00-a895-b739dffdcb2d.json b/data/hfopenllm_v2/lkoenig/BBAI_7B_Qwen2.5koen/552f3814-d071-4d00-a895-b739dffdcb2d.json
deleted file mode 100644
index b762f6c72..000000000
--- a/data/hfopenllm_v2/lkoenig/BBAI_7B_Qwen2.5koen/552f3814-d071-4d00-a895-b739dffdcb2d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_7B_Qwen2.5koen/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI_7B_Qwen2.5koen",
-    "id": "lkoenig/BBAI_7B_Qwen2.5koen",
-    "developer": "lkoenig",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5544
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3656
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4369
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4485
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lkoenig/BBAI_7B_QwenDyanKoenLo/d3819133-bae8-493d-9a86-aee67da5d115.json b/data/hfopenllm_v2/lkoenig/BBAI_7B_QwenDyanKoenLo/d3819133-bae8-493d-9a86-aee67da5d115.json
deleted file mode 100644
index 536de7261..000000000
--- a/data/hfopenllm_v2/lkoenig/BBAI_7B_QwenDyanKoenLo/d3819133-bae8-493d-9a86-aee67da5d115.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_7B_QwenDyanKoenLo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI_7B_QwenDyanKoenLo",
-    "id": "lkoenig/BBAI_7B_QwenDyanKoenLo",
-    "developer": "lkoenig",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4663
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5562
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.364
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4343
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4465
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lkoenig/BBAI_7B_QwenDyancabsLAW/5c3a022f-7221-4b4f-ab67-d5b69c558434.json b/data/hfopenllm_v2/lkoenig/BBAI_7B_QwenDyancabsLAW/5c3a022f-7221-4b4f-ab67-d5b69c558434.json
deleted file mode 100644
index 9a62f605c..000000000
--- a/data/hfopenllm_v2/lkoenig/BBAI_7B_QwenDyancabsLAW/5c3a022f-7221-4b4f-ab67-d5b69c558434.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lkoenig_BBAI_7B_QwenDyancabsLAW/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BBAI_7B_QwenDyancabsLAW",
-    "id": "lkoenig/BBAI_7B_QwenDyancabsLAW",
-    "developer": "lkoenig",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.555
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5579
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3678
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4461
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4471
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/llmat/Mistral-v0.3-7B-ORPO/c161b868-746f-4d88-9f41-eb8283a7b87a.json b/data/hfopenllm_v2/llmat/Mistral-v0.3-7B-ORPO/c161b868-746f-4d88-9f41-eb8283a7b87a.json
deleted file mode 100644
index 240e3a159..000000000
--- a/data/hfopenllm_v2/llmat/Mistral-v0.3-7B-ORPO/c161b868-746f-4d88-9f41-eb8283a7b87a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/llmat_Mistral-v0.3-7B-ORPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-v0.3-7B-ORPO",
-    "id": "llmat/Mistral-v0.3-7B-ORPO",
-    "developer": "llmat",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.377
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3978
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0242
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3555
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2278
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/llmat/Mistral-v0.3-7B-ORPO/f79a76fc-09ff-48c8-b0e7-5f18e0750e6d.json b/data/hfopenllm_v2/llmat/Mistral-v0.3-7B-ORPO/f79a76fc-09ff-48c8-b0e7-5f18e0750e6d.json
deleted file mode 100644
index ffd455c9c..000000000
--- a/data/hfopenllm_v2/llmat/Mistral-v0.3-7B-ORPO/f79a76fc-09ff-48c8-b0e7-5f18e0750e6d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/llmat_Mistral-v0.3-7B-ORPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-v0.3-7B-ORPO",
-    "id": "llmat/Mistral-v0.3-7B-ORPO",
-    "developer": "llmat",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.364
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4005
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0015
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3529
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2301
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-1B-SLERP-V5/39f4d1ab-fd42-4746-b949-9666ce32f9d1.json b/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-1B-SLERP-V5/39f4d1ab-fd42-4746-b949-9666ce32f9d1.json
deleted file mode 100644
index cc9cfb3e3..000000000
--- a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-1B-SLERP-V5/39f4d1ab-fd42-4746-b949-9666ce32f9d1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/llnYou_ECE-PRYMMAL-YL-1B-SLERP-V5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-YL-1B-SLERP-V5",
-    "id": "llnYou/ECE-PRYMMAL-YL-1B-SLERP-V5",
-    "developer": "llnYou",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3313
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4233
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.111
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3868
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2931
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-1B-SLERP-V6/8348f316-9109-4229-9fee-edc02431befa.json b/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-1B-SLERP-V6/8348f316-9109-4229-9fee-edc02431befa.json
deleted file mode 100644
index 48d4a3877..000000000
--- a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-1B-SLERP-V6/8348f316-9109-4229-9fee-edc02431befa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/llnYou_ECE-PRYMMAL-YL-1B-SLERP-V6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-YL-1B-SLERP-V6",
-    "id": "llnYou/ECE-PRYMMAL-YL-1B-SLERP-V6",
-    "developer": "llnYou",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.357
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1388
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3944
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0023
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3928
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.235
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V1/6b2346c6-5fbf-4195-b3bb-66bbd446ca53.json b/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V1/6b2346c6-5fbf-4195-b3bb-66bbd446ca53.json
deleted file mode 100644
index 914529969..000000000
--- a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V1/6b2346c6-5fbf-4195-b3bb-66bbd446ca53.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/llnYou_ECE-PRYMMAL-YL-3B-SLERP-V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-YL-3B-SLERP-V1",
-    "id": "llnYou/ECE-PRYMMAL-YL-3B-SLERP-V1",
-    "developer": "llnYou",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 2.81
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2346
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4018
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.285
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V2/8645ffc1-6487-4205-b8b0-e980e094ac6c.json b/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V2/8645ffc1-6487-4205-b8b0-e980e094ac6c.json
deleted file mode 100644
index dd572deaa..000000000
--- a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V2/8645ffc1-6487-4205-b8b0-e980e094ac6c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/llnYou_ECE-PRYMMAL-YL-3B-SLERP-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-YL-3B-SLERP-V2",
-    "id": "llnYou/ECE-PRYMMAL-YL-3B-SLERP-V2",
-    "developer": "llnYou",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 2.81
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2309
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.399
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3588
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.29
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V3/2c6d1e57-7673-4a86-808e-6ff6a7146a11.json b/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V3/2c6d1e57-7673-4a86-808e-6ff6a7146a11.json
deleted file mode 100644
index b5bc635f3..000000000
--- a/data/hfopenllm_v2/llnYou/ECE-PRYMMAL-YL-3B-SLERP-V3/2c6d1e57-7673-4a86-808e-6ff6a7146a11.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/llnYou_ECE-PRYMMAL-YL-3B-SLERP-V3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-YL-3B-SLERP-V3",
-    "id": "llnYou/ECE-PRYMMAL-YL-3B-SLERP-V3",
-    "developer": "llnYou",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3581
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5473
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1299
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4361
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4043
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lmsys/vicuna-13b-v1.3/64ab8b1a-62be-4561-8f0c-e42f1fe37178.json b/data/hfopenllm_v2/lmsys/vicuna-13b-v1.3/64ab8b1a-62be-4561-8f0c-e42f1fe37178.json
deleted file mode 100644
index 51fcf6393..000000000
--- a/data/hfopenllm_v2/lmsys/vicuna-13b-v1.3/64ab8b1a-62be-4561-8f0c-e42f1fe37178.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lmsys_vicuna-13b-v1.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "vicuna-13b-v1.3",
-    "id": "lmsys/vicuna-13b-v1.3",
-    "developer": "lmsys",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3344
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3384
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3727
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2243
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lmsys/vicuna-7b-v1.3/3eb22885-eb7c-4c85-b79f-cd47ffacd551.json b/data/hfopenllm_v2/lmsys/vicuna-7b-v1.3/3eb22885-eb7c-4c85-b79f-cd47ffacd551.json
deleted file mode 100644
index adbfd12b9..000000000
--- a/data/hfopenllm_v2/lmsys/vicuna-7b-v1.3/3eb22885-eb7c-4c85-b79f-cd47ffacd551.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lmsys_vicuna-7b-v1.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "vicuna-7b-v1.3",
-    "id": "lmsys/vicuna-7b-v1.3",
-    "developer": "lmsys",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2909
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3298
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2424
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3793
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1838
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lmsys/vicuna-7b-v1.5/8956d608-c627-469b-943d-bfad6c7382af.json b/data/hfopenllm_v2/lmsys/vicuna-7b-v1.5/8956d608-c627-469b-943d-bfad6c7382af.json
deleted file mode 100644
index 54b24d075..000000000
--- a/data/hfopenllm_v2/lmsys/vicuna-7b-v1.5/8956d608-c627-469b-943d-bfad6c7382af.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lmsys_vicuna-7b-v1.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "vicuna-7b-v1.5",
-    "id": "lmsys/vicuna-7b-v1.5",
-    "developer": "lmsys",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2352
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3947
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4231
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2147
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lodrick-the-lafted/llama-3.1-8b-instruct-ortho-v7/9ff060c8-d4fa-4880-a0cd-9581f5c2f574.json b/data/hfopenllm_v2/lodrick-the-lafted/llama-3.1-8b-instruct-ortho-v7/9ff060c8-d4fa-4880-a0cd-9581f5c2f574.json
deleted file mode 100644
index 2ad864b95..000000000
--- a/data/hfopenllm_v2/lodrick-the-lafted/llama-3.1-8b-instruct-ortho-v7/9ff060c8-d4fa-4880-a0cd-9581f5c2f574.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lodrick-the-lafted_llama-3.1-8b-instruct-ortho-v7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3.1-8b-instruct-ortho-v7",
-    "id": "lodrick-the-lafted/llama-3.1-8b-instruct-ortho-v7",
-    "developer": "lodrick-the-lafted",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3907
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0272
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3616
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1974
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lordjia/Llama-3-Cantonese-8B-Instruct/e3d6b3d7-a231-40c1-bac9-0b7fcb478bca.json b/data/hfopenllm_v2/lordjia/Llama-3-Cantonese-8B-Instruct/e3d6b3d7-a231-40c1-bac9-0b7fcb478bca.json
deleted file mode 100644
index eef3e165a..000000000
--- a/data/hfopenllm_v2/lordjia/Llama-3-Cantonese-8B-Instruct/e3d6b3d7-a231-40c1-bac9-0b7fcb478bca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lordjia_Llama-3-Cantonese-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Cantonese-8B-Instruct",
-    "id": "lordjia/Llama-3-Cantonese-8B-Instruct",
-    "developer": "lordjia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6669
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4814
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0891
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4046
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lordjia/Qwen2-Cantonese-7B-Instruct/20acb302-3a74-4425-af4c-a1d719b90a88.json b/data/hfopenllm_v2/lordjia/Qwen2-Cantonese-7B-Instruct/20acb302-3a74-4425-af4c-a1d719b90a88.json
deleted file mode 100644
index 95ef13bda..000000000
--- a/data/hfopenllm_v2/lordjia/Qwen2-Cantonese-7B-Instruct/20acb302-3a74-4425-af4c-a1d719b90a88.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lordjia_Qwen2-Cantonese-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-Cantonese-7B-Instruct",
-    "id": "lordjia/Qwen2-Cantonese-7B-Instruct",
-    "developer": "lordjia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5435
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5215
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.256
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4004
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3843
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lt-asset/nova-1.3b/a8613588-687d-4291-ae5a-57688501cffd.json b/data/hfopenllm_v2/lt-asset/nova-1.3b/a8613588-687d-4291-ae5a-57688501cffd.json
deleted file mode 100644
index 80085f690..000000000
--- a/data/hfopenllm_v2/lt-asset/nova-1.3b/a8613588-687d-4291-ae5a-57688501cffd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lt-asset_nova-1.3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "nova-1.3b",
-    "id": "lt-asset/nova-1.3b",
-    "developer": "lt-asset",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "NovaForCausalLM",
-      "params_billions": 1.347
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1214
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.317
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3698
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1142
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lunahr/thea-3b-50r-u1/83dd67cb-5508-4aa5-9435-d5585b7f3d52.json b/data/hfopenllm_v2/lunahr/thea-3b-50r-u1/83dd67cb-5508-4aa5-9435-d5585b7f3d52.json
deleted file mode 100644
index 00e74d58e..000000000
--- a/data/hfopenllm_v2/lunahr/thea-3b-50r-u1/83dd67cb-5508-4aa5-9435-d5585b7f3d52.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lunahr_thea-3b-50r-u1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "thea-3b-50r-u1",
-    "id": "lunahr/thea-3b-50r-u1",
-    "developer": "lunahr",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.603
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4105
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1042
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3182
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2808
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/lunahr/thea-v2-3b-50r/26d981bb-f2e5-4195-8d6f-594bb0b26f4a.json b/data/hfopenllm_v2/lunahr/thea-v2-3b-50r/26d981bb-f2e5-4195-8d6f-594bb0b26f4a.json
deleted file mode 100644
index bc6aaf04b..000000000
--- a/data/hfopenllm_v2/lunahr/thea-v2-3b-50r/26d981bb-f2e5-4195-8d6f-594bb0b26f4a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/lunahr_thea-v2-3b-50r/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "thea-v2-3b-50r",
-    "id": "lunahr/thea-v2-3b-50r",
-    "developer": "lunahr",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3704
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4194
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0242
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3222
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2409
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/m42-health/Llama3-Med42-70B/df06c977-b54c-4668-837f-eb583ef24d29.json b/data/hfopenllm_v2/m42-health/Llama3-Med42-70B/df06c977-b54c-4668-837f-eb583ef24d29.json
deleted file mode 100644
index 8b02603c5..000000000
--- a/data/hfopenllm_v2/m42-health/Llama3-Med42-70B/df06c977-b54c-4668-837f-eb583ef24d29.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/m42-health_Llama3-Med42-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3-Med42-70B",
-    "id": "m42-health/Llama3-Med42-70B",
-    "developer": "m42-health",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6291
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6688
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2258
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4629
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4963
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/macadeliccc/Samantha-Qwen-2-7B/31a8ac03-f58b-46e3-9f17-53311b1fd506.json b/data/hfopenllm_v2/macadeliccc/Samantha-Qwen-2-7B/31a8ac03-f58b-46e3-9f17-53311b1fd506.json
deleted file mode 100644
index c21abe404..000000000
--- a/data/hfopenllm_v2/macadeliccc/Samantha-Qwen-2-7B/31a8ac03-f58b-46e3-9f17-53311b1fd506.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/macadeliccc_Samantha-Qwen-2-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Samantha-Qwen-2-7B",
-    "id": "macadeliccc/Samantha-Qwen-2-7B",
-    "developer": "macadeliccc",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4377
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5082
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2115
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4799
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3779
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/macadeliccc/magistrate-3.2-3b-base/3e4a7141-7a82-421a-a107-bbac3cbafc9b.json b/data/hfopenllm_v2/macadeliccc/magistrate-3.2-3b-base/3e4a7141-7a82-421a-a107-bbac3cbafc9b.json
deleted file mode 100644
index 552af16da..000000000
--- a/data/hfopenllm_v2/macadeliccc/magistrate-3.2-3b-base/3e4a7141-7a82-421a-a107-bbac3cbafc9b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/macadeliccc_magistrate-3.2-3b-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "magistrate-3.2-3b-base",
-    "id": "macadeliccc/magistrate-3.2-3b-base",
-    "developer": "macadeliccc",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1159
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3343
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0113
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3976
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1689
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/macadeliccc/magistrate-3.2-3b-it/9a3069f2-81ed-484a-b6e6-a45a259e9a43.json b/data/hfopenllm_v2/macadeliccc/magistrate-3.2-3b-it/9a3069f2-81ed-484a-b6e6-a45a259e9a43.json
deleted file mode 100644
index 55e0d15e4..000000000
--- a/data/hfopenllm_v2/macadeliccc/magistrate-3.2-3b-it/9a3069f2-81ed-484a-b6e6-a45a259e9a43.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/macadeliccc_magistrate-3.2-3b-it/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "magistrate-3.2-3b-it",
-    "id": "macadeliccc/magistrate-3.2-3b-it",
-    "developer": "macadeliccc",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2292
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3257
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2475
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3763
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1592
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/magnifi/Phi3_intent_v56_3_w_unknown_5_lr_0.002/c0a3d0c3-c541-4606-a925-4100b062284f.json b/data/hfopenllm_v2/magnifi/Phi3_intent_v56_3_w_unknown_5_lr_0.002/c0a3d0c3-c541-4606-a925-4100b062284f.json
deleted file mode 100644
index 0f570e9e4..000000000
--- a/data/hfopenllm_v2/magnifi/Phi3_intent_v56_3_w_unknown_5_lr_0.002/c0a3d0c3-c541-4606-a925-4100b062284f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/magnifi_Phi3_intent_v56_3_w_unknown_5_lr_0.002/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi3_intent_v56_3_w_unknown_5_lr_0.002",
-    "id": "magnifi/Phi3_intent_v56_3_w_unknown_5_lr_0.002",
-    "developer": "magnifi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2018
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3282
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4123
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1472
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/maldv/Awqward2.5-32B-Instruct/20685a4b-686f-4cd4-b49d-3067a005256d.json b/data/hfopenllm_v2/maldv/Awqward2.5-32B-Instruct/20685a4b-686f-4cd4-b49d-3067a005256d.json
deleted file mode 100644
index 3a60298fd..000000000
--- a/data/hfopenllm_v2/maldv/Awqward2.5-32B-Instruct/20685a4b-686f-4cd4-b49d-3067a005256d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/maldv_Awqward2.5-32B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Awqward2.5-32B-Instruct",
-    "id": "maldv/Awqward2.5-32B-Instruct",
-    "developer": "maldv",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8255
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6974
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6231
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4275
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5723
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/maldv/Lytta2.5-32B-Instruct/85a91293-cd51-4f79-8b98-2f4bc67d78c1.json b/data/hfopenllm_v2/maldv/Lytta2.5-32B-Instruct/85a91293-cd51-4f79-8b98-2f4bc67d78c1.json
deleted file mode 100644
index 4697190aa..000000000
--- a/data/hfopenllm_v2/maldv/Lytta2.5-32B-Instruct/85a91293-cd51-4f79-8b98-2f4bc67d78c1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/maldv_Lytta2.5-32B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lytta2.5-32B-Instruct",
-    "id": "maldv/Lytta2.5-32B-Instruct",
-    "developer": "maldv",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.56
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3444
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3769
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5048
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/maldv/Qwentile2.5-32B-Instruct/d2e3a6c2-4e67-4150-b9a8-fec979fb1658.json b/data/hfopenllm_v2/maldv/Qwentile2.5-32B-Instruct/d2e3a6c2-4e67-4150-b9a8-fec979fb1658.json
deleted file mode 100644
index 345ca800e..000000000
--- a/data/hfopenllm_v2/maldv/Qwentile2.5-32B-Instruct/d2e3a6c2-4e67-4150-b9a8-fec979fb1658.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/maldv_Qwentile2.5-32B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwentile2.5-32B-Instruct",
-    "id": "maldv/Qwentile2.5-32B-Instruct",
-    "developer": "maldv",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7393
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6963
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4682
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5879
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/maldv/badger-kappa-llama-3-8b/c4d686f2-2af1-4271-9556-09380f07ba5f.json b/data/hfopenllm_v2/maldv/badger-kappa-llama-3-8b/c4d686f2-2af1-4271-9556-09380f07ba5f.json
deleted file mode 100644
index 65f56c77f..000000000
--- a/data/hfopenllm_v2/maldv/badger-kappa-llama-3-8b/c4d686f2-2af1-4271-9556-09380f07ba5f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/maldv_badger-kappa-llama-3-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "badger-kappa-llama-3-8b",
-    "id": "maldv/badger-kappa-llama-3-8b",
-    "developer": "maldv",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4695
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5085
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0861
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3765
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3695
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/maldv/badger-lambda-llama-3-8b/93167303-b38e-43f0-a552-72c26ccb4339.json b/data/hfopenllm_v2/maldv/badger-lambda-llama-3-8b/93167303-b38e-43f0-a552-72c26ccb4339.json
deleted file mode 100644
index 84491b944..000000000
--- a/data/hfopenllm_v2/maldv/badger-lambda-llama-3-8b/93167303-b38e-43f0-a552-72c26ccb4339.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/maldv_badger-lambda-llama-3-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "badger-lambda-llama-3-8b",
-    "id": "maldv/badger-lambda-llama-3-8b",
-    "developer": "maldv",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4861
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4963
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0944
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3754
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3767
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/maldv/badger-mu-llama-3-8b/b52a176f-f369-4791-a7e3-88a72709c868.json b/data/hfopenllm_v2/maldv/badger-mu-llama-3-8b/b52a176f-f369-4791-a7e3-88a72709c868.json
deleted file mode 100644
index 006838490..000000000
--- a/data/hfopenllm_v2/maldv/badger-mu-llama-3-8b/b52a176f-f369-4791-a7e3-88a72709c868.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/maldv_badger-mu-llama-3-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "badger-mu-llama-3-8b",
-    "id": "maldv/badger-mu-llama-3-8b",
-    "developer": "maldv",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4919
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5143
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0559
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3555
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3674
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/maldv/badger-writer-llama-3-8b/b6310012-17f1-4ee0-abd0-0079a9299350.json b/data/hfopenllm_v2/maldv/badger-writer-llama-3-8b/b6310012-17f1-4ee0-abd0-0079a9299350.json
deleted file mode 100644
index 08b482ae6..000000000
--- a/data/hfopenllm_v2/maldv/badger-writer-llama-3-8b/b6310012-17f1-4ee0-abd0-0079a9299350.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/maldv_badger-writer-llama-3-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "badger-writer-llama-3-8b",
-    "id": "maldv/badger-writer-llama-3-8b",
-    "developer": "maldv",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5303
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4864
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0755
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3581
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.376
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/Cheng-1/f581e832-0f77-496e-bcd3-6cfec51ef594.json b/data/hfopenllm_v2/marcuscedricridia/Cheng-1/f581e832-0f77-496e-bcd3-6cfec51ef594.json
deleted file mode 100644
index db31c8ed3..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/Cheng-1/f581e832-0f77-496e-bcd3-6cfec51ef594.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_Cheng-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cheng-1",
-    "id": "marcuscedricridia/Cheng-1",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7789
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5525
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4894
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4073
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4349
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/Cheng-2-v1.1/47b47c89-b13b-4099-98b2-854feae05f63.json b/data/hfopenllm_v2/marcuscedricridia/Cheng-2-v1.1/47b47c89-b13b-4099-98b2-854feae05f63.json
deleted file mode 100644
index a97a44973..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/Cheng-2-v1.1/47b47c89-b13b-4099-98b2-854feae05f63.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_Cheng-2-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cheng-2-v1.1",
-    "id": "marcuscedricridia/Cheng-2-v1.1",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.827
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.651
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3431
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4167
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5076
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/Cheng-2/8d51ae58-7b20-4fa4-b234-2abb9cdeaad4.json b/data/hfopenllm_v2/marcuscedricridia/Cheng-2/8d51ae58-7b20-4fa4-b234-2abb9cdeaad4.json
deleted file mode 100644
index d2b1be907..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/Cheng-2/8d51ae58-7b20-4fa4-b234-2abb9cdeaad4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_Cheng-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cheng-2",
-    "id": "marcuscedricridia/Cheng-2",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8337
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6499
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3456
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4193
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5013
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.1/4d4d5679-8ec6-49b8-a5d7-2a76497b44b7.json b/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.1/4d4d5679-8ec6-49b8-a5d7-2a76497b44b7.json
deleted file mode 100644
index 2a3ec93a3..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.1/4d4d5679-8ec6-49b8-a5d7-2a76497b44b7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_Hush-Qwen2.5-7B-MST-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hush-Qwen2.5-7B-MST-v1.1",
-    "id": "marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.1",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7445
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5559
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4653
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4073
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4299
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.3/0bdb6574-69e2-4858-b7aa-a90a5fadf741.json b/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.3/0bdb6574-69e2-4858-b7aa-a90a5fadf741.json
deleted file mode 100644
index 5208a426c..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.3/0bdb6574-69e2-4858-b7aa-a90a5fadf741.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_Hush-Qwen2.5-7B-MST-v1.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hush-Qwen2.5-7B-MST-v1.3",
-    "id": "marcuscedricridia/Hush-Qwen2.5-7B-MST-v1.3",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7043
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5516
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4758
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4311
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.444
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST/fa1a92bb-ad25-4be2-a35f-7fdebbeeeba8.json b/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST/fa1a92bb-ad25-4be2-a35f-7fdebbeeeba8.json
deleted file mode 100644
index 637b7439c..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-MST/fa1a92bb-ad25-4be2-a35f-7fdebbeeeba8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_Hush-Qwen2.5-7B-MST/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hush-Qwen2.5-7B-MST",
-    "id": "marcuscedricridia/Hush-Qwen2.5-7B-MST",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7488
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5458
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4245
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3914
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4163
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-Preview/d62ea0a1-cc9d-41b7-8d60-479b8e2262b5.json b/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-Preview/d62ea0a1-cc9d-41b7-8d60-479b8e2262b5.json
deleted file mode 100644
index af57fa32c..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-Preview/d62ea0a1-cc9d-41b7-8d60-479b8e2262b5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_Hush-Qwen2.5-7B-Preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hush-Qwen2.5-7B-Preview",
-    "id": "marcuscedricridia/Hush-Qwen2.5-7B-Preview",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7962
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5431
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3754
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4298
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4364
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-RP-v1.4-1M/912446e3-efdf-4ed0-80bd-261c6c87a3d0.json b/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-RP-v1.4-1M/912446e3-efdf-4ed0-80bd-261c6c87a3d0.json
deleted file mode 100644
index 50261cfbe..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-RP-v1.4-1M/912446e3-efdf-4ed0-80bd-261c6c87a3d0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_Hush-Qwen2.5-7B-RP-v1.4-1M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hush-Qwen2.5-7B-RP-v1.4-1M",
-    "id": "marcuscedricridia/Hush-Qwen2.5-7B-RP-v1.4-1M",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7728
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5295
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3369
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4433
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4135
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.1/5e86dc31-ae3e-4ef7-858e-41e29b3a8031.json b/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.1/5e86dc31-ae3e-4ef7-858e-41e29b3a8031.json
deleted file mode 100644
index 91ec9fbad..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.1/5e86dc31-ae3e-4ef7-858e-41e29b3a8031.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_Hush-Qwen2.5-7B-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hush-Qwen2.5-7B-v1.1",
-    "id": "marcuscedricridia/Hush-Qwen2.5-7B-v1.1",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7889
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5384
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4381
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4179
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4227
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.2/80680e5e-ab83-4a59-aeec-9d4166509c47.json b/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.2/80680e5e-ab83-4a59-aeec-9d4166509c47.json
deleted file mode 100644
index db41151dd..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.2/80680e5e-ab83-4a59-aeec-9d4166509c47.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_Hush-Qwen2.5-7B-v1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hush-Qwen2.5-7B-v1.2",
-    "id": "marcuscedricridia/Hush-Qwen2.5-7B-v1.2",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7865
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5403
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4403
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4219
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4197
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.3/c5bc9c92-8469-4174-aafd-67bb61aaccf2.json b/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.3/c5bc9c92-8469-4174-aafd-67bb61aaccf2.json
deleted file mode 100644
index b018cf076..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.3/c5bc9c92-8469-4174-aafd-67bb61aaccf2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_Hush-Qwen2.5-7B-v1.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hush-Qwen2.5-7B-v1.3",
-    "id": "marcuscedricridia/Hush-Qwen2.5-7B-v1.3",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7856
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5327
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3323
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4246
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4345
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.4/1d67b792-178b-4baa-a108-2362f658bd4e.json b/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.4/1d67b792-178b-4baa-a108-2362f658bd4e.json
deleted file mode 100644
index 142cdd945..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/Hush-Qwen2.5-7B-v1.4/1d67b792-178b-4baa-a108-2362f658bd4e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_Hush-Qwen2.5-7B-v1.4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hush-Qwen2.5-7B-v1.4",
-    "id": "marcuscedricridia/Hush-Qwen2.5-7B-v1.4",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7835
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5423
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.426
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4232
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4195
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/Qwen2.5-7B-Preview/eb0c87b0-4795-4029-82c1-57ce37ba8259.json b/data/hfopenllm_v2/marcuscedricridia/Qwen2.5-7B-Preview/eb0c87b0-4795-4029-82c1-57ce37ba8259.json
deleted file mode 100644
index 9e2e24bb9..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/Qwen2.5-7B-Preview/eb0c87b0-4795-4029-82c1-57ce37ba8259.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_Qwen2.5-7B-Preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-Preview",
-    "id": "marcuscedricridia/Qwen2.5-7B-Preview",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7679
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.536
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3444
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.414
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4258
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/Yell-Qwen2.5-7B-Preview-v1.1/dc9b2300-7ab0-4e92-9d23-15fe9ca52994.json b/data/hfopenllm_v2/marcuscedricridia/Yell-Qwen2.5-7B-Preview-v1.1/dc9b2300-7ab0-4e92-9d23-15fe9ca52994.json
deleted file mode 100644
index f754c4b96..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/Yell-Qwen2.5-7B-Preview-v1.1/dc9b2300-7ab0-4e92-9d23-15fe9ca52994.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_Yell-Qwen2.5-7B-Preview-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yell-Qwen2.5-7B-Preview-v1.1",
-    "id": "marcuscedricridia/Yell-Qwen2.5-7B-Preview-v1.1",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5757
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5348
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1896
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4059
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3831
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/Yell-Qwen2.5-7B-Preview/e005624d-c822-4be1-9477-873642aae228.json b/data/hfopenllm_v2/marcuscedricridia/Yell-Qwen2.5-7B-Preview/e005624d-c822-4be1-9477-873642aae228.json
deleted file mode 100644
index 0967ce44f..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/Yell-Qwen2.5-7B-Preview/e005624d-c822-4be1-9477-873642aae228.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_Yell-Qwen2.5-7B-Preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Yell-Qwen2.5-7B-Preview",
-    "id": "marcuscedricridia/Yell-Qwen2.5-7B-Preview",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5839
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5371
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1926
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4046
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3798
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/absolute-o1-7b/e9756d91-b9e2-4dd0-bf08-c6154c7d1f2e.json b/data/hfopenllm_v2/marcuscedricridia/absolute-o1-7b/e9756d91-b9e2-4dd0-bf08-c6154c7d1f2e.json
deleted file mode 100644
index 6b56c9588..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/absolute-o1-7b/e9756d91-b9e2-4dd0-bf08-c6154c7d1f2e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_absolute-o1-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "absolute-o1-7b",
-    "id": "marcuscedricridia/absolute-o1-7b",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7516
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5469
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4114
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4413
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-2-28-2025/704598c3-c5d6-4ce0-bab3-0fa98118e16a.json b/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-2-28-2025/704598c3-c5d6-4ce0-bab3-0fa98118e16a.json
deleted file mode 100644
index f7db34598..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-2-28-2025/704598c3-c5d6-4ce0-bab3-0fa98118e16a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_cursa-o1-7b-2-28-2025/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "cursa-o1-7b-2-28-2025",
-    "id": "marcuscedricridia/cursa-o1-7b-2-28-2025",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7467
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5384
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4811
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4273
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4365
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-v1.1/fafc9463-d725-4827-8bc1-5cd9e83814b6.json b/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-v1.1/fafc9463-d725-4827-8bc1-5cd9e83814b6.json
deleted file mode 100644
index 542f287db..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-v1.1/fafc9463-d725-4827-8bc1-5cd9e83814b6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_cursa-o1-7b-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "cursa-o1-7b-v1.1",
-    "id": "marcuscedricridia/cursa-o1-7b-v1.1",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7528
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5493
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4985
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4259
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4392
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-v1.2-normalize-false/109820e0-ee00-449c-9ae5-58a7bf1da5f8.json b/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-v1.2-normalize-false/109820e0-ee00-449c-9ae5-58a7bf1da5f8.json
deleted file mode 100644
index 37f93dd1f..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b-v1.2-normalize-false/109820e0-ee00-449c-9ae5-58a7bf1da5f8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_cursa-o1-7b-v1.2-normalize-false/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "cursa-o1-7b-v1.2-normalize-false",
-    "id": "marcuscedricridia/cursa-o1-7b-v1.2-normalize-false",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7616
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5492
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4992
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4273
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4436
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b/37f29d5b-d803-4195-9ce0-75e45e32c160.json b/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b/37f29d5b-d803-4195-9ce0-75e45e32c160.json
deleted file mode 100644
index ad5a42f1a..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/cursa-o1-7b/37f29d5b-d803-4195-9ce0-75e45e32c160.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_cursa-o1-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "cursa-o1-7b",
-    "id": "marcuscedricridia/cursa-o1-7b",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7628
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5466
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4955
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4301
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4392
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/cursor-o1-7b/43546f48-8c46-4481-b1e5-f4b1ad2535be.json b/data/hfopenllm_v2/marcuscedricridia/cursor-o1-7b/43546f48-8c46-4481-b1e5-f4b1ad2535be.json
deleted file mode 100644
index 69bdd676a..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/cursor-o1-7b/43546f48-8c46-4481-b1e5-f4b1ad2535be.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_cursor-o1-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "cursor-o1-7b",
-    "id": "marcuscedricridia/cursor-o1-7b",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4107
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5007
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1412
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4101
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3251
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/cursorr-o1.2-7b/ec81e0ff-9cb4-4d43-9f78-1d5f4edc9103.json b/data/hfopenllm_v2/marcuscedricridia/cursorr-o1.2-7b/ec81e0ff-9cb4-4d43-9f78-1d5f4edc9103.json
deleted file mode 100644
index ebc368fd4..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/cursorr-o1.2-7b/ec81e0ff-9cb4-4d43-9f78-1d5f4edc9103.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_cursorr-o1.2-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "cursorr-o1.2-7b",
-    "id": "marcuscedricridia/cursorr-o1.2-7b",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.166
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3068
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3538
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/etr1o-explicit-v1.1/9290c86f-40b0-4520-b8aa-3460de62c396.json b/data/hfopenllm_v2/marcuscedricridia/etr1o-explicit-v1.1/9290c86f-40b0-4520-b8aa-3460de62c396.json
deleted file mode 100644
index 3d23d32e5..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/etr1o-explicit-v1.1/9290c86f-40b0-4520-b8aa-3460de62c396.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_etr1o-explicit-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "etr1o-explicit-v1.1",
-    "id": "marcuscedricridia/etr1o-explicit-v1.1",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.288
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3132
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4111
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1195
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/etr1o-explicit-v1.2/a4bf576e-9556-4956-8dcb-4d8906d45db0.json b/data/hfopenllm_v2/marcuscedricridia/etr1o-explicit-v1.2/a4bf576e-9556-4956-8dcb-4d8906d45db0.json
deleted file mode 100644
index 22ae724ac..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/etr1o-explicit-v1.2/a4bf576e-9556-4956-8dcb-4d8906d45db0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_etr1o-explicit-v1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "etr1o-explicit-v1.2",
-    "id": "marcuscedricridia/etr1o-explicit-v1.2",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1504
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.295
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4031
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1126
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/etr1o-v1.1/320a5c00-3307-4bc3-9f47-9befb88e461c.json b/data/hfopenllm_v2/marcuscedricridia/etr1o-v1.1/320a5c00-3307-4bc3-9f47-9befb88e461c.json
deleted file mode 100644
index a0ab10b5b..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/etr1o-v1.1/320a5c00-3307-4bc3-9f47-9befb88e461c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_etr1o-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "etr1o-v1.1",
-    "id": "marcuscedricridia/etr1o-v1.1",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1597
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.31
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4017
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1157
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/etr1o-v1.2/844d1556-6bc6-467e-a145-f92646770727.json b/data/hfopenllm_v2/marcuscedricridia/etr1o-v1.2/844d1556-6bc6-467e-a145-f92646770727.json
deleted file mode 100644
index c2db8d268..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/etr1o-v1.2/844d1556-6bc6-467e-a145-f92646770727.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_etr1o-v1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "etr1o-v1.2",
-    "id": "marcuscedricridia/etr1o-v1.2",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7287
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6349
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3588
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3758
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4714
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5316
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/fan-o1-7b/78923f4b-c2e7-4472-8398-10a0a8453ec5.json b/data/hfopenllm_v2/marcuscedricridia/fan-o1-7b/78923f4b-c2e7-4472-8398-10a0a8453ec5.json
deleted file mode 100644
index 63b85c22d..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/fan-o1-7b/78923f4b-c2e7-4472-8398-10a0a8453ec5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_fan-o1-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "fan-o1-7b",
-    "id": "marcuscedricridia/fan-o1-7b",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4456
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4849
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1616
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3834
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3274
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/olmner-7b/17abe1bf-2e97-409e-88e3-4f661861a195.json b/data/hfopenllm_v2/marcuscedricridia/olmner-7b/17abe1bf-2e97-409e-88e3-4f661861a195.json
deleted file mode 100644
index 116baa15b..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/olmner-7b/17abe1bf-2e97-409e-88e3-4f661861a195.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_olmner-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "olmner-7b",
-    "id": "marcuscedricridia/olmner-7b",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7254
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5472
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.463
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.438
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4309
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/olmner-della-7b/756978e5-1dfe-433e-ba88-339004a50ea7.json b/data/hfopenllm_v2/marcuscedricridia/olmner-della-7b/756978e5-1dfe-433e-ba88-339004a50ea7.json
deleted file mode 100644
index 41cca1ed9..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/olmner-della-7b/756978e5-1dfe-433e-ba88-339004a50ea7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_olmner-della-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "olmner-della-7b",
-    "id": "marcuscedricridia/olmner-della-7b",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7637
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5491
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4962
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4208
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4386
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/olmner-o1-7b/a889ae3a-5d86-4454-bfb9-332c4b61b836.json b/data/hfopenllm_v2/marcuscedricridia/olmner-o1-7b/a889ae3a-5d86-4454-bfb9-332c4b61b836.json
deleted file mode 100644
index 963ff040a..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/olmner-o1-7b/a889ae3a-5d86-4454-bfb9-332c4b61b836.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_olmner-o1-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "olmner-o1-7b",
-    "id": "marcuscedricridia/olmner-o1-7b",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7528
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5481
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4924
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4299
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4386
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/olmner-sbr-7b/2c5e1086-03b7-4cdd-801e-03fb26183076.json b/data/hfopenllm_v2/marcuscedricridia/olmner-sbr-7b/2c5e1086-03b7-4cdd-801e-03fb26183076.json
deleted file mode 100644
index 634719136..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/olmner-sbr-7b/2c5e1086-03b7-4cdd-801e-03fb26183076.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_olmner-sbr-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "olmner-sbr-7b",
-    "id": "marcuscedricridia/olmner-sbr-7b",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5462
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4947
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4154
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4412
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/post-cursa-o1/d9578847-b732-4c75-b246-9cdf03674fe0.json b/data/hfopenllm_v2/marcuscedricridia/post-cursa-o1/d9578847-b732-4c75-b246-9cdf03674fe0.json
deleted file mode 100644
index 2d244c450..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/post-cursa-o1/d9578847-b732-4c75-b246-9cdf03674fe0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_post-cursa-o1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "post-cursa-o1",
-    "id": "marcuscedricridia/post-cursa-o1",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7628
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.548
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4872
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4351
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4361
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.2/4c6f83fe-7896-4cf3-9434-b5f8d499f5ba.json b/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.2/4c6f83fe-7896-4cf3-9434-b5f8d499f5ba.json
deleted file mode 100644
index 57d91dea9..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.2/4c6f83fe-7896-4cf3-9434-b5f8d499f5ba.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_pre-cursa-o1-v1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pre-cursa-o1-v1.2",
-    "id": "marcuscedricridia/pre-cursa-o1-v1.2",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7549
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5487
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4272
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4402
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.3/619037af-d528-4579-b7e3-58628468d8fb.json b/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.3/619037af-d528-4579-b7e3-58628468d8fb.json
deleted file mode 100644
index ba339407b..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.3/619037af-d528-4579-b7e3-58628468d8fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_pre-cursa-o1-v1.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pre-cursa-o1-v1.3",
-    "id": "marcuscedricridia/pre-cursa-o1-v1.3",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7507
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5455
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5076
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4271
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.442
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.4/5113b737-8d9f-4321-9a67-91f1aabb40a1.json b/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.4/5113b737-8d9f-4321-9a67-91f1aabb40a1.json
deleted file mode 100644
index 4cb1c2ce4..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.4/5113b737-8d9f-4321-9a67-91f1aabb40a1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_pre-cursa-o1-v1.4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pre-cursa-o1-v1.4",
-    "id": "marcuscedricridia/pre-cursa-o1-v1.4",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7488
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5493
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4834
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4285
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4436
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.6/641ac372-2e5a-4b44-b22e-a17600a6a868.json b/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.6/641ac372-2e5a-4b44-b22e-a17600a6a868.json
deleted file mode 100644
index f3a0dee4d..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1-v1.6/641ac372-2e5a-4b44-b22e-a17600a6a868.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_pre-cursa-o1-v1.6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pre-cursa-o1-v1.6",
-    "id": "marcuscedricridia/pre-cursa-o1-v1.6",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7528
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5473
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4234
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4413
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1/7cbb0b08-871d-48fc-bf3e-86267f5ef19d.json b/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1/7cbb0b08-871d-48fc-bf3e-86267f5ef19d.json
deleted file mode 100644
index 2cf7d9f19..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/pre-cursa-o1/7cbb0b08-871d-48fc-bf3e-86267f5ef19d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_pre-cursa-o1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pre-cursa-o1",
-    "id": "marcuscedricridia/pre-cursa-o1",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7409
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5462
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5038
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.426
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4424
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/r1o-et/c82e887c-c8ab-4221-aa0b-e8b7a86e7c46.json b/data/hfopenllm_v2/marcuscedricridia/r1o-et/c82e887c-c8ab-4221-aa0b-e8b7a86e7c46.json
deleted file mode 100644
index 57d5458d0..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/r1o-et/c82e887c-c8ab-4221-aa0b-e8b7a86e7c46.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_r1o-et/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "r1o-et",
-    "id": "marcuscedricridia/r1o-et",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3597
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4209
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0793
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3579
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.258
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/sbr-o1-7b/50c65a83-9d08-4155-ad2c-5a2f8ffc8743.json b/data/hfopenllm_v2/marcuscedricridia/sbr-o1-7b/50c65a83-9d08-4155-ad2c-5a2f8ffc8743.json
deleted file mode 100644
index dab48a238..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/sbr-o1-7b/50c65a83-9d08-4155-ad2c-5a2f8ffc8743.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_sbr-o1-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "sbr-o1-7b",
-    "id": "marcuscedricridia/sbr-o1-7b",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7455
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5479
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4985
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4404
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4355
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/marcuscedricridia/stray-r1o-et/99d97aef-bb6b-471b-8ed7-f6f92f75842c.json b/data/hfopenllm_v2/marcuscedricridia/stray-r1o-et/99d97aef-bb6b-471b-8ed7-f6f92f75842c.json
deleted file mode 100644
index b8ff455f6..000000000
--- a/data/hfopenllm_v2/marcuscedricridia/stray-r1o-et/99d97aef-bb6b-471b-8ed7-f6f92f75842c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/marcuscedricridia_stray-r1o-et/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "stray-r1o-et",
-    "id": "marcuscedricridia/stray-r1o-et",
-    "developer": "marcuscedricridia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1562
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2967
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4086
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1094
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3/b98504a0-f1d6-4872-b748-2ca8199c5328.json b/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3/b98504a0-f1d6-4872-b748-2ca8199c5328.json
deleted file mode 100644
index 524853f69..000000000
--- a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3/b98504a0-f1d6-4872-b748-2ca8199c5328.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/matouLeLoup_ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3",
-    "id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-EnhancedMUSREnsembleV3",
-    "developer": "matouLeLoup",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1873
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3239
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3752
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.172
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis/5a159667-7460-4a97-884e-6a96df59873b.json b/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis/5a159667-7460-4a97-884e-6a96df59873b.json
deleted file mode 100644
index 9c8242ed9..000000000
--- a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis/5a159667-7460-4a97-884e-6a96df59873b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/matouLeLoup_ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis",
-    "id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-MUSR-ENSEMBLE-V2Mathis",
-    "developer": "matouLeLoup",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1873
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3239
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3752
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.172
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis/16a2eceb-073d-4dc3-87a7-a15c641c5ebb.json b/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis/16a2eceb-073d-4dc3-87a7-a15c641c5ebb.json
deleted file mode 100644
index 4054a3100..000000000
--- a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis/16a2eceb-073d-4dc3-87a7-a15c641c5ebb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/matouLeLoup_ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis",
-    "id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-ENSEMBLE-Mathis",
-    "developer": "matouLeLoup",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1873
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3239
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3752
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.172
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis/e8e2d04b-21db-43dc-8b8f-7fa3bba87abc.json b/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis/e8e2d04b-21db-43dc-8b8f-7fa3bba87abc.json
deleted file mode 100644
index a118652bb..000000000
--- a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis/e8e2d04b-21db-43dc-8b8f-7fa3bba87abc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/matouLeLoup_ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis",
-    "id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-V4-MUSR-Mathis",
-    "developer": "matouLeLoup",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1882
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3233
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0272
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3685
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.172
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/acbb93b3-f8fc-479d-9610-392efd7d4ecc.json b/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/acbb93b3-f8fc-479d-9610-392efd7d4ecc.json
deleted file mode 100644
index e00a6160a..000000000
--- a/data/hfopenllm_v2/matouLeLoup/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/acbb93b3-f8fc-479d-9610-392efd7d4ecc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/matouLeLoup_ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis",
-    "id": "matouLeLoup/ECE-PRYMMAL-0.5B-FT-V5-MUSR-Mathis",
-    "developer": "matouLeLoup",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1652
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3024
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0189
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4273
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1116
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mattshumer/Reflection-Llama-3.1-70B/6d0589bd-1f05-44ee-afa5-3657b960d7c9.json b/data/hfopenllm_v2/mattshumer/Reflection-Llama-3.1-70B/6d0589bd-1f05-44ee-afa5-3657b960d7c9.json
deleted file mode 100644
index e8d1ce9f3..000000000
--- a/data/hfopenllm_v2/mattshumer/Reflection-Llama-3.1-70B/6d0589bd-1f05-44ee-afa5-3657b960d7c9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mattshumer_Reflection-Llama-3.1-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Reflection-Llama-3.1-70B",
-    "id": "mattshumer/Reflection-Llama-3.1-70B",
-    "developer": "mattshumer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0045
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.645
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3633
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4577
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4955
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mattshumer/ref_70_e3/134663d8-05a8-4336-90e2-68e7cba5f1df.json b/data/hfopenllm_v2/mattshumer/ref_70_e3/134663d8-05a8-4336-90e2-68e7cba5f1df.json
deleted file mode 100644
index d4167a969..000000000
--- a/data/hfopenllm_v2/mattshumer/ref_70_e3/134663d8-05a8-4336-90e2-68e7cba5f1df.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mattshumer_ref_70_e3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ref_70_e3",
-    "id": "mattshumer/ref_70_e3",
-    "developer": "mattshumer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6294
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6501
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2795
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4328
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5303
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/maywell/Qwen2-7B-Multilingual-RP/3bfced28-b06e-46ab-a6aa-171b0c424337.json b/data/hfopenllm_v2/maywell/Qwen2-7B-Multilingual-RP/3bfced28-b06e-46ab-a6aa-171b0c424337.json
deleted file mode 100644
index 4a43aa1ea..000000000
--- a/data/hfopenllm_v2/maywell/Qwen2-7B-Multilingual-RP/3bfced28-b06e-46ab-a6aa-171b0c424337.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/maywell_Qwen2-7B-Multilingual-RP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-7B-Multilingual-RP",
-    "id": "maywell/Qwen2-7B-Multilingual-RP",
-    "developer": "maywell",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4347
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5062
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2243
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3696
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3859
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meditsolutions/Llama-3.1-MedIT-SUN-8B/b6a83b82-6b05-4437-a076-e2a3982f6169.json b/data/hfopenllm_v2/meditsolutions/Llama-3.1-MedIT-SUN-8B/b6a83b82-6b05-4437-a076-e2a3982f6169.json
deleted file mode 100644
index a6245e049..000000000
--- a/data/hfopenllm_v2/meditsolutions/Llama-3.1-MedIT-SUN-8B/b6a83b82-6b05-4437-a076-e2a3982f6169.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meditsolutions_Llama-3.1-MedIT-SUN-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-MedIT-SUN-8B",
-    "id": "meditsolutions/Llama-3.1-MedIT-SUN-8B",
-    "developer": "meditsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7837
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5187
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2092
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4056
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3916
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-1B-Instruct/f621201b-f571-4487-9f1e-b767675c659d.json b/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-1B-Instruct/f621201b-f571-4487-9f1e-b767675c659d.json
deleted file mode 100644
index c6b47ea7c..000000000
--- a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-1B-Instruct/f621201b-f571-4487-9f1e-b767675c659d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meditsolutions_Llama-3.2-SUN-1B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-SUN-1B-Instruct",
-    "id": "meditsolutions/Llama-3.2-SUN-1B-Instruct",
-    "developer": "meditsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaMedITForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6413
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3474
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.071
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2424
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3514
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1781
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-1B-chat/710fdb79-fba4-42da-8e26-45b4caf75207.json b/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-1B-chat/710fdb79-fba4-42da-8e26-45b4caf75207.json
deleted file mode 100644
index 0f33814a2..000000000
--- a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-1B-chat/710fdb79-fba4-42da-8e26-45b4caf75207.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meditsolutions_Llama-3.2-SUN-1B-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-SUN-1B-chat",
-    "id": "meditsolutions/Llama-3.2-SUN-1B-chat",
-    "developer": "meditsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5482
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3514
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0642
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3249
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1838
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-26000/35fa7a5e-8866-4ce3-9899-8737e908f34f.json b/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-26000/35fa7a5e-8866-4ce3-9899-8737e908f34f.json
deleted file mode 100644
index b58b7f561..000000000
--- a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-26000/35fa7a5e-8866-4ce3-9899-8737e908f34f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meditsolutions_Llama-3.2-SUN-2.4B-checkpoint-26000/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-SUN-2.4B-checkpoint-26000",
-    "id": "meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-26000",
-    "developer": "meditsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 2.209
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2814
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3018
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4103
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1345
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-34800/2b24b69b-15dc-4666-83f3-c77db545bdbd.json b/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-34800/2b24b69b-15dc-4666-83f3-c77db545bdbd.json
deleted file mode 100644
index 3c289add0..000000000
--- a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-34800/2b24b69b-15dc-4666-83f3-c77db545bdbd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meditsolutions_Llama-3.2-SUN-2.4B-checkpoint-34800/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-SUN-2.4B-checkpoint-34800",
-    "id": "meditsolutions/Llama-3.2-SUN-2.4B-checkpoint-34800",
-    "developer": "meditsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 2.209
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2501
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3161
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4022
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1357
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-v1.0.0/0d00d849-2147-4fc1-9e5f-d42a95be6ca5.json b/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-v1.0.0/0d00d849-2147-4fc1-9e5f-d42a95be6ca5.json
deleted file mode 100644
index 92bc18771..000000000
--- a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.4B-v1.0.0/0d00d849-2147-4fc1-9e5f-d42a95be6ca5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meditsolutions_Llama-3.2-SUN-2.4B-v1.0.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-SUN-2.4B-v1.0.0",
-    "id": "meditsolutions/Llama-3.2-SUN-2.4B-v1.0.0",
-    "developer": "meditsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 2.472
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5637
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3391
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0627
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3209
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1543
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.5B-chat/f45135b0-3c26-44b5-9922-a6c0817a172d.json b/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.5B-chat/f45135b0-3c26-44b5-9922-a6c0817a172d.json
deleted file mode 100644
index 03c4a4de9..000000000
--- a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-2.5B-chat/f45135b0-3c26-44b5-9922-a6c0817a172d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meditsolutions_Llama-3.2-SUN-2.5B-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-SUN-2.5B-chat",
-    "id": "meditsolutions/Llama-3.2-SUN-2.5B-chat",
-    "developer": "meditsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 2.472
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5604
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3575
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.071
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3155
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1813
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-HDIC-1B-Instruct/67eb0d6c-9086-4c80-8506-c3e1489f2673.json b/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-HDIC-1B-Instruct/67eb0d6c-9086-4c80-8506-c3e1489f2673.json
deleted file mode 100644
index 9ccd525a8..000000000
--- a/data/hfopenllm_v2/meditsolutions/Llama-3.2-SUN-HDIC-1B-Instruct/67eb0d6c-9086-4c80-8506-c3e1489f2673.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meditsolutions_Llama-3.2-SUN-HDIC-1B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-SUN-HDIC-1B-Instruct",
-    "id": "meditsolutions/Llama-3.2-SUN-HDIC-1B-Instruct",
-    "developer": "meditsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6827
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3508
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0619
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2366
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3594
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1687
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meditsolutions/MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune/79d3dc85-08f6-475c-ac2c-1ff32f5a089f.json b/data/hfopenllm_v2/meditsolutions/MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune/79d3dc85-08f6-475c-ac2c-1ff32f5a089f.json
deleted file mode 100644
index 47159802e..000000000
--- a/data/hfopenllm_v2/meditsolutions/MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune/79d3dc85-08f6-475c-ac2c-1ff32f5a089f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meditsolutions_MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune",
-    "id": "meditsolutions/MSH-Lite-7B-v1-Bielik-v2.3-Instruct-Llama-Prune",
-    "developer": "meditsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.646
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3655
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4035
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4253
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.219
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meditsolutions/MSH-v1-Bielik-v2.3-Instruct-MedIT-merge/4e9b3fa2-d3d2-4e4c-a1fa-c812f481f64a.json b/data/hfopenllm_v2/meditsolutions/MSH-v1-Bielik-v2.3-Instruct-MedIT-merge/4e9b3fa2-d3d2-4e4c-a1fa-c812f481f64a.json
deleted file mode 100644
index 555723348..000000000
--- a/data/hfopenllm_v2/meditsolutions/MSH-v1-Bielik-v2.3-Instruct-MedIT-merge/4e9b3fa2-d3d2-4e4c-a1fa-c812f481f64a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meditsolutions_MSH-v1-Bielik-v2.3-Instruct-MedIT-merge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MSH-v1-Bielik-v2.3-Instruct-MedIT-merge",
-    "id": "meditsolutions/MSH-v1-Bielik-v2.3-Instruct-MedIT-merge",
-    "developer": "meditsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 11.169
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5814
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5672
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2077
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3456
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4385
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meditsolutions/MedIT-Mesh-3B-Instruct/6e62a8a0-0bdf-4b6c-93de-593423dadd3a.json b/data/hfopenllm_v2/meditsolutions/MedIT-Mesh-3B-Instruct/6e62a8a0-0bdf-4b6c-93de-593423dadd3a.json
deleted file mode 100644
index af35e21a0..000000000
--- a/data/hfopenllm_v2/meditsolutions/MedIT-Mesh-3B-Instruct/6e62a8a0-0bdf-4b6c-93de-593423dadd3a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meditsolutions_MedIT-Mesh-3B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MedIT-Mesh-3B-Instruct",
-    "id": "meditsolutions/MedIT-Mesh-3B-Instruct",
-    "developer": "meditsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5814
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5576
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2032
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4048
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4012
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meditsolutions/SmolLM2-MedIT-Upscale-2B/871131c1-295d-40a0-a396-09d24b880064.json b/data/hfopenllm_v2/meditsolutions/SmolLM2-MedIT-Upscale-2B/871131c1-295d-40a0-a396-09d24b880064.json
deleted file mode 100644
index 870bf4c0c..000000000
--- a/data/hfopenllm_v2/meditsolutions/SmolLM2-MedIT-Upscale-2B/871131c1-295d-40a0-a396-09d24b880064.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meditsolutions_SmolLM2-MedIT-Upscale-2B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM2-MedIT-Upscale-2B",
-    "id": "meditsolutions/SmolLM2-MedIT-Upscale-2B",
-    "developer": "meditsolutions",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 2.114
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6429
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3551
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0559
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1971
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meetkai/functionary-small-v3.1/44eefbb2-22d4-4dff-889d-a87fc40b2eea.json b/data/hfopenllm_v2/meetkai/functionary-small-v3.1/44eefbb2-22d4-4dff-889d-a87fc40b2eea.json
deleted file mode 100644
index f08479499..000000000
--- a/data/hfopenllm_v2/meetkai/functionary-small-v3.1/44eefbb2-22d4-4dff-889d-a87fc40b2eea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meetkai_functionary-small-v3.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "functionary-small-v3.1",
-    "id": "meetkai/functionary-small-v3.1",
-    "developer": "meetkai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6275
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4982
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1571
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3834
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3349
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meraGPT/mera-mix-4x7B/cd1de470-a174-4c08-9efe-a06d493dc4b2.json b/data/hfopenllm_v2/meraGPT/mera-mix-4x7B/cd1de470-a174-4c08-9efe-a06d493dc4b2.json
deleted file mode 100644
index b69c8564f..000000000
--- a/data/hfopenllm_v2/meraGPT/mera-mix-4x7B/cd1de470-a174-4c08-9efe-a06d493dc4b2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meraGPT_mera-mix-4x7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mera-mix-4x7B",
-    "id": "meraGPT/mera-mix-4x7B",
-    "developer": "meraGPT",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 24.154
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4832
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4019
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4057
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2748
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mergekit-community/JAJUKA-WEWILLNEVERFORGETYOU-3B/fdb55a14-0697-4775-8358-fed202498b4f.json b/data/hfopenllm_v2/mergekit-community/JAJUKA-WEWILLNEVERFORGETYOU-3B/fdb55a14-0697-4775-8358-fed202498b4f.json
deleted file mode 100644
index a962d2f5e..000000000
--- a/data/hfopenllm_v2/mergekit-community/JAJUKA-WEWILLNEVERFORGETYOU-3B/fdb55a14-0697-4775-8358-fed202498b4f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mergekit-community_JAJUKA-WEWILLNEVERFORGETYOU-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "JAJUKA-WEWILLNEVERFORGETYOU-3B",
-    "id": "mergekit-community/JAJUKA-WEWILLNEVERFORGETYOU-3B",
-    "developer": "mergekit-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4941
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.437
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1246
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3656
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3033
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mergekit-community/SuperQwen-2.5-1.5B/c069a224-638a-4cad-a9ad-e4f8579e8c15.json b/data/hfopenllm_v2/mergekit-community/SuperQwen-2.5-1.5B/c069a224-638a-4cad-a9ad-e4f8579e8c15.json
deleted file mode 100644
index 1f4baa35d..000000000
--- a/data/hfopenllm_v2/mergekit-community/SuperQwen-2.5-1.5B/c069a224-638a-4cad-a9ad-e4f8579e8c15.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mergekit-community_SuperQwen-2.5-1.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SuperQwen-2.5-1.5B",
-    "id": "mergekit-community/SuperQwen-2.5-1.5B",
-    "developer": "mergekit-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1336
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2907
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3355
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1075
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mergekit-community/VirtuosoSmall-InstructModelStock/10e5c103-f25f-45bb-bfe6-a22876cffe87.json b/data/hfopenllm_v2/mergekit-community/VirtuosoSmall-InstructModelStock/10e5c103-f25f-45bb-bfe6-a22876cffe87.json
deleted file mode 100644
index 7af838cdf..000000000
--- a/data/hfopenllm_v2/mergekit-community/VirtuosoSmall-InstructModelStock/10e5c103-f25f-45bb-bfe6-a22876cffe87.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mergekit-community_VirtuosoSmall-InstructModelStock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "VirtuosoSmall-InstructModelStock",
-    "id": "mergekit-community/VirtuosoSmall-InstructModelStock",
-    "developer": "mergekit-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5238
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6518
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4094
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3826
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4756
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5421
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mergekit-community/diabolic6045_ELN-AOC-CAIN/a9ecca9a-c5d4-45b2-a403-e74a98a46322.json b/data/hfopenllm_v2/mergekit-community/diabolic6045_ELN-AOC-CAIN/a9ecca9a-c5d4-45b2-a403-e74a98a46322.json
deleted file mode 100644
index 2b83b90c9..000000000
--- a/data/hfopenllm_v2/mergekit-community/diabolic6045_ELN-AOC-CAIN/a9ecca9a-c5d4-45b2-a403-e74a98a46322.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mergekit-community_diabolic6045_ELN-AOC-CAIN/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "diabolic6045_ELN-AOC-CAIN",
-    "id": "mergekit-community/diabolic6045_ELN-AOC-CAIN",
-    "developer": "mergekit-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0862
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3126
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3658
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1191
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mergekit-community/mergekit-dare_ties-ajgjgea/630d8a60-03b7-4550-82f4-e879b2e01c6c.json b/data/hfopenllm_v2/mergekit-community/mergekit-dare_ties-ajgjgea/630d8a60-03b7-4550-82f4-e879b2e01c6c.json
deleted file mode 100644
index afddca479..000000000
--- a/data/hfopenllm_v2/mergekit-community/mergekit-dare_ties-ajgjgea/630d8a60-03b7-4550-82f4-e879b2e01c6c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mergekit-community_mergekit-dare_ties-ajgjgea/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mergekit-dare_ties-ajgjgea",
-    "id": "mergekit-community/mergekit-dare_ties-ajgjgea",
-    "developer": "mergekit-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5263
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3495
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0642
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1744
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mergekit-community/mergekit-della-zgowfmf/206b5a96-ae07-41fd-822f-436d49c57dcb.json b/data/hfopenllm_v2/mergekit-community/mergekit-della-zgowfmf/206b5a96-ae07-41fd-822f-436d49c57dcb.json
deleted file mode 100644
index 160d88a78..000000000
--- a/data/hfopenllm_v2/mergekit-community/mergekit-della-zgowfmf/206b5a96-ae07-41fd-822f-436d49c57dcb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mergekit-community_mergekit-della-zgowfmf/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mergekit-della-zgowfmf",
-    "id": "mergekit-community/mergekit-della-zgowfmf",
-    "developer": "mergekit-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4828
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6591
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3618
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3901
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4834
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5415
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mergekit-community/mergekit-model_stock-azgztvm/702d2120-5301-4e03-bb0f-1f8ab19e522a.json b/data/hfopenllm_v2/mergekit-community/mergekit-model_stock-azgztvm/702d2120-5301-4e03-bb0f-1f8ab19e522a.json
deleted file mode 100644
index 46f675fa0..000000000
--- a/data/hfopenllm_v2/mergekit-community/mergekit-model_stock-azgztvm/702d2120-5301-4e03-bb0f-1f8ab19e522a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mergekit-community_mergekit-model_stock-azgztvm/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mergekit-model_stock-azgztvm",
-    "id": "mergekit-community/mergekit-model_stock-azgztvm",
-    "developer": "mergekit-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5062
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6543
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4373
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.473
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5406
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mergekit-community/mergekit-slerp-fmrazcr/61e39700-c237-49fc-baef-3fa573b3b0c6.json b/data/hfopenllm_v2/mergekit-community/mergekit-slerp-fmrazcr/61e39700-c237-49fc-baef-3fa573b3b0c6.json
deleted file mode 100644
index 0b718a9e2..000000000
--- a/data/hfopenllm_v2/mergekit-community/mergekit-slerp-fmrazcr/61e39700-c237-49fc-baef-3fa573b3b0c6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mergekit-community_mergekit-slerp-fmrazcr/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mergekit-slerp-fmrazcr",
-    "id": "mergekit-community/mergekit-slerp-fmrazcr",
-    "developer": "mergekit-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4174
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5342
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1193
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4105
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3777
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mergekit-community/mergekit-ties-rraxdhv/8892ab84-750d-494f-9f87-ad28e73cf364.json b/data/hfopenllm_v2/mergekit-community/mergekit-ties-rraxdhv/8892ab84-750d-494f-9f87-ad28e73cf364.json
deleted file mode 100644
index a58721bfd..000000000
--- a/data/hfopenllm_v2/mergekit-community/mergekit-ties-rraxdhv/8892ab84-750d-494f-9f87-ad28e73cf364.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mergekit-community_mergekit-ties-rraxdhv/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mergekit-ties-rraxdhv",
-    "id": "mergekit-community/mergekit-ties-rraxdhv",
-    "developer": "mergekit-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1123
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5184
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4202
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.391
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mergekit-community/mergekit-ties-ykqemwr/538a2eb7-34e4-4e78-a382-60a13710096e.json b/data/hfopenllm_v2/mergekit-community/mergekit-ties-ykqemwr/538a2eb7-34e4-4e78-a382-60a13710096e.json
deleted file mode 100644
index 929e2d82b..000000000
--- a/data/hfopenllm_v2/mergekit-community/mergekit-ties-ykqemwr/538a2eb7-34e4-4e78-a382-60a13710096e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mergekit-community_mergekit-ties-ykqemwr/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mergekit-ties-ykqemwr",
-    "id": "mergekit-community/mergekit-ties-ykqemwr",
-    "developer": "mergekit-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.36
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5455
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1224
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4198
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3734
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mergekit-community/sexeh_time_testing/a041629e-8ed8-4a6c-95ee-98e759501e19.json b/data/hfopenllm_v2/mergekit-community/sexeh_time_testing/a041629e-8ed8-4a6c-95ee-98e759501e19.json
deleted file mode 100644
index 959317552..000000000
--- a/data/hfopenllm_v2/mergekit-community/sexeh_time_testing/a041629e-8ed8-4a6c-95ee-98e759501e19.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mergekit-community_sexeh_time_testing/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "sexeh_time_testing",
-    "id": "mergekit-community/sexeh_time_testing",
-    "developer": "mergekit-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7329
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5241
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0899
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3619
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3667
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Llama-2-13b-chat-hf/09f05984-5815-4b3d-bc73-83ea1e5ecc27.json b/data/hfopenllm_v2/meta-llama/Llama-2-13b-chat-hf/09f05984-5815-4b3d-bc73-83ea1e5ecc27.json
deleted file mode 100644
index 29fbff778..000000000
--- a/data/hfopenllm_v2/meta-llama/Llama-2-13b-chat-hf/09f05984-5815-4b3d-bc73-83ea1e5ecc27.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Llama-2-13b-chat-hf/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-2-13b-chat-hf",
-    "id": "meta-llama/Llama-2-13b-chat-hf",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.016
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3985
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3343
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2315
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4007
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1923
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Llama-2-13b-hf/6535524e-f8cf-4f2f-9d89-9ba70aedac91.json b/data/hfopenllm_v2/meta-llama/Llama-2-13b-hf/6535524e-f8cf-4f2f-9d89-9ba70aedac91.json
deleted file mode 100644
index 50a1407eb..000000000
--- a/data/hfopenllm_v2/meta-llama/Llama-2-13b-hf/6535524e-f8cf-4f2f-9d89-9ba70aedac91.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Llama-2-13b-hf/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-2-13b-hf",
-    "id": "meta-llama/Llama-2-13b-hf",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.016
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2482
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4126
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3538
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2378
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Llama-2-70b-chat-hf/08ea4f9d-0e3c-4a8b-85e6-075290d30ba4.json b/data/hfopenllm_v2/meta-llama/Llama-2-70b-chat-hf/08ea4f9d-0e3c-4a8b-85e6-075290d30ba4.json
deleted file mode 100644
index 741023f42..000000000
--- a/data/hfopenllm_v2/meta-llama/Llama-2-70b-chat-hf/08ea4f9d-0e3c-4a8b-85e6-075290d30ba4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Llama-2-70b-chat-hf/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-2-70b-chat-hf",
-    "id": "meta-llama/Llama-2-70b-chat-hf",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 68.977
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4958
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3042
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3687
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2433
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Llama-2-70b-hf/631f0a1f-a6f5-46f6-9aa0-31ac9764c086.json b/data/hfopenllm_v2/meta-llama/Llama-2-70b-hf/631f0a1f-a6f5-46f6-9aa0-31ac9764c086.json
deleted file mode 100644
index d8c229621..000000000
--- a/data/hfopenllm_v2/meta-llama/Llama-2-70b-hf/631f0a1f-a6f5-46f6-9aa0-31ac9764c086.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Llama-2-70b-hf/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-2-70b-hf",
-    "id": "meta-llama/Llama-2-70b-hf",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 68.977
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2407
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5473
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0325
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4124
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3718
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Llama-2-7b-chat-hf/b771f6db-7516-4423-9010-3467db0e26e3.json b/data/hfopenllm_v2/meta-llama/Llama-2-7b-chat-hf/b771f6db-7516-4423-9010-3467db0e26e3.json
deleted file mode 100644
index 4c366b7ee..000000000
--- a/data/hfopenllm_v2/meta-llama/Llama-2-7b-chat-hf/b771f6db-7516-4423-9010-3467db0e26e3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Llama-2-7b-chat-hf/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-2-7b-chat-hf",
-    "id": "meta-llama/Llama-2-7b-chat-hf",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.738
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3986
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3114
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3676
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1688
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Llama-2-7b-hf/cf580dfb-2924-4c4b-9352-394275b959bd.json b/data/hfopenllm_v2/meta-llama/Llama-2-7b-hf/cf580dfb-2924-4c4b-9352-394275b959bd.json
deleted file mode 100644
index 009ce4f17..000000000
--- a/data/hfopenllm_v2/meta-llama/Llama-2-7b-hf/cf580dfb-2924-4c4b-9352-394275b959bd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Llama-2-7b-hf/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-2-7b-hf",
-    "id": "meta-llama/Llama-2-7b-hf",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.738
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2519
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3496
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0174
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3701
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1861
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Llama-3.1-70B-Instruct/ba549fe6-7718-4abf-a610-7e0f48611483.json b/data/hfopenllm_v2/meta-llama/Llama-3.1-70B-Instruct/ba549fe6-7718-4abf-a610-7e0f48611483.json
deleted file mode 100644
index 772844f47..000000000
--- a/data/hfopenllm_v2/meta-llama/Llama-3.1-70B-Instruct/ba549fe6-7718-4abf-a610-7e0f48611483.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Llama-3.1-70B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-70B-Instruct",
-    "id": "meta-llama/Llama-3.1-70B-Instruct",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8669
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6917
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3807
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4581
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5309
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Llama-3.1-70B/b92440b1-78a9-4288-a432-f057f2b04a2f.json b/data/hfopenllm_v2/meta-llama/Llama-3.1-70B/b92440b1-78a9-4288-a432-f057f2b04a2f.json
deleted file mode 100644
index a94baa730..000000000
--- a/data/hfopenllm_v2/meta-llama/Llama-3.1-70B/b92440b1-78a9-4288-a432-f057f2b04a2f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Llama-3.1-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-70B",
-    "id": "meta-llama/Llama-3.1-70B",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1684
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.626
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1843
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3876
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4572
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4654
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Llama-3.1-8B-Instruct/838f3932-edf2-4f72-9238-981d1aadc771.json b/data/hfopenllm_v2/meta-llama/Llama-3.1-8B-Instruct/838f3932-edf2-4f72-9238-981d1aadc771.json
deleted file mode 100644
index f3e5eb24a..000000000
--- a/data/hfopenllm_v2/meta-llama/Llama-3.1-8B-Instruct/838f3932-edf2-4f72-9238-981d1aadc771.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Llama-3.1-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-Instruct",
-    "id": "meta-llama/Llama-3.1-8B-Instruct",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4922
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5087
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1556
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3972
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3798
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Llama-3.1-8B/61e933b2-5cd1-4f08-8a9e-5b06ef54b6d5.json b/data/hfopenllm_v2/meta-llama/Llama-3.1-8B/61e933b2-5cd1-4f08-8a9e-5b06ef54b6d5.json
deleted file mode 100644
index b2fd3607e..000000000
--- a/data/hfopenllm_v2/meta-llama/Llama-3.1-8B/61e933b2-5cd1-4f08-8a9e-5b06ef54b6d5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Llama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B",
-    "id": "meta-llama/Llama-3.1-8B",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1246
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.466
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0657
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3812
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3288
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Llama-3.2-1B-Instruct/0b307c78-94c7-418f-bc47-5106b81c30de.json b/data/hfopenllm_v2/meta-llama/Llama-3.2-1B-Instruct/0b307c78-94c7-418f-bc47-5106b81c30de.json
deleted file mode 100644
index f3ef3bdee..000000000
--- a/data/hfopenllm_v2/meta-llama/Llama-3.2-1B-Instruct/0b307c78-94c7-418f-bc47-5106b81c30de.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Llama-3.2-1B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-1B-Instruct",
-    "id": "meta-llama/Llama-3.2-1B-Instruct",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.24
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5698
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3497
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0702
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3329
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1682
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Llama-3.2-1B/18783694-3e7b-4d06-9378-5a3fa4a7a0a2.json b/data/hfopenllm_v2/meta-llama/Llama-3.2-1B/18783694-3e7b-4d06-9378-5a3fa4a7a0a2.json
deleted file mode 100644
index c0d6f09c5..000000000
--- a/data/hfopenllm_v2/meta-llama/Llama-3.2-1B/18783694-3e7b-4d06-9378-5a3fa4a7a0a2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Llama-3.2-1B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-1B",
-    "id": "meta-llama/Llama-3.2-1B",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.24
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1478
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3115
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2282
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3447
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1203
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Llama-3.2-3B-Instruct/dab922e5-1b46-4a90-b75c-1b26cd6cc6d3.json b/data/hfopenllm_v2/meta-llama/Llama-3.2-3B-Instruct/dab922e5-1b46-4a90-b75c-1b26cd6cc6d3.json
deleted file mode 100644
index cad18f268..000000000
--- a/data/hfopenllm_v2/meta-llama/Llama-3.2-3B-Instruct/dab922e5-1b46-4a90-b75c-1b26cd6cc6d3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Llama-3.2-3B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-Instruct",
-    "id": "meta-llama/Llama-3.2-3B-Instruct",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7393
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.461
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1767
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3529
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3195
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Llama-3.2-3B/8cfa1f00-3b26-4d75-9b0a-0dea65e2e352.json b/data/hfopenllm_v2/meta-llama/Llama-3.2-3B/8cfa1f00-3b26-4d75-9b0a-0dea65e2e352.json
deleted file mode 100644
index b00c2e441..000000000
--- a/data/hfopenllm_v2/meta-llama/Llama-3.2-3B/8cfa1f00-3b26-4d75-9b0a-0dea65e2e352.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Llama-3.2-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B",
-    "id": "meta-llama/Llama-3.2-3B",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1337
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3905
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0189
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3577
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2488
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Llama-3.3-70B-Instruct/f74d26e6-9dfb-4e81-8522-8309b27760cf.json b/data/hfopenllm_v2/meta-llama/Llama-3.3-70B-Instruct/f74d26e6-9dfb-4e81-8522-8309b27760cf.json
deleted file mode 100644
index fd07a860b..000000000
--- a/data/hfopenllm_v2/meta-llama/Llama-3.3-70B-Instruct/f74d26e6-9dfb-4e81-8522-8309b27760cf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Llama-3.3-70B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.3-70B-Instruct",
-    "id": "meta-llama/Llama-3.3-70B-Instruct",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8998
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6919
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4834
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4461
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5332
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-70B-Instruct/2022bcf3-a057-4b0a-aa33-6cf074ffc714.json b/data/hfopenllm_v2/meta-llama/Meta-Llama-3-70B-Instruct/2022bcf3-a057-4b0a-aa33-6cf074ffc714.json
deleted file mode 100644
index dc54a58af..000000000
--- a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-70B-Instruct/2022bcf3-a057-4b0a-aa33-6cf074ffc714.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Meta-Llama-3-70B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Meta-Llama-3-70B-Instruct",
-    "id": "meta-llama/Meta-Llama-3-70B-Instruct",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8099
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6547
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2447
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4154
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5207
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-70B/a6e79d12-42f6-47ad-95fa-ba03fa4d3a06.json b/data/hfopenllm_v2/meta-llama/Meta-Llama-3-70B/a6e79d12-42f6-47ad-95fa-ba03fa4d3a06.json
deleted file mode 100644
index faa64a96e..000000000
--- a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-70B/a6e79d12-42f6-47ad-95fa-ba03fa4d3a06.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Meta-Llama-3-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Meta-Llama-3-70B",
-    "id": "meta-llama/Meta-Llama-3-70B",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1603
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6461
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1858
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3977
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4518
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4709
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B-Instruct/24d850fe-1817-4041-8767-085f4bd2bac3.json b/data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B-Instruct/24d850fe-1817-4041-8767-085f4bd2bac3.json
deleted file mode 100644
index ac1995522..000000000
--- a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B-Instruct/24d850fe-1817-4041-8767-085f4bd2bac3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Meta-Llama-3-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Meta-Llama-3-8B-Instruct",
-    "id": "meta-llama/Meta-Llama-3-8B-Instruct",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7408
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4989
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0869
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3568
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3664
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B-Instruct/610a3be1-1032-4079-ba37-d6c2c5f9fd55.json b/data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B-Instruct/610a3be1-1032-4079-ba37-d6c2c5f9fd55.json
deleted file mode 100644
index 7d274c62c..000000000
--- a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B-Instruct/610a3be1-1032-4079-ba37-d6c2c5f9fd55.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Meta-Llama-3-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Meta-Llama-3-8B-Instruct",
-    "id": "meta-llama/Meta-Llama-3-8B-Instruct",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4782
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.491
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0914
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3805
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3591
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B/857bb10e-1b43-4714-a758-0cef5816ba02.json b/data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B/857bb10e-1b43-4714-a758-0cef5816ba02.json
deleted file mode 100644
index e2bd15b0b..000000000
--- a/data/hfopenllm_v2/meta-llama/Meta-Llama-3-8B/857bb10e-1b43-4714-a758-0cef5816ba02.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/meta-llama_Meta-Llama-3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Meta-Llama-3-8B",
-    "id": "meta-llama/Meta-Llama-3-8B",
-    "developer": "meta-llama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1455
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4598
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3614
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.321
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mhl1/Qwen2.5-0.5B-cinstruct-stage1/cdabdd54-6101-471c-9bd8-446953be986b.json b/data/hfopenllm_v2/mhl1/Qwen2.5-0.5B-cinstruct-stage1/cdabdd54-6101-471c-9bd8-446953be986b.json
deleted file mode 100644
index a4956c1b7..000000000
--- a/data/hfopenllm_v2/mhl1/Qwen2.5-0.5B-cinstruct-stage1/cdabdd54-6101-471c-9bd8-446953be986b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mhl1_Qwen2.5-0.5B-cinstruct-stage1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-0.5B-cinstruct-stage1",
-    "id": "mhl1/Qwen2.5-0.5B-cinstruct-stage1",
-    "developer": "mhl1",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1482
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3256
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1139
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/microsoft/DialoGPT-medium/8029cb75-8d3b-411d-b0eb-74539b8ecb2f.json b/data/hfopenllm_v2/microsoft/DialoGPT-medium/8029cb75-8d3b-411d-b0eb-74539b8ecb2f.json
deleted file mode 100644
index 93673afea..000000000
--- a/data/hfopenllm_v2/microsoft/DialoGPT-medium/8029cb75-8d3b-411d-b0eb-74539b8ecb2f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/microsoft_DialoGPT-medium/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DialoGPT-medium",
-    "id": "microsoft/DialoGPT-medium",
-    "developer": "microsoft",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPT2LMHeadModel",
-      "params_billions": 0.345
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1479
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3014
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4287
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1119
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/microsoft/Orca-2-13b/65d10996-2c5b-4e11-9a07-319c2446a237.json b/data/hfopenllm_v2/microsoft/Orca-2-13b/65d10996-2c5b-4e11-9a07-319c2446a237.json
deleted file mode 100644
index 346756a08..000000000
--- a/data/hfopenllm_v2/microsoft/Orca-2-13b/65d10996-2c5b-4e11-9a07-319c2446a237.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/microsoft_Orca-2-13b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Orca-2-13b",
-    "id": "microsoft/Orca-2-13b",
-    "developer": "microsoft",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3128
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4884
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0317
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.513
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2749
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/microsoft/Orca-2-7b/ef21d739-b122-4ab8-a8ff-a7cfecad5c8e.json b/data/hfopenllm_v2/microsoft/Orca-2-7b/ef21d739-b122-4ab8-a8ff-a7cfecad5c8e.json
deleted file mode 100644
index b1da27202..000000000
--- a/data/hfopenllm_v2/microsoft/Orca-2-7b/ef21d739-b122-4ab8-a8ff-a7cfecad5c8e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/microsoft_Orca-2-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Orca-2-7b",
-    "id": "microsoft/Orca-2-7b",
-    "developer": "microsoft",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2183
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4452
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5026
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2319
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/microsoft/Phi-3-medium-128k-instruct/45f3b963-497b-4d89-ac66-9ff0ba8dadf8.json b/data/hfopenllm_v2/microsoft/Phi-3-medium-128k-instruct/45f3b963-497b-4d89-ac66-9ff0ba8dadf8.json
deleted file mode 100644
index a60530782..000000000
--- a/data/hfopenllm_v2/microsoft/Phi-3-medium-128k-instruct/45f3b963-497b-4d89-ac66-9ff0ba8dadf8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/microsoft_Phi-3-medium-128k-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3-medium-128k-instruct",
-    "id": "microsoft/Phi-3-medium-128k-instruct",
-    "developer": "microsoft",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 13.96
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.604
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6382
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1918
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4129
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4712
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/microsoft/Phi-3-medium-4k-instruct/4173435b-d907-4ac5-a8bd-dfa2759f3fb6.json b/data/hfopenllm_v2/microsoft/Phi-3-medium-4k-instruct/4173435b-d907-4ac5-a8bd-dfa2759f3fb6.json
deleted file mode 100644
index 5255ec1d7..000000000
--- a/data/hfopenllm_v2/microsoft/Phi-3-medium-4k-instruct/4173435b-d907-4ac5-a8bd-dfa2759f3fb6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/microsoft_Phi-3-medium-4k-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3-medium-4k-instruct",
-    "id": "microsoft/Phi-3-medium-4k-instruct",
-    "developer": "microsoft",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 13.96
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6423
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6412
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1956
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4258
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4676
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/microsoft/Phi-3-mini-128k-instruct/b4a79f30-3a04-4f78-861e-1571316a0642.json b/data/hfopenllm_v2/microsoft/Phi-3-mini-128k-instruct/b4a79f30-3a04-4f78-861e-1571316a0642.json
deleted file mode 100644
index 9e63038b5..000000000
--- a/data/hfopenllm_v2/microsoft/Phi-3-mini-128k-instruct/b4a79f30-3a04-4f78-861e-1571316a0642.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/microsoft_Phi-3-mini-128k-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3-mini-128k-instruct",
-    "id": "microsoft/Phi-3-mini-128k-instruct",
-    "developer": "microsoft",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5976
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5575
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1405
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3937
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3734
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/microsoft/Phi-3-mini-4k-instruct/53426038-df38-45ba-b621-34231c9cad7f.json b/data/hfopenllm_v2/microsoft/Phi-3-mini-4k-instruct/53426038-df38-45ba-b621-34231c9cad7f.json
deleted file mode 100644
index bbd15d9d0..000000000
--- a/data/hfopenllm_v2/microsoft/Phi-3-mini-4k-instruct/53426038-df38-45ba-b621-34231c9cad7f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/microsoft_Phi-3-mini-4k-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3-mini-4k-instruct",
-    "id": "microsoft/Phi-3-mini-4k-instruct",
-    "developer": "microsoft",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5477
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5491
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1639
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3322
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4284
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4022
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/microsoft/Phi-3-mini-4k-instruct/fa758fe5-21ec-45cc-941f-5cb5ca0612b1.json b/data/hfopenllm_v2/microsoft/Phi-3-mini-4k-instruct/fa758fe5-21ec-45cc-941f-5cb5ca0612b1.json
deleted file mode 100644
index 14dce77a3..000000000
--- a/data/hfopenllm_v2/microsoft/Phi-3-mini-4k-instruct/fa758fe5-21ec-45cc-941f-5cb5ca0612b1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/microsoft_Phi-3-mini-4k-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3-mini-4k-instruct",
-    "id": "microsoft/Phi-3-mini-4k-instruct",
-    "developer": "microsoft",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5613
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5676
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1163
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.395
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3866
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/microsoft/Phi-3-small-128k-instruct/d2a92a62-3bd0-4cb2-897b-742ea0d5203f.json b/data/hfopenllm_v2/microsoft/Phi-3-small-128k-instruct/d2a92a62-3bd0-4cb2-897b-742ea0d5203f.json
deleted file mode 100644
index 4fa6eea19..000000000
--- a/data/hfopenllm_v2/microsoft/Phi-3-small-128k-instruct/d2a92a62-3bd0-4cb2-897b-742ea0d5203f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/microsoft_Phi-3-small-128k-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3-small-128k-instruct",
-    "id": "microsoft/Phi-3-small-128k-instruct",
-    "developer": "microsoft",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3SmallForCausalLM",
-      "params_billions": 7.392
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6368
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6202
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2026
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4378
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4491
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/microsoft/Phi-3-small-8k-instruct/8b752519-63d4-4638-b56e-1c45c7f4694e.json b/data/hfopenllm_v2/microsoft/Phi-3-small-8k-instruct/8b752519-63d4-4638-b56e-1c45c7f4694e.json
deleted file mode 100644
index ac0af8542..000000000
--- a/data/hfopenllm_v2/microsoft/Phi-3-small-8k-instruct/8b752519-63d4-4638-b56e-1c45c7f4694e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/microsoft_Phi-3-small-8k-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3-small-8k-instruct",
-    "id": "microsoft/Phi-3-small-8k-instruct",
-    "developer": "microsoft",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3SmallForCausalLM",
-      "params_billions": 7.392
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6497
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6208
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1887
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4558
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4506
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/microsoft/Phi-3.5-MoE-instruct/8da71b7c-7b73-453f-998b-84e70b54e471.json b/data/hfopenllm_v2/microsoft/Phi-3.5-MoE-instruct/8da71b7c-7b73-453f-998b-84e70b54e471.json
deleted file mode 100644
index abd80b617..000000000
--- a/data/hfopenllm_v2/microsoft/Phi-3.5-MoE-instruct/8da71b7c-7b73-453f-998b-84e70b54e471.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/microsoft_Phi-3.5-MoE-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3.5-MoE-instruct",
-    "id": "microsoft/Phi-3.5-MoE-instruct",
-    "developer": "microsoft",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 42.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6925
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6408
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3119
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3557
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4565
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4658
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/microsoft/Phi-3.5-mini-instruct/2b7b1216-3ea7-48f1-89f6-e5d84fef2b32.json b/data/hfopenllm_v2/microsoft/Phi-3.5-mini-instruct/2b7b1216-3ea7-48f1-89f6-e5d84fef2b32.json
deleted file mode 100644
index 5a22577ff..000000000
--- a/data/hfopenllm_v2/microsoft/Phi-3.5-mini-instruct/2b7b1216-3ea7-48f1-89f6-e5d84fef2b32.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/microsoft_Phi-3.5-mini-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3.5-mini-instruct",
-    "id": "microsoft/Phi-3.5-mini-instruct",
-    "developer": "microsoft",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5775
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5518
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1964
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3398
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4021
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3962
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/microsoft/Phi-4-mini-instruct/37e19712-3197-42da-a8f2-ae1f36c2b06c.json b/data/hfopenllm_v2/microsoft/Phi-4-mini-instruct/37e19712-3197-42da-a8f2-ae1f36c2b06c.json
deleted file mode 100644
index 7a263b228..000000000
--- a/data/hfopenllm_v2/microsoft/Phi-4-mini-instruct/37e19712-3197-42da-a8f2-ae1f36c2b06c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/microsoft_Phi-4-mini-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-mini-instruct",
-    "id": "microsoft/Phi-4-mini-instruct",
-    "developer": "microsoft",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 3.836
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7378
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5689
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1699
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3873
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3932
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/microsoft/phi-1/c6ae6691-64ec-443d-8d76-af614c8cc7f9.json b/data/hfopenllm_v2/microsoft/phi-1/c6ae6691-64ec-443d-8d76-af614c8cc7f9.json
deleted file mode 100644
index 84c69a4be..000000000
--- a/data/hfopenllm_v2/microsoft/phi-1/c6ae6691-64ec-443d-8d76-af614c8cc7f9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/microsoft_phi-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-1",
-    "id": "microsoft/phi-1",
-    "developer": "microsoft",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 1.418
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2068
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3139
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3525
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1162
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/microsoft/phi-1_5/80567722-8c6b-41b9-8103-3bdaedfdb8ee.json b/data/hfopenllm_v2/microsoft/phi-1_5/80567722-8c6b-41b9-8103-3bdaedfdb8ee.json
deleted file mode 100644
index 2fc19d941..000000000
--- a/data/hfopenllm_v2/microsoft/phi-1_5/80567722-8c6b-41b9-8103-3bdaedfdb8ee.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/microsoft_phi-1_5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-1_5",
-    "id": "microsoft/phi-1_5",
-    "developer": "microsoft",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 1.418
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2033
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.336
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3404
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1691
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/microsoft/phi-2/20192dc4-ea3a-4413-8457-18a592fa0c64.json b/data/hfopenllm_v2/microsoft/phi-2/20192dc4-ea3a-4413-8457-18a592fa0c64.json
deleted file mode 100644
index 428cbe527..000000000
--- a/data/hfopenllm_v2/microsoft/phi-2/20192dc4-ea3a-4413-8457-18a592fa0c64.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/microsoft_phi-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-2",
-    "id": "microsoft/phi-2",
-    "developer": "microsoft",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.78
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2739
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4881
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4099
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2628
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/microsoft/phi-4/8c878c05-86f7-4d61-81d7-9bb286516581.json b/data/hfopenllm_v2/microsoft/phi-4/8c878c05-86f7-4d61-81d7-9bb286516581.json
deleted file mode 100644
index 5d88bde05..000000000
--- a/data/hfopenllm_v2/microsoft/phi-4/8c878c05-86f7-4d61-81d7-9bb286516581.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/microsoft_phi-4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-4",
-    "id": "microsoft/phi-4",
-    "developer": "microsoft",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0585
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6691
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3165
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.406
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5034
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5287
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/microsoft/phi-4/fa753be0-4a98-4ec3-9cc9-3bf7b380ad17.json b/data/hfopenllm_v2/microsoft/phi-4/fa753be0-4a98-4ec3-9cc9-3bf7b380ad17.json
deleted file mode 100644
index 8eaf9faa9..000000000
--- a/data/hfopenllm_v2/microsoft/phi-4/fa753be0-4a98-4ec3-9cc9-3bf7b380ad17.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/microsoft_phi-4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-4",
-    "id": "microsoft/phi-4",
-    "developer": "microsoft",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0488
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6703
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2787
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.401
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5034
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5295
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/migtissera/Llama-3-70B-Synthia-v3.5/0516b46b-a957-413f-aadc-58f4339dc60a.json b/data/hfopenllm_v2/migtissera/Llama-3-70B-Synthia-v3.5/0516b46b-a957-413f-aadc-58f4339dc60a.json
deleted file mode 100644
index 2429a8f70..000000000
--- a/data/hfopenllm_v2/migtissera/Llama-3-70B-Synthia-v3.5/0516b46b-a957-413f-aadc-58f4339dc60a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/migtissera_Llama-3-70B-Synthia-v3.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-70B-Synthia-v3.5",
-    "id": "migtissera/Llama-3-70B-Synthia-v3.5",
-    "developer": "migtissera",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6076
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6489
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2115
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3876
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4922
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4658
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/migtissera/Llama-3-8B-Synthia-v3.5/97200dd7-7ed0-4a7b-ace9-31c173f017f1.json b/data/hfopenllm_v2/migtissera/Llama-3-8B-Synthia-v3.5/97200dd7-7ed0-4a7b-ace9-31c173f017f1.json
deleted file mode 100644
index 30146a9c1..000000000
--- a/data/hfopenllm_v2/migtissera/Llama-3-8B-Synthia-v3.5/97200dd7-7ed0-4a7b-ace9-31c173f017f1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/migtissera_Llama-3-8B-Synthia-v3.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Synthia-v3.5",
-    "id": "migtissera/Llama-3-8B-Synthia-v3.5",
-    "developer": "migtissera",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.507
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4888
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0657
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4044
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.303
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/migtissera/Tess-3-7B-SFT/758f8332-ffa8-4059-ac6f-400f9367bb23.json b/data/hfopenllm_v2/migtissera/Tess-3-7B-SFT/758f8332-ffa8-4059-ac6f-400f9367bb23.json
deleted file mode 100644
index 45ec18a45..000000000
--- a/data/hfopenllm_v2/migtissera/Tess-3-7B-SFT/758f8332-ffa8-4059-ac6f-400f9367bb23.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/migtissera_Tess-3-7B-SFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tess-3-7B-SFT",
-    "id": "migtissera/Tess-3-7B-SFT",
-    "developer": "migtissera",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3946
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4607
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4113
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3034
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/migtissera/Tess-3-Mistral-Nemo-12B/b1103662-055c-471e-ace8-dd75f607491d.json b/data/hfopenllm_v2/migtissera/Tess-3-Mistral-Nemo-12B/b1103662-055c-471e-ace8-dd75f607491d.json
deleted file mode 100644
index ada1e45ae..000000000
--- a/data/hfopenllm_v2/migtissera/Tess-3-Mistral-Nemo-12B/b1103662-055c-471e-ace8-dd75f607491d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/migtissera_Tess-3-Mistral-Nemo-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tess-3-Mistral-Nemo-12B",
-    "id": "migtissera/Tess-3-Mistral-Nemo-12B",
-    "developer": "migtissera",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3355
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4899
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4458
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2565
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/migtissera/Tess-v2.5-Phi-3-medium-128k-14B/27b0d675-498f-4351-b92f-7c0d1a3c83bd.json b/data/hfopenllm_v2/migtissera/Tess-v2.5-Phi-3-medium-128k-14B/27b0d675-498f-4351-b92f-7c0d1a3c83bd.json
deleted file mode 100644
index 81b7d3cb0..000000000
--- a/data/hfopenllm_v2/migtissera/Tess-v2.5-Phi-3-medium-128k-14B/27b0d675-498f-4351-b92f-7c0d1a3c83bd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/migtissera_Tess-v2.5-Phi-3-medium-128k-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tess-v2.5-Phi-3-medium-128k-14B",
-    "id": "migtissera/Tess-v2.5-Phi-3-medium-128k-14B",
-    "developer": "migtissera",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 13.96
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4539
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6207
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4113
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3732
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/migtissera/Tess-v2.5.2-Qwen2-72B/3f1f88d4-2908-4f28-b8d3-4f9ded18ba0e.json b/data/hfopenllm_v2/migtissera/Tess-v2.5.2-Qwen2-72B/3f1f88d4-2908-4f28-b8d3-4f9ded18ba0e.json
deleted file mode 100644
index 82ec5a874..000000000
--- a/data/hfopenllm_v2/migtissera/Tess-v2.5.2-Qwen2-72B/3f1f88d4-2908-4f28-b8d3-4f9ded18ba0e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/migtissera_Tess-v2.5.2-Qwen2-72B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tess-v2.5.2-Qwen2-72B",
-    "id": "migtissera/Tess-v2.5.2-Qwen2-72B",
-    "developer": "migtissera",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4494
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6647
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2938
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3507
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4188
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5561
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B-v0.2/3883b0d3-e442-42d3-adc6-ed959c902dd3.json b/data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B-v0.2/3883b0d3-e442-42d3-adc6-ed959c902dd3.json
deleted file mode 100644
index 3c3025654..000000000
--- a/data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B-v0.2/3883b0d3-e442-42d3-adc6-ed959c902dd3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/migtissera_Trinity-2-Codestral-22B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Trinity-2-Codestral-22B-v0.2",
-    "id": "migtissera/Trinity-2-Codestral-22B-v0.2",
-    "developer": "migtissera",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4345
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5686
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0838
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4045
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.334
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B-v0.2/da172cdb-1388-42f5-97b1-ae8e15291631.json b/data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B-v0.2/da172cdb-1388-42f5-97b1-ae8e15291631.json
deleted file mode 100644
index 5c049da80..000000000
--- a/data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B-v0.2/da172cdb-1388-42f5-97b1-ae8e15291631.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/migtissera_Trinity-2-Codestral-22B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Trinity-2-Codestral-22B-v0.2",
-    "id": "migtissera/Trinity-2-Codestral-22B-v0.2",
-    "developer": "migtissera",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.443
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5706
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0869
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4031
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3354
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B/7c94dbfa-4b3a-43fd-9f2c-b3d63d8ef700.json b/data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B/7c94dbfa-4b3a-43fd-9f2c-b3d63d8ef700.json
deleted file mode 100644
index 186299b68..000000000
--- a/data/hfopenllm_v2/migtissera/Trinity-2-Codestral-22B/7c94dbfa-4b3a-43fd-9f2c-b3d63d8ef700.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/migtissera_Trinity-2-Codestral-22B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Trinity-2-Codestral-22B",
-    "id": "migtissera/Trinity-2-Codestral-22B",
-    "developer": "migtissera",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4202
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5593
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0967
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4111
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3308
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mindw96/DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3/7cdd1de0-767d-4527-a024-c67166bb8b20.json b/data/hfopenllm_v2/mindw96/DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3/7cdd1de0-767d-4527-a024-c67166bb8b20.json
deleted file mode 100644
index 036d983ff..000000000
--- a/data/hfopenllm_v2/mindw96/DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3/7cdd1de0-767d-4527-a024-c67166bb8b20.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mindw96_DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3",
-    "id": "mindw96/DeepSeek-llama3.3-Bllossom-8B-DACON-LLM3",
-    "developer": "mindw96",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1388
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3068
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1106
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/minghaowu/Qwen1.5-1.8B-OpenHermes-2.5/d4702278-54c4-42e8-a901-dfe5c7f2004a.json b/data/hfopenllm_v2/minghaowu/Qwen1.5-1.8B-OpenHermes-2.5/d4702278-54c4-42e8-a901-dfe5c7f2004a.json
deleted file mode 100644
index fb2414062..000000000
--- a/data/hfopenllm_v2/minghaowu/Qwen1.5-1.8B-OpenHermes-2.5/d4702278-54c4-42e8-a901-dfe5c7f2004a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/minghaowu_Qwen1.5-1.8B-OpenHermes-2.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen1.5-1.8B-OpenHermes-2.5",
-    "id": "minghaowu/Qwen1.5-1.8B-OpenHermes-2.5",
-    "developer": "minghaowu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.837
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2778
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3375
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0242
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3529
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1792
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ministral/Ministral-3b-instruct/149f8ee5-4376-4fcc-8f87-7412a3083570.json b/data/hfopenllm_v2/ministral/Ministral-3b-instruct/149f8ee5-4376-4fcc-8f87-7412a3083570.json
deleted file mode 100644
index d2bace4f0..000000000
--- a/data/hfopenllm_v2/ministral/Ministral-3b-instruct/149f8ee5-4376-4fcc-8f87-7412a3083570.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ministral_Ministral-3b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ministral-3b-instruct",
-    "id": "ministral/Ministral-3b-instruct",
-    "developer": "ministral",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 3.316
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1358
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3192
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1093
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistral-community/Mistral-7B-v0.2/de82b746-c5d7-450a-bc2b-1b2859d91d6b.json b/data/hfopenllm_v2/mistral-community/Mistral-7B-v0.2/de82b746-c5d7-450a-bc2b-1b2859d91d6b.json
deleted file mode 100644
index 0f1e81e70..000000000
--- a/data/hfopenllm_v2/mistral-community/Mistral-7B-v0.2/de82b746-c5d7-450a-bc2b-1b2859d91d6b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistral-community_Mistral-7B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-v0.2",
-    "id": "mistral-community/Mistral-7B-v0.2",
-    "developer": "mistral-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2266
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.451
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4032
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistral-community/Mixtral-8x22B-v0.1/d2a916a6-288a-4761-a3fd-ca674edb67c1.json b/data/hfopenllm_v2/mistral-community/Mixtral-8x22B-v0.1/d2a916a6-288a-4761-a3fd-ca674edb67c1.json
deleted file mode 100644
index 944c1169a..000000000
--- a/data/hfopenllm_v2/mistral-community/Mixtral-8x22B-v0.1/d2a916a6-288a-4761-a3fd-ca674edb67c1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistral-community_Mixtral-8x22B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mixtral-8x22B-v0.1",
-    "id": "mistral-community/Mixtral-8x22B-v0.1",
-    "developer": "mistral-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Unknown",
-      "params_billions": 0.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3167
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1543
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.33
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3533
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.36
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistral-community/mixtral-8x22B-v0.3/cda497f9-c7f9-48d6-944b-0167476e5e5c.json b/data/hfopenllm_v2/mistral-community/mixtral-8x22B-v0.3/cda497f9-c7f9-48d6-944b-0167476e5e5c.json
deleted file mode 100644
index 036d6208b..000000000
--- a/data/hfopenllm_v2/mistral-community/mixtral-8x22B-v0.3/cda497f9-c7f9-48d6-944b-0167476e5e5c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistral-community_mixtral-8x22B-v0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mixtral-8x22B-v0.3",
-    "id": "mistral-community/mixtral-8x22B-v0.3",
-    "developer": "mistral-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 140.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2583
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.625
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1835
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3775
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4037
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4639
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistralai/Codestral-22B-v0.1/b56c6c01-a226-4090-9332-330535d79e24.json b/data/hfopenllm_v2/mistralai/Codestral-22B-v0.1/b56c6c01-a226-4090-9332-330535d79e24.json
deleted file mode 100644
index d2f9f4474..000000000
--- a/data/hfopenllm_v2/mistralai/Codestral-22B-v0.1/b56c6c01-a226-4090-9332-330535d79e24.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistralai_Codestral-22B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Codestral-22B-v0.1",
-    "id": "mistralai/Codestral-22B-v0.1",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5772
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5139
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1005
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4187
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3156
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistralai/Ministral-8B-Instruct-2410/0ddc8e10-9cc5-48eb-b5b0-a2c2f071862b.json b/data/hfopenllm_v2/mistralai/Ministral-8B-Instruct-2410/0ddc8e10-9cc5-48eb-b5b0-a2c2f071862b.json
deleted file mode 100644
index 3393c5511..000000000
--- a/data/hfopenllm_v2/mistralai/Ministral-8B-Instruct-2410/0ddc8e10-9cc5-48eb-b5b0-a2c2f071862b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistralai_Ministral-8B-Instruct-2410/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ministral-8B-Instruct-2410",
-    "id": "mistralai/Ministral-8B-Instruct-2410",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 8.02
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5896
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4762
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1956
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4138
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3291
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.1/2917c469-7e22-497e-8d62-9b9972266658.json b/data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.1/2917c469-7e22-497e-8d62-9b9972266658.json
deleted file mode 100644
index 46b328f13..000000000
--- a/data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.1/2917c469-7e22-497e-8d62-9b9972266658.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistralai_Mistral-7B-Instruct-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Instruct-v0.1",
-    "id": "mistralai/Mistral-7B-Instruct-v0.1",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4487
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3355
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0227
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3848
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2414
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.2/2424d85c-e092-4e7c-bf4f-ae014d08a159.json b/data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.2/2424d85c-e092-4e7c-bf4f-ae014d08a159.json
deleted file mode 100644
index 3c886d188..000000000
--- a/data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.2/2424d85c-e092-4e7c-bf4f-ae014d08a159.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistralai_Mistral-7B-Instruct-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Instruct-v0.2",
-    "id": "mistralai/Mistral-7B-Instruct-v0.2",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5496
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.446
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3966
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2717
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.3/90278363-1d8f-47ca-a7dc-c51c6b511dc9.json b/data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.3/90278363-1d8f-47ca-a7dc-c51c6b511dc9.json
deleted file mode 100644
index e6172895f..000000000
--- a/data/hfopenllm_v2/mistralai/Mistral-7B-Instruct-v0.3/90278363-1d8f-47ca-a7dc-c51c6b511dc9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistralai_Mistral-7B-Instruct-v0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Instruct-v0.3",
-    "id": "mistralai/Mistral-7B-Instruct-v0.3",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5465
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4722
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3739
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3075
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistralai/Mistral-7B-v0.1/3c3197ee-675d-4bb7-874d-28104d2a3cae.json b/data/hfopenllm_v2/mistralai/Mistral-7B-v0.1/3c3197ee-675d-4bb7-874d-28104d2a3cae.json
deleted file mode 100644
index 59e48b5f6..000000000
--- a/data/hfopenllm_v2/mistralai/Mistral-7B-v0.1/3c3197ee-675d-4bb7-874d-28104d2a3cae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistralai_Mistral-7B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-v0.1",
-    "id": "mistralai/Mistral-7B-v0.1",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2386
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4419
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4139
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3013
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistralai/Mistral-7B-v0.3/eb5a8679-bfdd-40f2-9a32-55c04a65ae7e.json b/data/hfopenllm_v2/mistralai/Mistral-7B-v0.3/eb5a8679-bfdd-40f2-9a32-55c04a65ae7e.json
deleted file mode 100644
index ab5aca3af..000000000
--- a/data/hfopenllm_v2/mistralai/Mistral-7B-v0.3/eb5a8679-bfdd-40f2-9a32-55c04a65ae7e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistralai_Mistral-7B-v0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-v0.3",
-    "id": "mistralai/Mistral-7B-v0.3",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2266
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4517
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4032
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistralai/Mistral-Large-Instruct-2411/d770f88d-b110-4f27-85e9-e52217c11798.json b/data/hfopenllm_v2/mistralai/Mistral-Large-Instruct-2411/d770f88d-b110-4f27-85e9-e52217c11798.json
deleted file mode 100644
index 439c483ad..000000000
--- a/data/hfopenllm_v2/mistralai/Mistral-Large-Instruct-2411/d770f88d-b110-4f27-85e9-e52217c11798.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistralai_Mistral-Large-Instruct-2411/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Large-Instruct-2411",
-    "id": "mistralai/Mistral-Large-Instruct-2411",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 122.61
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8401
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6747
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4955
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4371
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.454
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5562
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistralai/Mistral-Nemo-Base-2407/364328ce-5de7-401f-ad84-0c76e3c1dc91.json b/data/hfopenllm_v2/mistralai/Mistral-Nemo-Base-2407/364328ce-5de7-401f-ad84-0c76e3c1dc91.json
deleted file mode 100644
index 9a51045d8..000000000
--- a/data/hfopenllm_v2/mistralai/Mistral-Nemo-Base-2407/364328ce-5de7-401f-ad84-0c76e3c1dc91.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistralai_Mistral-Nemo-Base-2407/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Nemo-Base-2407",
-    "id": "mistralai/Mistral-Nemo-Base-2407",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 11.58
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.163
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5035
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0597
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3921
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3472
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistralai/Mistral-Nemo-Instruct-2407/f7dcfdbb-ff12-4692-9702-712de3d0b7ba.json b/data/hfopenllm_v2/mistralai/Mistral-Nemo-Instruct-2407/f7dcfdbb-ff12-4692-9702-712de3d0b7ba.json
deleted file mode 100644
index 3522fc93d..000000000
--- a/data/hfopenllm_v2/mistralai/Mistral-Nemo-Instruct-2407/f7dcfdbb-ff12-4692-9702-712de3d0b7ba.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistralai_Mistral-Nemo-Instruct-2407/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Nemo-Instruct-2407",
-    "id": "mistralai/Mistral-Nemo-Instruct-2407",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.638
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5037
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1269
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3517
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistralai/Mistral-Small-24B-Base-2501/d641aa88-9981-4a25-90d5-fcc4564ede52.json b/data/hfopenllm_v2/mistralai/Mistral-Small-24B-Base-2501/d641aa88-9981-4a25-90d5-fcc4564ede52.json
deleted file mode 100644
index f8fd7098d..000000000
--- a/data/hfopenllm_v2/mistralai/Mistral-Small-24B-Base-2501/d641aa88-9981-4a25-90d5-fcc4564ede52.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistralai_Mistral-Small-24B-Base-2501/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Small-24B-Base-2501",
-    "id": "mistralai/Mistral-Small-24B-Base-2501",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1672
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6442
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1971
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3876
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4237
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5406
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistralai/Mistral-Small-Instruct-2409/8915e742-df2e-41bc-b83f-3e111edfd257.json b/data/hfopenllm_v2/mistralai/Mistral-Small-Instruct-2409/8915e742-df2e-41bc-b83f-3e111edfd257.json
deleted file mode 100644
index b9bddd2f0..000000000
--- a/data/hfopenllm_v2/mistralai/Mistral-Small-Instruct-2409/8915e742-df2e-41bc-b83f-3e111edfd257.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistralai_Mistral-Small-Instruct-2409/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Small-Instruct-2409",
-    "id": "mistralai/Mistral-Small-Instruct-2409",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6283
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.583
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2039
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4063
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4099
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistralai/Mistral-Small-Instruct-2409/e29a5e35-8677-4e53-83fd-85e919b4366a.json b/data/hfopenllm_v2/mistralai/Mistral-Small-Instruct-2409/e29a5e35-8677-4e53-83fd-85e919b4366a.json
deleted file mode 100644
index 883513689..000000000
--- a/data/hfopenllm_v2/mistralai/Mistral-Small-Instruct-2409/e29a5e35-8677-4e53-83fd-85e919b4366a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistralai_Mistral-Small-Instruct-2409/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Small-Instruct-2409",
-    "id": "mistralai/Mistral-Small-Instruct-2409",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.05
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.667
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5213
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1435
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3632
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.396
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistralai/Mixtral-8x22B-Instruct-v0.1/e5c55d38-dc04-42b4-9aca-ae7be436ebe0.json b/data/hfopenllm_v2/mistralai/Mixtral-8x22B-Instruct-v0.1/e5c55d38-dc04-42b4-9aca-ae7be436ebe0.json
deleted file mode 100644
index 95271fa45..000000000
--- a/data/hfopenllm_v2/mistralai/Mixtral-8x22B-Instruct-v0.1/e5c55d38-dc04-42b4-9aca-ae7be436ebe0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistralai_Mixtral-8x22B-Instruct-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mixtral-8x22B-Instruct-v0.1",
-    "id": "mistralai/Mixtral-8x22B-Instruct-v0.1",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 140.621
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7184
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6125
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1873
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4311
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4483
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistralai/Mixtral-8x22B-v0.1/504baceb-6684-430d-a532-b7b5b0b061fe.json b/data/hfopenllm_v2/mistralai/Mixtral-8x22B-v0.1/504baceb-6684-430d-a532-b7b5b0b061fe.json
deleted file mode 100644
index e967e9002..000000000
--- a/data/hfopenllm_v2/mistralai/Mixtral-8x22B-v0.1/504baceb-6684-430d-a532-b7b5b0b061fe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistralai_Mixtral-8x22B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mixtral-8x22B-v0.1",
-    "id": "mistralai/Mixtral-8x22B-v0.1",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 140.621
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2583
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.624
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1835
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3758
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4037
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4639
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistralai/Mixtral-8x7B-Instruct-v0.1/31fcd34a-af1e-4eab-bd9a-5ec17eb572d2.json b/data/hfopenllm_v2/mistralai/Mixtral-8x7B-Instruct-v0.1/31fcd34a-af1e-4eab-bd9a-5ec17eb572d2.json
deleted file mode 100644
index 659f51d7a..000000000
--- a/data/hfopenllm_v2/mistralai/Mixtral-8x7B-Instruct-v0.1/31fcd34a-af1e-4eab-bd9a-5ec17eb572d2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistralai_Mixtral-8x7B-Instruct-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mixtral-8x7B-Instruct-v0.1",
-    "id": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 46.703
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5599
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4962
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0914
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4203
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3692
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistralai/Mixtral-8x7B-v0.1/01ab0a3e-393a-497a-9b32-8af790b7581a.json b/data/hfopenllm_v2/mistralai/Mixtral-8x7B-v0.1/01ab0a3e-393a-497a-9b32-8af790b7581a.json
deleted file mode 100644
index b2775e111..000000000
--- a/data/hfopenllm_v2/mistralai/Mixtral-8x7B-v0.1/01ab0a3e-393a-497a-9b32-8af790b7581a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistralai_Mixtral-8x7B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mixtral-8x7B-v0.1",
-    "id": "mistralai/Mixtral-8x7B-v0.1",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 46.703
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2326
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5098
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0937
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4413
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3871
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mistralai/Mixtral-8x7B-v0.1/541967a6-b856-4dc9-958a-9335197fba99.json b/data/hfopenllm_v2/mistralai/Mixtral-8x7B-v0.1/541967a6-b856-4dc9-958a-9335197fba99.json
deleted file mode 100644
index 2f9aa7f43..000000000
--- a/data/hfopenllm_v2/mistralai/Mixtral-8x7B-v0.1/541967a6-b856-4dc9-958a-9335197fba99.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mistralai_Mixtral-8x7B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mixtral-8x7B-v0.1",
-    "id": "mistralai/Mixtral-8x7B-v0.1",
-    "developer": "mistralai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 46.703
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2415
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5087
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4321
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.385
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mixtao/MixTAO-7Bx2-MoE-v8.1/ee31c801-67cb-46a3-9e39-02e842c0473f.json b/data/hfopenllm_v2/mixtao/MixTAO-7Bx2-MoE-v8.1/ee31c801-67cb-46a3-9e39-02e842c0473f.json
deleted file mode 100644
index f557e9ecc..000000000
--- a/data/hfopenllm_v2/mixtao/MixTAO-7Bx2-MoE-v8.1/ee31c801-67cb-46a3-9e39-02e842c0473f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mixtao_MixTAO-7Bx2-MoE-v8.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MixTAO-7Bx2-MoE-v8.1",
-    "id": "mixtao/MixTAO-7Bx2-MoE-v8.1",
-    "developer": "mixtao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 12.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4162
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5189
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0906
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4463
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3123
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mkurman/llama-3.2-MEDIT-3B-o1/65fabe8b-05af-461e-b804-fcff3492da34.json b/data/hfopenllm_v2/mkurman/llama-3.2-MEDIT-3B-o1/65fabe8b-05af-461e-b804-fcff3492da34.json
deleted file mode 100644
index 3b0161b0b..000000000
--- a/data/hfopenllm_v2/mkurman/llama-3.2-MEDIT-3B-o1/65fabe8b-05af-461e-b804-fcff3492da34.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mkurman_llama-3.2-MEDIT-3B-o1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3.2-MEDIT-3B-o1",
-    "id": "mkurman/llama-3.2-MEDIT-3B-o1",
-    "developer": "mkurman",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.607
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4382
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1307
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2741
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mkurman/phi-4-MedIT-11B-exp-1/7e1a7121-2c9f-4196-bbdd-48aea257f384.json b/data/hfopenllm_v2/mkurman/phi-4-MedIT-11B-exp-1/7e1a7121-2c9f-4196-bbdd-48aea257f384.json
deleted file mode 100644
index f017555d6..000000000
--- a/data/hfopenllm_v2/mkurman/phi-4-MedIT-11B-exp-1/7e1a7121-2c9f-4196-bbdd-48aea257f384.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mkurman_phi-4-MedIT-11B-exp-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-4-MedIT-11B-exp-1",
-    "id": "mkurman/phi-4-MedIT-11B-exp-1",
-    "developer": "mkurman",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Phi3ForCausalLM",
-      "params_billions": 11.514
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5948
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5414
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0899
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3848
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3825
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mkurman/phi4-MedIT-10B-o1/dd32609c-316e-4511-8791-fcae33a1a506.json b/data/hfopenllm_v2/mkurman/phi4-MedIT-10B-o1/dd32609c-316e-4511-8791-fcae33a1a506.json
deleted file mode 100644
index 5207ed657..000000000
--- a/data/hfopenllm_v2/mkurman/phi4-MedIT-10B-o1/dd32609c-316e-4511-8791-fcae33a1a506.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mkurman_phi4-MedIT-10B-o1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi4-MedIT-10B-o1",
-    "id": "mkurman/phi4-MedIT-10B-o1",
-    "developer": "mkurman",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaMedITForCausalLM",
-      "params_billions": 10.255
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3463
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5198
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2458
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3968
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3507
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mkxu/llama-3-8b-instruct-fpo/d95d7058-49eb-47d7-b790-3a253291d22b.json b/data/hfopenllm_v2/mkxu/llama-3-8b-instruct-fpo/d95d7058-49eb-47d7-b790-3a253291d22b.json
deleted file mode 100644
index 9ef45d0ae..000000000
--- a/data/hfopenllm_v2/mkxu/llama-3-8b-instruct-fpo/d95d7058-49eb-47d7-b790-3a253291d22b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mkxu_llama-3-8b-instruct-fpo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-8b-instruct-fpo",
-    "id": "mkxu/llama-3-8b-instruct-fpo",
-    "developer": "mkxu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.679
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4959
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0733
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3658
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3605
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mkxu/llama-3-8b-po1/37cbc3d6-1198-4e23-b86c-1fd979eacd9a.json b/data/hfopenllm_v2/mkxu/llama-3-8b-po1/37cbc3d6-1198-4e23-b86c-1fd979eacd9a.json
deleted file mode 100644
index db56d51b6..000000000
--- a/data/hfopenllm_v2/mkxu/llama-3-8b-po1/37cbc3d6-1198-4e23-b86c-1fd979eacd9a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mkxu_llama-3-8b-po1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-8b-po1",
-    "id": "mkxu/llama-3-8b-po1",
-    "developer": "mkxu",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4081
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4976
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0702
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3804
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3562
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mlabonne/AlphaMonarch-7B/76d0d338-e502-4638-adad-c4c4df00c26f.json b/data/hfopenllm_v2/mlabonne/AlphaMonarch-7B/76d0d338-e502-4638-adad-c4c4df00c26f.json
deleted file mode 100644
index e4a056c61..000000000
--- a/data/hfopenllm_v2/mlabonne/AlphaMonarch-7B/76d0d338-e502-4638-adad-c4c4df00c26f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mlabonne_AlphaMonarch-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AlphaMonarch-7B",
-    "id": "mlabonne/AlphaMonarch-7B",
-    "developer": "mlabonne",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4939
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4626
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4121
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2473
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mlabonne/Beyonder-4x7B-v3/f47375bd-547a-4d0b-8c96-bbe2bc1ac445.json b/data/hfopenllm_v2/mlabonne/Beyonder-4x7B-v3/f47375bd-547a-4d0b-8c96-bbe2bc1ac445.json
deleted file mode 100644
index ae8e73867..000000000
--- a/data/hfopenllm_v2/mlabonne/Beyonder-4x7B-v3/f47375bd-547a-4d0b-8c96-bbe2bc1ac445.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mlabonne_Beyonder-4x7B-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Beyonder-4x7B-v3",
-    "id": "mlabonne/Beyonder-4x7B-v3",
-    "developer": "mlabonne",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 24.154
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5608
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4671
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4045
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2512
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mlabonne/BigQwen2.5-52B-Instruct/6b1ed68c-3099-4bd7-892b-cdc36c90ccfe.json b/data/hfopenllm_v2/mlabonne/BigQwen2.5-52B-Instruct/6b1ed68c-3099-4bd7-892b-cdc36c90ccfe.json
deleted file mode 100644
index 7401e378b..000000000
--- a/data/hfopenllm_v2/mlabonne/BigQwen2.5-52B-Instruct/6b1ed68c-3099-4bd7-892b-cdc36c90ccfe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mlabonne_BigQwen2.5-52B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BigQwen2.5-52B-Instruct",
-    "id": "mlabonne/BigQwen2.5-52B-Instruct",
-    "developer": "mlabonne",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 52.268
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7913
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7121
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4113
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5519
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mlabonne/BigQwen2.5-Echo-47B-Instruct/0e59c8ca-cde0-4482-ab03-3309bcb8737c.json b/data/hfopenllm_v2/mlabonne/BigQwen2.5-Echo-47B-Instruct/0e59c8ca-cde0-4482-ab03-3309bcb8737c.json
deleted file mode 100644
index 015017389..000000000
--- a/data/hfopenllm_v2/mlabonne/BigQwen2.5-Echo-47B-Instruct/0e59c8ca-cde0-4482-ab03-3309bcb8737c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mlabonne_BigQwen2.5-Echo-47B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BigQwen2.5-Echo-47B-Instruct",
-    "id": "mlabonne/BigQwen2.5-Echo-47B-Instruct",
-    "developer": "mlabonne",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 47.392
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7357
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6125
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4381
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4125
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4734
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mlabonne/ChimeraLlama-3-8B-v2/d7e900e2-0574-44cd-a68a-0dd2715cf48c.json b/data/hfopenllm_v2/mlabonne/ChimeraLlama-3-8B-v2/d7e900e2-0574-44cd-a68a-0dd2715cf48c.json
deleted file mode 100644
index e8c365a9a..000000000
--- a/data/hfopenllm_v2/mlabonne/ChimeraLlama-3-8B-v2/d7e900e2-0574-44cd-a68a-0dd2715cf48c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mlabonne_ChimeraLlama-3-8B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ChimeraLlama-3-8B-v2",
-    "id": "mlabonne/ChimeraLlama-3-8B-v2",
-    "developer": "mlabonne",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4469
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5046
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0906
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3791
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3569
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mlabonne/ChimeraLlama-3-8B-v3/fd626c3f-566d-4193-9a85-e7c9a89e671c.json b/data/hfopenllm_v2/mlabonne/ChimeraLlama-3-8B-v3/fd626c3f-566d-4193-9a85-e7c9a89e671c.json
deleted file mode 100644
index e51cf0145..000000000
--- a/data/hfopenllm_v2/mlabonne/ChimeraLlama-3-8B-v3/fd626c3f-566d-4193-9a85-e7c9a89e671c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mlabonne_ChimeraLlama-3-8B-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ChimeraLlama-3-8B-v3",
-    "id": "mlabonne/ChimeraLlama-3-8B-v3",
-    "developer": "mlabonne",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4408
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4978
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0884
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4004
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3669
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mlabonne/Daredevil-8B-abliterated/196b04ae-fd53-400f-9f08-19edd4959f6e.json b/data/hfopenllm_v2/mlabonne/Daredevil-8B-abliterated/196b04ae-fd53-400f-9f08-19edd4959f6e.json
deleted file mode 100644
index 8e430f3b6..000000000
--- a/data/hfopenllm_v2/mlabonne/Daredevil-8B-abliterated/196b04ae-fd53-400f-9f08-19edd4959f6e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mlabonne_Daredevil-8B-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Daredevil-8B-abliterated",
-    "id": "mlabonne/Daredevil-8B-abliterated",
-    "developer": "mlabonne",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4426
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4254
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0944
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.407
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3701
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mlabonne/Daredevil-8B/57177299-076a-4506-89a7-ce54af08df4f.json b/data/hfopenllm_v2/mlabonne/Daredevil-8B/57177299-076a-4506-89a7-ce54af08df4f.json
deleted file mode 100644
index f05fd6db8..000000000
--- a/data/hfopenllm_v2/mlabonne/Daredevil-8B/57177299-076a-4506-89a7-ce54af08df4f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mlabonne_Daredevil-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Daredevil-8B",
-    "id": "mlabonne/Daredevil-8B",
-    "developer": "mlabonne",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4548
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5194
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3939
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3831
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mlabonne/Hermes-3-Llama-3.1-70B-lorablated/d3bdf36f-7f89-4b5a-b6cb-847b49200b5b.json b/data/hfopenllm_v2/mlabonne/Hermes-3-Llama-3.1-70B-lorablated/d3bdf36f-7f89-4b5a-b6cb-847b49200b5b.json
deleted file mode 100644
index dced4ed52..000000000
--- a/data/hfopenllm_v2/mlabonne/Hermes-3-Llama-3.1-70B-lorablated/d3bdf36f-7f89-4b5a-b6cb-847b49200b5b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mlabonne_Hermes-3-Llama-3.1-70B-lorablated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hermes-3-Llama-3.1-70B-lorablated",
-    "id": "mlabonne/Hermes-3-Llama-3.1-70B-lorablated",
-    "developer": "mlabonne",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3424
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6693
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2243
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3658
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5029
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4679
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated/92619b9e-dacf-4d0a-9f8b-6e131af74fa4.json b/data/hfopenllm_v2/mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated/92619b9e-dacf-4d0a-9f8b-6e131af74fa4.json
deleted file mode 100644
index 2110b6907..000000000
--- a/data/hfopenllm_v2/mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated/92619b9e-dacf-4d0a-9f8b-6e131af74fa4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mlabonne_Meta-Llama-3.1-8B-Instruct-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Meta-Llama-3.1-8B-Instruct-abliterated",
-    "id": "mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated",
-    "developer": "mlabonne",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7329
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4874
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0687
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3649
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3503
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mlabonne/NeuralBeagle14-7B/cbb408ea-ced6-4f47-9066-d4ff6d604b1e.json b/data/hfopenllm_v2/mlabonne/NeuralBeagle14-7B/cbb408ea-ced6-4f47-9066-d4ff6d604b1e.json
deleted file mode 100644
index 6fc2a6175..000000000
--- a/data/hfopenllm_v2/mlabonne/NeuralBeagle14-7B/cbb408ea-ced6-4f47-9066-d4ff6d604b1e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mlabonne_NeuralBeagle14-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NeuralBeagle14-7B",
-    "id": "mlabonne/NeuralBeagle14-7B",
-    "developer": "mlabonne",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4935
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4628
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0521
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4319
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mlabonne/NeuralDaredevil-8B-abliterated/6999bb02-29fd-4c59-886f-184362afa06e.json b/data/hfopenllm_v2/mlabonne/NeuralDaredevil-8B-abliterated/6999bb02-29fd-4c59-886f-184362afa06e.json
deleted file mode 100644
index 6a7692af4..000000000
--- a/data/hfopenllm_v2/mlabonne/NeuralDaredevil-8B-abliterated/6999bb02-29fd-4c59-886f-184362afa06e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mlabonne_NeuralDaredevil-8B-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NeuralDaredevil-8B-abliterated",
-    "id": "mlabonne/NeuralDaredevil-8B-abliterated",
-    "developer": "mlabonne",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7561
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5111
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0906
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4019
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3841
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mlabonne/NeuralDaredevil-8B-abliterated/913d1d8e-0b02-4ce5-9b7c-403143a8c880.json b/data/hfopenllm_v2/mlabonne/NeuralDaredevil-8B-abliterated/913d1d8e-0b02-4ce5-9b7c-403143a8c880.json
deleted file mode 100644
index 24cd13aa3..000000000
--- a/data/hfopenllm_v2/mlabonne/NeuralDaredevil-8B-abliterated/913d1d8e-0b02-4ce5-9b7c-403143a8c880.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mlabonne_NeuralDaredevil-8B-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NeuralDaredevil-8B-abliterated",
-    "id": "mlabonne/NeuralDaredevil-8B-abliterated",
-    "developer": "mlabonne",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4162
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5124
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0853
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.415
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3802
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mlabonne/OrpoLlama-3-8B/82c87bc0-29cf-4150-92f5-c80fb0028ea6.json b/data/hfopenllm_v2/mlabonne/OrpoLlama-3-8B/82c87bc0-29cf-4150-92f5-c80fb0028ea6.json
deleted file mode 100644
index e19f78336..000000000
--- a/data/hfopenllm_v2/mlabonne/OrpoLlama-3-8B/82c87bc0-29cf-4150-92f5-c80fb0028ea6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mlabonne_OrpoLlama-3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OrpoLlama-3-8B",
-    "id": "mlabonne/OrpoLlama-3-8B",
-    "developer": "mlabonne",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3653
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4424
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0559
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3579
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2705
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mlabonne/phixtral-2x2_8/a18834ad-6143-4ce2-9842-471817a60a39.json b/data/hfopenllm_v2/mlabonne/phixtral-2x2_8/a18834ad-6143-4ce2-9842-471817a60a39.json
deleted file mode 100644
index f6430bd4f..000000000
--- a/data/hfopenllm_v2/mlabonne/phixtral-2x2_8/a18834ad-6143-4ce2-9842-471817a60a39.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mlabonne_phixtral-2x2_8/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phixtral-2x2_8",
-    "id": "mlabonne/phixtral-2x2_8",
-    "developer": "mlabonne",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 4.458
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3431
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4889
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0355
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3644
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2551
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mlx-community/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32/be900bcf-8ec9-484f-81db-0e83975c1ecd.json b/data/hfopenllm_v2/mlx-community/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32/be900bcf-8ec9-484f-81db-0e83975c1ecd.json
deleted file mode 100644
index da8271107..000000000
--- a/data/hfopenllm_v2/mlx-community/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32/be900bcf-8ec9-484f-81db-0e83975c1ecd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mlx-community_Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32",
-    "id": "mlx-community/Josiefied-Qwen2.5-0.5B-Instruct-abliterated-v1-float32",
-    "developer": "mlx-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3369
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3292
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0846
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3249
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1638
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mlx-community/Mistral-Small-24B-Instruct-2501-bf16/d226ccf6-674b-44c6-8b11-d782b59a961a.json b/data/hfopenllm_v2/mlx-community/Mistral-Small-24B-Instruct-2501-bf16/d226ccf6-674b-44c6-8b11-d782b59a961a.json
deleted file mode 100644
index b1e204eb5..000000000
--- a/data/hfopenllm_v2/mlx-community/Mistral-Small-24B-Instruct-2501-bf16/d226ccf6-674b-44c6-8b11-d782b59a961a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mlx-community_Mistral-Small-24B-Instruct-2501-bf16/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Small-24B-Instruct-2501-bf16",
-    "id": "mlx-community/Mistral-Small-24B-Instruct-2501-bf16",
-    "developer": "mlx-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6283
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6713
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3225
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3951
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4618
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5395
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mmnga/Llama-3-70B-japanese-suzume-vector-v0.1/d8839a1a-8d07-4e0b-bd44-2668c84f750c.json b/data/hfopenllm_v2/mmnga/Llama-3-70B-japanese-suzume-vector-v0.1/d8839a1a-8d07-4e0b-bd44-2668c84f750c.json
deleted file mode 100644
index c98ce35a8..000000000
--- a/data/hfopenllm_v2/mmnga/Llama-3-70B-japanese-suzume-vector-v0.1/d8839a1a-8d07-4e0b-bd44-2668c84f750c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mmnga_Llama-3-70B-japanese-suzume-vector-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-70B-japanese-suzume-vector-v0.1",
-    "id": "mmnga/Llama-3-70B-japanese-suzume-vector-v0.1",
-    "developer": "mmnga",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4649
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6542
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2326
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4141
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5224
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mobiuslabsgmbh/DeepSeek-R1-ReDistill-Llama3-8B-v1.1/e90b04db-2eb3-483a-ab0e-ea8aef821d84.json b/data/hfopenllm_v2/mobiuslabsgmbh/DeepSeek-R1-ReDistill-Llama3-8B-v1.1/e90b04db-2eb3-483a-ab0e-ea8aef821d84.json
deleted file mode 100644
index f696f25a8..000000000
--- a/data/hfopenllm_v2/mobiuslabsgmbh/DeepSeek-R1-ReDistill-Llama3-8B-v1.1/e90b04db-2eb3-483a-ab0e-ea8aef821d84.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mobiuslabsgmbh_DeepSeek-R1-ReDistill-Llama3-8B-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-ReDistill-Llama3-8B-v1.1",
-    "id": "mobiuslabsgmbh/DeepSeek-R1-ReDistill-Llama3-8B-v1.1",
-    "developer": "mobiuslabsgmbh",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3704
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3285
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3396
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2198
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mobiuslabsgmbh/DeepSeek-R1-ReDistill-Qwen-7B-v1.1/900921ae-fbb2-4488-ab19-18987c1d008d.json b/data/hfopenllm_v2/mobiuslabsgmbh/DeepSeek-R1-ReDistill-Qwen-7B-v1.1/900921ae-fbb2-4488-ab19-18987c1d008d.json
deleted file mode 100644
index 195277705..000000000
--- a/data/hfopenllm_v2/mobiuslabsgmbh/DeepSeek-R1-ReDistill-Qwen-7B-v1.1/900921ae-fbb2-4488-ab19-18987c1d008d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mobiuslabsgmbh_DeepSeek-R1-ReDistill-Qwen-7B-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-ReDistill-Qwen-7B-v1.1",
-    "id": "mobiuslabsgmbh/DeepSeek-R1-ReDistill-Qwen-7B-v1.1",
-    "developer": "mobiuslabsgmbh",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3698
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3497
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4009
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2326
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/moeru-ai/L3.1-Moe-2x8B-v0.2/0da0a7cd-c075-4bc0-8e88-8acc7212e5c3.json b/data/hfopenllm_v2/moeru-ai/L3.1-Moe-2x8B-v0.2/0da0a7cd-c075-4bc0-8e88-8acc7212e5c3.json
deleted file mode 100644
index 3c3e590ae..000000000
--- a/data/hfopenllm_v2/moeru-ai/L3.1-Moe-2x8B-v0.2/0da0a7cd-c075-4bc0-8e88-8acc7212e5c3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/moeru-ai_L3.1-Moe-2x8B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.1-Moe-2x8B-v0.2",
-    "id": "moeru-ai/L3.1-Moe-2x8B-v0.2",
-    "developer": "moeru-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 13.668
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7348
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5256
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1699
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3858
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/moeru-ai/L3.1-Moe-4x8B-v0.1/b50a49cd-2909-4dbe-9c9f-c150abb99845.json b/data/hfopenllm_v2/moeru-ai/L3.1-Moe-4x8B-v0.1/b50a49cd-2909-4dbe-9c9f-c150abb99845.json
deleted file mode 100644
index e9612c059..000000000
--- a/data/hfopenllm_v2/moeru-ai/L3.1-Moe-4x8B-v0.1/b50a49cd-2909-4dbe-9c9f-c150abb99845.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/moeru-ai_L3.1-Moe-4x8B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.1-Moe-4x8B-v0.1",
-    "id": "moeru-ai/L3.1-Moe-4x8B-v0.1",
-    "developer": "moeru-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 24.942
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4332
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4939
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1299
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3609
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3454
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/moeru-ai/L3.1-Moe-4x8B-v0.2/13831d81-a9dd-43c7-bce1-240aad42fbc6.json b/data/hfopenllm_v2/moeru-ai/L3.1-Moe-4x8B-v0.2/13831d81-a9dd-43c7-bce1-240aad42fbc6.json
deleted file mode 100644
index 5c203bb82..000000000
--- a/data/hfopenllm_v2/moeru-ai/L3.1-Moe-4x8B-v0.2/13831d81-a9dd-43c7-bce1-240aad42fbc6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/moeru-ai_L3.1-Moe-4x8B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.1-Moe-4x8B-v0.2",
-    "id": "moeru-ai/L3.1-Moe-4x8B-v0.2",
-    "developer": "moeru-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 24.942
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5407
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4466
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1035
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3234
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2763
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/monsterapi/Llama-3_1-8B-Instruct-orca-ORPO/56ea7cb3-3a1e-477a-bac8-26a0fde6297a.json b/data/hfopenllm_v2/monsterapi/Llama-3_1-8B-Instruct-orca-ORPO/56ea7cb3-3a1e-477a-bac8-26a0fde6297a.json
deleted file mode 100644
index 4f353589f..000000000
--- a/data/hfopenllm_v2/monsterapi/Llama-3_1-8B-Instruct-orca-ORPO/56ea7cb3-3a1e-477a-bac8-26a0fde6297a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/monsterapi_Llama-3_1-8B-Instruct-orca-ORPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3_1-8B-Instruct-orca-ORPO",
-    "id": "monsterapi/Llama-3_1-8B-Instruct-orca-ORPO",
-    "developer": "monsterapi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 16.061
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2273
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2865
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3445
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1168
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/monsterapi/gemma-2-2b-LoRA-MonsterInstruct/8ce19b33-4f2b-4b8d-80bd-1ed399a5e9dd.json b/data/hfopenllm_v2/monsterapi/gemma-2-2b-LoRA-MonsterInstruct/8ce19b33-4f2b-4b8d-80bd-1ed399a5e9dd.json
deleted file mode 100644
index 1294499b7..000000000
--- a/data/hfopenllm_v2/monsterapi/gemma-2-2b-LoRA-MonsterInstruct/8ce19b33-4f2b-4b8d-80bd-1ed399a5e9dd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/monsterapi_gemma-2-2b-LoRA-MonsterInstruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-LoRA-MonsterInstruct",
-    "id": "monsterapi/gemma-2-2b-LoRA-MonsterInstruct",
-    "developer": "monsterapi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3903
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.365
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3644
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1987
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mosaicml/mpt-7b/18ab167d-b72e-4fa9-94a8-09edc641c73f.json b/data/hfopenllm_v2/mosaicml/mpt-7b/18ab167d-b72e-4fa9-94a8-09edc641c73f.json
deleted file mode 100644
index 80c3fe30d..000000000
--- a/data/hfopenllm_v2/mosaicml/mpt-7b/18ab167d-b72e-4fa9-94a8-09edc641c73f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mosaicml_mpt-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mpt-7b",
-    "id": "mosaicml/mpt-7b",
-    "developer": "mosaicml",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MPTForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2152
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.33
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3672
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1206
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mosama/Qwen2.5-1.5B-Instruct-CoT-Reflection/7df237ea-29c0-4d0a-9092-c41df4c13aca.json b/data/hfopenllm_v2/mosama/Qwen2.5-1.5B-Instruct-CoT-Reflection/7df237ea-29c0-4d0a-9092-c41df4c13aca.json
deleted file mode 100644
index 553a8fd2a..000000000
--- a/data/hfopenllm_v2/mosama/Qwen2.5-1.5B-Instruct-CoT-Reflection/7df237ea-29c0-4d0a-9092-c41df4c13aca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mosama_Qwen2.5-1.5B-Instruct-CoT-Reflection/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-1.5B-Instruct-CoT-Reflection",
-    "id": "mosama/Qwen2.5-1.5B-Instruct-CoT-Reflection",
-    "developer": "mosama",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.287
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4109
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0272
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3212
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mrdayl/OpenCogito/e5dc8caa-2d86-4ff0-af8d-22d85c8faeb0.json b/data/hfopenllm_v2/mrdayl/OpenCogito/e5dc8caa-2d86-4ff0-af8d-22d85c8faeb0.json
deleted file mode 100644
index f7201e897..000000000
--- a/data/hfopenllm_v2/mrdayl/OpenCogito/e5dc8caa-2d86-4ff0-af8d-22d85c8faeb0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mrdayl_OpenCogito/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenCogito",
-    "id": "mrdayl/OpenCogito",
-    "developer": "mrdayl",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3934
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.472
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2183
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.424
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3452
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mrdayl/OpenCognito-r1/01591bb6-9daf-40fb-b802-0a007f4cc388.json b/data/hfopenllm_v2/mrdayl/OpenCognito-r1/01591bb6-9daf-40fb-b802-0a007f4cc388.json
deleted file mode 100644
index 745fb3a4c..000000000
--- a/data/hfopenllm_v2/mrdayl/OpenCognito-r1/01591bb6-9daf-40fb-b802-0a007f4cc388.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mrdayl_OpenCognito-r1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenCognito-r1",
-    "id": "mrdayl/OpenCognito-r1",
-    "developer": "mrdayl",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4241
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4673
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1903
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4241
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3475
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mrdayl/OpenCognito-r2/f6c32abf-bbae-4827-9ce2-29ce20c9463e.json b/data/hfopenllm_v2/mrdayl/OpenCognito-r2/f6c32abf-bbae-4827-9ce2-29ce20c9463e.json
deleted file mode 100644
index 45eaf201c..000000000
--- a/data/hfopenllm_v2/mrdayl/OpenCognito-r2/f6c32abf-bbae-4827-9ce2-29ce20c9463e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mrdayl_OpenCognito-r2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenCognito-r2",
-    "id": "mrdayl/OpenCognito-r2",
-    "developer": "mrdayl",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3959
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4688
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2024
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4202
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3462
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mrdayl/OpenCognito/74a6605d-3557-4458-bef5-cc9420434e68.json b/data/hfopenllm_v2/mrdayl/OpenCognito/74a6605d-3557-4458-bef5-cc9420434e68.json
deleted file mode 100644
index 6d8c6acce..000000000
--- a/data/hfopenllm_v2/mrdayl/OpenCognito/74a6605d-3557-4458-bef5-cc9420434e68.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mrdayl_OpenCognito/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenCognito",
-    "id": "mrdayl/OpenCognito",
-    "developer": "mrdayl",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4062
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4706
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2115
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4293
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3443
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mrdayl/OpenThink/dbe6e126-d35c-4634-a544-adf374ed5d00.json b/data/hfopenllm_v2/mrdayl/OpenThink/dbe6e126-d35c-4634-a544-adf374ed5d00.json
deleted file mode 100644
index 383e8e8c7..000000000
--- a/data/hfopenllm_v2/mrdayl/OpenThink/dbe6e126-d35c-4634-a544-adf374ed5d00.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mrdayl_OpenThink/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenThink",
-    "id": "mrdayl/OpenThink",
-    "developer": "mrdayl",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2054
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.346
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2885
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.185
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mrm8488/phi-4-14B-grpo-gsm8k-3e/d68681c1-01e4-4af0-9a81-e0aaed0ae865.json b/data/hfopenllm_v2/mrm8488/phi-4-14B-grpo-gsm8k-3e/d68681c1-01e4-4af0-9a81-e0aaed0ae865.json
deleted file mode 100644
index 0b5f7c44b..000000000
--- a/data/hfopenllm_v2/mrm8488/phi-4-14B-grpo-gsm8k-3e/d68681c1-01e4-4af0-9a81-e0aaed0ae865.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mrm8488_phi-4-14B-grpo-gsm8k-3e/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-4-14B-grpo-gsm8k-3e",
-    "id": "mrm8488/phi-4-14B-grpo-gsm8k-3e",
-    "developer": "mrm8488",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6885
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6805
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4524
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3994
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5268
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mrm8488/phi-4-14B-grpo-limo/de9620b8-7112-436f-8941-fae2c5e7f9e0.json b/data/hfopenllm_v2/mrm8488/phi-4-14B-grpo-limo/de9620b8-7112-436f-8941-fae2c5e7f9e0.json
deleted file mode 100644
index 2b68d13b9..000000000
--- a/data/hfopenllm_v2/mrm8488/phi-4-14B-grpo-limo/de9620b8-7112-436f-8941-fae2c5e7f9e0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mrm8488_phi-4-14B-grpo-limo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-4-14B-grpo-limo",
-    "id": "mrm8488/phi-4-14B-grpo-limo",
-    "developer": "mrm8488",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6812
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6785
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4569
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3981
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5261
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/mukaj/Llama-3.1-Hawkish-8B/cafee7ac-deb6-4c4b-af8f-81548648cb14.json b/data/hfopenllm_v2/mukaj/Llama-3.1-Hawkish-8B/cafee7ac-deb6-4c4b-af8f-81548648cb14.json
deleted file mode 100644
index 0f6acc63a..000000000
--- a/data/hfopenllm_v2/mukaj/Llama-3.1-Hawkish-8B/cafee7ac-deb6-4c4b-af8f-81548648cb14.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/mukaj_Llama-3.1-Hawkish-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Hawkish-8B",
-    "id": "mukaj/Llama-3.1-Hawkish-8B",
-    "developer": "mukaj",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.672
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4884
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2432
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3967
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/natong19/Mistral-Nemo-Instruct-2407-abliterated/3e3cb617-6f19-4731-b31a-b1f4d88237d5.json b/data/hfopenllm_v2/natong19/Mistral-Nemo-Instruct-2407-abliterated/3e3cb617-6f19-4731-b31a-b1f4d88237d5.json
deleted file mode 100644
index 8280be34b..000000000
--- a/data/hfopenllm_v2/natong19/Mistral-Nemo-Instruct-2407-abliterated/3e3cb617-6f19-4731-b31a-b1f4d88237d5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/natong19_Mistral-Nemo-Instruct-2407-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Nemo-Instruct-2407-abliterated",
-    "id": "natong19/Mistral-Nemo-Instruct-2407-abliterated",
-    "developer": "natong19",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6392
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5048
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1322
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4033
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3518
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/natong19/Qwen2-7B-Instruct-abliterated/3c2c2c14-d065-4d6c-8c98-44ba8f2ca461.json b/data/hfopenllm_v2/natong19/Qwen2-7B-Instruct-abliterated/3c2c2c14-d065-4d6c-8c98-44ba8f2ca461.json
deleted file mode 100644
index f21a3cf8b..000000000
--- a/data/hfopenllm_v2/natong19/Qwen2-7B-Instruct-abliterated/3c2c2c14-d065-4d6c-8c98-44ba8f2ca461.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/natong19_Qwen2-7B-Instruct-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2-7B-Instruct-abliterated",
-    "id": "natong19/Qwen2-7B-Instruct-abliterated",
-    "developer": "natong19",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5837
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5553
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2764
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4034
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish-Instruct/8909f916-401b-4457-ab8f-2691696049c6.json b/data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish-Instruct/8909f916-401b-4457-ab8f-2691696049c6.json
deleted file mode 100644
index 642a4f5d2..000000000
--- a/data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish-Instruct/8909f916-401b-4457-ab8f-2691696049c6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nazimali_Mistral-Nemo-Kurdish-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Nemo-Kurdish-Instruct",
-    "id": "nazimali/Mistral-Nemo-Kurdish-Instruct",
-    "developer": "nazimali",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4964
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4699
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3979
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3063
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish-Instruct/ae191508-7dad-4cac-ad4a-af95d7a15b5d.json b/data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish-Instruct/ae191508-7dad-4cac-ad4a-af95d7a15b5d.json
deleted file mode 100644
index 030f58d03..000000000
--- a/data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish-Instruct/ae191508-7dad-4cac-ad4a-af95d7a15b5d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nazimali_Mistral-Nemo-Kurdish-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Nemo-Kurdish-Instruct",
-    "id": "nazimali/Mistral-Nemo-Kurdish-Instruct",
-    "developer": "nazimali",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.486
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4721
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0846
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4006
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish/507f5047-fac3-415f-b9fa-aae4311fa837.json b/data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish/507f5047-fac3-415f-b9fa-aae4311fa837.json
deleted file mode 100644
index ef98c1d8b..000000000
--- a/data/hfopenllm_v2/nazimali/Mistral-Nemo-Kurdish/507f5047-fac3-415f-b9fa-aae4311fa837.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nazimali_Mistral-Nemo-Kurdish/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Nemo-Kurdish",
-    "id": "nazimali/Mistral-Nemo-Kurdish",
-    "developer": "nazimali",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3401
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5133
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0959
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4116
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3235
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/BigKartoffel-mistral-nemo-20B/0ee8716c-74f0-41b4-94a2-efc715150293.json b/data/hfopenllm_v2/nbeerbower/BigKartoffel-mistral-nemo-20B/0ee8716c-74f0-41b4-94a2-efc715150293.json
deleted file mode 100644
index b7f28eb06..000000000
--- a/data/hfopenllm_v2/nbeerbower/BigKartoffel-mistral-nemo-20B/0ee8716c-74f0-41b4-94a2-efc715150293.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_BigKartoffel-mistral-nemo-20B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BigKartoffel-mistral-nemo-20B",
-    "id": "nbeerbower/BigKartoffel-mistral-nemo-20B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 20.427
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5857
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5515
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.428
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.353
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/DoppelKartoffel-Mistral-Nemo-23B/fcf491f4-cf57-4c95-9de1-4702ab5d54c7.json b/data/hfopenllm_v2/nbeerbower/DoppelKartoffel-Mistral-Nemo-23B/fcf491f4-cf57-4c95-9de1-4702ab5d54c7.json
deleted file mode 100644
index 59fb04144..000000000
--- a/data/hfopenllm_v2/nbeerbower/DoppelKartoffel-Mistral-Nemo-23B/fcf491f4-cf57-4c95-9de1-4702ab5d54c7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_DoppelKartoffel-Mistral-Nemo-23B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DoppelKartoffel-Mistral-Nemo-23B",
-    "id": "nbeerbower/DoppelKartoffel-Mistral-Nemo-23B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.153
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5191
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5218
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.031
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3795
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.308
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/DoublePotato-Mistral-Nemo-13B/4fd20259-c7c7-4da5-9013-ae2feb2175b1.json b/data/hfopenllm_v2/nbeerbower/DoublePotato-Mistral-Nemo-13B/4fd20259-c7c7-4da5-9013-ae2feb2175b1.json
deleted file mode 100644
index 27b871961..000000000
--- a/data/hfopenllm_v2/nbeerbower/DoublePotato-Mistral-Nemo-13B/4fd20259-c7c7-4da5-9013-ae2feb2175b1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_DoublePotato-Mistral-Nemo-13B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DoublePotato-Mistral-Nemo-13B",
-    "id": "nbeerbower/DoublePotato-Mistral-Nemo-13B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 13.338
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6796
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5438
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3596
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-1.5B/a7c8c345-cade-48fd-93c0-0f344044d2b5.json b/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-1.5B/a7c8c345-cade-48fd-93c0-0f344044d2b5.json
deleted file mode 100644
index cc0a30ad9..000000000
--- a/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-1.5B/a7c8c345-cade-48fd-93c0-0f344044d2b5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Dumpling-Qwen2.5-1.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dumpling-Qwen2.5-1.5B",
-    "id": "nbeerbower/Dumpling-Qwen2.5-1.5B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3699
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.416
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1171
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3728
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2772
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-14B/7a8e3986-7688-4a26-a74c-a9bb47cd3e8d.json b/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-14B/7a8e3986-7688-4a26-a74c-a9bb47cd3e8d.json
deleted file mode 100644
index 043eb77ba..000000000
--- a/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-14B/7a8e3986-7688-4a26-a74c-a9bb47cd3e8d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Dumpling-Qwen2.5-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dumpling-Qwen2.5-14B",
-    "id": "nbeerbower/Dumpling-Qwen2.5-14B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6064
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6451
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3097
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4354
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.517
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-7B-1k-r16/7a2ffb4d-1135-42a1-b28b-3b4e4d014979.json b/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-7B-1k-r16/7a2ffb4d-1135-42a1-b28b-3b4e4d014979.json
deleted file mode 100644
index c9e94af62..000000000
--- a/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-7B-1k-r16/7a2ffb4d-1135-42a1-b28b-3b4e4d014979.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Dumpling-Qwen2.5-7B-1k-r16/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dumpling-Qwen2.5-7B-1k-r16",
-    "id": "nbeerbower/Dumpling-Qwen2.5-7B-1k-r16",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.486
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5214
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2364
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.423
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3959
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-7B-1k-r64-2e-5/25468720-93d7-4f10-a534-30c4976657e8.json b/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-7B-1k-r64-2e-5/25468720-93d7-4f10-a534-30c4976657e8.json
deleted file mode 100644
index f310a40a3..000000000
--- a/data/hfopenllm_v2/nbeerbower/Dumpling-Qwen2.5-7B-1k-r64-2e-5/25468720-93d7-4f10-a534-30c4976657e8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Dumpling-Qwen2.5-7B-1k-r64-2e-5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dumpling-Qwen2.5-7B-1k-r64-2e-5",
-    "id": "nbeerbower/Dumpling-Qwen2.5-7B-1k-r64-2e-5",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4179
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5301
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2115
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4486
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4122
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/EVA-abliterated-TIES-Qwen2.5-1.5B/5ba1d617-9d9a-4c3b-b9cc-3224ace129b3.json b/data/hfopenllm_v2/nbeerbower/EVA-abliterated-TIES-Qwen2.5-1.5B/5ba1d617-9d9a-4c3b-b9cc-3224ace129b3.json
deleted file mode 100644
index d59eb6c26..000000000
--- a/data/hfopenllm_v2/nbeerbower/EVA-abliterated-TIES-Qwen2.5-1.5B/5ba1d617-9d9a-4c3b-b9cc-3224ace129b3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_EVA-abliterated-TIES-Qwen2.5-1.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "EVA-abliterated-TIES-Qwen2.5-1.5B",
-    "id": "nbeerbower/EVA-abliterated-TIES-Qwen2.5-1.5B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4115
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3997
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1375
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3502
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2712
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/EVA-abliterated-TIES-Qwen2.5-14B/27b2b46f-1323-4ddd-9f65-d8fcd9cd6508.json b/data/hfopenllm_v2/nbeerbower/EVA-abliterated-TIES-Qwen2.5-14B/27b2b46f-1323-4ddd-9f65-d8fcd9cd6508.json
deleted file mode 100644
index af7d95ee5..000000000
--- a/data/hfopenllm_v2/nbeerbower/EVA-abliterated-TIES-Qwen2.5-14B/27b2b46f-1323-4ddd-9f65-d8fcd9cd6508.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_EVA-abliterated-TIES-Qwen2.5-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "EVA-abliterated-TIES-Qwen2.5-14B",
-    "id": "nbeerbower/EVA-abliterated-TIES-Qwen2.5-14B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7836
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6372
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3549
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4407
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5211
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Flammades-Mistral-Nemo-12B/65917125-bb7c-4d64-ba5f-b5e4f67ec332.json b/data/hfopenllm_v2/nbeerbower/Flammades-Mistral-Nemo-12B/65917125-bb7c-4d64-ba5f-b5e4f67ec332.json
deleted file mode 100644
index 1fe39793f..000000000
--- a/data/hfopenllm_v2/nbeerbower/Flammades-Mistral-Nemo-12B/65917125-bb7c-4d64-ba5f-b5e4f67ec332.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Flammades-Mistral-Nemo-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Flammades-Mistral-Nemo-12B",
-    "id": "nbeerbower/Flammades-Mistral-Nemo-12B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0755
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4806
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3661
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Gemma2-Gutenberg-Doppel-9B/30bf22d8-b93a-4775-8073-30e14e15e35d.json b/data/hfopenllm_v2/nbeerbower/Gemma2-Gutenberg-Doppel-9B/30bf22d8-b93a-4775-8073-30e14e15e35d.json
deleted file mode 100644
index 4dfbf3259..000000000
--- a/data/hfopenllm_v2/nbeerbower/Gemma2-Gutenberg-Doppel-9B/30bf22d8-b93a-4775-8073-30e14e15e35d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Gemma2-Gutenberg-Doppel-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma2-Gutenberg-Doppel-9B",
-    "id": "nbeerbower/Gemma2-Gutenberg-Doppel-9B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7171
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.587
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1979
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4608
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4127
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Gutensuppe-mistral-nemo-12B/ff510365-a13d-4e44-9709-59a56e864991.json b/data/hfopenllm_v2/nbeerbower/Gutensuppe-mistral-nemo-12B/ff510365-a13d-4e44-9709-59a56e864991.json
deleted file mode 100644
index 0a1e89534..000000000
--- a/data/hfopenllm_v2/nbeerbower/Gutensuppe-mistral-nemo-12B/ff510365-a13d-4e44-9709-59a56e864991.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Gutensuppe-mistral-nemo-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gutensuppe-mistral-nemo-12B",
-    "id": "nbeerbower/Gutensuppe-mistral-nemo-12B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2916
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5487
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1329
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3372
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.429
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.368
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Hermes2-Gutenberg2-Mistral-7B/6d1eebc4-228b-43f3-b31c-3d5b1591ae2d.json b/data/hfopenllm_v2/nbeerbower/Hermes2-Gutenberg2-Mistral-7B/6d1eebc4-228b-43f3-b31c-3d5b1591ae2d.json
deleted file mode 100644
index edbb8c718..000000000
--- a/data/hfopenllm_v2/nbeerbower/Hermes2-Gutenberg2-Mistral-7B/6d1eebc4-228b-43f3-b31c-3d5b1591ae2d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Hermes2-Gutenberg2-Mistral-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hermes2-Gutenberg2-Mistral-7B",
-    "id": "nbeerbower/Hermes2-Gutenberg2-Mistral-7B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3721
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4981
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4623
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2993
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Kartoffel-Deepfry-12B/f1e8cdbb-14b7-4959-a053-fb1b37629aff.json b/data/hfopenllm_v2/nbeerbower/Kartoffel-Deepfry-12B/f1e8cdbb-14b7-4959-a053-fb1b37629aff.json
deleted file mode 100644
index 318400868..000000000
--- a/data/hfopenllm_v2/nbeerbower/Kartoffel-Deepfry-12B/f1e8cdbb-14b7-4959-a053-fb1b37629aff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Kartoffel-Deepfry-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kartoffel-Deepfry-12B",
-    "id": "nbeerbower/Kartoffel-Deepfry-12B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5022
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5365
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0604
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4792
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3582
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Llama-3.1-Nemotron-lorablated-70B/4145d1a0-8d6a-4d64-8a45-a89cf343ac46.json b/data/hfopenllm_v2/nbeerbower/Llama-3.1-Nemotron-lorablated-70B/4145d1a0-8d6a-4d64-8a45-a89cf343ac46.json
deleted file mode 100644
index 254d39ab3..000000000
--- a/data/hfopenllm_v2/nbeerbower/Llama-3.1-Nemotron-lorablated-70B/4145d1a0-8d6a-4d64-8a45-a89cf343ac46.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Llama-3.1-Nemotron-lorablated-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Nemotron-lorablated-70B",
-    "id": "nbeerbower/Llama-3.1-Nemotron-lorablated-70B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7229
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6825
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3338
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3909
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4682
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5343
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Llama3.1-Gutenberg-Doppel-70B/d6966190-e254-4902-8472-cac59bfbdbe0.json b/data/hfopenllm_v2/nbeerbower/Llama3.1-Gutenberg-Doppel-70B/d6966190-e254-4902-8472-cac59bfbdbe0.json
deleted file mode 100644
index 348fd047b..000000000
--- a/data/hfopenllm_v2/nbeerbower/Llama3.1-Gutenberg-Doppel-70B/d6966190-e254-4902-8472-cac59bfbdbe0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Llama3.1-Gutenberg-Doppel-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-Gutenberg-Doppel-70B",
-    "id": "nbeerbower/Llama3.1-Gutenberg-Doppel-70B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7092
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6661
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2122
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3448
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4897
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4737
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Lyra-Gutenberg-mistral-nemo-12B/5fdb5437-f413-451d-9800-42036cda7686.json b/data/hfopenllm_v2/nbeerbower/Lyra-Gutenberg-mistral-nemo-12B/5fdb5437-f413-451d-9800-42036cda7686.json
deleted file mode 100644
index 16901bff0..000000000
--- a/data/hfopenllm_v2/nbeerbower/Lyra-Gutenberg-mistral-nemo-12B/5fdb5437-f413-451d-9800-42036cda7686.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Lyra-Gutenberg-mistral-nemo-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lyra-Gutenberg-mistral-nemo-12B",
-    "id": "nbeerbower/Lyra-Gutenberg-mistral-nemo-12B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3495
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5586
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1012
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3339
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4357
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3628
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Lyra4-Gutenberg-12B/347577a4-2768-4472-ba48-9b174ad89724.json b/data/hfopenllm_v2/nbeerbower/Lyra4-Gutenberg-12B/347577a4-2768-4472-ba48-9b174ad89724.json
deleted file mode 100644
index fb5f4a0df..000000000
--- a/data/hfopenllm_v2/nbeerbower/Lyra4-Gutenberg-12B/347577a4-2768-4472-ba48-9b174ad89724.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Lyra4-Gutenberg-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lyra4-Gutenberg-12B",
-    "id": "nbeerbower/Lyra4-Gutenberg-12B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2212
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5387
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1299
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4038
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3571
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Lyra4-Gutenberg2-12B/33af440e-837d-4454-9340-af0d3ee74f77.json b/data/hfopenllm_v2/nbeerbower/Lyra4-Gutenberg2-12B/33af440e-837d-4454-9340-af0d3ee74f77.json
deleted file mode 100644
index 6dfbf9b42..000000000
--- a/data/hfopenllm_v2/nbeerbower/Lyra4-Gutenberg2-12B/33af440e-837d-4454-9340-af0d3ee74f77.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Lyra4-Gutenberg2-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lyra4-Gutenberg2-12B",
-    "id": "nbeerbower/Lyra4-Gutenberg2-12B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2585
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5345
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1171
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3972
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Mahou-1.5-mistral-nemo-12B-lorablated/1a1f4709-8d05-4905-8105-0c3606d5ef5b.json b/data/hfopenllm_v2/nbeerbower/Mahou-1.5-mistral-nemo-12B-lorablated/1a1f4709-8d05-4905-8105-0c3606d5ef5b.json
deleted file mode 100644
index 5eb89a5fa..000000000
--- a/data/hfopenllm_v2/nbeerbower/Mahou-1.5-mistral-nemo-12B-lorablated/1a1f4709-8d05-4905-8105-0c3606d5ef5b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Mahou-1.5-mistral-nemo-12B-lorablated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mahou-1.5-mistral-nemo-12B-lorablated",
-    "id": "nbeerbower/Mahou-1.5-mistral-nemo-12B-lorablated",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6825
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5496
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0891
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4522
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3574
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Mistral-Gutenberg-Doppel-7B-FFT/28421948-089b-4487-bb71-a06e5ce74402.json b/data/hfopenllm_v2/nbeerbower/Mistral-Gutenberg-Doppel-7B-FFT/28421948-089b-4487-bb71-a06e5ce74402.json
deleted file mode 100644
index dd9733de3..000000000
--- a/data/hfopenllm_v2/nbeerbower/Mistral-Gutenberg-Doppel-7B-FFT/28421948-089b-4487-bb71-a06e5ce74402.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Mistral-Gutenberg-Doppel-7B-FFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Gutenberg-Doppel-7B-FFT",
-    "id": "nbeerbower/Mistral-Gutenberg-Doppel-7B-FFT",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5717
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4076
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4059
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2729
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B-v2/3fa0c783-9226-4fc8-b3a0-6e960684f43d.json b/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B-v2/3fa0c783-9226-4fc8-b3a0-6e960684f43d.json
deleted file mode 100644
index 40710b344..000000000
--- a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B-v2/3fa0c783-9226-4fc8-b3a0-6e960684f43d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Mistral-Nemo-Gutenberg-Doppel-12B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Nemo-Gutenberg-Doppel-12B-v2",
-    "id": "nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B-v2",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6536
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5374
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1156
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4233
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3546
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B/743b7fe2-f998-408c-98b1-af02d9c1ee2a.json b/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B/743b7fe2-f998-408c-98b1-af02d9c1ee2a.json
deleted file mode 100644
index 0895b9113..000000000
--- a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B/743b7fe2-f998-408c-98b1-af02d9c1ee2a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Mistral-Nemo-Gutenberg-Doppel-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Nemo-Gutenberg-Doppel-12B",
-    "id": "nbeerbower/Mistral-Nemo-Gutenberg-Doppel-12B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3567
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5275
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4132
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3579
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Moderne-12B-FFT-experimental/0039c88b-a881-4ce0-9a0a-a10f1a8cbc70.json b/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Moderne-12B-FFT-experimental/0039c88b-a881-4ce0-9a0a-a10f1a8cbc70.json
deleted file mode 100644
index 361dfbd4d..000000000
--- a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Moderne-12B-FFT-experimental/0039c88b-a881-4ce0-9a0a-a10f1a8cbc70.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Mistral-Nemo-Moderne-12B-FFT-experimental/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Nemo-Moderne-12B-FFT-experimental",
-    "id": "nbeerbower/Mistral-Nemo-Moderne-12B-FFT-experimental",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3352
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5234
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.077
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3715
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3455
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B-v2/87c7fbd9-7648-4d0d-ac9e-8ba85860e335.json b/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B-v2/87c7fbd9-7648-4d0d-ac9e-8ba85860e335.json
deleted file mode 100644
index 85ba56e3d..000000000
--- a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B-v2/87c7fbd9-7648-4d0d-ac9e-8ba85860e335.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Mistral-Nemo-Prism-12B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Nemo-Prism-12B-v2",
-    "id": "nbeerbower/Mistral-Nemo-Prism-12B-v2",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6974
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5492
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0891
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3567
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B-v7/6ca3ab87-c05e-46b5-879d-4fc8bf75417b.json b/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B-v7/6ca3ab87-c05e-46b5-879d-4fc8bf75417b.json
deleted file mode 100644
index 4e58c2e14..000000000
--- a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B-v7/6ca3ab87-c05e-46b5-879d-4fc8bf75417b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Mistral-Nemo-Prism-12B-v7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Nemo-Prism-12B-v7",
-    "id": "nbeerbower/Mistral-Nemo-Prism-12B-v7",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6962
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5521
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0869
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4639
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.359
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B/525f1b9f-88a2-459d-bb4a-7c01a0107968.json b/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B/525f1b9f-88a2-459d-bb4a-7c01a0107968.json
deleted file mode 100644
index 8eb277f61..000000000
--- a/data/hfopenllm_v2/nbeerbower/Mistral-Nemo-Prism-12B/525f1b9f-88a2-459d-bb4a-7c01a0107968.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Mistral-Nemo-Prism-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Nemo-Prism-12B",
-    "id": "nbeerbower/Mistral-Nemo-Prism-12B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6858
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5475
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0869
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4626
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3581
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Mistral-Small-Drummer-22B/503f79be-7f05-4464-ac9f-0f284f1e7965.json b/data/hfopenllm_v2/nbeerbower/Mistral-Small-Drummer-22B/503f79be-7f05-4464-ac9f-0f284f1e7965.json
deleted file mode 100644
index f6ad8dc3b..000000000
--- a/data/hfopenllm_v2/nbeerbower/Mistral-Small-Drummer-22B/503f79be-7f05-4464-ac9f-0f284f1e7965.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Mistral-Small-Drummer-22B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Small-Drummer-22B",
-    "id": "nbeerbower/Mistral-Small-Drummer-22B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6331
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5793
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1888
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3431
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4064
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4095
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Mistral-Small-Gutenberg-Doppel-22B/86ec7d95-6f6d-4ca6-97d5-7a910f42a06d.json b/data/hfopenllm_v2/nbeerbower/Mistral-Small-Gutenberg-Doppel-22B/86ec7d95-6f6d-4ca6-97d5-7a910f42a06d.json
deleted file mode 100644
index ade724a65..000000000
--- a/data/hfopenllm_v2/nbeerbower/Mistral-Small-Gutenberg-Doppel-22B/86ec7d95-6f6d-4ca6-97d5-7a910f42a06d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Mistral-Small-Gutenberg-Doppel-22B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Small-Gutenberg-Doppel-22B",
-    "id": "nbeerbower/Mistral-Small-Gutenberg-Doppel-22B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4893
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5859
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2183
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3465
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3971
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4124
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Nemo-Loony-12B-experimental/d472ba79-6592-4f8a-a99c-ec3f71468d3e.json b/data/hfopenllm_v2/nbeerbower/Nemo-Loony-12B-experimental/d472ba79-6592-4f8a-a99c-ec3f71468d3e.json
deleted file mode 100644
index cdcb586d6..000000000
--- a/data/hfopenllm_v2/nbeerbower/Nemo-Loony-12B-experimental/d472ba79-6592-4f8a-a99c-ec3f71468d3e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Nemo-Loony-12B-experimental/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nemo-Loony-12B-experimental",
-    "id": "nbeerbower/Nemo-Loony-12B-experimental",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3734
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3822
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3341
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1589
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Nemoties-ChatML-12B/6ddc052c-6bda-4d8e-ad97-20d881c8cfb7.json b/data/hfopenllm_v2/nbeerbower/Nemoties-ChatML-12B/6ddc052c-6bda-4d8e-ad97-20d881c8cfb7.json
deleted file mode 100644
index 2c18c82d6..000000000
--- a/data/hfopenllm_v2/nbeerbower/Nemoties-ChatML-12B/6ddc052c-6bda-4d8e-ad97-20d881c8cfb7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Nemoties-ChatML-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nemoties-ChatML-12B",
-    "id": "nbeerbower/Nemoties-ChatML-12B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6382
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.547
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0785
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4509
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3551
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Qwen2.5-Gutenberg-Doppel-14B/76d1aed8-80fe-4b4f-bd81-ea0d6bf085c4.json b/data/hfopenllm_v2/nbeerbower/Qwen2.5-Gutenberg-Doppel-14B/76d1aed8-80fe-4b4f-bd81-ea0d6bf085c4.json
deleted file mode 100644
index c994ceaa2..000000000
--- a/data/hfopenllm_v2/nbeerbower/Qwen2.5-Gutenberg-Doppel-14B/76d1aed8-80fe-4b4f-bd81-ea0d6bf085c4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Qwen2.5-Gutenberg-Doppel-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Gutenberg-Doppel-14B",
-    "id": "nbeerbower/Qwen2.5-Gutenberg-Doppel-14B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8091
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6382
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5415
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4101
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4921
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/SmolNemo-12B-FFT-experimental/d2845d6e-65dd-4448-901d-d554b3e741f3.json b/data/hfopenllm_v2/nbeerbower/SmolNemo-12B-FFT-experimental/d2845d6e-65dd-4448-901d-d554b3e741f3.json
deleted file mode 100644
index baa82798c..000000000
--- a/data/hfopenllm_v2/nbeerbower/SmolNemo-12B-FFT-experimental/d2845d6e-65dd-4448-901d-d554b3e741f3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_SmolNemo-12B-FFT-experimental/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolNemo-12B-FFT-experimental",
-    "id": "nbeerbower/SmolNemo-12B-FFT-experimental",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3348
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3336
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3847
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1217
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/Stella-mistral-nemo-12B-v2/f7dd203f-24d8-4875-878a-12ed99e20cd3.json b/data/hfopenllm_v2/nbeerbower/Stella-mistral-nemo-12B-v2/f7dd203f-24d8-4875-878a-12ed99e20cd3.json
deleted file mode 100644
index 2d27ecb57..000000000
--- a/data/hfopenllm_v2/nbeerbower/Stella-mistral-nemo-12B-v2/f7dd203f-24d8-4875-878a-12ed99e20cd3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_Stella-mistral-nemo-12B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Stella-mistral-nemo-12B-v2",
-    "id": "nbeerbower/Stella-mistral-nemo-12B-v2",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3274
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5484
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1163
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3322
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4304
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3684
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/gemma2-gutenberg-27B/287ae246-bee5-4fae-b78f-203491aa8df2.json b/data/hfopenllm_v2/nbeerbower/gemma2-gutenberg-27B/287ae246-bee5-4fae-b78f-203491aa8df2.json
deleted file mode 100644
index 883e02d15..000000000
--- a/data/hfopenllm_v2/nbeerbower/gemma2-gutenberg-27B/287ae246-bee5-4fae-b78f-203491aa8df2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_gemma2-gutenberg-27B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma2-gutenberg-27B",
-    "id": "nbeerbower/gemma2-gutenberg-27B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 27.227
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2947
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3797
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0189
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3727
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1982
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/gemma2-gutenberg-9B/9ee493f7-e031-4593-beae-65be17678e00.json b/data/hfopenllm_v2/nbeerbower/gemma2-gutenberg-9B/9ee493f7-e031-4593-beae-65be17678e00.json
deleted file mode 100644
index 49d199adb..000000000
--- a/data/hfopenllm_v2/nbeerbower/gemma2-gutenberg-9B/9ee493f7-e031-4593-beae-65be17678e00.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_gemma2-gutenberg-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma2-gutenberg-9B",
-    "id": "nbeerbower/gemma2-gutenberg-9B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2796
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5951
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0808
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4595
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4192
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/llama-3-gutenberg-8B/86b10c6f-41c6-4d0a-ae59-f90e204e466c.json b/data/hfopenllm_v2/nbeerbower/llama-3-gutenberg-8B/86b10c6f-41c6-4d0a-ae59-f90e204e466c.json
deleted file mode 100644
index 787cd3d0f..000000000
--- a/data/hfopenllm_v2/nbeerbower/llama-3-gutenberg-8B/86b10c6f-41c6-4d0a-ae59-f90e204e466c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_llama-3-gutenberg-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-gutenberg-8B",
-    "id": "nbeerbower/llama-3-gutenberg-8B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4372
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4994
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0785
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4073
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3831
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/llama3.1-cc-8B/043e3533-7d5c-4d45-bcd8-0dbcc8ca4819.json b/data/hfopenllm_v2/nbeerbower/llama3.1-cc-8B/043e3533-7d5c-4d45-bcd8-0dbcc8ca4819.json
deleted file mode 100644
index 17baa63e8..000000000
--- a/data/hfopenllm_v2/nbeerbower/llama3.1-cc-8B/043e3533-7d5c-4d45-bcd8-0dbcc8ca4819.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_llama3.1-cc-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama3.1-cc-8B",
-    "id": "nbeerbower/llama3.1-cc-8B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5068
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4871
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.071
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3885
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3347
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/llama3.1-kartoffeldes-70B/1b3269fb-4b16-42b6-80c0-3d54bc2b4fed.json b/data/hfopenllm_v2/nbeerbower/llama3.1-kartoffeldes-70B/1b3269fb-4b16-42b6-80c0-3d54bc2b4fed.json
deleted file mode 100644
index ed92bd808..000000000
--- a/data/hfopenllm_v2/nbeerbower/llama3.1-kartoffeldes-70B/1b3269fb-4b16-42b6-80c0-3d54bc2b4fed.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_llama3.1-kartoffeldes-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama3.1-kartoffeldes-70B",
-    "id": "nbeerbower/llama3.1-kartoffeldes-70B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.823
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6894
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3218
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4646
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4988
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-bophades-12B/ee625c29-62c4-49da-9790-e7e67233157d.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-bophades-12B/ee625c29-62c4-49da-9790-e7e67233157d.json
deleted file mode 100644
index 75703bbb6..000000000
--- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-bophades-12B/ee625c29-62c4-49da-9790-e7e67233157d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-bophades-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-nemo-bophades-12B",
-    "id": "nbeerbower/mistral-nemo-bophades-12B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6794
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4988
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1231
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4178
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3501
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-bophades3-12B/02b16bf2-62bb-401e-9726-2135d8d610be.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-bophades3-12B/02b16bf2-62bb-401e-9726-2135d8d610be.json
deleted file mode 100644
index c34d6ed08..000000000
--- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-bophades3-12B/02b16bf2-62bb-401e-9726-2135d8d610be.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-bophades3-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-nemo-bophades3-12B",
-    "id": "nbeerbower/mistral-nemo-bophades3-12B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6578
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5449
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0846
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4604
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3371
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-cc-12B/db10c6f9-2962-46cc-aa4e-4c99c4b494d1.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-cc-12B/db10c6f9-2962-46cc-aa4e-4c99c4b494d1.json
deleted file mode 100644
index 25981a535..000000000
--- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-cc-12B/db10c6f9-2962-46cc-aa4e-4c99c4b494d1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-cc-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-nemo-cc-12B",
-    "id": "nbeerbower/mistral-nemo-cc-12B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1435
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5399
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0257
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4424
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3598
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutades-12B/aa37bda0-2e0a-4361-a5b4-468154d8ac72.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutades-12B/aa37bda0-2e0a-4361-a5b4-468154d8ac72.json
deleted file mode 100644
index 4cebf3ef7..000000000
--- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutades-12B/aa37bda0-2e0a-4361-a5b4-468154d8ac72.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-gutades-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-nemo-gutades-12B",
-    "id": "nbeerbower/mistral-nemo-gutades-12B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3425
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5407
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1178
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.404
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3561
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v2/d9a6565c-5a0b-4893-b6e0-1fc52ec55bf5.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v2/d9a6565c-5a0b-4893-b6e0-1fc52ec55bf5.json
deleted file mode 100644
index f920c7857..000000000
--- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v2/d9a6565c-5a0b-4893-b6e0-1fc52ec55bf5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-gutenberg-12B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-nemo-gutenberg-12B-v2",
-    "id": "nbeerbower/mistral-nemo-gutenberg-12B-v2",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6203
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5397
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1088
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4287
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3499
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v3/becf9805-83a9-4137-a938-81a61a10e4f0.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v3/becf9805-83a9-4137-a938-81a61a10e4f0.json
deleted file mode 100644
index b760c9b8c..000000000
--- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v3/becf9805-83a9-4137-a938-81a61a10e4f0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-gutenberg-12B-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-nemo-gutenberg-12B-v3",
-    "id": "nbeerbower/mistral-nemo-gutenberg-12B-v3",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2183
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5441
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0597
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.445
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3644
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v4/6e848120-bc31-4628-af05-30707a6dcc41.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v4/6e848120-bc31-4628-af05-30707a6dcc41.json
deleted file mode 100644
index ab9d229a4..000000000
--- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B-v4/6e848120-bc31-4628-af05-30707a6dcc41.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-gutenberg-12B-v4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-nemo-gutenberg-12B-v4",
-    "id": "nbeerbower/mistral-nemo-gutenberg-12B-v4",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2379
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5269
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1261
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4104
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3575
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B/864af855-71b0-4b11-ae3f-56294a7d0db9.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B/864af855-71b0-4b11-ae3f-56294a7d0db9.json
deleted file mode 100644
index c29fbdb15..000000000
--- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg-12B/864af855-71b0-4b11-ae3f-56294a7d0db9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-gutenberg-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-nemo-gutenberg-12B",
-    "id": "nbeerbower/mistral-nemo-gutenberg-12B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3504
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5281
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1163
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4171
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3562
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg2-12B-test/285bd390-1dd9-4db2-af45-68dea557da3c.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg2-12B-test/285bd390-1dd9-4db2-af45-68dea557da3c.json
deleted file mode 100644
index dc74a419a..000000000
--- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-gutenberg2-12B-test/285bd390-1dd9-4db2-af45-68dea557da3c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-gutenberg2-12B-test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-nemo-gutenberg2-12B-test",
-    "id": "nbeerbower/mistral-nemo-gutenberg2-12B-test",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3385
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5255
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1163
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4157
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3555
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-kartoffel-12B/459e2375-1a15-4129-bee0-dc8852d531e2.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-kartoffel-12B/459e2375-1a15-4129-bee0-dc8852d531e2.json
deleted file mode 100644
index 0d4219fba..000000000
--- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-kartoffel-12B/459e2375-1a15-4129-bee0-dc8852d531e2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-kartoffel-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-nemo-kartoffel-12B",
-    "id": "nbeerbower/mistral-nemo-kartoffel-12B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7032
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5484
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0853
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4653
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3585
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-narwhal-12B/7b4c7d92-f581-4057-bec9-e3a8c6a5386e.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-narwhal-12B/7b4c7d92-f581-4057-bec9-e3a8c6a5386e.json
deleted file mode 100644
index d17ad6414..000000000
--- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-narwhal-12B/7b4c7d92-f581-4057-bec9-e3a8c6a5386e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-narwhal-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-nemo-narwhal-12B",
-    "id": "nbeerbower/mistral-nemo-narwhal-12B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5549
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5057
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0582
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3847
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3483
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbeerbower/mistral-nemo-wissenschaft-12B/7ceab841-f9a3-455b-9314-243d8fc3cd11.json b/data/hfopenllm_v2/nbeerbower/mistral-nemo-wissenschaft-12B/7ceab841-f9a3-455b-9314-243d8fc3cd11.json
deleted file mode 100644
index 5c64e8342..000000000
--- a/data/hfopenllm_v2/nbeerbower/mistral-nemo-wissenschaft-12B/7ceab841-f9a3-455b-9314-243d8fc3cd11.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbeerbower_mistral-nemo-wissenschaft-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-nemo-wissenschaft-12B",
-    "id": "nbeerbower/mistral-nemo-wissenschaft-12B",
-    "developer": "nbeerbower",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.652
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.504
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4178
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3532
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nbrahme/IndusQ/c1e2fb45-22d8-4eb4-8971-ce89c3048b9e.json b/data/hfopenllm_v2/nbrahme/IndusQ/c1e2fb45-22d8-4eb4-8971-ce89c3048b9e.json
deleted file mode 100644
index 21a04901a..000000000
--- a/data/hfopenllm_v2/nbrahme/IndusQ/c1e2fb45-22d8-4eb4-8971-ce89c3048b9e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nbrahme_IndusQ/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IndusQ",
-    "id": "nbrahme/IndusQ",
-    "developer": "nbrahme",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPT2LMHeadModel",
-      "params_billions": 1.176
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.244
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3366
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.112
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/necva/IE-cont-Llama3.1-8B/68cb2ca1-1648-41a2-92b7-969bccdca4ee.json b/data/hfopenllm_v2/necva/IE-cont-Llama3.1-8B/68cb2ca1-1648-41a2-92b7-969bccdca4ee.json
deleted file mode 100644
index baefc2323..000000000
--- a/data/hfopenllm_v2/necva/IE-cont-Llama3.1-8B/68cb2ca1-1648-41a2-92b7-969bccdca4ee.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/necva_IE-cont-Llama3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IE-cont-Llama3.1-8B",
-    "id": "necva/IE-cont-Llama3.1-8B",
-    "developer": "necva",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2049
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2912
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3575
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1167
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/necva/replica-IEPile/5f285d61-5e4b-4c5c-8960-c10313d76ae3.json b/data/hfopenllm_v2/necva/replica-IEPile/5f285d61-5e4b-4c5c-8960-c10313d76ae3.json
deleted file mode 100644
index 065eb3646..000000000
--- a/data/hfopenllm_v2/necva/replica-IEPile/5f285d61-5e4b-4c5c-8960-c10313d76ae3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/necva_replica-IEPile/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "replica-IEPile",
-    "id": "necva/replica-IEPile",
-    "developer": "necva",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.65
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4678
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4779
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1239
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3998
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3561
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.1-bf16-falcon3-7b-instruct/3af19898-8590-4aec-b324-46c7fbf596d3.json b/data/hfopenllm_v2/neopolita/jessi-v0.1-bf16-falcon3-7b-instruct/3af19898-8590-4aec-b324-46c7fbf596d3.json
deleted file mode 100644
index a9a8b3aca..000000000
--- a/data/hfopenllm_v2/neopolita/jessi-v0.1-bf16-falcon3-7b-instruct/3af19898-8590-4aec-b324-46c7fbf596d3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.1-bf16-falcon3-7b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "jessi-v0.1-bf16-falcon3-7b-instruct",
-    "id": "neopolita/jessi-v0.1-bf16-falcon3-7b-instruct",
-    "developer": "neopolita",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7527
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5516
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3807
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4825
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3924
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.1-falcon3-10b-instruct/e8472266-6d03-439f-bd6b-e3ac5ef2cf09.json b/data/hfopenllm_v2/neopolita/jessi-v0.1-falcon3-10b-instruct/e8472266-6d03-439f-bd6b-e3ac5ef2cf09.json
deleted file mode 100644
index e86ad0154..000000000
--- a/data/hfopenllm_v2/neopolita/jessi-v0.1-falcon3-10b-instruct/e8472266-6d03-439f-bd6b-e3ac5ef2cf09.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.1-falcon3-10b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "jessi-v0.1-falcon3-10b-instruct",
-    "id": "neopolita/jessi-v0.1-falcon3-10b-instruct",
-    "developer": "neopolita",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7552
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5953
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2002
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4279
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4188
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.1-qwen2.5-7b-instruct/3f578b45-48f9-4022-991c-32a71706aba3.json b/data/hfopenllm_v2/neopolita/jessi-v0.1-qwen2.5-7b-instruct/3f578b45-48f9-4022-991c-32a71706aba3.json
deleted file mode 100644
index 428e15bc6..000000000
--- a/data/hfopenllm_v2/neopolita/jessi-v0.1-qwen2.5-7b-instruct/3f578b45-48f9-4022-991c-32a71706aba3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.1-qwen2.5-7b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "jessi-v0.1-qwen2.5-7b-instruct",
-    "id": "neopolita/jessi-v0.1-qwen2.5-7b-instruct",
-    "developer": "neopolita",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7327
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5292
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4086
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3914
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4228
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.1-virtuoso-small/ef8c22a7-3898-422e-88e2-1a8c14ab5bf2.json b/data/hfopenllm_v2/neopolita/jessi-v0.1-virtuoso-small/ef8c22a7-3898-422e-88e2-1a8c14ab5bf2.json
deleted file mode 100644
index d6a3717c7..000000000
--- a/data/hfopenllm_v2/neopolita/jessi-v0.1-virtuoso-small/ef8c22a7-3898-422e-88e2-1a8c14ab5bf2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.1-virtuoso-small/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "jessi-v0.1-virtuoso-small",
-    "id": "neopolita/jessi-v0.1-virtuoso-small",
-    "developer": "neopolita",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7959
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6443
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3399
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3305
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4362
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.513
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.2-falcon3-10b-instruct/81630ea2-d496-4872-92b7-e476badaf50d.json b/data/hfopenllm_v2/neopolita/jessi-v0.2-falcon3-10b-instruct/81630ea2-d496-4872-92b7-e476badaf50d.json
deleted file mode 100644
index 26642a60f..000000000
--- a/data/hfopenllm_v2/neopolita/jessi-v0.2-falcon3-10b-instruct/81630ea2-d496-4872-92b7-e476badaf50d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.2-falcon3-10b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "jessi-v0.2-falcon3-10b-instruct",
-    "id": "neopolita/jessi-v0.2-falcon3-10b-instruct",
-    "developer": "neopolita",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7768
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6205
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2122
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4281
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4354
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.2-falcon3-7b-instruct/9436d04a-9c81-47ad-a7b8-496e14058627.json b/data/hfopenllm_v2/neopolita/jessi-v0.2-falcon3-7b-instruct/9436d04a-9c81-47ad-a7b8-496e14058627.json
deleted file mode 100644
index b8f742e94..000000000
--- a/data/hfopenllm_v2/neopolita/jessi-v0.2-falcon3-7b-instruct/9436d04a-9c81-47ad-a7b8-496e14058627.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.2-falcon3-7b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "jessi-v0.2-falcon3-7b-instruct",
-    "id": "neopolita/jessi-v0.2-falcon3-7b-instruct",
-    "developer": "neopolita",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5771
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5363
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2538
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4479
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3905
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.3-falcon3-7b-instruct/f1e6e54e-cb97-4980-8957-2190ee5c4c34.json b/data/hfopenllm_v2/neopolita/jessi-v0.3-falcon3-7b-instruct/f1e6e54e-cb97-4980-8957-2190ee5c4c34.json
deleted file mode 100644
index ed27b7620..000000000
--- a/data/hfopenllm_v2/neopolita/jessi-v0.3-falcon3-7b-instruct/f1e6e54e-cb97-4980-8957-2190ee5c4c34.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.3-falcon3-7b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "jessi-v0.3-falcon3-7b-instruct",
-    "id": "neopolita/jessi-v0.3-falcon3-7b-instruct",
-    "developer": "neopolita",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7509
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5388
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1888
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4692
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.397
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.4-falcon3-7b-instruct/30914dd3-c857-4aaf-b6b9-d1c7e4917e89.json b/data/hfopenllm_v2/neopolita/jessi-v0.4-falcon3-7b-instruct/30914dd3-c857-4aaf-b6b9-d1c7e4917e89.json
deleted file mode 100644
index b113a0091..000000000
--- a/data/hfopenllm_v2/neopolita/jessi-v0.4-falcon3-7b-instruct/30914dd3-c857-4aaf-b6b9-d1c7e4917e89.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.4-falcon3-7b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "jessi-v0.4-falcon3-7b-instruct",
-    "id": "neopolita/jessi-v0.4-falcon3-7b-instruct",
-    "developer": "neopolita",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7604
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5522
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3769
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4971
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4004
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.5-falcon3-7b-instruct/1c389a32-68b3-47c0-a6b8-2c2291293002.json b/data/hfopenllm_v2/neopolita/jessi-v0.5-falcon3-7b-instruct/1c389a32-68b3-47c0-a6b8-2c2291293002.json
deleted file mode 100644
index d7d351e7b..000000000
--- a/data/hfopenllm_v2/neopolita/jessi-v0.5-falcon3-7b-instruct/1c389a32-68b3-47c0-a6b8-2c2291293002.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.5-falcon3-7b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "jessi-v0.5-falcon3-7b-instruct",
-    "id": "neopolita/jessi-v0.5-falcon3-7b-instruct",
-    "developer": "neopolita",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7412
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.559
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3739
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4865
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3966
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/neopolita/jessi-v0.6-falcon3-7b-instruct/e759a217-6571-446d-9bf9-d1512793f307.json b/data/hfopenllm_v2/neopolita/jessi-v0.6-falcon3-7b-instruct/e759a217-6571-446d-9bf9-d1512793f307.json
deleted file mode 100644
index 02cb96cd3..000000000
--- a/data/hfopenllm_v2/neopolita/jessi-v0.6-falcon3-7b-instruct/e759a217-6571-446d-9bf9-d1512793f307.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/neopolita_jessi-v0.6-falcon3-7b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "jessi-v0.6-falcon3-7b-instruct",
-    "id": "neopolita/jessi-v0.6-falcon3-7b-instruct",
-    "developer": "neopolita",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7402
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5509
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4904
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3957
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/neopolita/loki-v0.1-virtuoso/753f3b21-7365-4117-b2a0-a91f03ec3d39.json b/data/hfopenllm_v2/neopolita/loki-v0.1-virtuoso/753f3b21-7365-4117-b2a0-a91f03ec3d39.json
deleted file mode 100644
index 21bbf2ac9..000000000
--- a/data/hfopenllm_v2/neopolita/loki-v0.1-virtuoso/753f3b21-7365-4117-b2a0-a91f03ec3d39.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/neopolita_loki-v0.1-virtuoso/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "loki-v0.1-virtuoso",
-    "id": "neopolita/loki-v0.1-virtuoso",
-    "developer": "neopolita",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7819
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6467
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3391
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3507
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4375
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5129
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b/297ef102-67c1-4e9c-b418-fed026bb1f8a.json b/data/hfopenllm_v2/netcat420/DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b/297ef102-67c1-4e9c-b418-fed026bb1f8a.json
deleted file mode 100644
index 504b55a12..000000000
--- a/data/hfopenllm_v2/netcat420/DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b/297ef102-67c1-4e9c-b418-fed026bb1f8a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b",
-    "id": "netcat420/DeepSeek-R1-Distill-Qwen-MFANN-Slerp-7b",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.115
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2877
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0015
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3724
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.109
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/DeepSeek-R1-MFANN-TIES-unretrained-7b/9fbf73d7-7d67-4d6c-a5b9-efc627cd1b2b.json b/data/hfopenllm_v2/netcat420/DeepSeek-R1-MFANN-TIES-unretrained-7b/9fbf73d7-7d67-4d6c-a5b9-efc627cd1b2b.json
deleted file mode 100644
index 3a61410cd..000000000
--- a/data/hfopenllm_v2/netcat420/DeepSeek-R1-MFANN-TIES-unretrained-7b/9fbf73d7-7d67-4d6c-a5b9-efc627cd1b2b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_DeepSeek-R1-MFANN-TIES-unretrained-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-MFANN-TIES-unretrained-7b",
-    "id": "netcat420/DeepSeek-R1-MFANN-TIES-unretrained-7b",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2587
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3086
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3527
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1145
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/Llama3.1-MFANN-8b/b1446577-f13f-434a-a0b4-916091395d4a.json b/data/hfopenllm_v2/netcat420/Llama3.1-MFANN-8b/b1446577-f13f-434a-a0b4-916091395d4a.json
deleted file mode 100644
index 4ba8de1e7..000000000
--- a/data/hfopenllm_v2/netcat420/Llama3.1-MFANN-8b/b1446577-f13f-434a-a0b4-916091395d4a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_Llama3.1-MFANN-8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-MFANN-8b",
-    "id": "netcat420/Llama3.1-MFANN-8b",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4281
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3379
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2725
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V2/fc8946aa-8b04-482c-8c05-d026d2af07be.json b/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V2/fc8946aa-8b04-482c-8c05-d026d2af07be.json
deleted file mode 100644
index db28a161a..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V2/fc8946aa-8b04-482c-8c05-d026d2af07be.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN-Llama3.1-Abliterated-SLERP-TIES-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN-Llama3.1-Abliterated-SLERP-TIES-V2",
-    "id": "netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V2",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.421
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4924
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0763
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3728
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3522
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V3/fabe3784-948c-4618-9cf0-c76a3ddd3820.json b/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V3/fabe3784-948c-4618-9cf0-c76a3ddd3820.json
deleted file mode 100644
index fe6d32e73..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V3/fabe3784-948c-4618-9cf0-c76a3ddd3820.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN-Llama3.1-Abliterated-SLERP-TIES-V3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN-Llama3.1-Abliterated-SLERP-TIES-V3",
-    "id": "netcat420/MFANN-Llama3.1-Abliterated-SLERP-TIES-V3",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4238
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4914
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0755
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3741
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.349
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-V4/736dcf09-6a19-4e88-a790-7a7ee74d8717.json b/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-V4/736dcf09-6a19-4e88-a790-7a7ee74d8717.json
deleted file mode 100644
index ca9c619d8..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-V4/736dcf09-6a19-4e88-a790-7a7ee74d8717.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN-Llama3.1-Abliterated-SLERP-V4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN-Llama3.1-Abliterated-SLERP-V4",
-    "id": "netcat420/MFANN-Llama3.1-Abliterated-SLERP-V4",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4169
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4909
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3821
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3516
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-V5/75b4c750-1570-4825-a04a-965c06861fd4.json b/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-V5/75b4c750-1570-4825-a04a-965c06861fd4.json
deleted file mode 100644
index 2cffceba6..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-SLERP-V5/75b4c750-1570-4825-a04a-965c06861fd4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN-Llama3.1-Abliterated-SLERP-V5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN-Llama3.1-Abliterated-SLERP-V5",
-    "id": "netcat420/MFANN-Llama3.1-Abliterated-SLERP-V5",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4329
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4952
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0816
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3781
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3445
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-Slerp-TIES/b7f8b678-2aea-4d41-ba21-2083fc472574.json b/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-Slerp-TIES/b7f8b678-2aea-4d41-ba21-2083fc472574.json
deleted file mode 100644
index 4f314f567..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-Slerp-TIES/b7f8b678-2aea-4d41-ba21-2083fc472574.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN-Llama3.1-Abliterated-Slerp-TIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN-Llama3.1-Abliterated-Slerp-TIES",
-    "id": "netcat420/MFANN-Llama3.1-Abliterated-Slerp-TIES",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4293
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4968
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3687
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3531
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-Slerp-V3.2/a8010630-58de-448c-af08-70b8ffec431b.json b/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-Slerp-V3.2/a8010630-58de-448c-af08-70b8ffec431b.json
deleted file mode 100644
index 3eb29349d..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN-Llama3.1-Abliterated-Slerp-V3.2/a8010630-58de-448c-af08-70b8ffec431b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN-Llama3.1-Abliterated-Slerp-V3.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN-Llama3.1-Abliterated-Slerp-V3.2",
-    "id": "netcat420/MFANN-Llama3.1-Abliterated-Slerp-V3.2",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4128
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4978
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0702
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3754
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3527
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN-SFT/4a0c2ce5-a4b4-4d35-b65d-bbc6e36a649b.json b/data/hfopenllm_v2/netcat420/MFANN-SFT/4a0c2ce5-a4b4-4d35-b65d-bbc6e36a649b.json
deleted file mode 100644
index dde4c2499..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN-SFT/4a0c2ce5-a4b4-4d35-b65d-bbc6e36a649b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN-SFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN-SFT",
-    "id": "netcat420/MFANN-SFT",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3682
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4852
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0597
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3725
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3336
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN-abliterated-phi2-merge-unretrained/1132251a-59c7-402e-9957-f9288864508f.json b/data/hfopenllm_v2/netcat420/MFANN-abliterated-phi2-merge-unretrained/1132251a-59c7-402e-9957-f9288864508f.json
deleted file mode 100644
index d5dbf98ee..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN-abliterated-phi2-merge-unretrained/1132251a-59c7-402e-9957-f9288864508f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN-abliterated-phi2-merge-unretrained/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN-abliterated-phi2-merge-unretrained",
-    "id": "netcat420/MFANN-abliterated-phi2-merge-unretrained",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.775
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3005
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4104
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3183
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1478
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN-llama3.1-Abliterated-SLERP/e2fac049-8f9f-4b71-bcd3-5746b7d90150.json b/data/hfopenllm_v2/netcat420/MFANN-llama3.1-Abliterated-SLERP/e2fac049-8f9f-4b71-bcd3-5746b7d90150.json
deleted file mode 100644
index 3a55b4765..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN-llama3.1-Abliterated-SLERP/e2fac049-8f9f-4b71-bcd3-5746b7d90150.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN-llama3.1-Abliterated-SLERP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN-llama3.1-Abliterated-SLERP",
-    "id": "netcat420/MFANN-llama3.1-Abliterated-SLERP",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2591
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4574
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0483
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3809
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-SLERP-v3.1/d891a1e1-ad65-498f-9ee8-59523c1bfd19.json b/data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-SLERP-v3.1/d891a1e1-ad65-498f-9ee8-59523c1bfd19.json
deleted file mode 100644
index ea3ef3c69..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-SLERP-v3.1/d891a1e1-ad65-498f-9ee8-59523c1bfd19.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN-llama3.1-abliterated-SLERP-v3.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN-llama3.1-abliterated-SLERP-v3.1",
-    "id": "netcat420/MFANN-llama3.1-abliterated-SLERP-v3.1",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4202
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4921
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0695
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3686
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3543
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-SLERP-v3/9dd3103f-6c4f-4077-ac27-3a9b0f4a5882.json b/data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-SLERP-v3/9dd3103f-6c4f-4077-ac27-3a9b0f4a5882.json
deleted file mode 100644
index cf3326495..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-SLERP-v3/9dd3103f-6c4f-4077-ac27-3a9b0f4a5882.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN-llama3.1-abliterated-SLERP-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN-llama3.1-abliterated-SLERP-v3",
-    "id": "netcat420/MFANN-llama3.1-abliterated-SLERP-v3",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3799
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4931
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0642
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.366
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3531
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-v2/ca031f70-5785-46d1-8a58-b279d8340776.json b/data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-v2/ca031f70-5785-46d1-8a58-b279d8340776.json
deleted file mode 100644
index 27620df52..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN-llama3.1-abliterated-v2/ca031f70-5785-46d1-8a58-b279d8340776.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN-llama3.1-abliterated-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN-llama3.1-abliterated-v2",
-    "id": "netcat420/MFANN-llama3.1-abliterated-v2",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4429
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4941
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.074
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3845
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3491
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V2/18457711-92b8-4c27-a89a-928fecdf5724.json b/data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V2/18457711-92b8-4c27-a89a-928fecdf5724.json
deleted file mode 100644
index f7adf105b..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V2/18457711-92b8-4c27-a89a-928fecdf5724.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN-phigments-slerp-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN-phigments-slerp-V2",
-    "id": "netcat420/MFANN-phigments-slerp-V2",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.78
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3232
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4827
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0317
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4037
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2717
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V3.2/3398aeb8-08a8-4be9-a24c-efeabcaa2139.json b/data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V3.2/3398aeb8-08a8-4be9-a24c-efeabcaa2139.json
deleted file mode 100644
index fd602beef..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V3.2/3398aeb8-08a8-4be9-a24c-efeabcaa2139.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN-phigments-slerp-V3.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN-phigments-slerp-V3.2",
-    "id": "netcat420/MFANN-phigments-slerp-V3.2",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.78
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3524
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4809
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0332
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3708
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2705
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V3.3/707bc006-4318-41bc-b91b-aa43ca7cba6f.json b/data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V3.3/707bc006-4318-41bc-b91b-aa43ca7cba6f.json
deleted file mode 100644
index bc28c5ad6..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN-phigments-slerp-V3.3/707bc006-4318-41bc-b91b-aa43ca7cba6f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN-phigments-slerp-V3.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN-phigments-slerp-V3.3",
-    "id": "netcat420/MFANN-phigments-slerp-V3.3",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.78
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3691
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4895
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0332
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3892
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2803
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN3b/7bfda919-13be-4b68-8655-99fe6a4605a2.json b/data/hfopenllm_v2/netcat420/MFANN3b/7bfda919-13be-4b68-8655-99fe6a4605a2.json
deleted file mode 100644
index 1d0c531da..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN3b/7bfda919-13be-4b68-8655-99fe6a4605a2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN3b",
-    "id": "netcat420/MFANN3b",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.78
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2524
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4433
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3606
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2306
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv0.15/f844e739-5f0d-4db4-ba66-bd33b1290571.json b/data/hfopenllm_v2/netcat420/MFANN3bv0.15/f844e739-5f0d-4db4-ba66-bd33b1290571.json
deleted file mode 100644
index da81b800a..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN3bv0.15/f844e739-5f0d-4db4-ba66-bd33b1290571.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv0.15/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN3bv0.15",
-    "id": "netcat420/MFANN3bv0.15",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.78
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2012
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4539
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3958
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2468
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv0.18/0cde6639-6a89-4682-bb3e-a2a24a1bc8ab.json b/data/hfopenllm_v2/netcat420/MFANN3bv0.18/0cde6639-6a89-4682-bb3e-a2a24a1bc8ab.json
deleted file mode 100644
index 691ede716..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN3bv0.18/0cde6639-6a89-4682-bb3e-a2a24a1bc8ab.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv0.18/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN3bv0.18",
-    "id": "netcat420/MFANN3bv0.18",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.78
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2206
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4514
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4024
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv0.19/87652005-4404-4c45-bd4f-5f63c44adf63.json b/data/hfopenllm_v2/netcat420/MFANN3bv0.19/87652005-4404-4c45-bd4f-5f63c44adf63.json
deleted file mode 100644
index b5b586fea..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN3bv0.19/87652005-4404-4c45-bd4f-5f63c44adf63.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv0.19/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN3bv0.19",
-    "id": "netcat420/MFANN3bv0.19",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.78
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2258
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4516
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0227
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4024
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.252
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv0.20/a7e0bc2d-784d-4719-ac08-d8fa0c29d178.json b/data/hfopenllm_v2/netcat420/MFANN3bv0.20/a7e0bc2d-784d-4719-ac08-d8fa0c29d178.json
deleted file mode 100644
index 560868144..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN3bv0.20/a7e0bc2d-784d-4719-ac08-d8fa0c29d178.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv0.20/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN3bv0.20",
-    "id": "netcat420/MFANN3bv0.20",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.78
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2193
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4493
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4077
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv0.21/e8ba93e6-6f90-4169-8403-381b7f9e26ab.json b/data/hfopenllm_v2/netcat420/MFANN3bv0.21/e8ba93e6-6f90-4169-8403-381b7f9e26ab.json
deleted file mode 100644
index 12ac93cd4..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN3bv0.21/e8ba93e6-6f90-4169-8403-381b7f9e26ab.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv0.21/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN3bv0.21",
-    "id": "netcat420/MFANN3bv0.21",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.78
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1909
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.447
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0317
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3759
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2393
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv0.22/ea86b542-3d06-4e71-b49d-17cdd362b465.json b/data/hfopenllm_v2/netcat420/MFANN3bv0.22/ea86b542-3d06-4e71-b49d-17cdd362b465.json
deleted file mode 100644
index 48ebc0f8a..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN3bv0.22/ea86b542-3d06-4e71-b49d-17cdd362b465.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv0.22/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN3bv0.22",
-    "id": "netcat420/MFANN3bv0.22",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.78
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1979
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4485
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3521
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv0.23/15615d2c-46a1-47c7-a273-697e97bdf9f2.json b/data/hfopenllm_v2/netcat420/MFANN3bv0.23/15615d2c-46a1-47c7-a273-697e97bdf9f2.json
deleted file mode 100644
index 0a80010af..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN3bv0.23/15615d2c-46a1-47c7-a273-697e97bdf9f2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv0.23/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN3bv0.23",
-    "id": "netcat420/MFANN3bv0.23",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.78
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2048
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4495
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3427
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2418
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv0.24/a2b8da3f-c99e-4dba-b4a2-23739281eaf2.json b/data/hfopenllm_v2/netcat420/MFANN3bv0.24/a2b8da3f-c99e-4dba-b4a2-23739281eaf2.json
deleted file mode 100644
index 95741769c..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN3bv0.24/a2b8da3f-c99e-4dba-b4a2-23739281eaf2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv0.24/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN3bv0.24",
-    "id": "netcat420/MFANN3bv0.24",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.78
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.22
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4407
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0279
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3521
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2352
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv1.1/76f3fa3a-1629-4cdd-b457-3a108784b427.json b/data/hfopenllm_v2/netcat420/MFANN3bv1.1/76f3fa3a-1629-4cdd-b457-3a108784b427.json
deleted file mode 100644
index 2c3b57cd6..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN3bv1.1/76f3fa3a-1629-4cdd-b457-3a108784b427.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN3bv1.1",
-    "id": "netcat420/MFANN3bv1.1",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.775
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2507
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3397
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3223
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1159
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv1.2/c9e979e1-4433-4a38-8fd4-c14895e74f44.json b/data/hfopenllm_v2/netcat420/MFANN3bv1.2/c9e979e1-4433-4a38-8fd4-c14895e74f44.json
deleted file mode 100644
index f3b97324c..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN3bv1.2/c9e979e1-4433-4a38-8fd4-c14895e74f44.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN3bv1.2",
-    "id": "netcat420/MFANN3bv1.2",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.775
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2686
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.366
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3156
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.145
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv1.3/3f2effba-1ab8-476d-b228-ed9491e83adf.json b/data/hfopenllm_v2/netcat420/MFANN3bv1.3/3f2effba-1ab8-476d-b228-ed9491e83adf.json
deleted file mode 100644
index 5ea688265..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN3bv1.3/3f2effba-1ab8-476d-b228-ed9491e83adf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv1.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN3bv1.3",
-    "id": "netcat420/MFANN3bv1.3",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.78
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2547
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4456
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0211
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3299
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2276
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANN3bv1.4/a5f0fb1b-27a7-495f-a010-3307afdb8949.json b/data/hfopenllm_v2/netcat420/MFANN3bv1.4/a5f0fb1b-27a7-495f-a010-3307afdb8949.json
deleted file mode 100644
index 6b3a33345..000000000
--- a/data/hfopenllm_v2/netcat420/MFANN3bv1.4/a5f0fb1b-27a7-495f-a010-3307afdb8949.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANN3bv1.4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANN3bv1.4",
-    "id": "netcat420/MFANN3bv1.4",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.78
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3524
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4809
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.037
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3708
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2705
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANNv0.19/22f2aa1d-fff1-430a-9c20-3b32859d9665.json b/data/hfopenllm_v2/netcat420/MFANNv0.19/22f2aa1d-fff1-430a-9c20-3b32859d9665.json
deleted file mode 100644
index 4621646f1..000000000
--- a/data/hfopenllm_v2/netcat420/MFANNv0.19/22f2aa1d-fff1-430a-9c20-3b32859d9665.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANNv0.19/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANNv0.19",
-    "id": "netcat420/MFANNv0.19",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3057
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4731
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0415
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3527
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2473
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANNv0.20/daff0e6f-d29f-4861-855f-902a0cd9a469.json b/data/hfopenllm_v2/netcat420/MFANNv0.20/daff0e6f-d29f-4861-855f-902a0cd9a469.json
deleted file mode 100644
index b815945e1..000000000
--- a/data/hfopenllm_v2/netcat420/MFANNv0.20/daff0e6f-d29f-4861-855f-902a0cd9a469.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANNv0.20/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANNv0.20",
-    "id": "netcat420/MFANNv0.20",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3479
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4574
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0498
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3874
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3202
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANNv0.21/0f5cb926-b691-4d57-87f5-290235fd250a.json b/data/hfopenllm_v2/netcat420/MFANNv0.21/0f5cb926-b691-4d57-87f5-290235fd250a.json
deleted file mode 100644
index 0d6243929..000000000
--- a/data/hfopenllm_v2/netcat420/MFANNv0.21/0f5cb926-b691-4d57-87f5-290235fd250a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANNv0.21/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANNv0.21",
-    "id": "netcat420/MFANNv0.21",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3233
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4576
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3993
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3031
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANNv0.22.1/d9e813da-2966-4901-99f9-c7627c64fc52.json b/data/hfopenllm_v2/netcat420/MFANNv0.22.1/d9e813da-2966-4901-99f9-c7627c64fc52.json
deleted file mode 100644
index 7b869bbb5..000000000
--- a/data/hfopenllm_v2/netcat420/MFANNv0.22.1/d9e813da-2966-4901-99f9-c7627c64fc52.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANNv0.22.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANNv0.22.1",
-    "id": "netcat420/MFANNv0.22.1",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3089
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4661
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3753
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3343
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANNv0.23/4cb98a5b-3eb7-4fa8-adfd-17add38d3332.json b/data/hfopenllm_v2/netcat420/MFANNv0.23/4cb98a5b-3eb7-4fa8-adfd-17add38d3332.json
deleted file mode 100644
index 80ebfd3ef..000000000
--- a/data/hfopenllm_v2/netcat420/MFANNv0.23/4cb98a5b-3eb7-4fa8-adfd-17add38d3332.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANNv0.23/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANNv0.23",
-    "id": "netcat420/MFANNv0.23",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3127
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4898
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0498
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3768
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3388
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANNv0.24/f7494fd4-d248-46a6-a46d-f9d8db560aae.json b/data/hfopenllm_v2/netcat420/MFANNv0.24/f7494fd4-d248-46a6-a46d-f9d8db560aae.json
deleted file mode 100644
index 89be208aa..000000000
--- a/data/hfopenllm_v2/netcat420/MFANNv0.24/f7494fd4-d248-46a6-a46d-f9d8db560aae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANNv0.24/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANNv0.24",
-    "id": "netcat420/MFANNv0.24",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3162
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.479
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3754
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3348
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/MFANNv0.25/4b8533d1-7770-435f-ba76-a5c658aabd8f.json b/data/hfopenllm_v2/netcat420/MFANNv0.25/4b8533d1-7770-435f-ba76-a5c658aabd8f.json
deleted file mode 100644
index c62b5f82d..000000000
--- a/data/hfopenllm_v2/netcat420/MFANNv0.25/4b8533d1-7770-435f-ba76-a5c658aabd8f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_MFANNv0.25/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MFANNv0.25",
-    "id": "netcat420/MFANNv0.25",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3467
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4794
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0582
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3688
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3343
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/Qwen2.5-7B-nerd-uncensored-v0.9-MFANN/309c7906-0010-4f17-848f-185062d96a26.json b/data/hfopenllm_v2/netcat420/Qwen2.5-7B-nerd-uncensored-v0.9-MFANN/309c7906-0010-4f17-848f-185062d96a26.json
deleted file mode 100644
index f9f6fbeee..000000000
--- a/data/hfopenllm_v2/netcat420/Qwen2.5-7B-nerd-uncensored-v0.9-MFANN/309c7906-0010-4f17-848f-185062d96a26.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_Qwen2.5-7B-nerd-uncensored-v0.9-MFANN/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-nerd-uncensored-v0.9-MFANN",
-    "id": "netcat420/Qwen2.5-7B-nerd-uncensored-v0.9-MFANN",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5878
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5237
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3376
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3926
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3904
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/Qwen2.5-7b-MFANN-slerp/f18ab2ab-098b-4e46-8f8d-433b52cdb81b.json b/data/hfopenllm_v2/netcat420/Qwen2.5-7b-MFANN-slerp/f18ab2ab-098b-4e46-8f8d-433b52cdb81b.json
deleted file mode 100644
index 25a7240c6..000000000
--- a/data/hfopenllm_v2/netcat420/Qwen2.5-7b-MFANN-slerp/f18ab2ab-098b-4e46-8f8d-433b52cdb81b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_Qwen2.5-7b-MFANN-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7b-MFANN-slerp",
-    "id": "netcat420/Qwen2.5-7b-MFANN-slerp",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6532
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5089
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4073
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3417
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/Qwen2.5-7b-nerd-uncensored-MFANN-slerp/b4a70c71-dfac-4888-937e-d5220b491b0e.json b/data/hfopenllm_v2/netcat420/Qwen2.5-7b-nerd-uncensored-MFANN-slerp/b4a70c71-dfac-4888-937e-d5220b491b0e.json
deleted file mode 100644
index 07ded6ed7..000000000
--- a/data/hfopenllm_v2/netcat420/Qwen2.5-7b-nerd-uncensored-MFANN-slerp/b4a70c71-dfac-4888-937e-d5220b491b0e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_Qwen2.5-7b-nerd-uncensored-MFANN-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7b-nerd-uncensored-MFANN-slerp",
-    "id": "netcat420/Qwen2.5-7b-nerd-uncensored-MFANN-slerp",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1564
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.292
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.11
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained/b879a534-6b24-4873-a0e4-e18453540121.json b/data/hfopenllm_v2/netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained/b879a534-6b24-4873-a0e4-e18453540121.json
deleted file mode 100644
index 6830a0165..000000000
--- a/data/hfopenllm_v2/netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained/b879a534-6b24-4873-a0e4-e18453540121.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained",
-    "id": "netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN-Slerp-Unretrained",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6486
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5066
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2991
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4152
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3432
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN/c67ae8f2-596b-4dab-8c4f-768b2f0608b4.json b/data/hfopenllm_v2/netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN/c67ae8f2-596b-4dab-8c4f-768b2f0608b4.json
deleted file mode 100644
index 2e55a4ead..000000000
--- a/data/hfopenllm_v2/netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN/c67ae8f2-596b-4dab-8c4f-768b2f0608b4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN",
-    "id": "netcat420/Qwen2.5-Coder-Scholar-7B-Abliterated-MFANN",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5742
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5071
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2568
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4058
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3157
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b/7766c638-b4dc-4b2d-8c14-becdb1b709ef.json b/data/hfopenllm_v2/netcat420/Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b/7766c638-b4dc-4b2d-8c14-becdb1b709ef.json
deleted file mode 100644
index a0253b996..000000000
--- a/data/hfopenllm_v2/netcat420/Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b/7766c638-b4dc-4b2d-8c14-becdb1b709ef.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b",
-    "id": "netcat420/Qwen2.5-DeepSeek-R1-MFANN-Slerp-7b",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3789
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2324
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3528
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1677
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/Qwen2.5-MFANN-7b/dd211bef-3940-4d78-8f7b-a67da81d605b.json b/data/hfopenllm_v2/netcat420/Qwen2.5-MFANN-7b/dd211bef-3940-4d78-8f7b-a67da81d605b.json
deleted file mode 100644
index 1928a3f0f..000000000
--- a/data/hfopenllm_v2/netcat420/Qwen2.5-MFANN-7b/dd211bef-3940-4d78-8f7b-a67da81d605b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_Qwen2.5-MFANN-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-MFANN-7b",
-    "id": "netcat420/Qwen2.5-MFANN-7b",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6097
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5054
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2787
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4021
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3233
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-SLERP-V1.2/87e20b7a-85c8-4845-94b0-ace1e18814cb.json b/data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-SLERP-V1.2/87e20b7a-85c8-4845-94b0-ace1e18814cb.json
deleted file mode 100644
index 51f2c598a..000000000
--- a/data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-SLERP-V1.2/87e20b7a-85c8-4845-94b0-ace1e18814cb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_qwen2.5-MFANN-7b-SLERP-V1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen2.5-MFANN-7b-SLERP-V1.2",
-    "id": "netcat420/qwen2.5-MFANN-7b-SLERP-V1.2",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6606
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5111
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4259
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3438
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-SLERPv1.1/9ab01db6-3154-4c5b-b6a2-35479538d332.json b/data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-SLERPv1.1/9ab01db6-3154-4c5b-b6a2-35479538d332.json
deleted file mode 100644
index 6d923911a..000000000
--- a/data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-SLERPv1.1/9ab01db6-3154-4c5b-b6a2-35479538d332.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_qwen2.5-MFANN-7b-SLERPv1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen2.5-MFANN-7b-SLERPv1.1",
-    "id": "netcat420/qwen2.5-MFANN-7b-SLERPv1.1",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6555
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5075
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2968
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4126
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3448
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-v1.1/9d35316a-011d-4e45-ae57-317b53de621f.json b/data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-v1.1/9d35316a-011d-4e45-ae57-317b53de621f.json
deleted file mode 100644
index 6d09d9e25..000000000
--- a/data/hfopenllm_v2/netcat420/qwen2.5-MFANN-7b-v1.1/9d35316a-011d-4e45-ae57-317b53de621f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netcat420_qwen2.5-MFANN-7b-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen2.5-MFANN-7b-v1.1",
-    "id": "netcat420/qwen2.5-MFANN-7b-v1.1",
-    "developer": "netcat420",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6088
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4967
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2825
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4114
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3248
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/netease-youdao/Confucius-o1-14B/c9e7fec0-b244-4ca1-a117-a52fdd4671a5.json b/data/hfopenllm_v2/netease-youdao/Confucius-o1-14B/c9e7fec0-b244-4ca1-a117-a52fdd4671a5.json
deleted file mode 100644
index d488ce1e9..000000000
--- a/data/hfopenllm_v2/netease-youdao/Confucius-o1-14B/c9e7fec0-b244-4ca1-a117-a52fdd4671a5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/netease-youdao_Confucius-o1-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Confucius-o1-14B",
-    "id": "netease-youdao/Confucius-o1-14B",
-    "developer": "netease-youdao",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6378
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.63
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4313
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3649
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4338
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5265
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/newsbang/Homer-7B-v0.1/0659cb01-0d52-42cb-9e3a-2d8cac01692e.json b/data/hfopenllm_v2/newsbang/Homer-7B-v0.1/0659cb01-0d52-42cb-9e3a-2d8cac01692e.json
deleted file mode 100644
index 889543f52..000000000
--- a/data/hfopenllm_v2/newsbang/Homer-7B-v0.1/0659cb01-0d52-42cb-9e3a-2d8cac01692e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/newsbang_Homer-7B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Homer-7B-v0.1",
-    "id": "newsbang/Homer-7B-v0.1",
-    "developer": "newsbang",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6109
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5601
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.386
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4357
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4475
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/newsbang/Homer-7B-v0.2/98490bb1-70f0-4e7a-8fd6-698ec9fcbd5a.json b/data/hfopenllm_v2/newsbang/Homer-7B-v0.2/98490bb1-70f0-4e7a-8fd6-698ec9fcbd5a.json
deleted file mode 100644
index 0e6b4cdcc..000000000
--- a/data/hfopenllm_v2/newsbang/Homer-7B-v0.2/98490bb1-70f0-4e7a-8fd6-698ec9fcbd5a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/newsbang_Homer-7B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Homer-7B-v0.2",
-    "id": "newsbang/Homer-7B-v0.2",
-    "developer": "newsbang",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7494
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5517
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2477
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3322
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4298
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.441
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/newsbang/Homer-v0.3-Qwen2.5-7B/6e0f7e7e-8927-436e-95a7-5a7c626ca241.json b/data/hfopenllm_v2/newsbang/Homer-v0.3-Qwen2.5-7B/6e0f7e7e-8927-436e-95a7-5a7c626ca241.json
deleted file mode 100644
index b533392a6..000000000
--- a/data/hfopenllm_v2/newsbang/Homer-v0.3-Qwen2.5-7B/6e0f7e7e-8927-436e-95a7-5a7c626ca241.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/newsbang_Homer-v0.3-Qwen2.5-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Homer-v0.3-Qwen2.5-7B",
-    "id": "newsbang/Homer-v0.3-Qwen2.5-7B",
-    "developer": "newsbang",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5154
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5481
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3089
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3339
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4744
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4456
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/newsbang/Homer-v0.4-Qwen2.5-7B/9c5b3f4d-6e0b-482b-b142-dd7b387cae22.json b/data/hfopenllm_v2/newsbang/Homer-v0.4-Qwen2.5-7B/9c5b3f4d-6e0b-482b-b142-dd7b387cae22.json
deleted file mode 100644
index c5e3960e9..000000000
--- a/data/hfopenllm_v2/newsbang/Homer-v0.4-Qwen2.5-7B/9c5b3f4d-6e0b-482b-b142-dd7b387cae22.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/newsbang_Homer-v0.4-Qwen2.5-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Homer-v0.4-Qwen2.5-7B",
-    "id": "newsbang/Homer-v0.4-Qwen2.5-7B",
-    "developer": "newsbang",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7999
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5533
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2779
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4311
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4363
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/newsbang/Homer-v0.5-Qwen2.5-7B/04840708-a4cc-407c-8b2a-876b382920a1.json b/data/hfopenllm_v2/newsbang/Homer-v0.5-Qwen2.5-7B/04840708-a4cc-407c-8b2a-876b382920a1.json
deleted file mode 100644
index 63f957333..000000000
--- a/data/hfopenllm_v2/newsbang/Homer-v0.5-Qwen2.5-7B/04840708-a4cc-407c-8b2a-876b382920a1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/newsbang_Homer-v0.5-Qwen2.5-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Homer-v0.5-Qwen2.5-7B",
-    "id": "newsbang/Homer-v0.5-Qwen2.5-7B",
-    "developer": "newsbang",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7881
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.554
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3724
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4193
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4369
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/newsbang/Homer-v1.0-Qwen2.5-72B/83b0844c-70fe-4b63-8ed2-4147390518ee.json b/data/hfopenllm_v2/newsbang/Homer-v1.0-Qwen2.5-72B/83b0844c-70fe-4b63-8ed2-4147390518ee.json
deleted file mode 100644
index a97243437..000000000
--- a/data/hfopenllm_v2/newsbang/Homer-v1.0-Qwen2.5-72B/83b0844c-70fe-4b63-8ed2-4147390518ee.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/newsbang_Homer-v1.0-Qwen2.5-72B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Homer-v1.0-Qwen2.5-72B",
-    "id": "newsbang/Homer-v1.0-Qwen2.5-72B",
-    "developer": "newsbang",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7628
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.731
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4902
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4161
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4677
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6145
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/newsbang/Homer-v1.0-Qwen2.5-7B/9cf10c60-bee1-4f4f-9e03-c3c10287bded.json b/data/hfopenllm_v2/newsbang/Homer-v1.0-Qwen2.5-7B/9cf10c60-bee1-4f4f-9e03-c3c10287bded.json
deleted file mode 100644
index c299cf546..000000000
--- a/data/hfopenllm_v2/newsbang/Homer-v1.0-Qwen2.5-7B/9cf10c60-bee1-4f4f-9e03-c3c10287bded.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/newsbang_Homer-v1.0-Qwen2.5-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Homer-v1.0-Qwen2.5-7B",
-    "id": "newsbang/Homer-v1.0-Qwen2.5-7B",
-    "developer": "newsbang",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6393
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5655
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3323
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3221
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4278
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4535
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nguyentd/FinancialAdvice-Qwen2.5-7B/8e92dd9e-a68c-46ef-9b03-955c06a21437.json b/data/hfopenllm_v2/nguyentd/FinancialAdvice-Qwen2.5-7B/8e92dd9e-a68c-46ef-9b03-955c06a21437.json
deleted file mode 100644
index 332c8a2d4..000000000
--- a/data/hfopenllm_v2/nguyentd/FinancialAdvice-Qwen2.5-7B/8e92dd9e-a68c-46ef-9b03-955c06a21437.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nguyentd_FinancialAdvice-Qwen2.5-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FinancialAdvice-Qwen2.5-7B",
-    "id": "nguyentd/FinancialAdvice-Qwen2.5-7B",
-    "developer": "nguyentd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4496
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4731
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4025
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3752
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ngxson/MiniThinky-1B-Llama-3.2/dd1139d8-2b44-4516-b24a-1219826f5482.json b/data/hfopenllm_v2/ngxson/MiniThinky-1B-Llama-3.2/dd1139d8-2b44-4516-b24a-1219826f5482.json
deleted file mode 100644
index 2e0f2504c..000000000
--- a/data/hfopenllm_v2/ngxson/MiniThinky-1B-Llama-3.2/dd1139d8-2b44-4516-b24a-1219826f5482.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ngxson_MiniThinky-1B-Llama-3.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MiniThinky-1B-Llama-3.2",
-    "id": "ngxson/MiniThinky-1B-Llama-3.2",
-    "developer": "ngxson",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2771
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3142
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2391
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3434
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1147
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ngxson/MiniThinky-v2-1B-Llama-3.2/e37e86f7-b67b-4f0a-b1bd-92f30842b303.json b/data/hfopenllm_v2/ngxson/MiniThinky-v2-1B-Llama-3.2/e37e86f7-b67b-4f0a-b1bd-92f30842b303.json
deleted file mode 100644
index 34a2fa568..000000000
--- a/data/hfopenllm_v2/ngxson/MiniThinky-v2-1B-Llama-3.2/e37e86f7-b67b-4f0a-b1bd-92f30842b303.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ngxson_MiniThinky-v2-1B-Llama-3.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MiniThinky-v2-1B-Llama-3.2",
-    "id": "ngxson/MiniThinky-v2-1B-Llama-3.2",
-    "developer": "ngxson",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2963
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2399
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1116
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nhyha/N3N_Delirium-v1_1030_0227/bc3b55d5-35ca-48b5-832e-8544e145b1b1.json b/data/hfopenllm_v2/nhyha/N3N_Delirium-v1_1030_0227/bc3b55d5-35ca-48b5-832e-8544e145b1b1.json
deleted file mode 100644
index c0a263780..000000000
--- a/data/hfopenllm_v2/nhyha/N3N_Delirium-v1_1030_0227/bc3b55d5-35ca-48b5-832e-8544e145b1b1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nhyha_N3N_Delirium-v1_1030_0227/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "N3N_Delirium-v1_1030_0227",
-    "id": "nhyha/N3N_Delirium-v1_1030_0227",
-    "developer": "nhyha",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8023
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5891
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2107
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3372
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4098
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.415
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nhyha/N3N_Llama-3.1-8B-Instruct_1028_0216/5757cd3d-c64e-4743-8200-5e610e24bf95.json b/data/hfopenllm_v2/nhyha/N3N_Llama-3.1-8B-Instruct_1028_0216/5757cd3d-c64e-4743-8200-5e610e24bf95.json
deleted file mode 100644
index 0ece90da0..000000000
--- a/data/hfopenllm_v2/nhyha/N3N_Llama-3.1-8B-Instruct_1028_0216/5757cd3d-c64e-4743-8200-5e610e24bf95.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nhyha_N3N_Llama-3.1-8B-Instruct_1028_0216/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "N3N_Llama-3.1-8B-Instruct_1028_0216",
-    "id": "nhyha/N3N_Llama-3.1-8B-Instruct_1028_0216",
-    "developer": "nhyha",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4796
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5054
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1707
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.405
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3638
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nhyha/N3N_gemma-2-9b-it_20241029_1532/ae8cd3ad-ce7b-41f4-8e4a-f11002af2e58.json b/data/hfopenllm_v2/nhyha/N3N_gemma-2-9b-it_20241029_1532/ae8cd3ad-ce7b-41f4-8e4a-f11002af2e58.json
deleted file mode 100644
index 9ba31eded..000000000
--- a/data/hfopenllm_v2/nhyha/N3N_gemma-2-9b-it_20241029_1532/ae8cd3ad-ce7b-41f4-8e4a-f11002af2e58.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nhyha_N3N_gemma-2-9b-it_20241029_1532/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "N3N_gemma-2-9b-it_20241029_1532",
-    "id": "nhyha/N3N_gemma-2-9b-it_20241029_1532",
-    "developer": "nhyha",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6752
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5863
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2122
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4594
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4122
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nhyha/N3N_gemma-2-9b-it_20241110_2026/bee54048-ebb2-4051-a18f-aa85b0f2ce27.json b/data/hfopenllm_v2/nhyha/N3N_gemma-2-9b-it_20241110_2026/bee54048-ebb2-4051-a18f-aa85b0f2ce27.json
deleted file mode 100644
index 3720f36fb..000000000
--- a/data/hfopenllm_v2/nhyha/N3N_gemma-2-9b-it_20241110_2026/bee54048-ebb2-4051-a18f-aa85b0f2ce27.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nhyha_N3N_gemma-2-9b-it_20241110_2026/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "N3N_gemma-2-9b-it_20241110_2026",
-    "id": "nhyha/N3N_gemma-2-9b-it_20241110_2026",
-    "developer": "nhyha",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6283
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5867
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1609
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4073
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.402
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nhyha/merge_Qwen2.5-7B-Instruct_20241023_0314/2f98c85b-5a2e-467e-9626-b1bdefe7bdd7.json b/data/hfopenllm_v2/nhyha/merge_Qwen2.5-7B-Instruct_20241023_0314/2f98c85b-5a2e-467e-9626-b1bdefe7bdd7.json
deleted file mode 100644
index f76d4c073..000000000
--- a/data/hfopenllm_v2/nhyha/merge_Qwen2.5-7B-Instruct_20241023_0314/2f98c85b-5a2e-467e-9626-b1bdefe7bdd7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nhyha_merge_Qwen2.5-7B-Instruct_20241023_0314/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "merge_Qwen2.5-7B-Instruct_20241023_0314",
-    "id": "nhyha/merge_Qwen2.5-7B-Instruct_20241023_0314",
-    "developer": "nhyha",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5695
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5559
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3542
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4251
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4542
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nidum/Nidum-Limitless-Gemma-2B/2c530a3b-888e-4a61-b97b-ea875b30ec9c.json b/data/hfopenllm_v2/nidum/Nidum-Limitless-Gemma-2B/2c530a3b-888e-4a61-b97b-ea875b30ec9c.json
deleted file mode 100644
index e6ba40bf6..000000000
--- a/data/hfopenllm_v2/nidum/Nidum-Limitless-Gemma-2B/2c530a3b-888e-4a61-b97b-ea875b30ec9c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nidum_Nidum-Limitless-Gemma-2B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nidum-Limitless-Gemma-2B",
-    "id": "nidum/Nidum-Limitless-Gemma-2B",
-    "developer": "nidum",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 2.506
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2424
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.374
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1174
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nisten/franqwenstein-35b/4c9fb322-735e-4644-8121-088d00f78c5f.json b/data/hfopenllm_v2/nisten/franqwenstein-35b/4c9fb322-735e-4644-8121-088d00f78c5f.json
deleted file mode 100644
index d683ac73a..000000000
--- a/data/hfopenllm_v2/nisten/franqwenstein-35b/4c9fb322-735e-4644-8121-088d00f78c5f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nisten_franqwenstein-35b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "franqwenstein-35b",
-    "id": "nisten/franqwenstein-35b",
-    "developer": "nisten",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 34.714
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3799
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6647
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4035
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.494
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5731
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nisten/franqwenstein-35b/e7e7733f-682b-4e68-8f07-85f3ba7a7ae1.json b/data/hfopenllm_v2/nisten/franqwenstein-35b/e7e7733f-682b-4e68-8f07-85f3ba7a7ae1.json
deleted file mode 100644
index 12f77c7a5..000000000
--- a/data/hfopenllm_v2/nisten/franqwenstein-35b/e7e7733f-682b-4e68-8f07-85f3ba7a7ae1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nisten_franqwenstein-35b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "franqwenstein-35b",
-    "id": "nisten/franqwenstein-35b",
-    "developer": "nisten",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 34.714
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3914
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6591
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3044
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3591
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4681
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5611
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nisten/tqwendo-36b/e9a4e1e2-bd55-4c3d-99eb-8fafd8f6ec44.json b/data/hfopenllm_v2/nisten/tqwendo-36b/e9a4e1e2-bd55-4c3d-99eb-8fafd8f6ec44.json
deleted file mode 100644
index 5712a649d..000000000
--- a/data/hfopenllm_v2/nisten/tqwendo-36b/e9a4e1e2-bd55-4c3d-99eb-8fafd8f6ec44.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nisten_tqwendo-36b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tqwendo-36b",
-    "id": "nisten/tqwendo-36b",
-    "developer": "nisten",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 35.69
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6778
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6432
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4154
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.443
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4381
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.0.8/42ed92b3-63bc-4fa1-bc16-c19bfb73368f.json b/data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.0.8/42ed92b3-63bc-4fa1-bc16-c19bfb73368f.json
deleted file mode 100644
index b018debaf..000000000
--- a/data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.0.8/42ed92b3-63bc-4fa1-bc16-c19bfb73368f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nlpguy_Lion-Lamarck-v.1.0.8/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lion-Lamarck-v.1.0.8",
-    "id": "nlpguy/Lion-Lamarck-v.1.0.8",
-    "developer": "nlpguy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4509
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5869
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3582
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4673
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4643
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.0.9/915ae579-786a-4eb2-a1bb-107a12c9c40d.json b/data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.0.9/915ae579-786a-4eb2-a1bb-107a12c9c40d.json
deleted file mode 100644
index e62e07a5e..000000000
--- a/data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.0.9/915ae579-786a-4eb2-a1bb-107a12c9c40d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nlpguy_Lion-Lamarck-v.1.0.9/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lion-Lamarck-v.1.0.9",
-    "id": "nlpguy/Lion-Lamarck-v.1.0.9",
-    "developer": "nlpguy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3409
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5918
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5642
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3901
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4704
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.1.0/3489ffea-a607-4f3d-a0c2-bd17147f244f.json b/data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.1.0/3489ffea-a607-4f3d-a0c2-bd17147f244f.json
deleted file mode 100644
index 4a3058b4b..000000000
--- a/data/hfopenllm_v2/nlpguy/Lion-Lamarck-v.1.1.0/3489ffea-a607-4f3d-a0c2-bd17147f244f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nlpguy_Lion-Lamarck-v.1.1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lion-Lamarck-v.1.1.0",
-    "id": "nlpguy/Lion-Lamarck-v.1.1.0",
-    "developer": "nlpguy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3658
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5962
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5755
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3926
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5325
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4631
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nlpguy/Miisce-one/7b5ba8a8-16c3-4169-b97d-13dd5d4f8395.json b/data/hfopenllm_v2/nlpguy/Miisce-one/7b5ba8a8-16c3-4169-b97d-13dd5d4f8395.json
deleted file mode 100644
index 39e5855db..000000000
--- a/data/hfopenllm_v2/nlpguy/Miisce-one/7b5ba8a8-16c3-4169-b97d-13dd5d4f8395.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nlpguy_Miisce-one/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Miisce-one",
-    "id": "nlpguy/Miisce-one",
-    "developer": "nlpguy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6066
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6505
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4169
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3859
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.482
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5412
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v1/6411c44a-b2b3-4fe3-8ba4-9422a0a0b31e.json b/data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v1/6411c44a-b2b3-4fe3-8ba4-9422a0a0b31e.json
deleted file mode 100644
index 823d8c02e..000000000
--- a/data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v1/6411c44a-b2b3-4fe3-8ba4-9422a0a0b31e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nlpguy_Mistral-NeMo-Minitron-Upscale-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-NeMo-Minitron-Upscale-v1",
-    "id": "nlpguy/Mistral-NeMo-Minitron-Upscale-v1",
-    "developer": "nlpguy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.451
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1648
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4468
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3804
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2537
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v2/fe344f84-7428-45af-940f-736275bc4d50.json b/data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v2/fe344f84-7428-45af-940f-736275bc4d50.json
deleted file mode 100644
index 5887f8230..000000000
--- a/data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v2/fe344f84-7428-45af-940f-736275bc4d50.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nlpguy_Mistral-NeMo-Minitron-Upscale-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-NeMo-Minitron-Upscale-v2",
-    "id": "nlpguy/Mistral-NeMo-Minitron-Upscale-v2",
-    "developer": "nlpguy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.451
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1573
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.395
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3791
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1927
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v3/60956ea2-8b0b-4e4b-801a-d0689f9d46f4.json b/data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v3/60956ea2-8b0b-4e4b-801a-d0689f9d46f4.json
deleted file mode 100644
index f652b2940..000000000
--- a/data/hfopenllm_v2/nlpguy/Mistral-NeMo-Minitron-Upscale-v3/60956ea2-8b0b-4e4b-801a-d0689f9d46f4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nlpguy_Mistral-NeMo-Minitron-Upscale-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-NeMo-Minitron-Upscale-v3",
-    "id": "nlpguy/Mistral-NeMo-Minitron-Upscale-v3",
-    "developer": "nlpguy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.451
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1412
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3052
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0113
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4098
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1171
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nlpguy/StableProse/1ad54bdc-419a-4dd9-9fbb-d7b7ee7038d1.json b/data/hfopenllm_v2/nlpguy/StableProse/1ad54bdc-419a-4dd9-9fbb-d7b7ee7038d1.json
deleted file mode 100644
index 7c16d0d96..000000000
--- a/data/hfopenllm_v2/nlpguy/StableProse/1ad54bdc-419a-4dd9-9fbb-d7b7ee7038d1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nlpguy_StableProse/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "StableProse",
-    "id": "nlpguy/StableProse",
-    "developer": "nlpguy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1972
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5117
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4067
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3468
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nlpguy/StarFusion-alpha1/2ab375f0-2477-48a5-a5d9-0b5d0d7d0a84.json b/data/hfopenllm_v2/nlpguy/StarFusion-alpha1/2ab375f0-2477-48a5-a5d9-0b5d0d7d0a84.json
deleted file mode 100644
index 3dae9ae39..000000000
--- a/data/hfopenllm_v2/nlpguy/StarFusion-alpha1/2ab375f0-2477-48a5-a5d9-0b5d0d7d0a84.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nlpguy_StarFusion-alpha1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "StarFusion-alpha1",
-    "id": "nlpguy/StarFusion-alpha1",
-    "developer": "nlpguy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.566
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4429
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0718
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4081
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3191
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/noname0202/Llama-3.2-4x3B-Instruct/e0525a52-d38c-4b2f-b59b-048b4bf71cb2.json b/data/hfopenllm_v2/noname0202/Llama-3.2-4x3B-Instruct/e0525a52-d38c-4b2f-b59b-048b4bf71cb2.json
deleted file mode 100644
index d9ea08253..000000000
--- a/data/hfopenllm_v2/noname0202/Llama-3.2-4x3B-Instruct/e0525a52-d38c-4b2f-b59b-048b4bf71cb2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/noname0202_Llama-3.2-4x3B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-4x3B-Instruct",
-    "id": "noname0202/Llama-3.2-4x3B-Instruct",
-    "developer": "noname0202",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 9.949
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7067
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4647
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1586
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3674
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3285
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/noname0202/gemma-2-2b-it-ties/01bc964f-552b-4cda-9ed0-cf720f0c8de4.json b/data/hfopenllm_v2/noname0202/gemma-2-2b-it-ties/01bc964f-552b-4cda-9ed0-cf720f0c8de4.json
deleted file mode 100644
index 51a2199d0..000000000
--- a/data/hfopenllm_v2/noname0202/gemma-2-2b-it-ties/01bc964f-552b-4cda-9ed0-cf720f0c8de4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/noname0202_gemma-2-2b-it-ties/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-it-ties",
-    "id": "noname0202/gemma-2-2b-it-ties",
-    "developer": "noname0202",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1266
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4206
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0242
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3929
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2561
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/noname0202/gemma-2-9b-sft-jp-en-zh-v1/c9e95c55-978e-485b-8a77-ab2e668e3254.json b/data/hfopenllm_v2/noname0202/gemma-2-9b-sft-jp-en-zh-v1/c9e95c55-978e-485b-8a77-ab2e668e3254.json
deleted file mode 100644
index 088f2166d..000000000
--- a/data/hfopenllm_v2/noname0202/gemma-2-9b-sft-jp-en-zh-v1/c9e95c55-978e-485b-8a77-ab2e668e3254.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/noname0202_gemma-2-9b-sft-jp-en-zh-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-9b-sft-jp-en-zh-v1",
-    "id": "noname0202/gemma-2-9b-sft-jp-en-zh-v1",
-    "developer": "noname0202",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2988
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4519
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0891
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.408
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3125
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/noname0202/gemma-2-9b-sft-jp-en-zh-v2/c71c606b-ccb7-48e9-a6c8-b72205ec6c06.json b/data/hfopenllm_v2/noname0202/gemma-2-9b-sft-jp-en-zh-v2/c71c606b-ccb7-48e9-a6c8-b72205ec6c06.json
deleted file mode 100644
index 26c8c9012..000000000
--- a/data/hfopenllm_v2/noname0202/gemma-2-9b-sft-jp-en-zh-v2/c71c606b-ccb7-48e9-a6c8-b72205ec6c06.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/noname0202_gemma-2-9b-sft-jp-en-zh-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-9b-sft-jp-en-zh-v2",
-    "id": "noname0202/gemma-2-9b-sft-jp-en-zh-v2",
-    "developer": "noname0202",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3993
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4515
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1042
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3612
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3675
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/noname0202/llama-math-1b-r16-0to512tokens-test/ae1801cb-d112-4d1a-895d-c6743779846a.json b/data/hfopenllm_v2/noname0202/llama-math-1b-r16-0to512tokens-test/ae1801cb-d112-4d1a-895d-c6743779846a.json
deleted file mode 100644
index ff9746f84..000000000
--- a/data/hfopenllm_v2/noname0202/llama-math-1b-r16-0to512tokens-test/ae1801cb-d112-4d1a-895d-c6743779846a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/noname0202_llama-math-1b-r16-0to512tokens-test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-math-1b-r16-0to512tokens-test",
-    "id": "noname0202/llama-math-1b-r16-0to512tokens-test",
-    "developer": "noname0202",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.547
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3488
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0816
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3143
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1728
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/noname0202/llama-math-1b-r32-0to512tokens-test/008e3601-dfc4-4bc1-bf8b-f5cef43ae098.json b/data/hfopenllm_v2/noname0202/llama-math-1b-r32-0to512tokens-test/008e3601-dfc4-4bc1-bf8b-f5cef43ae098.json
deleted file mode 100644
index d00eac878..000000000
--- a/data/hfopenllm_v2/noname0202/llama-math-1b-r32-0to512tokens-test/008e3601-dfc4-4bc1-bf8b-f5cef43ae098.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/noname0202_llama-math-1b-r32-0to512tokens-test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-math-1b-r32-0to512tokens-test",
-    "id": "noname0202/llama-math-1b-r32-0to512tokens-test",
-    "developer": "noname0202",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5683
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3495
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0906
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3209
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.176
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/noname0202/llama-math-1b-r32-test/379b315d-96fb-4edb-b2d6-3dc113a10c17.json b/data/hfopenllm_v2/noname0202/llama-math-1b-r32-test/379b315d-96fb-4edb-b2d6-3dc113a10c17.json
deleted file mode 100644
index 442e75821..000000000
--- a/data/hfopenllm_v2/noname0202/llama-math-1b-r32-test/379b315d-96fb-4edb-b2d6-3dc113a10c17.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/noname0202_llama-math-1b-r32-test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-math-1b-r32-test",
-    "id": "noname0202/llama-math-1b-r32-test",
-    "developer": "noname0202",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5819
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3486
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0725
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3156
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1781
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/noname0202/llama-math-1b-r8-512tokens-test/8cd36aa1-6f87-4d4d-a1bf-adc87e0a26c6.json b/data/hfopenllm_v2/noname0202/llama-math-1b-r8-512tokens-test/8cd36aa1-6f87-4d4d-a1bf-adc87e0a26c6.json
deleted file mode 100644
index 982f70f06..000000000
--- a/data/hfopenllm_v2/noname0202/llama-math-1b-r8-512tokens-test/8cd36aa1-6f87-4d4d-a1bf-adc87e0a26c6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/noname0202_llama-math-1b-r8-512tokens-test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-math-1b-r8-512tokens-test",
-    "id": "noname0202/llama-math-1b-r8-512tokens-test",
-    "developer": "noname0202",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5792
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3496
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0816
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3169
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1753
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/notbdq/Qwen2.5-14B-Instruct-1M-GRPO-Reasoning/f76ce244-29f7-44f0-9850-7291f8e4cbf1.json b/data/hfopenllm_v2/notbdq/Qwen2.5-14B-Instruct-1M-GRPO-Reasoning/f76ce244-29f7-44f0-9850-7291f8e4cbf1.json
deleted file mode 100644
index 92f477fa6..000000000
--- a/data/hfopenllm_v2/notbdq/Qwen2.5-14B-Instruct-1M-GRPO-Reasoning/f76ce244-29f7-44f0-9850-7291f8e4cbf1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/notbdq_Qwen2.5-14B-Instruct-1M-GRPO-Reasoning/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Instruct-1M-GRPO-Reasoning",
-    "id": "notbdq/Qwen2.5-14B-Instruct-1M-GRPO-Reasoning",
-    "developer": "notbdq",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8414
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6198
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3431
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.418
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.485
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nothingiisreal/L3.1-8B-Celeste-V1.5/506871f1-0c87-4e8c-a270-eed7b5da2599.json b/data/hfopenllm_v2/nothingiisreal/L3.1-8B-Celeste-V1.5/506871f1-0c87-4e8c-a270-eed7b5da2599.json
deleted file mode 100644
index 6a557a511..000000000
--- a/data/hfopenllm_v2/nothingiisreal/L3.1-8B-Celeste-V1.5/506871f1-0c87-4e8c-a270-eed7b5da2599.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nothingiisreal_L3.1-8B-Celeste-V1.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.1-8B-Celeste-V1.5",
-    "id": "nothingiisreal/L3.1-8B-Celeste-V1.5",
-    "developer": "nothingiisreal",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7327
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5012
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1465
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3749
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3704
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nothingiisreal/MN-12B-Starcannon-v2/c20264fd-b1f9-4e0f-9f6e-1d58f1c18cda.json b/data/hfopenllm_v2/nothingiisreal/MN-12B-Starcannon-v2/c20264fd-b1f9-4e0f-9f6e-1d58f1c18cda.json
deleted file mode 100644
index b14841c8a..000000000
--- a/data/hfopenllm_v2/nothingiisreal/MN-12B-Starcannon-v2/c20264fd-b1f9-4e0f-9f6e-1d58f1c18cda.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nothingiisreal_MN-12B-Starcannon-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-Starcannon-v2",
-    "id": "nothingiisreal/MN-12B-Starcannon-v2",
-    "developer": "nothingiisreal",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3925
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5004
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0597
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3978
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3128
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nothingiisreal/MN-12B-Starcannon-v3/59f14dca-923a-41f1-b443-cc3551063f45.json b/data/hfopenllm_v2/nothingiisreal/MN-12B-Starcannon-v3/59f14dca-923a-41f1-b443-cc3551063f45.json
deleted file mode 100644
index 9f9ddf54f..000000000
--- a/data/hfopenllm_v2/nothingiisreal/MN-12B-Starcannon-v3/59f14dca-923a-41f1-b443-cc3551063f45.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nothingiisreal_MN-12B-Starcannon-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MN-12B-Starcannon-v3",
-    "id": "nothingiisreal/MN-12B-Starcannon-v3",
-    "developer": "nothingiisreal",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3807
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5171
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0778
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4046
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3265
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nvidia/AceInstruct-1.5B/a1ba054f-b0a1-4827-b7ea-3988aa4cf1f1.json b/data/hfopenllm_v2/nvidia/AceInstruct-1.5B/a1ba054f-b0a1-4827-b7ea-3988aa4cf1f1.json
deleted file mode 100644
index e9741ba03..000000000
--- a/data/hfopenllm_v2/nvidia/AceInstruct-1.5B/a1ba054f-b0a1-4827-b7ea-3988aa4cf1f1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nvidia_AceInstruct-1.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AceInstruct-1.5B",
-    "id": "nvidia/AceInstruct-1.5B",
-    "developer": "nvidia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3948
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3932
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3127
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.346
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2574
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nvidia/AceInstruct-72B/51d8f53f-ad7e-4dae-9e2a-0895729ff790.json b/data/hfopenllm_v2/nvidia/AceInstruct-72B/51d8f53f-ad7e-4dae-9e2a-0895729ff790.json
deleted file mode 100644
index 589e694b8..000000000
--- a/data/hfopenllm_v2/nvidia/AceInstruct-72B/51d8f53f-ad7e-4dae-9e2a-0895729ff790.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nvidia_AceInstruct-72B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AceInstruct-72B",
-    "id": "nvidia/AceInstruct-72B",
-    "developer": "nvidia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7119
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6139
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6261
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3213
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4206
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4874
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nvidia/AceInstruct-7B/421119ea-0da8-4b26-a335-f2e720618c44.json b/data/hfopenllm_v2/nvidia/AceInstruct-7B/421119ea-0da8-4b26-a335-f2e720618c44.json
deleted file mode 100644
index 50b4c8bff..000000000
--- a/data/hfopenllm_v2/nvidia/AceInstruct-7B/421119ea-0da8-4b26-a335-f2e720618c44.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nvidia_AceInstruct-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AceInstruct-7B",
-    "id": "nvidia/AceInstruct-7B",
-    "developer": "nvidia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5422
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5501
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4255
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4177
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nvidia/AceMath-1.5B-Instruct/b0e6bfb2-a8d4-4b1d-859a-aa821f646e57.json b/data/hfopenllm_v2/nvidia/AceMath-1.5B-Instruct/b0e6bfb2-a8d4-4b1d-859a-aa821f646e57.json
deleted file mode 100644
index cd6ae73f6..000000000
--- a/data/hfopenllm_v2/nvidia/AceMath-1.5B-Instruct/b0e6bfb2-a8d4-4b1d-859a-aa821f646e57.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nvidia_AceMath-1.5B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AceMath-1.5B-Instruct",
-    "id": "nvidia/AceMath-1.5B-Instruct",
-    "developer": "nvidia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3212
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4024
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3607
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2064
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nvidia/AceMath-72B-Instruct/7c4c2ccf-7d7b-4d24-802e-20c182290d07.json b/data/hfopenllm_v2/nvidia/AceMath-72B-Instruct/7c4c2ccf-7d7b-4d24-802e-20c182290d07.json
deleted file mode 100644
index 8c57ec762..000000000
--- a/data/hfopenllm_v2/nvidia/AceMath-72B-Instruct/7c4c2ccf-7d7b-4d24-802e-20c182290d07.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nvidia_AceMath-72B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AceMath-72B-Instruct",
-    "id": "nvidia/AceMath-72B-Instruct",
-    "developer": "nvidia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.495
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6402
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4062
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4411
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nvidia/AceMath-72B-RM/95212a55-f382-4869-9e11-cfa201ba865b.json b/data/hfopenllm_v2/nvidia/AceMath-72B-RM/95212a55-f382-4869-9e11-cfa201ba865b.json
deleted file mode 100644
index a94d6d7b0..000000000
--- a/data/hfopenllm_v2/nvidia/AceMath-72B-RM/95212a55-f382-4869-9e11-cfa201ba865b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nvidia_AceMath-72B-RM/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AceMath-72B-RM",
-    "id": "nvidia/AceMath-72B-RM",
-    "developer": "nvidia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForSequenceClassification",
-      "params_billions": 71.461
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1413
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2717
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2341
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3351
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1179
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nvidia/AceMath-7B-Instruct/a7da2118-063c-489f-bb31-40f1b7beeefe.json b/data/hfopenllm_v2/nvidia/AceMath-7B-Instruct/a7da2118-063c-489f-bb31-40f1b7beeefe.json
deleted file mode 100644
index 66e2a33e8..000000000
--- a/data/hfopenllm_v2/nvidia/AceMath-7B-Instruct/a7da2118-063c-489f-bb31-40f1b7beeefe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nvidia_AceMath-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AceMath-7B-Instruct",
-    "id": "nvidia/AceMath-7B-Instruct",
-    "developer": "nvidia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4532
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4994
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6337
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4193
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3383
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nvidia/AceMath-7B-RM/9a75ae18-8f9a-40a5-8a7b-0c38df34e9dd.json b/data/hfopenllm_v2/nvidia/AceMath-7B-RM/9a75ae18-8f9a-40a5-8a7b-0c38df34e9dd.json
deleted file mode 100644
index 9eb39d05c..000000000
--- a/data/hfopenllm_v2/nvidia/AceMath-7B-RM/9a75ae18-8f9a-40a5-8a7b-0c38df34e9dd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nvidia_AceMath-7B-RM/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AceMath-7B-RM",
-    "id": "nvidia/AceMath-7B-RM",
-    "developer": "nvidia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForSequenceClassification",
-      "params_billions": 7.071
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1494
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2423
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2458
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1139
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nvidia/Hymba-1.5B-Base/a85d4a1f-fbd9-4d21-9700-9e55e30c1391.json b/data/hfopenllm_v2/nvidia/Hymba-1.5B-Base/a85d4a1f-fbd9-4d21-9700-9e55e30c1391.json
deleted file mode 100644
index f9cc991e3..000000000
--- a/data/hfopenllm_v2/nvidia/Hymba-1.5B-Base/a85d4a1f-fbd9-4d21-9700-9e55e30c1391.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nvidia_Hymba-1.5B-Base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hymba-1.5B-Base",
-    "id": "nvidia/Hymba-1.5B-Base",
-    "developer": "nvidia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "HymbaForCausalLM",
-      "params_billions": 1.523
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2295
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3256
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3566
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1922
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nvidia/Hymba-1.5B-Instruct/2fd1c45e-209c-43da-ae85-d60887513a96.json b/data/hfopenllm_v2/nvidia/Hymba-1.5B-Instruct/2fd1c45e-209c-43da-ae85-d60887513a96.json
deleted file mode 100644
index 9057e3697..000000000
--- a/data/hfopenllm_v2/nvidia/Hymba-1.5B-Instruct/2fd1c45e-209c-43da-ae85-d60887513a96.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nvidia_Hymba-1.5B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hymba-1.5B-Instruct",
-    "id": "nvidia/Hymba-1.5B-Instruct",
-    "developer": "nvidia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "HymbaForCausalLM",
-      "params_billions": 1.523
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6009
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3067
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0272
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3316
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.204
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nvidia/Llama-3.1-Minitron-4B-Depth-Base/91e0e6aa-b933-4a02-a28d-8d69e698c60a.json b/data/hfopenllm_v2/nvidia/Llama-3.1-Minitron-4B-Depth-Base/91e0e6aa-b933-4a02-a28d-8d69e698c60a.json
deleted file mode 100644
index cafbeee97..000000000
--- a/data/hfopenllm_v2/nvidia/Llama-3.1-Minitron-4B-Depth-Base/91e0e6aa-b933-4a02-a28d-8d69e698c60a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nvidia_Llama-3.1-Minitron-4B-Depth-Base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Minitron-4B-Depth-Base",
-    "id": "nvidia/Llama-3.1-Minitron-4B-Depth-Base",
-    "developer": "nvidia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.02
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1607
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4171
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4011
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2798
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF/6f3f3d06-2937-4c55-9b95-a62ae5253571.json b/data/hfopenllm_v2/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF/6f3f3d06-2937-4c55-9b95-a62ae5253571.json
deleted file mode 100644
index 666f668f2..000000000
--- a/data/hfopenllm_v2/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF/6f3f3d06-2937-4c55-9b95-a62ae5253571.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nvidia_Llama-3.1-Nemotron-70B-Instruct-HF/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-Nemotron-70B-Instruct-HF",
-    "id": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
-    "developer": "nvidia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7381
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6316
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4267
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4328
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4919
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nvidia/Minitron-4B-Base/9b3ffdd3-ac18-4084-9e83-1bfc61db0ec2.json b/data/hfopenllm_v2/nvidia/Minitron-4B-Base/9b3ffdd3-ac18-4084-9e83-1bfc61db0ec2.json
deleted file mode 100644
index 2e2aa65ef..000000000
--- a/data/hfopenllm_v2/nvidia/Minitron-4B-Base/9b3ffdd3-ac18-4084-9e83-1bfc61db0ec2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nvidia_Minitron-4B-Base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Minitron-4B-Base",
-    "id": "nvidia/Minitron-4B-Base",
-    "developer": "nvidia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "NemotronForCausalLM",
-      "params_billions": 4.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2218
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4084
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4134
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.262
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nvidia/Minitron-8B-Base/60077cbd-87af-4a00-a359-9235acb011ed.json b/data/hfopenllm_v2/nvidia/Minitron-8B-Base/60077cbd-87af-4a00-a359-9235acb011ed.json
deleted file mode 100644
index e6699dedf..000000000
--- a/data/hfopenllm_v2/nvidia/Minitron-8B-Base/60077cbd-87af-4a00-a359-9235acb011ed.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nvidia_Minitron-8B-Base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Minitron-8B-Base",
-    "id": "nvidia/Minitron-8B-Base",
-    "developer": "nvidia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "NemotronForCausalLM",
-      "params_billions": 7.22
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2424
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4395
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0257
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4026
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3181
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nvidia/Mistral-NeMo-Minitron-8B-Base/577936a8-b450-4233-b633-064565b3d1a4.json b/data/hfopenllm_v2/nvidia/Mistral-NeMo-Minitron-8B-Base/577936a8-b450-4233-b633-064565b3d1a4.json
deleted file mode 100644
index b690a4bd6..000000000
--- a/data/hfopenllm_v2/nvidia/Mistral-NeMo-Minitron-8B-Base/577936a8-b450-4233-b633-064565b3d1a4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nvidia_Mistral-NeMo-Minitron-8B-Base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-NeMo-Minitron-8B-Base",
-    "id": "nvidia/Mistral-NeMo-Minitron-8B-Base",
-    "developer": "nvidia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.88
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1946
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5219
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0483
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4092
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3796
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nvidia/Mistral-NeMo-Minitron-8B-Instruct/470b9413-2cc8-4bf4-9e7c-0b8e99929568.json b/data/hfopenllm_v2/nvidia/Mistral-NeMo-Minitron-8B-Instruct/470b9413-2cc8-4bf4-9e7c-0b8e99929568.json
deleted file mode 100644
index 964990218..000000000
--- a/data/hfopenllm_v2/nvidia/Mistral-NeMo-Minitron-8B-Instruct/470b9413-2cc8-4bf4-9e7c-0b8e99929568.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nvidia_Mistral-NeMo-Minitron-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-NeMo-Minitron-8B-Instruct",
-    "id": "nvidia/Mistral-NeMo-Minitron-8B-Instruct",
-    "developer": "nvidia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 8.414
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5004
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5321
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1163
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3886
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3991
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nvidia/Nemotron-Mini-4B-Instruct/3cbf9c73-0dc8-402e-bc94-c6d52b9f1af7.json b/data/hfopenllm_v2/nvidia/Nemotron-Mini-4B-Instruct/3cbf9c73-0dc8-402e-bc94-c6d52b9f1af7.json
deleted file mode 100644
index 95835d5c7..000000000
--- a/data/hfopenllm_v2/nvidia/Nemotron-Mini-4B-Instruct/3cbf9c73-0dc8-402e-bc94-c6d52b9f1af7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nvidia_Nemotron-Mini-4B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nemotron-Mini-4B-Instruct",
-    "id": "nvidia/Nemotron-Mini-4B-Instruct",
-    "developer": "nvidia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "NemotronForCausalLM",
-      "params_billions": 4.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6669
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3865
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0257
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3767
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nvidia/OpenMath2-Llama3.1-8B/3fccb1d0-5ae1-427a-adae-37004ecbacaa.json b/data/hfopenllm_v2/nvidia/OpenMath2-Llama3.1-8B/3fccb1d0-5ae1-427a-adae-37004ecbacaa.json
deleted file mode 100644
index 2fec4ab85..000000000
--- a/data/hfopenllm_v2/nvidia/OpenMath2-Llama3.1-8B/3fccb1d0-5ae1-427a-adae-37004ecbacaa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nvidia_OpenMath2-Llama3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenMath2-Llama3.1-8B",
-    "id": "nvidia/OpenMath2-Llama3.1-8B",
-    "developer": "nvidia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2331
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4096
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2674
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3436
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1553
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/nxmwxm/Beast-Soul-new/6463183f-4043-4b96-b4d1-0bd41b4d6876.json b/data/hfopenllm_v2/nxmwxm/Beast-Soul-new/6463183f-4043-4b96-b4d1-0bd41b4d6876.json
deleted file mode 100644
index 63096f4f3..000000000
--- a/data/hfopenllm_v2/nxmwxm/Beast-Soul-new/6463183f-4043-4b96-b4d1-0bd41b4d6876.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/nxmwxm_Beast-Soul-new/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Beast-Soul-new",
-    "id": "nxmwxm/Beast-Soul-new",
-    "developer": "nxmwxm",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4869
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5227
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.074
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4459
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3102
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/occiglot/occiglot-7b-es-en-instruct/0b102423-1a06-4e5b-a287-710695658b63.json b/data/hfopenllm_v2/occiglot/occiglot-7b-es-en-instruct/0b102423-1a06-4e5b-a287-710695658b63.json
deleted file mode 100644
index 14b07b1d6..000000000
--- a/data/hfopenllm_v2/occiglot/occiglot-7b-es-en-instruct/0b102423-1a06-4e5b-a287-710695658b63.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/occiglot_occiglot-7b-es-en-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "occiglot-7b-es-en-instruct",
-    "id": "occiglot/occiglot-7b-es-en-instruct",
-    "developer": "occiglot",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3485
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4111
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0242
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2311
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/odyssey-labs/Astral-1-10B/b7e4ffd8-2a5a-4364-844a-a308dd7c899c.json b/data/hfopenllm_v2/odyssey-labs/Astral-1-10B/b7e4ffd8-2a5a-4364-844a-a308dd7c899c.json
deleted file mode 100644
index 81a00bf26..000000000
--- a/data/hfopenllm_v2/odyssey-labs/Astral-1-10B/b7e4ffd8-2a5a-4364-844a-a308dd7c899c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/odyssey-labs_Astral-1-10B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Astral-1-10B",
-    "id": "odyssey-labs/Astral-1-10B",
-    "developer": "odyssey-labs",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3878
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4873
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0347
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.428
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2985
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/olabs-ai/reflection_model/3fa2e3ef-a375-4ca5-9f85-7cb986313d53.json b/data/hfopenllm_v2/olabs-ai/reflection_model/3fa2e3ef-a375-4ca5-9f85-7cb986313d53.json
deleted file mode 100644
index 619a40e89..000000000
--- a/data/hfopenllm_v2/olabs-ai/reflection_model/3fa2e3ef-a375-4ca5-9f85-7cb986313d53.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/olabs-ai_reflection_model/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "reflection_model",
-    "id": "olabs-ai/reflection_model",
-    "developer": "olabs-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 9.3
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1599
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4713
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3508
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3311
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/Llama_3.2_1b-autoredteam_helpfulness-train/abd48d9d-0443-40be-a23a-68922771e14f.json b/data/hfopenllm_v2/ontocord/Llama_3.2_1b-autoredteam_helpfulness-train/abd48d9d-0443-40be-a23a-68922771e14f.json
deleted file mode 100644
index 03e54e74a..000000000
--- a/data/hfopenllm_v2/ontocord/Llama_3.2_1b-autoredteam_helpfulness-train/abd48d9d-0443-40be-a23a-68922771e14f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_Llama_3.2_1b-autoredteam_helpfulness-train/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama_3.2_1b-autoredteam_helpfulness-train",
-    "id": "ontocord/Llama_3.2_1b-autoredteam_helpfulness-train",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.498
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2765
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3115
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3459
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1132
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/RedPajama-3B-v1-AutoRedteam-Harmless-only/436ff0a4-9907-4e56-a5f2-c97f1b13f81a.json b/data/hfopenllm_v2/ontocord/RedPajama-3B-v1-AutoRedteam-Harmless-only/436ff0a4-9907-4e56-a5f2-c97f1b13f81a.json
deleted file mode 100644
index 070fd99d6..000000000
--- a/data/hfopenllm_v2/ontocord/RedPajama-3B-v1-AutoRedteam-Harmless-only/436ff0a4-9907-4e56-a5f2-c97f1b13f81a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_RedPajama-3B-v1-AutoRedteam-Harmless-only/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RedPajama-3B-v1-AutoRedteam-Harmless-only",
-    "id": "ontocord/RedPajama-3B-v1-AutoRedteam-Harmless-only",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 2.776
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1525
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3124
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2315
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3661
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.11
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/RedPajama-3B-v1-AutoRedteam/7a654100-b206-4011-828e-fb386df27d0c.json b/data/hfopenllm_v2/ontocord/RedPajama-3B-v1-AutoRedteam/7a654100-b206-4011-828e-fb386df27d0c.json
deleted file mode 100644
index 20317204f..000000000
--- a/data/hfopenllm_v2/ontocord/RedPajama-3B-v1-AutoRedteam/7a654100-b206-4011-828e-fb386df27d0c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_RedPajama-3B-v1-AutoRedteam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RedPajama-3B-v1-AutoRedteam",
-    "id": "ontocord/RedPajama-3B-v1-AutoRedteam",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 2.776
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1343
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3026
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2424
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3661
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1108
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/RedPajama3b_v1-autoredteam_helpfulness-train/2f0e262c-a099-41f4-89f1-8b251708a960.json b/data/hfopenllm_v2/ontocord/RedPajama3b_v1-autoredteam_helpfulness-train/2f0e262c-a099-41f4-89f1-8b251708a960.json
deleted file mode 100644
index b83d49b55..000000000
--- a/data/hfopenllm_v2/ontocord/RedPajama3b_v1-autoredteam_helpfulness-train/2f0e262c-a099-41f4-89f1-8b251708a960.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_RedPajama3b_v1-autoredteam_helpfulness-train/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RedPajama3b_v1-autoredteam_helpfulness-train",
-    "id": "ontocord/RedPajama3b_v1-autoredteam_helpfulness-train",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 2.776
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2848
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3093
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2458
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1107
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/merged_0.2_expert_0.8-stack_2x/7bf3e9ca-7d6f-4d43-b8fe-aceb8d60c7c6.json b/data/hfopenllm_v2/ontocord/merged_0.2_expert_0.8-stack_2x/7bf3e9ca-7d6f-4d43-b8fe-aceb8d60c7c6.json
deleted file mode 100644
index 14485961f..000000000
--- a/data/hfopenllm_v2/ontocord/merged_0.2_expert_0.8-stack_2x/7bf3e9ca-7d6f-4d43-b8fe-aceb8d60c7c6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_merged_0.2_expert_0.8-stack_2x/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "merged_0.2_expert_0.8-stack_2x",
-    "id": "ontocord/merged_0.2_expert_0.8-stack_2x",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 6.512
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1796
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3006
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3541
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1103
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/merged_0.2_expert_0.8/8703dbdd-12ef-457b-8cda-f570c8f5c890.json b/data/hfopenllm_v2/ontocord/merged_0.2_expert_0.8/8703dbdd-12ef-457b-8cda-f570c8f5c890.json
deleted file mode 100644
index 4f9707321..000000000
--- a/data/hfopenllm_v2/ontocord/merged_0.2_expert_0.8/8703dbdd-12ef-457b-8cda-f570c8f5c890.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_merged_0.2_expert_0.8/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "merged_0.2_expert_0.8",
-    "id": "ontocord/merged_0.2_expert_0.8",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1743
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3046
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3621
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1111
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/merged_0.5_expert_0.5/d77f3e8f-1eea-478e-babd-ba873d2d427c.json b/data/hfopenllm_v2/ontocord/merged_0.5_expert_0.5/d77f3e8f-1eea-478e-babd-ba873d2d427c.json
deleted file mode 100644
index 3c97d6dc5..000000000
--- a/data/hfopenllm_v2/ontocord/merged_0.5_expert_0.5/d77f3e8f-1eea-478e-babd-ba873d2d427c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_merged_0.5_expert_0.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "merged_0.5_expert_0.5",
-    "id": "ontocord/merged_0.5_expert_0.5",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1787
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3017
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3542
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1108
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful/783a4385-c802-4bb3-9a21-90629d16efc7.json b/data/hfopenllm_v2/ontocord/ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful/783a4385-c802-4bb3-9a21-90629d16efc7.json
deleted file mode 100644
index b166ddfd7..000000000
--- a/data/hfopenllm_v2/ontocord/ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful/783a4385-c802-4bb3-9a21-90629d16efc7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful",
-    "id": "ontocord/ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained-autoredteam_helpful-0.25_helpful",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1318
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3004
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3631
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1142
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/ontocord_wide_7b-stacked-stage1-instruct/bb4ff51e-ce3a-42f5-871e-3e5e8977bc42.json b/data/hfopenllm_v2/ontocord/ontocord_wide_7b-stacked-stage1-instruct/bb4ff51e-ce3a-42f5-871e-3e5e8977bc42.json
deleted file mode 100644
index 3d829740d..000000000
--- a/data/hfopenllm_v2/ontocord/ontocord_wide_7b-stacked-stage1-instruct/bb4ff51e-ce3a-42f5-871e-3e5e8977bc42.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_ontocord_wide_7b-stacked-stage1-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ontocord_wide_7b-stacked-stage1-instruct",
-    "id": "ontocord/ontocord_wide_7b-stacked-stage1-instruct",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.888
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.153
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2854
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2466
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3538
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1117
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/ontocord_wide_7b-stacked-stage1/e80d25b5-3f4b-45a7-9472-09f98db03bf0.json b/data/hfopenllm_v2/ontocord/ontocord_wide_7b-stacked-stage1/e80d25b5-3f4b-45a7-9472-09f98db03bf0.json
deleted file mode 100644
index 0a16462de..000000000
--- a/data/hfopenllm_v2/ontocord/ontocord_wide_7b-stacked-stage1/e80d25b5-3f4b-45a7-9472-09f98db03bf0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_ontocord_wide_7b-stacked-stage1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ontocord_wide_7b-stacked-stage1",
-    "id": "ontocord/ontocord_wide_7b-stacked-stage1",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.888
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1485
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2897
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3604
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1105
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/starcoder2-29b-ls/7fed0b1d-0d79-4784-8fd6-42f8611b1751.json b/data/hfopenllm_v2/ontocord/starcoder2-29b-ls/7fed0b1d-0d79-4784-8fd6-42f8611b1751.json
deleted file mode 100644
index 3cb91cf0f..000000000
--- a/data/hfopenllm_v2/ontocord/starcoder2-29b-ls/7fed0b1d-0d79-4784-8fd6-42f8611b1751.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_starcoder2-29b-ls/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "starcoder2-29b-ls",
-    "id": "ontocord/starcoder2-29b-ls",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Starcoder2ForCausalLM",
-      "params_billions": 29.009
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2149
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3735
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0189
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1869
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/starcoder2_3b-AutoRedteam/be534cd3-8245-4370-ba6c-9687b431ee8d.json b/data/hfopenllm_v2/ontocord/starcoder2_3b-AutoRedteam/be534cd3-8245-4370-ba6c-9687b431ee8d.json
deleted file mode 100644
index ad59c53df..000000000
--- a/data/hfopenllm_v2/ontocord/starcoder2_3b-AutoRedteam/be534cd3-8245-4370-ba6c-9687b431ee8d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_starcoder2_3b-AutoRedteam/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "starcoder2_3b-AutoRedteam",
-    "id": "ontocord/starcoder2_3b-AutoRedteam",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Starcoder2ForCausalLM",
-      "params_billions": 3.181
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1574
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3498
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3646
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1336
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b-merge_test/e98967b7-3aff-4baa-92eb-eff86bf09797.json b/data/hfopenllm_v2/ontocord/wide_3b-merge_test/e98967b7-3aff-4baa-92eb-eff86bf09797.json
deleted file mode 100644
index e0c7c037c..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b-merge_test/e98967b7-3aff-4baa-92eb-eff86bf09797.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b-merge_test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b-merge_test",
-    "id": "ontocord/wide_3b-merge_test",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1763
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3011
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2399
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1066
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b-stage1_shuf_sample1_jsonl-pretrained/8736a22a-f980-4a01-953d-217f27050129.json b/data/hfopenllm_v2/ontocord/wide_3b-stage1_shuf_sample1_jsonl-pretrained/8736a22a-f980-4a01-953d-217f27050129.json
deleted file mode 100644
index abe0c08b1..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b-stage1_shuf_sample1_jsonl-pretrained/8736a22a-f980-4a01-953d-217f27050129.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b-stage1_shuf_sample1_jsonl-pretrained/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b-stage1_shuf_sample1_jsonl-pretrained",
-    "id": "ontocord/wide_3b-stage1_shuf_sample1_jsonl-pretrained",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1395
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3004
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3632
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge/75a2b5c9-7c73-4bb4-8e99-af4a3a27589d.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge/75a2b5c9-7c73-4bb4-8e99-af4a3a27589d.json
deleted file mode 100644
index 129aad1c1..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge/75a2b5c9-7c73-4bb4-8e99-af4a3a27589d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge",
-    "id": "ontocord/wide_3b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1664
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3031
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0113
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3845
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1111
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge/0e0ebdc7-a5bd-4314-9bd7-fc8a11541a4e.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge/0e0ebdc7-a5bd-4314-9bd7-fc8a11541a4e.json
deleted file mode 100644
index 2b49af2c2..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge/0e0ebdc7-a5bd-4314-9bd7-fc8a11541a4e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge",
-    "id": "ontocord/wide_3b_sft_stag1.2-lyrical_news_software_howto_formattedtext-merge",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1697
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2975
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3778
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue/f8579305-003b-4727-b904-bad4f363a616.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue/f8579305-003b-4727-b904-bad4f363a616.json
deleted file mode 100644
index b29b83734..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue/f8579305-003b-4727-b904-bad4f363a616.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue",
-    "id": "ontocord/wide_3b_sft_stage1.1-ss1-no_redteam_skg_poem.no_issue",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.148
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3095
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3579
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1108
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue/3103f36a-4a88-4a39-8261-0b597f8d6db4.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue/3103f36a-4a88-4a39-8261-0b597f8d6db4.json
deleted file mode 100644
index e2c5d63f0..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue/3103f36a-4a88-4a39-8261-0b597f8d6db4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue",
-    "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr.no_issue",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1237
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.306
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3673
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1111
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue/eda9de3b-ae53-4102-b203-eddadbc50464.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue/eda9de3b-ae53-4102-b203-eddadbc50464.json
deleted file mode 100644
index d30f5ad9b..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue/eda9de3b-ae53-4102-b203-eddadbc50464.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue",
-    "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math.no_issue",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1192
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2956
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3553
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1183
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/b7de4fa8-d97d-400f-bc3f-ecb1963a03ed.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/b7de4fa8-d97d-400f-bc3f-ecb1963a03ed.json
deleted file mode 100644
index 9fef62a26..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/b7de4fa8-d97d-400f-bc3f-ecb1963a03ed.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue",
-    "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1128
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0113
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.346
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1129
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/fa6ecaf9-457e-4135-ad25-4790ebc27737.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/fa6ecaf9-457e-4135-ad25-4790ebc27737.json
deleted file mode 100644
index 053d679ba..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/fa6ecaf9-457e-4135-ad25-4790ebc27737.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue",
-    "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories.no_issue",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1162
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3184
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0076
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3447
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1124
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue/ebaa99c4-ff66-421d-8ba7-dae2c5fa274c.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue/ebaa99c4-ff66-421d-8ba7-dae2c5fa274c.json
deleted file mode 100644
index 4832c4eb6..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue/ebaa99c4-ff66-421d-8ba7-dae2c5fa274c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue",
-    "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_math_stories_no_orig_instr.no_issue",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1317
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3064
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3446
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1144
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue/e388c707-8b35-49a4-94eb-f32e983fe33e.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue/e388c707-8b35-49a4-94eb-f32e983fe33e.json
deleted file mode 100644
index b73dce637..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue/e388c707-8b35-49a4-94eb-f32e983fe33e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue",
-    "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_intr_stories.no_issue",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1182
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3567
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1162
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue/f6273192-31cf-4ee1-af45-c2f62de05330.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue/f6273192-31cf-4ee1-af45-c2f62de05330.json
deleted file mode 100644
index f93ee1cb2..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue/f6273192-31cf-4ee1-af45-c2f62de05330.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue",
-    "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_generics_math.no_issue",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.124
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3032
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0076
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3487
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1128
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_math.no_issue/105650e6-d9cf-4106-9d55-6f3c08f2f1cf.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_math.no_issue/105650e6-d9cf-4106-9d55-6f3c08f2f1cf.json
deleted file mode 100644
index 635e7e0ec..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_math.no_issue/105650e6-d9cf-4106-9d55-6f3c08f2f1cf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-with_math.no_issue/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b_sft_stage1.1-ss1-with_math.no_issue",
-    "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_math.no_issue",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1298
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3052
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3928
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1147
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue/a1d23749-40c0-4ccb-a104-bf0de63bc2bd.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue/a1d23749-40c0-4ccb-a104-bf0de63bc2bd.json
deleted file mode 100644
index 22c12a778..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue/a1d23749-40c0-4ccb-a104-bf0de63bc2bd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue",
-    "id": "ontocord/wide_3b_sft_stage1.1-ss1-with_r1_generics_intr_math_stories.no_issue",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2049
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2912
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3575
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1167
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical/4e4b4cf9-48d5-4ff6-92c0-1e9d7b874b6b.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical/4e4b4cf9-48d5-4ff6-92c0-1e9d7b874b6b.json
deleted file mode 100644
index b7dea3b9d..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical/4e4b4cf9-48d5-4ff6-92c0-1e9d7b874b6b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical",
-    "id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_fictional_lyrical",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1461
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2998
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3926
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1141
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_formatted_text/3c4713a3-3973-4a04-9c4a-a6782251734e.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_formatted_text/3c4713a3-3973-4a04-9c4a-a6782251734e.json
deleted file mode 100644
index 7106f073b..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_formatted_text/3c4713a3-3973-4a04-9c4a-a6782251734e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.2-ss1-expert_formatted_text/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b_sft_stage1.2-ss1-expert_formatted_text",
-    "id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_formatted_text",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1487
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3069
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3474
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1146
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_how-to/de70c700-a007-4e87-a3db-941ee285eb1f.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_how-to/de70c700-a007-4e87-a3db-941ee285eb1f.json
deleted file mode 100644
index 0860e6dcd..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_how-to/de70c700-a007-4e87-a3db-941ee285eb1f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.2-ss1-expert_how-to/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b_sft_stage1.2-ss1-expert_how-to",
-    "id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_how-to",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1245
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3047
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3658
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1153
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_math/a1324a7f-1911-4fa9-8d83-be891f752a61.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_math/a1324a7f-1911-4fa9-8d83-be891f752a61.json
deleted file mode 100644
index ffb80060b..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_math/a1324a7f-1911-4fa9-8d83-be891f752a61.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.2-ss1-expert_math/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b_sft_stage1.2-ss1-expert_math",
-    "id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_math",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1915
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.306
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0279
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1092
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_news/9c4af0df-f538-4755-8cd0-eec6b2b26524.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_news/9c4af0df-f538-4755-8cd0-eec6b2b26524.json
deleted file mode 100644
index 1ef6d19ad..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_news/9c4af0df-f538-4755-8cd0-eec6b2b26524.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.2-ss1-expert_news/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b_sft_stage1.2-ss1-expert_news",
-    "id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_news",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1658
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2926
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3621
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1111
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_software/fde650a6-a5d1-4edc-bd64-8be806663263.json b/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_software/fde650a6-a5d1-4edc-bd64-8be806663263.json
deleted file mode 100644
index 511c00854..000000000
--- a/data/hfopenllm_v2/ontocord/wide_3b_sft_stage1.2-ss1-expert_software/fde650a6-a5d1-4edc-bd64-8be806663263.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_3b_sft_stage1.2-ss1-expert_software/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_3b_sft_stage1.2-ss1-expert_software",
-    "id": "ontocord/wide_3b_sft_stage1.2-ss1-expert_software",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.759
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1734
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.298
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3569
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ontocord/wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked/96dd1a08-b166-4d8e-ac31-5e948adf931b.json b/data/hfopenllm_v2/ontocord/wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked/96dd1a08-b166-4d8e-ac31-5e948adf931b.json
deleted file mode 100644
index a1a9a9d89..000000000
--- a/data/hfopenllm_v2/ontocord/wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked/96dd1a08-b166-4d8e-ac31-5e948adf931b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ontocord_wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked",
-    "id": "ontocord/wide_6.6b_sft_stag1.2-lyrical_law_news_software_howto_formattedtext_math_wiki-merge-stacked",
-    "developer": "ontocord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.888
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1244
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3026
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3686
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1115
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/oobabooga/CodeBooga-34B-v0.1/3b90b9db-a68e-4ee9-bd4d-a18cec357753.json b/data/hfopenllm_v2/oobabooga/CodeBooga-34B-v0.1/3b90b9db-a68e-4ee9-bd4d-a18cec357753.json
deleted file mode 100644
index e386ca80a..000000000
--- a/data/hfopenllm_v2/oobabooga/CodeBooga-34B-v0.1/3b90b9db-a68e-4ee9-bd4d-a18cec357753.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/oobabooga_CodeBooga-34B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CodeBooga-34B-v0.1",
-    "id": "oobabooga/CodeBooga-34B-v0.1",
-    "developer": "oobabooga",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 33.744
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.525
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3427
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.431
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.236
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/oopere/Llama-FinSent-S/444a6ace-77d4-4d93-b80b-ff5c7e2f6888.json b/data/hfopenllm_v2/oopere/Llama-FinSent-S/444a6ace-77d4-4d93-b80b-ff5c7e2f6888.json
deleted file mode 100644
index 466bcc8a4..000000000
--- a/data/hfopenllm_v2/oopere/Llama-FinSent-S/444a6ace-77d4-4d93-b80b-ff5c7e2f6888.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/oopere_Llama-FinSent-S/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-FinSent-S",
-    "id": "oopere/Llama-FinSent-S",
-    "developer": "oopere",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.914
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2119
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3156
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3832
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.113
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/oopere/Llama-FinSent-S/7e11a778-fccf-4a91-81cf-c06f1a5c77c4.json b/data/hfopenllm_v2/oopere/Llama-FinSent-S/7e11a778-fccf-4a91-81cf-c06f1a5c77c4.json
deleted file mode 100644
index d92a91932..000000000
--- a/data/hfopenllm_v2/oopere/Llama-FinSent-S/7e11a778-fccf-4a91-81cf-c06f1a5c77c4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/oopere_Llama-FinSent-S/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-FinSent-S",
-    "id": "oopere/Llama-FinSent-S",
-    "developer": "oopere",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.914
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2164
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3169
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3832
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1134
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/oopere/pruned10-llama-3.2-3B/e5d126d7-e0bf-43dc-95c0-184ea1d586ea.json b/data/hfopenllm_v2/oopere/pruned10-llama-3.2-3B/e5d126d7-e0bf-43dc-95c0-184ea1d586ea.json
deleted file mode 100644
index afabeb373..000000000
--- a/data/hfopenllm_v2/oopere/pruned10-llama-3.2-3B/e5d126d7-e0bf-43dc-95c0-184ea1d586ea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/oopere_pruned10-llama-3.2-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pruned10-llama-3.2-3B",
-    "id": "oopere/pruned10-llama-3.2-3B",
-    "developer": "oopere",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.001
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1776
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.334
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3722
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.164
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/oopere/pruned20-llama-1b/d05b129c-6b9e-4e6b-80fc-af65db620c5d.json b/data/hfopenllm_v2/oopere/pruned20-llama-1b/d05b129c-6b9e-4e6b-80fc-af65db620c5d.json
deleted file mode 100644
index d15232953..000000000
--- a/data/hfopenllm_v2/oopere/pruned20-llama-1b/d05b129c-6b9e-4e6b-80fc-af65db620c5d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/oopere_pruned20-llama-1b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pruned20-llama-1b",
-    "id": "oopere/pruned20-llama-1b",
-    "developer": "oopere",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.075
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1994
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3031
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3631
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1123
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/oopere/pruned20-llama-3.2-3b/d9792fac-29c1-45b2-b649-cdebb6830e2f.json b/data/hfopenllm_v2/oopere/pruned20-llama-3.2-3b/d9792fac-29c1-45b2-b649-cdebb6830e2f.json
deleted file mode 100644
index bc86d9513..000000000
--- a/data/hfopenllm_v2/oopere/pruned20-llama-3.2-3b/d9792fac-29c1-45b2-b649-cdebb6830e2f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/oopere_pruned20-llama-3.2-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pruned20-llama-3.2-3b",
-    "id": "oopere/pruned20-llama-3.2-3b",
-    "developer": "oopere",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 2.79
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1789
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3248
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3418
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.128
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/oopere/pruned40-llama-1b/fcc2f06a-e6c8-4c28-bf22-4ee582392912.json b/data/hfopenllm_v2/oopere/pruned40-llama-1b/fcc2f06a-e6c8-4c28-bf22-4ee582392912.json
deleted file mode 100644
index 2afb1508c..000000000
--- a/data/hfopenllm_v2/oopere/pruned40-llama-1b/fcc2f06a-e6c8-4c28-bf22-4ee582392912.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/oopere_pruned40-llama-1b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pruned40-llama-1b",
-    "id": "oopere/pruned40-llama-1b",
-    "developer": "oopere",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.914
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2284
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2969
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0076
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2433
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4287
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1082
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/oopere/pruned40-llama-3.2-1B/c6e13327-90b3-440d-9367-dbcec54dd6cc.json b/data/hfopenllm_v2/oopere/pruned40-llama-3.2-1B/c6e13327-90b3-440d-9367-dbcec54dd6cc.json
deleted file mode 100644
index ee1ba1f96..000000000
--- a/data/hfopenllm_v2/oopere/pruned40-llama-3.2-1B/c6e13327-90b3-440d-9367-dbcec54dd6cc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/oopere_pruned40-llama-3.2-1B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pruned40-llama-3.2-1B",
-    "id": "oopere/pruned40-llama-3.2-1B",
-    "developer": "oopere",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.914
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2266
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2982
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4352
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1115
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/oopere/pruned40-llama-3.2-3b/30b02429-350c-4d86-aded-ba8597bec4d5.json b/data/hfopenllm_v2/oopere/pruned40-llama-3.2-3b/30b02429-350c-4d86-aded-ba8597bec4d5.json
deleted file mode 100644
index 343aa660b..000000000
--- a/data/hfopenllm_v2/oopere/pruned40-llama-3.2-3b/30b02429-350c-4d86-aded-ba8597bec4d5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/oopere_pruned40-llama-3.2-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pruned40-llama-3.2-3b",
-    "id": "oopere/pruned40-llama-3.2-3b",
-    "developer": "oopere",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 2.367
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2183
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3167
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2299
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3539
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1177
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/oopere/pruned60-llama-1b/7d1ee802-106e-4313-ba1d-72d5a0676c88.json b/data/hfopenllm_v2/oopere/pruned60-llama-1b/7d1ee802-106e-4313-ba1d-72d5a0676c88.json
deleted file mode 100644
index 116617b5f..000000000
--- a/data/hfopenllm_v2/oopere/pruned60-llama-1b/7d1ee802-106e-4313-ba1d-72d5a0676c88.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/oopere_pruned60-llama-1b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pruned60-llama-1b",
-    "id": "oopere/pruned60-llama-1b",
-    "developer": "oopere",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.753
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1829
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3016
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0023
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4088
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1173
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/oopere/pruned60-llama-3.2-3b/1b3af020-f65e-44b8-a9a2-ad60fa686427.json b/data/hfopenllm_v2/oopere/pruned60-llama-3.2-3b/1b3af020-f65e-44b8-a9a2-ad60fa686427.json
deleted file mode 100644
index 321e3d3d4..000000000
--- a/data/hfopenllm_v2/oopere/pruned60-llama-3.2-3b/1b3af020-f65e-44b8-a9a2-ad60fa686427.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/oopere_pruned60-llama-3.2-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "pruned60-llama-3.2-3b",
-    "id": "oopere/pruned60-llama-3.2-3b",
-    "developer": "oopere",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.944
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1825
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3166
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0038
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3633
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1131
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/open-atlas/Atlas-Flash-1.5B-Preview/6e40871d-bc23-4f1c-a005-f5b8eb096f84.json b/data/hfopenllm_v2/open-atlas/Atlas-Flash-1.5B-Preview/6e40871d-bc23-4f1c-a005-f5b8eb096f84.json
deleted file mode 100644
index 2ba09d639..000000000
--- a/data/hfopenllm_v2/open-atlas/Atlas-Flash-1.5B-Preview/6e40871d-bc23-4f1c-a005-f5b8eb096f84.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/open-atlas_Atlas-Flash-1.5B-Preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Atlas-Flash-1.5B-Preview",
-    "id": "open-atlas/Atlas-Flash-1.5B-Preview",
-    "developer": "open-atlas",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.327
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3215
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2213
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3488
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1374
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/open-atlas/Atlas-Flash-7B-Preview/1ab33ed2-ea3b-4c6f-a2ac-2465ddd844f4.json b/data/hfopenllm_v2/open-atlas/Atlas-Flash-7B-Preview/1ab33ed2-ea3b-4c6f-a2ac-2465ddd844f4.json
deleted file mode 100644
index 5c4b96098..000000000
--- a/data/hfopenllm_v2/open-atlas/Atlas-Flash-7B-Preview/1ab33ed2-ea3b-4c6f-a2ac-2465ddd844f4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/open-atlas_Atlas-Flash-7B-Preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Atlas-Flash-7B-Preview",
-    "id": "open-atlas/Atlas-Flash-7B-Preview",
-    "developer": "open-atlas",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3908
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3542
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3836
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2784
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/open-neo/Kyro-n1-3B/ec601f5d-bf19-4407-ac41-6b9272d94735.json b/data/hfopenllm_v2/open-neo/Kyro-n1-3B/ec601f5d-bf19-4407-ac41-6b9272d94735.json
deleted file mode 100644
index eab4e7f5e..000000000
--- a/data/hfopenllm_v2/open-neo/Kyro-n1-3B/ec601f5d-bf19-4407-ac41-6b9272d94735.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/open-neo_Kyro-n1-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kyro-n1-3B",
-    "id": "open-neo/Kyro-n1-3B",
-    "developer": "open-neo",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4595
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4685
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2855
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4088
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3423
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/open-neo/Kyro-n1-7B/87e53761-e8b7-4032-ae7a-c3a91704d115.json b/data/hfopenllm_v2/open-neo/Kyro-n1-7B/87e53761-e8b7-4032-ae7a-c3a91704d115.json
deleted file mode 100644
index 7c4002489..000000000
--- a/data/hfopenllm_v2/open-neo/Kyro-n1-7B/87e53761-e8b7-4032-ae7a-c3a91704d115.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/open-neo_Kyro-n1-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Kyro-n1-7B",
-    "id": "open-neo/Kyro-n1-7B",
-    "developer": "open-neo",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5573
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5387
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3897
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3884
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4333
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/open-thoughts/OpenThinker-7B/59492d86-4b85-4865-84e9-84ab4ace630c.json b/data/hfopenllm_v2/open-thoughts/OpenThinker-7B/59492d86-4b85-4865-84e9-84ab4ace630c.json
deleted file mode 100644
index b02b2a84c..000000000
--- a/data/hfopenllm_v2/open-thoughts/OpenThinker-7B/59492d86-4b85-4865-84e9-84ab4ace630c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/open-thoughts_OpenThinker-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenThinker-7B",
-    "id": "open-thoughts/OpenThinker-7B",
-    "developer": "open-thoughts",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4089
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5343
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.426
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4165
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/openai-community/gpt2-large/cc082df2-259c-44c1-abe4-ef349056a2a9.json b/data/hfopenllm_v2/openai-community/gpt2-large/cc082df2-259c-44c1-abe4-ef349056a2a9.json
deleted file mode 100644
index f23bbd9c9..000000000
--- a/data/hfopenllm_v2/openai-community/gpt2-large/cc082df2-259c-44c1-abe4-ef349056a2a9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/openai-community_gpt2-large/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt2-large",
-    "id": "openai-community/gpt2-large",
-    "developer": "openai-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPT2LMHeadModel",
-      "params_billions": 0.812
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2048
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3069
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3789
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1142
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/openai-community/gpt2-medium/3f069053-b24e-4242-9302-d46b82e511aa.json b/data/hfopenllm_v2/openai-community/gpt2-medium/3f069053-b24e-4242-9302-d46b82e511aa.json
deleted file mode 100644
index 3800ef51a..000000000
--- a/data/hfopenllm_v2/openai-community/gpt2-medium/3f069053-b24e-4242-9302-d46b82e511aa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/openai-community_gpt2-medium/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt2-medium",
-    "id": "openai-community/gpt2-medium",
-    "developer": "openai-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPT2LMHeadModel",
-      "params_billions": 0.38
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2208
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.305
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0076
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3884
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1182
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/openai-community/gpt2-xl/62cd9bcb-a74c-40b9-be84-a0077235ae3c.json b/data/hfopenllm_v2/openai-community/gpt2-xl/62cd9bcb-a74c-40b9-be84-a0077235ae3c.json
deleted file mode 100644
index 4094508d3..000000000
--- a/data/hfopenllm_v2/openai-community/gpt2-xl/62cd9bcb-a74c-40b9-be84-a0077235ae3c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/openai-community_gpt2-xl/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt2-xl",
-    "id": "openai-community/gpt2-xl",
-    "developer": "openai-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPT2LMHeadModel",
-      "params_billions": 1.608
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2039
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3009
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.371
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1131
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/openai-community/gpt2/b4cd25f1-87d5-4173-a4d3-928444f6cb37.json b/data/hfopenllm_v2/openai-community/gpt2/b4cd25f1-87d5-4173-a4d3-928444f6cb37.json
deleted file mode 100644
index 172402db8..000000000
--- a/data/hfopenllm_v2/openai-community/gpt2/b4cd25f1-87d5-4173-a4d3-928444f6cb37.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/openai-community_gpt2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt2",
-    "id": "openai-community/gpt2",
-    "developer": "openai-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPT2LMHeadModel",
-      "params_billions": 0.137
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1793
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3036
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0023
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4471
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1159
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/openai-community/gpt2/ddd4716e-d8ae-46a1-8fb4-c27e2da40e6e.json b/data/hfopenllm_v2/openai-community/gpt2/ddd4716e-d8ae-46a1-8fb4-c27e2da40e6e.json
deleted file mode 100644
index 040f869b3..000000000
--- a/data/hfopenllm_v2/openai-community/gpt2/ddd4716e-d8ae-46a1-8fb4-c27e2da40e6e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/openai-community_gpt2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt2",
-    "id": "openai-community/gpt2",
-    "developer": "openai-community",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPT2LMHeadModel",
-      "params_billions": 0.137
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.178
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3017
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0053
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.439
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1165
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/openbmb/MiniCPM-S-1B-sft-llama-format/1e5b62a3-018b-429a-b2b4-325545ee99dc.json b/data/hfopenllm_v2/openbmb/MiniCPM-S-1B-sft-llama-format/1e5b62a3-018b-429a-b2b4-325545ee99dc.json
deleted file mode 100644
index 404c67964..000000000
--- a/data/hfopenllm_v2/openbmb/MiniCPM-S-1B-sft-llama-format/1e5b62a3-018b-429a-b2b4-325545ee99dc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/openbmb_MiniCPM-S-1B-sft-llama-format/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MiniCPM-S-1B-sft-llama-format",
-    "id": "openbmb/MiniCPM-S-1B-sft-llama-format",
-    "developer": "openbmb",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3329
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3049
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.031
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3317
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1858
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/openchat/openchat-3.5-0106/958d410e-ce43-44c0-8a56-685c0a618408.json b/data/hfopenllm_v2/openchat/openchat-3.5-0106/958d410e-ce43-44c0-8a56-685c0a618408.json
deleted file mode 100644
index c3931b7fa..000000000
--- a/data/hfopenllm_v2/openchat/openchat-3.5-0106/958d410e-ce43-44c0-8a56-685c0a618408.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/openchat_openchat-3.5-0106/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openchat-3.5-0106",
-    "id": "openchat/openchat-3.5-0106",
-    "developer": "openchat",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5967
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4617
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0763
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4254
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3291
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/openchat/openchat-3.5-1210/57c53f20-aa32-49fd-926a-f26c9d0759d4.json b/data/hfopenllm_v2/openchat/openchat-3.5-1210/57c53f20-aa32-49fd-926a-f26c9d0759d4.json
deleted file mode 100644
index 2d6ea71b7..000000000
--- a/data/hfopenllm_v2/openchat/openchat-3.5-1210/57c53f20-aa32-49fd-926a-f26c9d0759d4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/openchat_openchat-3.5-1210/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openchat-3.5-1210",
-    "id": "openchat/openchat-3.5-1210",
-    "developer": "openchat",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6037
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4535
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0785
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4414
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3142
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/openchat/openchat-3.6-8b-20240522/76def522-6fe1-458f-bfbf-99b50ece3367.json b/data/hfopenllm_v2/openchat/openchat-3.6-8b-20240522/76def522-6fe1-458f-bfbf-99b50ece3367.json
deleted file mode 100644
index 162c9b8e8..000000000
--- a/data/hfopenllm_v2/openchat/openchat-3.6-8b-20240522/76def522-6fe1-458f-bfbf-99b50ece3367.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/openchat_openchat-3.6-8b-20240522/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openchat-3.6-8b-20240522",
-    "id": "openchat/openchat-3.6-8b-20240522",
-    "developer": "openchat",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5343
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5338
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0997
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3999
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3229
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/openchat/openchat_3.5/c467bc88-6769-48ac-abd4-867ee38bbe57.json b/data/hfopenllm_v2/openchat/openchat_3.5/c467bc88-6769-48ac-abd4-867ee38bbe57.json
deleted file mode 100644
index 652c57fc3..000000000
--- a/data/hfopenllm_v2/openchat/openchat_3.5/c467bc88-6769-48ac-abd4-867ee38bbe57.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/openchat_openchat_3.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openchat_3.5",
-    "id": "openchat/openchat_3.5",
-    "developer": "openchat",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5931
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4426
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0725
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4229
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3153
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/openchat/openchat_v3.2/801681eb-66f4-46e0-bb2b-7ba4b46679af.json b/data/hfopenllm_v2/openchat/openchat_v3.2/801681eb-66f4-46e0-bb2b-7ba4b46679af.json
deleted file mode 100644
index 781672e47..000000000
--- a/data/hfopenllm_v2/openchat/openchat_v3.2/801681eb-66f4-46e0-bb2b-7ba4b46679af.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/openchat_openchat_v3.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openchat_v3.2",
-    "id": "openchat/openchat_v3.2",
-    "developer": "openchat",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2981
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4331
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4336
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2422
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/openchat/openchat_v3.2_super/cdd0ea1c-b17a-4816-953c-1d7164c64114.json b/data/hfopenllm_v2/openchat/openchat_v3.2_super/cdd0ea1c-b17a-4816-953c-1d7164c64114.json
deleted file mode 100644
index a8f2b40b0..000000000
--- a/data/hfopenllm_v2/openchat/openchat_v3.2_super/cdd0ea1c-b17a-4816-953c-1d7164c64114.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/openchat_openchat_v3.2_super/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openchat_v3.2_super",
-    "id": "openchat/openchat_v3.2_super",
-    "developer": "openchat",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2862
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4221
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0211
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4161
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2425
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/orai-nlp/Llama-eus-8B/b2060893-1f7d-4e7a-a458-3623147ac118.json b/data/hfopenllm_v2/orai-nlp/Llama-eus-8B/b2060893-1f7d-4e7a-a458-3623147ac118.json
deleted file mode 100644
index 5964aa7f2..000000000
--- a/data/hfopenllm_v2/orai-nlp/Llama-eus-8B/b2060893-1f7d-4e7a-a458-3623147ac118.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/orai-nlp_Llama-eus-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-eus-8B",
-    "id": "orai-nlp/Llama-eus-8B",
-    "developer": "orai-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2161
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4418
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0468
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3919
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3058
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/oxyapi/oxy-1-small/cf8aac35-679a-4ebb-bca8-6e0f2d42e71b.json b/data/hfopenllm_v2/oxyapi/oxy-1-small/cf8aac35-679a-4ebb-bca8-6e0f2d42e71b.json
deleted file mode 100644
index eae996b8a..000000000
--- a/data/hfopenllm_v2/oxyapi/oxy-1-small/cf8aac35-679a-4ebb-bca8-6e0f2d42e71b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/oxyapi_oxy-1-small/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "oxy-1-small",
-    "id": "oxyapi/oxy-1-small",
-    "developer": "oxyapi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6245
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5885
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3603
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4487
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5001
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ozone-ai/0x-lite/34bfe887-5a3a-4626-997e-c35d3a0ec341.json b/data/hfopenllm_v2/ozone-ai/0x-lite/34bfe887-5a3a-4626-997e-c35d3a0ec341.json
deleted file mode 100644
index b6787efe4..000000000
--- a/data/hfopenllm_v2/ozone-ai/0x-lite/34bfe887-5a3a-4626-997e-c35d3a0ec341.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ozone-ai_0x-lite/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "0x-lite",
-    "id": "ozone-ai/0x-lite",
-    "developer": "ozone-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.774
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6341
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4221
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5184
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ozone-research/Chirp-01/b81acc47-6fd5-4f89-8c70-f8f14b677e04.json b/data/hfopenllm_v2/ozone-research/Chirp-01/b81acc47-6fd5-4f89-8c70-f8f14b677e04.json
deleted file mode 100644
index 92d7d8fbf..000000000
--- a/data/hfopenllm_v2/ozone-research/Chirp-01/b81acc47-6fd5-4f89-8c70-f8f14b677e04.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ozone-research_Chirp-01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Chirp-01",
-    "id": "ozone-research/Chirp-01",
-    "developer": "ozone-research",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6348
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.465
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3467
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4487
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3508
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V1/30b977a8-7882-49be-8621-9ee3fce270ec.json b/data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V1/30b977a8-7882-49be-8621-9ee3fce270ec.json
deleted file mode 100644
index fa0ba291d..000000000
--- a/data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V1/30b977a8-7882-49be-8621-9ee3fce270ec.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/paloalma_ECE-TW3-JRGL-V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-TW3-JRGL-V1",
-    "id": "paloalma/ECE-TW3-JRGL-V1",
-    "developer": "paloalma",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 68.977
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5535
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1314
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4621
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4221
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V2/3367fd79-713c-4691-80cd-4abb6b2818ef.json b/data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V2/3367fd79-713c-4691-80cd-4abb6b2818ef.json
deleted file mode 100644
index 503c32c60..000000000
--- a/data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V2/3367fd79-713c-4691-80cd-4abb6b2818ef.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/paloalma_ECE-TW3-JRGL-V2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-TW3-JRGL-V2",
-    "id": "paloalma/ECE-TW3-JRGL-V2",
-    "developer": "paloalma",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.288
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2255
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6031
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.185
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4793
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4588
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V5/add899b8-f3e6-4d87-8846-8254f4dfbd5f.json b/data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V5/add899b8-f3e6-4d87-8846-8254f4dfbd5f.json
deleted file mode 100644
index 9ec62858b..000000000
--- a/data/hfopenllm_v2/paloalma/ECE-TW3-JRGL-V5/add899b8-f3e6-4d87-8846-8254f4dfbd5f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/paloalma_ECE-TW3-JRGL-V5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-TW3-JRGL-V5",
-    "id": "paloalma/ECE-TW3-JRGL-V5",
-    "developer": "paloalma",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 72.289
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4553
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6025
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1835
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3414
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4621
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4648
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/paloalma/Le_Triomphant-ECE-TW3/53829ec0-f233-4b61-a672-6a467823caaa.json b/data/hfopenllm_v2/paloalma/Le_Triomphant-ECE-TW3/53829ec0-f233-4b61-a672-6a467823caaa.json
deleted file mode 100644
index bd04d24ff..000000000
--- a/data/hfopenllm_v2/paloalma/Le_Triomphant-ECE-TW3/53829ec0-f233-4b61-a672-6a467823caaa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/paloalma_Le_Triomphant-ECE-TW3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Le_Triomphant-ECE-TW3",
-    "id": "paloalma/Le_Triomphant-ECE-TW3",
-    "developer": "paloalma",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 72.289
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5402
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6112
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1949
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.349
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4725
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4763
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/paloalma/TW3-JRGL-v2/e2b41200-bff2-4835-a0ea-27ff56937570.json b/data/hfopenllm_v2/paloalma/TW3-JRGL-v2/e2b41200-bff2-4835-a0ea-27ff56937570.json
deleted file mode 100644
index f94c8540b..000000000
--- a/data/hfopenllm_v2/paloalma/TW3-JRGL-v2/e2b41200-bff2-4835-a0ea-27ff56937570.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/paloalma_TW3-JRGL-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TW3-JRGL-v2",
-    "id": "paloalma/TW3-JRGL-v2",
-    "developer": "paloalma",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 72.289
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5316
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6138
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.179
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3591
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4858
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4858
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/Al_Dente_v1_8b/3d33f26d-72be-451e-bcf0-501e0bc2f1db.json b/data/hfopenllm_v2/pankajmathur/Al_Dente_v1_8b/3d33f26d-72be-451e-bcf0-501e0bc2f1db.json
deleted file mode 100644
index 4ce9999c5..000000000
--- a/data/hfopenllm_v2/pankajmathur/Al_Dente_v1_8b/3d33f26d-72be-451e-bcf0-501e0bc2f1db.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_Al_Dente_v1_8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Al_Dente_v1_8b",
-    "id": "pankajmathur/Al_Dente_v1_8b",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3694
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4835
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3987
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.286
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/model_007_13b_v2/3b4c05fc-2ccf-46db-8d64-045508f6614b.json b/data/hfopenllm_v2/pankajmathur/model_007_13b_v2/3b4c05fc-2ccf-46db-8d64-045508f6614b.json
deleted file mode 100644
index 8fcac09dd..000000000
--- a/data/hfopenllm_v2/pankajmathur/model_007_13b_v2/3b4c05fc-2ccf-46db-8d64-045508f6614b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_model_007_13b_v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "model_007_13b_v2",
-    "id": "pankajmathur/model_007_13b_v2",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3056
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4702
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0211
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4611
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2461
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_3b/af83a91c-3b07-48c6-9726-5bd77347f810.json b/data/hfopenllm_v2/pankajmathur/orca_mini_3b/af83a91c-3b07-48c6-9726-5bd77347f810.json
deleted file mode 100644
index ba7ea5916..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_3b/af83a91c-3b07-48c6-9726-5bd77347f810.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_3b",
-    "id": "pankajmathur/orca_mini_3b",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.426
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0742
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2458
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3349
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1145
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_7b/48759b07-9aea-42bd-8d73-9c4208d2789f.json b/data/hfopenllm_v2/pankajmathur/orca_mini_7b/48759b07-9aea-42bd-8d73-9c4208d2789f.json
deleted file mode 100644
index 9f6551021..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_7b/48759b07-9aea-42bd-8d73-9c4208d2789f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_7b",
-    "id": "pankajmathur/orca_mini_7b",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0412
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3332
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3698
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1246
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_phi-4/68820679-55f4-494d-91a0-0db1bccb8983.json b/data/hfopenllm_v2/pankajmathur/orca_mini_phi-4/68820679-55f4-494d-91a0-0db1bccb8983.json
deleted file mode 100644
index cf14929d8..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_phi-4/68820679-55f4-494d-91a0-0db1bccb8983.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_phi-4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_phi-4",
-    "id": "pankajmathur/orca_mini_phi-4",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7781
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6856
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3742
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4703
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5255
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v2_7b/029774ac-a63d-4acc-a37c-4194e4afdecc.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v2_7b/029774ac-a63d-4acc-a37c-4194e4afdecc.json
deleted file mode 100644
index 37a85929b..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v2_7b/029774ac-a63d-4acc-a37c-4194e4afdecc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v2_7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v2_7b",
-    "id": "pankajmathur/orca_mini_v2_7b",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1358
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3536
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0113
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3593
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1542
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v3_13b/146df856-e2c8-41eb-b860-ceb78c126e55.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v3_13b/146df856-e2c8-41eb-b860-ceb78c126e55.json
deleted file mode 100644
index d255ddd83..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v3_13b/146df856-e2c8-41eb-b860-ceb78c126e55.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v3_13b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v3_13b",
-    "id": "pankajmathur/orca_mini_v3_13b",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2897
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4711
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0211
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4598
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2305
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v3_70b/74c6bea7-ad16-4f08-a2b7-9c894b9ce207.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v3_70b/74c6bea7-ad16-4f08-a2b7-9c894b9ce207.json
deleted file mode 100644
index 39384845f..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v3_70b/74c6bea7-ad16-4f08-a2b7-9c894b9ce207.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v3_70b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v3_70b",
-    "id": "pankajmathur/orca_mini_v3_70b",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4015
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5949
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5079
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3757
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v3_7b/b5e97b2d-d8a2-485a-8b0a-71590e4a376e.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v3_7b/b5e97b2d-d8a2-485a-8b0a-71590e4a376e.json
deleted file mode 100644
index 33c79ebe0..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v3_7b/b5e97b2d-d8a2-485a-8b0a-71590e4a376e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v3_7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v3_7b",
-    "id": "pankajmathur/orca_mini_v3_7b",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2821
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4095
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2466
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4982
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2084
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b/e79d0a8c-caec-4dec-b119-3229ffa69a73.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b/e79d0a8c-caec-4dec-b119-3229ffa69a73.json
deleted file mode 100644
index 73bf37a29..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b/e79d0a8c-caec-4dec-b119-3229ffa69a73.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v5_8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v5_8b",
-    "id": "pankajmathur/orca_mini_v5_8b",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4806
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5064
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0989
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3076
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b_dpo/2c760893-b52a-40a9-9420-fb193a62a5c3.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b_dpo/2c760893-b52a-40a9-9420-fb193a62a5c3.json
deleted file mode 100644
index 1fb80ed95..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b_dpo/2c760893-b52a-40a9-9420-fb193a62a5c3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v5_8b_dpo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v5_8b_dpo",
-    "id": "pankajmathur/orca_mini_v5_8b_dpo",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4896
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5075
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0974
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3894
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3116
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b_orpo/ef9b84e0-68b0-4caa-9980-96ea5e7f440b.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b_orpo/ef9b84e0-68b0-4caa-9980-96ea5e7f440b.json
deleted file mode 100644
index b10b829f3..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v5_8b_orpo/ef9b84e0-68b0-4caa-9980-96ea5e7f440b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v5_8b_orpo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v5_8b_orpo",
-    "id": "pankajmathur/orca_mini_v5_8b_orpo",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0824
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4964
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4131
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2947
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v6_8b/fb48aff8-3f6b-4934-9fb8-d72bf8614d6f.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v6_8b/fb48aff8-3f6b-4934-9fb8-d72bf8614d6f.json
deleted file mode 100644
index 0c0674af7..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v6_8b/fb48aff8-3f6b-4934-9fb8-d72bf8614d6f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v6_8b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v6_8b",
-    "id": "pankajmathur/orca_mini_v6_8b",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0111
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0038
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2383
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3555
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v6_8b_dpo/9450acd9-16b6-49a2-9b73-cf1161b96df3.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v6_8b_dpo/9450acd9-16b6-49a2-9b73-cf1161b96df3.json
deleted file mode 100644
index 92b4ebda8..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v6_8b_dpo/9450acd9-16b6-49a2-9b73-cf1161b96df3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v6_8b_dpo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v6_8b_dpo",
-    "id": "pankajmathur/orca_mini_v6_8b_dpo",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3883
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5203
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.409
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3596
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v7_72b/0d50ec2d-5dd4-487e-80cb-9533246a9876.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v7_72b/0d50ec2d-5dd4-487e-80cb-9533246a9876.json
deleted file mode 100644
index 475ec2f73..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v7_72b/0d50ec2d-5dd4-487e-80cb-9533246a9876.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v7_72b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v7_72b",
-    "id": "pankajmathur/orca_mini_v7_72b",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.593
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6842
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0937
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3851
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.507
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5622
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v7_7b/f6e6827d-fbf8-49cd-bdad-e8c7ea87550a.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v7_7b/f6e6827d-fbf8-49cd-bdad-e8c7ea87550a.json
deleted file mode 100644
index 7c8b93f98..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v7_7b/f6e6827d-fbf8-49cd-bdad-e8c7ea87550a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v7_7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v7_7b",
-    "id": "pankajmathur/orca_mini_v7_7b",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4388
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5275
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1208
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.436
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4167
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v8_1_70b/c5e48fd8-0eea-46a9-8790-1745923561d3.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v8_1_70b/c5e48fd8-0eea-46a9-8790-1745923561d3.json
deleted file mode 100644
index 2d067df48..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v8_1_70b/c5e48fd8-0eea-46a9-8790-1745923561d3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v8_1_70b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v8_1_70b",
-    "id": "pankajmathur/orca_mini_v8_1_70b",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8571
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6781
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3527
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4329
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4437
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4983
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_0_3B-Instruct/870c7739-8886-47df-8e20-09bfae03b9c5.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_0_3B-Instruct/870c7739-8886-47df-8e20-09bfae03b9c5.json
deleted file mode 100644
index 1af69e1fb..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_0_3B-Instruct/870c7739-8886-47df-8e20-09bfae03b9c5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_0_3B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v9_0_3B-Instruct",
-    "id": "pankajmathur/orca_mini_v9_0_3B-Instruct",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5754
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4413
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1465
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3659
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2603
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_1_1B-Instruct/d8eb5fd1-f1d4-481d-85af-88a11d7b6f6f.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_1_1B-Instruct/d8eb5fd1-f1d4-481d-85af-88a11d7b6f6f.json
deleted file mode 100644
index fd6a47751..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_1_1B-Instruct/d8eb5fd1-f1d4-481d-85af-88a11d7b6f6f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_1_1B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v9_1_1B-Instruct",
-    "id": "pankajmathur/orca_mini_v9_1_1B-Instruct",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3629
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0461
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1374
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_2_14B/6625b2e0-1f65-4dc5-9913-ceb0e82e6439.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_2_14B/6625b2e0-1f65-4dc5-9913-ceb0e82e6439.json
deleted file mode 100644
index a42075775..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_2_14B/6625b2e0-1f65-4dc5-9913-ceb0e82e6439.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_2_14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v9_2_14B",
-    "id": "pankajmathur/orca_mini_v9_2_14B",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7781
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6856
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3742
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4703
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5255
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_2_70b/24e7df20-e046-48f7-909e-502d0c70216a.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_2_70b/24e7df20-e046-48f7-909e-502d0c70216a.json
deleted file mode 100644
index 7a7d9e13a..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_2_70b/24e7df20-e046-48f7-909e-502d0c70216a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_2_70b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v9_2_70b",
-    "id": "pankajmathur/orca_mini_v9_2_70b",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8383
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6745
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2938
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.471
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4821
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_4_70B/7920f562-9e7f-4a64-85f4-584b13af44de.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_4_70B/7920f562-9e7f-4a64-85f4-584b13af44de.json
deleted file mode 100644
index 2d1a1c562..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_4_70B/7920f562-9e7f-4a64-85f4-584b13af44de.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_4_70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v9_4_70B",
-    "id": "pankajmathur/orca_mini_v9_4_70B",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8015
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6419
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3658
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4647
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4536
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_1B-Instruct/c6620817-69fe-40e2-bb0a-1e9c739ab65d.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_1B-Instruct/c6620817-69fe-40e2-bb0a-1e9c739ab65d.json
deleted file mode 100644
index e8976fd6e..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_1B-Instruct/c6620817-69fe-40e2-bb0a-1e9c739ab65d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_5_1B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v9_5_1B-Instruct",
-    "id": "pankajmathur/orca_mini_v9_5_1B-Instruct",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4638
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3337
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3182
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.137
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_1B-Instruct_preview/520e2d66-4143-493b-8533-64f86c6d676e.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_1B-Instruct_preview/520e2d66-4143-493b-8533-64f86c6d676e.json
deleted file mode 100644
index ede7abcd5..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_1B-Instruct_preview/520e2d66-4143-493b-8533-64f86c6d676e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_5_1B-Instruct_preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v9_5_1B-Instruct_preview",
-    "id": "pankajmathur/orca_mini_v9_5_1B-Instruct_preview",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3936
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3277
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3395
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1327
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_3B-Instruct/993bdfd2-3a88-4de3-9ed9-9b7b63c0f4f5.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_3B-Instruct/993bdfd2-3a88-4de3-9ed9-9b7b63c0f4f5.json
deleted file mode 100644
index 067bf5772..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_5_3B-Instruct/993bdfd2-3a88-4de3-9ed9-9b7b63c0f4f5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_5_3B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v9_5_3B-Instruct",
-    "id": "pankajmathur/orca_mini_v9_5_3B-Instruct",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7207
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4496
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1322
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.427
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2882
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_6_1B-Instruct/4e1be694-cc4d-4943-a8e4-74913cfb2ebe.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_6_1B-Instruct/4e1be694-cc4d-4943-a8e4-74913cfb2ebe.json
deleted file mode 100644
index 6331cdee7..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_6_1B-Instruct/4e1be694-cc4d-4943-a8e4-74913cfb2ebe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_6_1B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v9_6_1B-Instruct",
-    "id": "pankajmathur/orca_mini_v9_6_1B-Instruct",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6086
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3561
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.077
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3396
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1809
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_6_3B-Instruct/42c174d1-6211-4438-bb9a-24f3cf386a6d.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_6_3B-Instruct/42c174d1-6211-4438-bb9a-24f3cf386a6d.json
deleted file mode 100644
index 4727738e9..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_6_3B-Instruct/42c174d1-6211-4438-bb9a-24f3cf386a6d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_6_3B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v9_6_3B-Instruct",
-    "id": "pankajmathur/orca_mini_v9_6_3B-Instruct",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7316
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4568
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1329
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4068
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2851
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_7_1B-Instruct/625bf39b-a118-4ec6-82d0-5405cf70ba53.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_7_1B-Instruct/625bf39b-a118-4ec6-82d0-5405cf70ba53.json
deleted file mode 100644
index ad5c217d2..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_7_1B-Instruct/625bf39b-a118-4ec6-82d0-5405cf70ba53.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_7_1B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v9_7_1B-Instruct",
-    "id": "pankajmathur/orca_mini_v9_7_1B-Instruct",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.561
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3182
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0446
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3527
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1345
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_7_3B-Instruct/e09cb198-d259-42ea-a356-6efe61b1e12b.json b/data/hfopenllm_v2/pankajmathur/orca_mini_v9_7_3B-Instruct/e09cb198-d259-42ea-a356-6efe61b1e12b.json
deleted file mode 100644
index 57b5c8eb9..000000000
--- a/data/hfopenllm_v2/pankajmathur/orca_mini_v9_7_3B-Instruct/e09cb198-d259-42ea-a356-6efe61b1e12b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pankajmathur_orca_mini_v9_7_3B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "orca_mini_v9_7_3B-Instruct",
-    "id": "pankajmathur/orca_mini_v9_7_3B-Instruct",
-    "developer": "pankajmathur",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5618
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3297
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0619
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3619
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1375
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/paulml/ECE-ILAB-Q1/5838b130-c2e6-400c-80b7-6822efb5db2c.json b/data/hfopenllm_v2/paulml/ECE-ILAB-Q1/5838b130-c2e6-400c-80b7-6822efb5db2c.json
deleted file mode 100644
index 4666ebf13..000000000
--- a/data/hfopenllm_v2/paulml/ECE-ILAB-Q1/5838b130-c2e6-400c-80b7-6822efb5db2c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/paulml_ECE-ILAB-Q1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-ILAB-Q1",
-    "id": "paulml/ECE-ILAB-Q1",
-    "developer": "paulml",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7865
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6718
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3557
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4614
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5505
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pints-ai/1.5-Pints-16K-v0.1/52b51638-64cd-4b19-8fc7-c223d50bc549.json b/data/hfopenllm_v2/pints-ai/1.5-Pints-16K-v0.1/52b51638-64cd-4b19-8fc7-c223d50bc549.json
deleted file mode 100644
index e89688fb3..000000000
--- a/data/hfopenllm_v2/pints-ai/1.5-Pints-16K-v0.1/52b51638-64cd-4b19-8fc7-c223d50bc549.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pints-ai_1.5-Pints-16K-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "1.5-Pints-16K-v0.1",
-    "id": "pints-ai/1.5-Pints-16K-v0.1",
-    "developer": "pints-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.566
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1636
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3133
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2357
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3579
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1119
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pints-ai/1.5-Pints-2K-v0.1/28b3178b-c963-4267-9649-3f7fc10fba3c.json b/data/hfopenllm_v2/pints-ai/1.5-Pints-2K-v0.1/28b3178b-c963-4267-9649-3f7fc10fba3c.json
deleted file mode 100644
index 9f6e80b6f..000000000
--- a/data/hfopenllm_v2/pints-ai/1.5-Pints-2K-v0.1/28b3178b-c963-4267-9649-3f7fc10fba3c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pints-ai_1.5-Pints-2K-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "1.5-Pints-2K-v0.1",
-    "id": "pints-ai/1.5-Pints-2K-v0.1",
-    "developer": "pints-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.566
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1762
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.298
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2483
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3502
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1104
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/piotr25691/thea-3b-25r/748298a2-5042-4636-ac7e-051c28916f3a.json b/data/hfopenllm_v2/piotr25691/thea-3b-25r/748298a2-5042-4636-ac7e-051c28916f3a.json
deleted file mode 100644
index b4477c2ac..000000000
--- a/data/hfopenllm_v2/piotr25691/thea-3b-25r/748298a2-5042-4636-ac7e-051c28916f3a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/piotr25691_thea-3b-25r/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "thea-3b-25r",
-    "id": "piotr25691/thea-3b-25r",
-    "developer": "piotr25691",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7344
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4484
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1782
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3315
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3182
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/piotr25691/thea-c-3b-25r/03bcd4e6-1620-424a-9200-c0cf4b73bbd2.json b/data/hfopenllm_v2/piotr25691/thea-c-3b-25r/03bcd4e6-1620-424a-9200-c0cf4b73bbd2.json
deleted file mode 100644
index add3fcd04..000000000
--- a/data/hfopenllm_v2/piotr25691/thea-c-3b-25r/03bcd4e6-1620-424a-9200-c0cf4b73bbd2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/piotr25691_thea-c-3b-25r/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "thea-c-3b-25r",
-    "id": "piotr25691/thea-c-3b-25r",
-    "developer": "piotr25691",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7402
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4532
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1526
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3315
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3178
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/piotr25691/thea-rp-3b-25r/c7fba530-63cc-4ece-a171-4a2919aa8057.json b/data/hfopenllm_v2/piotr25691/thea-rp-3b-25r/c7fba530-63cc-4ece-a171-4a2919aa8057.json
deleted file mode 100644
index 2c66084d4..000000000
--- a/data/hfopenllm_v2/piotr25691/thea-rp-3b-25r/c7fba530-63cc-4ece-a171-4a2919aa8057.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/piotr25691_thea-rp-3b-25r/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "thea-rp-3b-25r",
-    "id": "piotr25691/thea-rp-3b-25r",
-    "developer": "piotr25691",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6578
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.439
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1322
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3819
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.306
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/postbot/gpt2-medium-emailgen/c25c1046-a8d5-4f4b-9a72-c4591cfb4023.json b/data/hfopenllm_v2/postbot/gpt2-medium-emailgen/c25c1046-a8d5-4f4b-9a72-c4591cfb4023.json
deleted file mode 100644
index e0f2e52de..000000000
--- a/data/hfopenllm_v2/postbot/gpt2-medium-emailgen/c25c1046-a8d5-4f4b-9a72-c4591cfb4023.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/postbot_gpt2-medium-emailgen/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt2-medium-emailgen",
-    "id": "postbot/gpt2-medium-emailgen",
-    "developer": "postbot",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPT2LMHeadModel",
-      "params_billions": 0.38
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1492
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.313
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3911
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1147
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prince-canuma/Ministral-8B-Instruct-2410-HF/c3800a5c-310b-41cb-9b07-cfc1f1b13256.json b/data/hfopenllm_v2/prince-canuma/Ministral-8B-Instruct-2410-HF/c3800a5c-310b-41cb-9b07-cfc1f1b13256.json
deleted file mode 100644
index 6b36039e0..000000000
--- a/data/hfopenllm_v2/prince-canuma/Ministral-8B-Instruct-2410-HF/c3800a5c-310b-41cb-9b07-cfc1f1b13256.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prince-canuma_Ministral-8B-Instruct-2410-HF/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ministral-8B-Instruct-2410-HF",
-    "id": "prince-canuma/Ministral-8B-Instruct-2410-HF",
-    "developer": "prince-canuma",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 8.02
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5912
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4586
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1918
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4138
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3298
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Base/e8e2b99f-cf83-4776-9117-aa2b5d9c8068.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Base/e8e2b99f-cf83-4776-9117-aa2b5d9c8068.json
deleted file mode 100644
index f4645a2cd..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Base/e8e2b99f-cf83-4776-9117-aa2b5d9c8068.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-8B-ProLong-512k-Base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-ProLong-512k-Base",
-    "id": "princeton-nlp/Llama-3-8B-ProLong-512k-Base",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5322
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5033
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0687
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4223
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3329
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Instruct/2da19e45-117f-446b-b956-b35a20bb7411.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Instruct/2da19e45-117f-446b-b956-b35a20bb7411.json
deleted file mode 100644
index f3f5626a4..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Instruct/2da19e45-117f-446b-b956-b35a20bb7411.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-8B-ProLong-512k-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-ProLong-512k-Instruct",
-    "id": "princeton-nlp/Llama-3-8B-ProLong-512k-Instruct",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5508
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5028
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0529
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4266
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3231
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Instruct/9e982a33-19cb-4381-8560-884bc8946a2b.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Instruct/9e982a33-19cb-4381-8560-884bc8946a2b.json
deleted file mode 100644
index 62d796011..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-512k-Instruct/9e982a33-19cb-4381-8560-884bc8946a2b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-8B-ProLong-512k-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-ProLong-512k-Instruct",
-    "id": "princeton-nlp/Llama-3-8B-ProLong-512k-Instruct",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3978
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4983
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0582
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.425
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3246
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-64k-Base/9130a862-cfd7-47ce-a92a-f60438739491.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-64k-Base/9130a862-cfd7-47ce-a92a-f60438739491.json
deleted file mode 100644
index 41ea67f01..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-64k-Base/9130a862-cfd7-47ce-a92a-f60438739491.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-8B-ProLong-64k-Base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-ProLong-64k-Base",
-    "id": "princeton-nlp/Llama-3-8B-ProLong-64k-Base",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5201
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4927
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4341
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3348
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-64k-Instruct/858d3717-fcb2-45d9-8eaa-1b00ae0ca918.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-64k-Instruct/858d3717-fcb2-45d9-8eaa-1b00ae0ca918.json
deleted file mode 100644
index b83c173d0..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-8B-ProLong-64k-Instruct/858d3717-fcb2-45d9-8eaa-1b00ae0ca918.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-8B-ProLong-64k-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-ProLong-64k-Instruct",
-    "id": "princeton-nlp/Llama-3-8B-ProLong-64k-Instruct",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5563
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5083
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4397
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3275
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-CPO/5f1f137b-cb2f-4ee6-8bc9-5e0b94939f35.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-CPO/5f1f137b-cb2f-4ee6-8bc9-5e0b94939f35.json
deleted file mode 100644
index 25691f45b..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-CPO/5f1f137b-cb2f-4ee6-8bc9-5e0b94939f35.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT-CPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Base-8B-SFT-CPO",
-    "id": "princeton-nlp/Llama-3-Base-8B-SFT-CPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3703
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4595
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3609
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2976
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-DPO/6feca911-7a6e-43a2-b59d-7cb48070fe8e.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-DPO/6feca911-7a6e-43a2-b59d-7cb48070fe8e.json
deleted file mode 100644
index 84c86e76e..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-DPO/6feca911-7a6e-43a2-b59d-7cb48070fe8e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT-DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Base-8B-SFT-DPO",
-    "id": "princeton-nlp/Llama-3-Base-8B-SFT-DPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4111
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4666
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0415
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3078
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-IPO/d3ad9813-273e-47de-be16-312cc67ac64f.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-IPO/d3ad9813-273e-47de-be16-312cc67ac64f.json
deleted file mode 100644
index 95f0c784c..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-IPO/d3ad9813-273e-47de-be16-312cc67ac64f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT-IPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Base-8B-SFT-IPO",
-    "id": "princeton-nlp/Llama-3-Base-8B-SFT-IPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4487
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.469
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3919
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3115
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-KTO/317205ee-2cc6-4523-9662-be6508314b08.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-KTO/317205ee-2cc6-4523-9662-be6508314b08.json
deleted file mode 100644
index 1afb80262..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-KTO/317205ee-2cc6-4523-9662-be6508314b08.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT-KTO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Base-8B-SFT-KTO",
-    "id": "princeton-nlp/Llama-3-Base-8B-SFT-KTO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4523
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4693
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0529
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-ORPO/3b5fe65a-50a1-4036-b81a-86117356cab9.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-ORPO/3b5fe65a-50a1-4036-b81a-86117356cab9.json
deleted file mode 100644
index 4c1233c1f..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-ORPO/3b5fe65a-50a1-4036-b81a-86117356cab9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT-ORPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Base-8B-SFT-ORPO",
-    "id": "princeton-nlp/Llama-3-Base-8B-SFT-ORPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4517
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4734
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0468
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3707
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3083
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-RDPO/812ac262-97f4-485e-93de-f8d420b8658e.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-RDPO/812ac262-97f4-485e-93de-f8d420b8658e.json
deleted file mode 100644
index 21691d320..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-RDPO/812ac262-97f4-485e-93de-f8d420b8658e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT-RDPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Base-8B-SFT-RDPO",
-    "id": "princeton-nlp/Llama-3-Base-8B-SFT-RDPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.448
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4662
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4027
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3014
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-RRHF/39cd7eb0-781e-47b6-8eaa-c72e702f778f.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-RRHF/39cd7eb0-781e-47b6-8eaa-c72e702f778f.json
deleted file mode 100644
index 564369fd0..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-RRHF/39cd7eb0-781e-47b6-8eaa-c72e702f778f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT-RRHF/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Base-8B-SFT-RRHF",
-    "id": "princeton-nlp/Llama-3-Base-8B-SFT-RRHF",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3357
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.452
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3722
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2889
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-SLiC-HF/9411a8a4-306e-43da-96d7-c93eb3aac398.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-SLiC-HF/9411a8a4-306e-43da-96d7-c93eb3aac398.json
deleted file mode 100644
index 3220421bd..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-SLiC-HF/9411a8a4-306e-43da-96d7-c93eb3aac398.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT-SLiC-HF/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Base-8B-SFT-SLiC-HF",
-    "id": "princeton-nlp/Llama-3-Base-8B-SFT-SLiC-HF",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.489
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4704
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4091
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3063
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-SimPO/c93feb32-0526-44ac-b3ed-95f08c37cc9f.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-SimPO/c93feb32-0526-44ac-b3ed-95f08c37cc9f.json
deleted file mode 100644
index 5ed4540df..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT-SimPO/c93feb32-0526-44ac-b3ed-95f08c37cc9f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT-SimPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Base-8B-SFT-SimPO",
-    "id": "princeton-nlp/Llama-3-Base-8B-SFT-SimPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4685
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4741
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0551
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4127
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3105
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT/1a3b0f7a-afb6-4002-9321-23a86f000c5c.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT/1a3b0f7a-afb6-4002-9321-23a86f000c5c.json
deleted file mode 100644
index d4c8acc9b..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Base-8B-SFT/1a3b0f7a-afb6-4002-9321-23a86f000c5c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Base-8B-SFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Base-8B-SFT",
-    "id": "princeton-nlp/Llama-3-Base-8B-SFT",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2796
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4643
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4118
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3093
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-CPO-v0.2/8d29363d-3096-4c54-a40e-acf4a7318a04.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-CPO-v0.2/8d29363d-3096-4c54-a40e-acf4a7318a04.json
deleted file mode 100644
index 6799069cb..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-CPO-v0.2/8d29363d-3096-4c54-a40e-acf4a7318a04.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-CPO-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-CPO-v0.2",
-    "id": "princeton-nlp/Llama-3-Instruct-8B-CPO-v0.2",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7506
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5027
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3619
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3706
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-CPO/8cea452d-63b8-4e82-9511-64c94f8e140d.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-CPO/8cea452d-63b8-4e82-9511-64c94f8e140d.json
deleted file mode 100644
index acd408cf8..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-CPO/8cea452d-63b8-4e82-9511-64c94f8e140d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-CPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-CPO",
-    "id": "princeton-nlp/Llama-3-Instruct-8B-CPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7293
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4999
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0989
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3514
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3652
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-DPO-v0.2/5e5b5424-1d48-4a5e-8775-52c75609c338.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-DPO-v0.2/5e5b5424-1d48-4a5e-8775-52c75609c338.json
deleted file mode 100644
index 68024676a..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-DPO-v0.2/5e5b5424-1d48-4a5e-8775-52c75609c338.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-DPO-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-DPO-v0.2",
-    "id": "princeton-nlp/Llama-3-Instruct-8B-DPO-v0.2",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7208
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5056
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0899
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3844
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3769
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-DPO/73787033-ed1d-4d2e-b7b2-e886ef6f1036.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-DPO/73787033-ed1d-4d2e-b7b2-e886ef6f1036.json
deleted file mode 100644
index 9471420c1..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-DPO/73787033-ed1d-4d2e-b7b2-e886ef6f1036.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-DPO",
-    "id": "princeton-nlp/Llama-3-Instruct-8B-DPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6757
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4991
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0846
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3665
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-KTO-v0.2/54c9403f-2525-45c0-a585-9ff598f95f6b.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-KTO-v0.2/54c9403f-2525-45c0-a585-9ff598f95f6b.json
deleted file mode 100644
index 8762059d7..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-KTO-v0.2/54c9403f-2525-45c0-a585-9ff598f95f6b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-KTO-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-KTO-v0.2",
-    "id": "princeton-nlp/Llama-3-Instruct-8B-KTO-v0.2",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.729
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.508
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0997
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3777
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3668
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-KTO/77d0d88d-7ca8-4f3e-8b79-295f53140635.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-KTO/77d0d88d-7ca8-4f3e-8b79-295f53140635.json
deleted file mode 100644
index 0d6769c0d..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-KTO/77d0d88d-7ca8-4f3e-8b79-295f53140635.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-KTO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-KTO",
-    "id": "princeton-nlp/Llama-3-Instruct-8B-KTO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6864
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4982
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0725
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3698
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3599
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-ORPO-v0.2/727f27e3-2a3f-4572-8db5-87e498c4b6ca.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-ORPO-v0.2/727f27e3-2a3f-4572-8db5-87e498c4b6ca.json
deleted file mode 100644
index 26a0731e0..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-ORPO-v0.2/727f27e3-2a3f-4572-8db5-87e498c4b6ca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-ORPO-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-ORPO-v0.2",
-    "id": "princeton-nlp/Llama-3-Instruct-8B-ORPO-v0.2",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7633
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5078
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.378
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3731
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-ORPO/b6e0cc97-27cf-4082-a908-95d5c39014b8.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-ORPO/b6e0cc97-27cf-4082-a908-95d5c39014b8.json
deleted file mode 100644
index 64f178681..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-ORPO/b6e0cc97-27cf-4082-a908-95d5c39014b8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-ORPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-ORPO",
-    "id": "princeton-nlp/Llama-3-Instruct-8B-ORPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7128
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5001
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0785
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3502
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3646
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RDPO-v0.2/3b77ec51-fd47-4bc7-9e96-ed46202fef7c.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RDPO-v0.2/3b77ec51-fd47-4bc7-9e96-ed46202fef7c.json
deleted file mode 100644
index bac88e0ae..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RDPO-v0.2/3b77ec51-fd47-4bc7-9e96-ed46202fef7c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-RDPO-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-RDPO-v0.2",
-    "id": "princeton-nlp/Llama-3-Instruct-8B-RDPO-v0.2",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7077
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5049
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0869
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3804
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3774
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RDPO/b24cdd3f-3e44-4ebe-b2b4-209ee0bbfbd3.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RDPO/b24cdd3f-3e44-4ebe-b2b4-209ee0bbfbd3.json
deleted file mode 100644
index 03be08868..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RDPO/b24cdd3f-3e44-4ebe-b2b4-209ee0bbfbd3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-RDPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-RDPO",
-    "id": "princeton-nlp/Llama-3-Instruct-8B-RDPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.666
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5034
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0846
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3752
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3607
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RRHF-v0.2/e47a3cab-dfef-47f6-9377-9ee32489bab6.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RRHF-v0.2/e47a3cab-dfef-47f6-9377-9ee32489bab6.json
deleted file mode 100644
index d4d35dc63..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RRHF-v0.2/e47a3cab-dfef-47f6-9377-9ee32489bab6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-RRHF-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-RRHF-v0.2",
-    "id": "princeton-nlp/Llama-3-Instruct-8B-RRHF-v0.2",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7125
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4984
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0876
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3482
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RRHF/1e4481fe-458b-4c23-8a6c-55439fb8b4fd.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RRHF/1e4481fe-458b-4c23-8a6c-55439fb8b4fd.json
deleted file mode 100644
index e44593e69..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-RRHF/1e4481fe-458b-4c23-8a6c-55439fb8b4fd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-RRHF/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-RRHF",
-    "id": "princeton-nlp/Llama-3-Instruct-8B-RRHF",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7275
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4911
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0967
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3476
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3644
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SLiC-HF-v0.2/6421e9dc-e7ca-4e1c-9f4f-1d1ac409c4d1.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SLiC-HF-v0.2/6421e9dc-e7ca-4e1c-9f4f-1d1ac409c4d1.json
deleted file mode 100644
index 7c04e216a..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SLiC-HF-v0.2/6421e9dc-e7ca-4e1c-9f4f-1d1ac409c4d1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-SLiC-HF-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-SLiC-HF-v0.2",
-    "id": "princeton-nlp/Llama-3-Instruct-8B-SLiC-HF-v0.2",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.711
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4984
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0876
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3482
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SLiC-HF/55f43b53-6ed9-4c16-bf75-c968999a6f36.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SLiC-HF/55f43b53-6ed9-4c16-bf75-c968999a6f36.json
deleted file mode 100644
index abb7ea88f..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SLiC-HF/55f43b53-6ed9-4c16-bf75-c968999a6f36.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-SLiC-HF/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-SLiC-HF",
-    "id": "princeton-nlp/Llama-3-Instruct-8B-SLiC-HF",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.74
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5029
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0974
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3723
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3585
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SimPO-v0.2/6ce93e70-04b1-46b8-b3e3-7eb0df35e1c1.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SimPO-v0.2/6ce93e70-04b1-46b8-b3e3-7eb0df35e1c1.json
deleted file mode 100644
index ffe9fbf5e..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SimPO-v0.2/6ce93e70-04b1-46b8-b3e3-7eb0df35e1c1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-SimPO-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-SimPO-v0.2",
-    "id": "princeton-nlp/Llama-3-Instruct-8B-SimPO-v0.2",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6809
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5038
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.074
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3988
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3622
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SimPO/95096a89-2baf-4b14-bc6e-1f30e920c086.json b/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SimPO/95096a89-2baf-4b14-bc6e-1f30e920c086.json
deleted file mode 100644
index c46b79969..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Llama-3-Instruct-8B-SimPO/95096a89-2baf-4b14-bc6e-1f30e920c086.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Llama-3-Instruct-8B-SimPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Instruct-8B-SimPO",
-    "id": "princeton-nlp/Llama-3-Instruct-8B-SimPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6504
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4845
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0861
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3948
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3489
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-CPO/f1651632-2787-47cf-b471-89d1b89a6b01.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-CPO/f1651632-2787-47cf-b471-89d1b89a6b01.json
deleted file mode 100644
index 625ad8634..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-CPO/f1651632-2787-47cf-b471-89d1b89a6b01.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Base-SFT-CPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Base-SFT-CPO",
-    "id": "princeton-nlp/Mistral-7B-Base-SFT-CPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4655
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4382
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0279
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4071
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-DPO/e1fb2ac9-8f60-4dc1-9e0d-99fcb91a53a9.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-DPO/e1fb2ac9-8f60-4dc1-9e0d-99fcb91a53a9.json
deleted file mode 100644
index a960ca923..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-DPO/e1fb2ac9-8f60-4dc1-9e0d-99fcb91a53a9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Base-SFT-DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Base-SFT-DPO",
-    "id": "princeton-nlp/Mistral-7B-Base-SFT-DPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4403
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.435
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0211
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4122
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2645
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-IPO/d3accbc1-d698-4357-ab08-0b98fb49b4ed.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-IPO/d3accbc1-d698-4357-ab08-0b98fb49b4ed.json
deleted file mode 100644
index 908f14ac8..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-IPO/d3accbc1-d698-4357-ab08-0b98fb49b4ed.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Base-SFT-IPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Base-SFT-IPO",
-    "id": "princeton-nlp/Mistral-7B-Base-SFT-IPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.483
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4458
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3776
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2792
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-KTO/5388a25a-5780-4ae1-999f-172b558a7b52.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-KTO/5388a25a-5780-4ae1-999f-172b558a7b52.json
deleted file mode 100644
index 31e5cf5b4..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-KTO/5388a25a-5780-4ae1-999f-172b558a7b52.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Base-SFT-KTO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Base-SFT-KTO",
-    "id": "princeton-nlp/Mistral-7B-Base-SFT-KTO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4785
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4476
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4368
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2872
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-RDPO/9e4143ff-d461-4fdb-8bc7-86f959f69e68.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-RDPO/9e4143ff-d461-4fdb-8bc7-86f959f69e68.json
deleted file mode 100644
index 56f6acce0..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-RDPO/9e4143ff-d461-4fdb-8bc7-86f959f69e68.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Base-SFT-RDPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Base-SFT-RDPO",
-    "id": "princeton-nlp/Mistral-7B-Base-SFT-RDPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4606
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.444
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3579
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-RRHF/5d843bd7-b34b-41d4-92ff-c25a709b4930.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-RRHF/5d843bd7-b34b-41d4-92ff-c25a709b4930.json
deleted file mode 100644
index 8699b54e7..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-RRHF/5d843bd7-b34b-41d4-92ff-c25a709b4930.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Base-SFT-RRHF/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Base-SFT-RRHF",
-    "id": "princeton-nlp/Mistral-7B-Base-SFT-RRHF",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4407
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4281
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4187
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2398
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-SLiC-HF/87975b2f-298b-4297-8f4d-e5bb1bf5d113.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-SLiC-HF/87975b2f-298b-4297-8f4d-e5bb1bf5d113.json
deleted file mode 100644
index 09f698c9e..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-SLiC-HF/87975b2f-298b-4297-8f4d-e5bb1bf5d113.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Base-SFT-SLiC-HF/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Base-SFT-SLiC-HF",
-    "id": "princeton-nlp/Mistral-7B-Base-SFT-SLiC-HF",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5127
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4422
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0355
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4261
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2781
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-SimPO/41bb8174-f3d6-4862-b892-dbc9f6e2e696.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-SimPO/41bb8174-f3d6-4862-b892-dbc9f6e2e696.json
deleted file mode 100644
index 5f30fecdb..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Base-SFT-SimPO/41bb8174-f3d6-4862-b892-dbc9f6e2e696.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Base-SFT-SimPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Base-SFT-SimPO",
-    "id": "princeton-nlp/Mistral-7B-Base-SFT-SimPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4701
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4398
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3971
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2702
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-CPO/683ad2cd-5e39-4088-b98b-94d89dda7b88.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-CPO/683ad2cd-5e39-4088-b98b-94d89dda7b88.json
deleted file mode 100644
index 256520f89..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-CPO/683ad2cd-5e39-4088-b98b-94d89dda7b88.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Instruct-CPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Instruct-CPO",
-    "id": "princeton-nlp/Mistral-7B-Instruct-CPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4203
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4069
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4178
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-DPO/08ffd7ab-ccca-4258-be6d-cbc151cc43aa.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-DPO/08ffd7ab-ccca-4258-be6d-cbc151cc43aa.json
deleted file mode 100644
index c489fb059..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-DPO/08ffd7ab-ccca-4258-be6d-cbc151cc43aa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Instruct-DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Instruct-DPO",
-    "id": "princeton-nlp/Mistral-7B-Instruct-DPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5176
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.406
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.031
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3833
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2749
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-IPO/4b6efad4-c697-4f0a-8d24-75dc49d8ec06.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-IPO/4b6efad4-c697-4f0a-8d24-75dc49d8ec06.json
deleted file mode 100644
index 2ed1a6808..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-IPO/4b6efad4-c697-4f0a-8d24-75dc49d8ec06.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Instruct-IPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Instruct-IPO",
-    "id": "princeton-nlp/Mistral-7B-Instruct-IPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4929
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4322
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4324
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2708
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-KTO/4986c30a-85b0-4263-9be4-d69c9b067e0c.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-KTO/4986c30a-85b0-4263-9be4-d69c9b067e0c.json
deleted file mode 100644
index 9d84a30d3..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-KTO/4986c30a-85b0-4263-9be4-d69c9b067e0c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Instruct-KTO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Instruct-KTO",
-    "id": "princeton-nlp/Mistral-7B-Instruct-KTO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4908
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.414
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3953
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2812
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-ORPO/47b5a878-1a4a-425f-ae6f-ac286f681cca.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-ORPO/47b5a878-1a4a-425f-ae6f-ac286f681cca.json
deleted file mode 100644
index ef91d0d7f..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-ORPO/47b5a878-1a4a-425f-ae6f-ac286f681cca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Instruct-ORPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Instruct-ORPO",
-    "id": "princeton-nlp/Mistral-7B-Instruct-ORPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.472
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4104
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3912
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2662
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-RDPO/992a6862-46b9-415e-858f-2eff8709ca81.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-RDPO/992a6862-46b9-415e-858f-2eff8709ca81.json
deleted file mode 100644
index 99e611d5b..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-RDPO/992a6862-46b9-415e-858f-2eff8709ca81.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Instruct-RDPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Instruct-RDPO",
-    "id": "princeton-nlp/Mistral-7B-Instruct-RDPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4887
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.405
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3873
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-RRHF/c6391381-c973-4068-b72c-af08762d9e5c.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-RRHF/c6391381-c973-4068-b72c-af08762d9e5c.json
deleted file mode 100644
index d0f2bc97e..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-RRHF/c6391381-c973-4068-b72c-af08762d9e5c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Instruct-RRHF/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Instruct-RRHF",
-    "id": "princeton-nlp/Mistral-7B-Instruct-RRHF",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.496
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.419
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0279
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3979
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-SLiC-HF/0f6e18e6-1b0f-43f4-a9af-6632f6ce63cc.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-SLiC-HF/0f6e18e6-1b0f-43f4-a9af-6632f6ce63cc.json
deleted file mode 100644
index 43e0e4f1d..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-SLiC-HF/0f6e18e6-1b0f-43f4-a9af-6632f6ce63cc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Instruct-SLiC-HF/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Instruct-SLiC-HF",
-    "id": "princeton-nlp/Mistral-7B-Instruct-SLiC-HF",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5115
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.404
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0174
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3913
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2715
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-SimPO/56d9ee92-6774-4c9b-9861-c5f0a9945e7c.json b/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-SimPO/56d9ee92-6774-4c9b-9861-c5f0a9945e7c.json
deleted file mode 100644
index b5dab8271..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Mistral-7B-Instruct-SimPO/56d9ee92-6774-4c9b-9861-c5f0a9945e7c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Mistral-7B-Instruct-SimPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Instruct-SimPO",
-    "id": "princeton-nlp/Mistral-7B-Instruct-SimPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4687
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4507
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4098
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2797
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Sheared-LLaMA-1.3B/d3e753cc-37fc-4d77-8b2d-da90a7843d60.json b/data/hfopenllm_v2/princeton-nlp/Sheared-LLaMA-1.3B/d3e753cc-37fc-4d77-8b2d-da90a7843d60.json
deleted file mode 100644
index e3ac0d5ad..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Sheared-LLaMA-1.3B/d3e753cc-37fc-4d77-8b2d-da90a7843d60.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Sheared-LLaMA-1.3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sheared-LLaMA-1.3B",
-    "id": "princeton-nlp/Sheared-LLaMA-1.3B",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.3
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2198
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3197
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2399
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3713
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1171
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/Sheared-LLaMA-2.7B/eb08ef6f-6631-47c4-8f52-bf9454ad34b6.json b/data/hfopenllm_v2/princeton-nlp/Sheared-LLaMA-2.7B/eb08ef6f-6631-47c4-8f52-bf9454ad34b6.json
deleted file mode 100644
index 8203b5fb5..000000000
--- a/data/hfopenllm_v2/princeton-nlp/Sheared-LLaMA-2.7B/eb08ef6f-6631-47c4-8f52-bf9454ad34b6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_Sheared-LLaMA-2.7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sheared-LLaMA-2.7B",
-    "id": "princeton-nlp/Sheared-LLaMA-2.7B",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 2.7
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2417
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3259
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3567
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1187
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/gemma-2-9b-it-DPO/2207b154-c5d4-4e5a-ade0-271e62d6345f.json b/data/hfopenllm_v2/princeton-nlp/gemma-2-9b-it-DPO/2207b154-c5d4-4e5a-ade0-271e62d6345f.json
deleted file mode 100644
index fd91acee8..000000000
--- a/data/hfopenllm_v2/princeton-nlp/gemma-2-9b-it-DPO/2207b154-c5d4-4e5a-ade0-271e62d6345f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_gemma-2-9b-it-DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-9b-it-DPO",
-    "id": "princeton-nlp/gemma-2-9b-it-DPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2769
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5941
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0831
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3723
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/princeton-nlp/gemma-2-9b-it-SimPO/f4161154-7777-4261-9275-a3002a1305d8.json b/data/hfopenllm_v2/princeton-nlp/gemma-2-9b-it-SimPO/f4161154-7777-4261-9275-a3002a1305d8.json
deleted file mode 100644
index 0d667a77a..000000000
--- a/data/hfopenllm_v2/princeton-nlp/gemma-2-9b-it-SimPO/f4161154-7777-4261-9275-a3002a1305d8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/princeton-nlp_gemma-2-9b-it-SimPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-9b-it-SimPO",
-    "id": "princeton-nlp/gemma-2-9b-it-SimPO",
-    "developer": "princeton-nlp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3207
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5839
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.071
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4123
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3975
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Bellatrix-1.5B-xElite/8523812d-1db6-4a9d-b06b-ac904191789d.json b/data/hfopenllm_v2/prithivMLmods/Bellatrix-1.5B-xElite/8523812d-1db6-4a9d-b06b-ac904191789d.json
deleted file mode 100644
index a73f99ed2..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Bellatrix-1.5B-xElite/8523812d-1db6-4a9d-b06b-ac904191789d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Bellatrix-1.5B-xElite/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Bellatrix-1.5B-xElite",
-    "id": "prithivMLmods/Bellatrix-1.5B-xElite",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1964
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3501
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3619
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1657
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Bellatrix-Tiny-1.5B-R1/6cd9ea81-618d-444e-a892-d4f9819daa67.json b/data/hfopenllm_v2/prithivMLmods/Bellatrix-Tiny-1.5B-R1/6cd9ea81-618d-444e-a892-d4f9819daa67.json
deleted file mode 100644
index ed7c9d8c6..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Bellatrix-Tiny-1.5B-R1/6cd9ea81-618d-444e-a892-d4f9819daa67.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Bellatrix-Tiny-1.5B-R1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Bellatrix-Tiny-1.5B-R1",
-    "id": "prithivMLmods/Bellatrix-Tiny-1.5B-R1",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3352
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4022
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0604
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3683
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2751
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Bellatrix-Tiny-1B-v2/2217326d-377a-4503-8180-206c12c87436.json b/data/hfopenllm_v2/prithivMLmods/Bellatrix-Tiny-1B-v2/2217326d-377a-4503-8180-206c12c87436.json
deleted file mode 100644
index e1aaf8729..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Bellatrix-Tiny-1B-v2/2217326d-377a-4503-8180-206c12c87436.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Bellatrix-Tiny-1B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Bellatrix-Tiny-1B-v2",
-    "id": "prithivMLmods/Bellatrix-Tiny-1B-v2",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.151
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3268
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.343
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1493
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Blaze-14B-xElite/3bbb10fc-e3b9-4c6a-ac35-ee5de9ecd330.json b/data/hfopenllm_v2/prithivMLmods/Blaze-14B-xElite/3bbb10fc-e3b9-4c6a-ac35-ee5de9ecd330.json
deleted file mode 100644
index b456222e3..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Blaze-14B-xElite/3bbb10fc-e3b9-4c6a-ac35-ee5de9ecd330.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Blaze-14B-xElite/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Blaze-14B-xElite",
-    "id": "prithivMLmods/Blaze-14B-xElite",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0363
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6628
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3693
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3943
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4625
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5111
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/COCO-7B-Instruct-1M/01124f11-b739-422b-97f7-062074b8d0fb.json b/data/hfopenllm_v2/prithivMLmods/COCO-7B-Instruct-1M/01124f11-b739-422b-97f7-062074b8d0fb.json
deleted file mode 100644
index 7b3bedda6..000000000
--- a/data/hfopenllm_v2/prithivMLmods/COCO-7B-Instruct-1M/01124f11-b739-422b-97f7-062074b8d0fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_COCO-7B-Instruct-1M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "COCO-7B-Instruct-1M",
-    "id": "prithivMLmods/COCO-7B-Instruct-1M",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4743
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.541
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3497
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4186
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite-1M/7cc4c93b-7c43-4bed-84a3-fa1cd9130abb.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite-1M/7cc4c93b-7c43-4bed-84a3-fa1cd9130abb.json
deleted file mode 100644
index f174ff65b..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite-1M/7cc4c93b-7c43-4bed-84a3-fa1cd9130abb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-14B-Elite-1M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Calcium-Opus-14B-Elite-1M",
-    "id": "prithivMLmods/Calcium-Opus-14B-Elite-1M",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5613
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6329
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4456
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3523
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4676
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5152
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite-Stock/bf3aa551-f9c6-4203-b2d4-55cf9e6e2872.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite-Stock/bf3aa551-f9c6-4203-b2d4-55cf9e6e2872.json
deleted file mode 100644
index 5f39e061c..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite-Stock/bf3aa551-f9c6-4203-b2d4-55cf9e6e2872.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-14B-Elite-Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Calcium-Opus-14B-Elite-Stock",
-    "id": "prithivMLmods/Calcium-Opus-14B-Elite-Stock",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6143
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6329
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4668
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3683
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4808
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5284
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite/2eae8905-5338-4a78-86e7-d354d06efa23.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite/2eae8905-5338-4a78-86e7-d354d06efa23.json
deleted file mode 100644
index ead3156a2..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite/2eae8905-5338-4a78-86e7-d354d06efa23.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-14B-Elite/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Calcium-Opus-14B-Elite",
-    "id": "prithivMLmods/Calcium-Opus-14B-Elite",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6064
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6296
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3708
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4873
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5307
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite/9dcc4121-e046-49c7-969e-7255b0c32d3d.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite/9dcc4121-e046-49c7-969e-7255b0c32d3d.json
deleted file mode 100644
index 4683966d3..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite/9dcc4121-e046-49c7-969e-7255b0c32d3d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-14B-Elite/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Calcium-Opus-14B-Elite",
-    "id": "prithivMLmods/Calcium-Opus-14B-Elite",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6052
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6317
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4789
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3742
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.486
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5302
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite2-R1/dd7d4acd-549a-467b-b461-0eba5b019122.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite2-R1/dd7d4acd-549a-467b-b461-0eba5b019122.json
deleted file mode 100644
index 4c691d1f5..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite2-R1/dd7d4acd-549a-467b-b461-0eba5b019122.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-14B-Elite2-R1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Calcium-Opus-14B-Elite2-R1",
-    "id": "prithivMLmods/Calcium-Opus-14B-Elite2-R1",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6326
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6362
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3338
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3909
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.49
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5248
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite2/159969cc-32c5-4f6f-b586-8e6d44180b44.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite2/159969cc-32c5-4f6f-b586-8e6d44180b44.json
deleted file mode 100644
index 48a8d72bd..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite2/159969cc-32c5-4f6f-b586-8e6d44180b44.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-14B-Elite2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Calcium-Opus-14B-Elite2",
-    "id": "prithivMLmods/Calcium-Opus-14B-Elite2",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6176
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6318
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.469
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.494
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5301
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite3/b80e559d-e519-4678-8abc-ee5591b81fac.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite3/b80e559d-e519-4678-8abc-ee5591b81fac.json
deleted file mode 100644
index 25816e9c4..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite3/b80e559d-e519-4678-8abc-ee5591b81fac.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-14B-Elite3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Calcium-Opus-14B-Elite3",
-    "id": "prithivMLmods/Calcium-Opus-14B-Elite3",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5428
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.635
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4705
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3708
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4795
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5335
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite4/90c137c9-939d-4e77-9fcc-9e33551a6121.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite4/90c137c9-939d-4e77-9fcc-9e33551a6121.json
deleted file mode 100644
index 2a82505c9..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Elite4/90c137c9-939d-4e77-9fcc-9e33551a6121.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-14B-Elite4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Calcium-Opus-14B-Elite4",
-    "id": "prithivMLmods/Calcium-Opus-14B-Elite4",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6112
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6195
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3625
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3557
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4687
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5149
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Merge/f25d6fef-d337-4cf7-ba05-ca6ff5eccd52.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Merge/f25d6fef-d337-4cf7-ba05-ca6ff5eccd52.json
deleted file mode 100644
index 7ea9119d9..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-14B-Merge/f25d6fef-d337-4cf7-ba05-ca6ff5eccd52.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-14B-Merge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Calcium-Opus-14B-Merge",
-    "id": "prithivMLmods/Calcium-Opus-14B-Merge",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4949
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6319
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4637
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3708
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4861
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5356
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-20B-v1/c6f92306-dcdc-4549-bfc2-feb62a3a6ef6.json b/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-20B-v1/c6f92306-dcdc-4549-bfc2-feb62a3a6ef6.json
deleted file mode 100644
index 84bf99e76..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Calcium-Opus-20B-v1/c6f92306-dcdc-4549-bfc2-feb62a3a6ef6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Calcium-Opus-20B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Calcium-Opus-20B-v1",
-    "id": "prithivMLmods/Calcium-Opus-20B-v1",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 19.173
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3093
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.599
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3618
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3532
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4943
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4734
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Codepy-Deepthink-3B/96c64d23-d23d-486c-83a4-4c0ab4f09d60.json b/data/hfopenllm_v2/prithivMLmods/Codepy-Deepthink-3B/96c64d23-d23d-486c-83a4-4c0ab4f09d60.json
deleted file mode 100644
index 8b565a25a..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Codepy-Deepthink-3B/96c64d23-d23d-486c-83a4-4c0ab4f09d60.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Codepy-Deepthink-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Codepy-Deepthink-3B",
-    "id": "prithivMLmods/Codepy-Deepthink-3B",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4327
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4259
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1156
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.331
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.309
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Coma-II-14B/243abf0b-0f88-4b4f-ab51-6c8aebaf19be.json b/data/hfopenllm_v2/prithivMLmods/Coma-II-14B/243abf0b-0f88-4b4f-ab51-6c8aebaf19be.json
deleted file mode 100644
index 8f5675e29..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Coma-II-14B/243abf0b-0f88-4b4f-ab51-6c8aebaf19be.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Coma-II-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Coma-II-14B",
-    "id": "prithivMLmods/Coma-II-14B",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4168
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6321
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4002
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5351
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.504
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Condor-Opus-14B-Exp/438fb728-d6ad-4c28-a43c-ff82d522cd50.json b/data/hfopenllm_v2/prithivMLmods/Condor-Opus-14B-Exp/438fb728-d6ad-4c28-a43c-ff82d522cd50.json
deleted file mode 100644
index 8ad6643ba..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Condor-Opus-14B-Exp/438fb728-d6ad-4c28-a43c-ff82d522cd50.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Condor-Opus-14B-Exp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Condor-Opus-14B-Exp",
-    "id": "prithivMLmods/Condor-Opus-14B-Exp",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4043
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6154
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5227
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3918
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5194
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5014
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Cygnus-II-14B/94b45b8d-b754-4fb4-843d-b7ffeafc4f1b.json b/data/hfopenllm_v2/prithivMLmods/Cygnus-II-14B/94b45b8d-b754-4fb4-843d-b7ffeafc4f1b.json
deleted file mode 100644
index e0650e76a..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Cygnus-II-14B/94b45b8d-b754-4fb4-843d-b7ffeafc4f1b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Cygnus-II-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cygnus-II-14B",
-    "id": "prithivMLmods/Cygnus-II-14B",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6184
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6661
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4396
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3876
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4688
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5391
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Deepthink-Llama-3-8B-Preview/5618fc82-d455-4261-8e34-1190d70fd3f3.json b/data/hfopenllm_v2/prithivMLmods/Deepthink-Llama-3-8B-Preview/5618fc82-d455-4261-8e34-1190d70fd3f3.json
deleted file mode 100644
index 78544d542..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Deepthink-Llama-3-8B-Preview/5618fc82-d455-4261-8e34-1190d70fd3f3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Deepthink-Llama-3-8B-Preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Deepthink-Llama-3-8B-Preview",
-    "id": "prithivMLmods/Deepthink-Llama-3-8B-Preview",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2955
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4665
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.355
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3707
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2739
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Deepthink-Reasoning-14B/395f6339-3fca-4f4d-befc-2d231008efdd.json b/data/hfopenllm_v2/prithivMLmods/Deepthink-Reasoning-14B/395f6339-3fca-4f4d-befc-2d231008efdd.json
deleted file mode 100644
index 3bf404b51..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Deepthink-Reasoning-14B/395f6339-3fca-4f4d-befc-2d231008efdd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Deepthink-Reasoning-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Deepthink-Reasoning-14B",
-    "id": "prithivMLmods/Deepthink-Reasoning-14B",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5424
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6334
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.423
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3666
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4732
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5296
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Deepthink-Reasoning-7B/b22696ac-7074-44f2-b72f-c59ca0a41ce6.json b/data/hfopenllm_v2/prithivMLmods/Deepthink-Reasoning-7B/b22696ac-7074-44f2-b72f-c59ca0a41ce6.json
deleted file mode 100644
index 4a2741947..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Deepthink-Reasoning-7B/b22696ac-7074-44f2-b72f-c59ca0a41ce6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Deepthink-Reasoning-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Deepthink-Reasoning-7B",
-    "id": "prithivMLmods/Deepthink-Reasoning-7B",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.484
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5505
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3346
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4432
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4349
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Dinobot-Opus-14B-Exp/6856f8b6-a719-4f69-be71-4df582015f28.json b/data/hfopenllm_v2/prithivMLmods/Dinobot-Opus-14B-Exp/6856f8b6-a719-4f69-be71-4df582015f28.json
deleted file mode 100644
index e70ce3f68..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Dinobot-Opus-14B-Exp/6856f8b6-a719-4f69-be71-4df582015f28.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Dinobot-Opus-14B-Exp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Dinobot-Opus-14B-Exp",
-    "id": "prithivMLmods/Dinobot-Opus-14B-Exp",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.824
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.637
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5317
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.426
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4979
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Elita-0.1-Distilled-R1-abliterated/f2c0ea2b-76ae-4469-832e-84c0b79fa283.json b/data/hfopenllm_v2/prithivMLmods/Elita-0.1-Distilled-R1-abliterated/f2c0ea2b-76ae-4469-832e-84c0b79fa283.json
deleted file mode 100644
index 588c887dc..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Elita-0.1-Distilled-R1-abliterated/f2c0ea2b-76ae-4469-832e-84c0b79fa283.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Elita-0.1-Distilled-R1-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Elita-0.1-Distilled-R1-abliterated",
-    "id": "prithivMLmods/Elita-0.1-Distilled-R1-abliterated",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3542
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3828
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3066
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.366
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2758
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Elita-1/5619e3cb-eb3e-4420-a156-6f7b2a5d372d.json b/data/hfopenllm_v2/prithivMLmods/Elita-1/5619e3cb-eb3e-4420-a156-6f7b2a5d372d.json
deleted file mode 100644
index 54138e141..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Elita-1/5619e3cb-eb3e-4420-a156-6f7b2a5d372d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Elita-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Elita-1",
-    "id": "prithivMLmods/Elita-1",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4906
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.652
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3429
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3758
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4834
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5381
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Epimetheus-14B-Axo/9d5e329f-491a-4608-bcac-1ee63046b34a.json b/data/hfopenllm_v2/prithivMLmods/Epimetheus-14B-Axo/9d5e329f-491a-4608-bcac-1ee63046b34a.json
deleted file mode 100644
index 188b84ba8..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Epimetheus-14B-Axo/9d5e329f-491a-4608-bcac-1ee63046b34a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Epimetheus-14B-Axo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Epimetheus-14B-Axo",
-    "id": "prithivMLmods/Epimetheus-14B-Axo",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5546
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6613
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4101
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3926
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.482
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5304
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Equuleus-Opus-14B-Exp/80953f08-6530-4bab-a375-cc542081aabb.json b/data/hfopenllm_v2/prithivMLmods/Equuleus-Opus-14B-Exp/80953f08-6530-4bab-a375-cc542081aabb.json
deleted file mode 100644
index 31a9c3f6f..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Equuleus-Opus-14B-Exp/80953f08-6530-4bab-a375-cc542081aabb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Equuleus-Opus-14B-Exp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Equuleus-Opus-14B-Exp",
-    "id": "prithivMLmods/Equuleus-Opus-14B-Exp",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7001
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6434
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4585
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4952
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5374
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Eridanus-Opus-14B-r999/0b8691a8-f394-4da3-a67b-faa1af9b42c9.json b/data/hfopenllm_v2/prithivMLmods/Eridanus-Opus-14B-r999/0b8691a8-f394-4da3-a67b-faa1af9b42c9.json
deleted file mode 100644
index 4e0e78a3e..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Eridanus-Opus-14B-r999/0b8691a8-f394-4da3-a67b-faa1af9b42c9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Eridanus-Opus-14B-r999/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Eridanus-Opus-14B-r999",
-    "id": "prithivMLmods/Eridanus-Opus-14B-r999",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6386
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6584
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.386
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3943
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4769
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5362
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Evac-Opus-14B-Exp/fb541a2b-d9bd-4aa2-8b83-da62a3b77731.json b/data/hfopenllm_v2/prithivMLmods/Evac-Opus-14B-Exp/fb541a2b-d9bd-4aa2-8b83-da62a3b77731.json
deleted file mode 100644
index 29b96ac56..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Evac-Opus-14B-Exp/fb541a2b-d9bd-4aa2-8b83-da62a3b77731.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Evac-Opus-14B-Exp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Evac-Opus-14B-Exp",
-    "id": "prithivMLmods/Evac-Opus-14B-Exp",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5916
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6475
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4215
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3884
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4728
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5317
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/FastThink-0.5B-Tiny/c20d1c62-d3e0-4e30-b0d3-4c62a6585d23.json b/data/hfopenllm_v2/prithivMLmods/FastThink-0.5B-Tiny/c20d1c62-d3e0-4e30-b0d3-4c62a6585d23.json
deleted file mode 100644
index 6a08d3a8b..000000000
--- a/data/hfopenllm_v2/prithivMLmods/FastThink-0.5B-Tiny/c20d1c62-d3e0-4e30-b0d3-4c62a6585d23.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_FastThink-0.5B-Tiny/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FastThink-0.5B-Tiny",
-    "id": "prithivMLmods/FastThink-0.5B-Tiny",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.258
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3206
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3566
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1649
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/GWQ-9B-Preview/8a10eeb6-7178-4c78-8940-68fad78e389b.json b/data/hfopenllm_v2/prithivMLmods/GWQ-9B-Preview/8a10eeb6-7178-4c78-8940-68fad78e389b.json
deleted file mode 100644
index 5fd930e8e..000000000
--- a/data/hfopenllm_v2/prithivMLmods/GWQ-9B-Preview/8a10eeb6-7178-4c78-8940-68fad78e389b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_GWQ-9B-Preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GWQ-9B-Preview",
-    "id": "prithivMLmods/GWQ-9B-Preview",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5066
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5806
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2266
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3398
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4951
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3984
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/GWQ-9B-Preview2/f0bb774c-a842-4261-b817-b169ce65a493.json b/data/hfopenllm_v2/prithivMLmods/GWQ-9B-Preview2/f0bb774c-a842-4261-b817-b169ce65a493.json
deleted file mode 100644
index 99ceb5f7b..000000000
--- a/data/hfopenllm_v2/prithivMLmods/GWQ-9B-Preview2/f0bb774c-a842-4261-b817-b169ce65a493.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_GWQ-9B-Preview2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GWQ-9B-Preview2",
-    "id": "prithivMLmods/GWQ-9B-Preview2",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5209
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5797
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2372
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.486
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3997
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/GWQ2b/59afe234-3a7f-49bb-873c-df6cf793e5e5.json b/data/hfopenllm_v2/prithivMLmods/GWQ2b/59afe234-3a7f-49bb-873c-df6cf793e5e5.json
deleted file mode 100644
index 814634ab3..000000000
--- a/data/hfopenllm_v2/prithivMLmods/GWQ2b/59afe234-3a7f-49bb-873c-df6cf793e5e5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_GWQ2b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GWQ2b",
-    "id": "prithivMLmods/GWQ2b",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4115
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4143
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0627
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4311
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2473
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Gaea-Opus-14B-Exp/4074081a-66a6-42e4-994f-72541f90888b.json b/data/hfopenllm_v2/prithivMLmods/Gaea-Opus-14B-Exp/4074081a-66a6-42e4-994f-72541f90888b.json
deleted file mode 100644
index 72bbcb358..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Gaea-Opus-14B-Exp/4074081a-66a6-42e4-994f-72541f90888b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Gaea-Opus-14B-Exp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gaea-Opus-14B-Exp",
-    "id": "prithivMLmods/Gaea-Opus-14B-Exp",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5956
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.656
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4275
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3909
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4859
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5401
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Galactic-Qwen-14B-Exp1/6a618ec8-c029-49ec-9ea5-da52b5231280.json b/data/hfopenllm_v2/prithivMLmods/Galactic-Qwen-14B-Exp1/6a618ec8-c029-49ec-9ea5-da52b5231280.json
deleted file mode 100644
index 9d9a90116..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Galactic-Qwen-14B-Exp1/6a618ec8-c029-49ec-9ea5-da52b5231280.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Galactic-Qwen-14B-Exp1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Galactic-Qwen-14B-Exp1",
-    "id": "prithivMLmods/Galactic-Qwen-14B-Exp1",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5832
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6582
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4018
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4781
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5396
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Galactic-Qwen-14B-Exp2/edc8f510-c961-4c1f-9757-e80c4247f275.json b/data/hfopenllm_v2/prithivMLmods/Galactic-Qwen-14B-Exp2/edc8f510-c961-4c1f-9757-e80c4247f275.json
deleted file mode 100644
index 26d244447..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Galactic-Qwen-14B-Exp2/edc8f510-c961-4c1f-9757-e80c4247f275.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Galactic-Qwen-14B-Exp2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Galactic-Qwen-14B-Exp2",
-    "id": "prithivMLmods/Galactic-Qwen-14B-Exp2",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.662
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7203
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3474
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3993
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5354
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5691
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Gauss-Opus-14B-R999/aaa5d1e6-5aca-4471-87ea-7195610a6c1d.json b/data/hfopenllm_v2/prithivMLmods/Gauss-Opus-14B-R999/aaa5d1e6-5aca-4471-87ea-7195610a6c1d.json
deleted file mode 100644
index 7057ef00e..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Gauss-Opus-14B-R999/aaa5d1e6-5aca-4471-87ea-7195610a6c1d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Gauss-Opus-14B-R999/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gauss-Opus-14B-R999",
-    "id": "prithivMLmods/Gauss-Opus-14B-R999",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3907
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6228
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5755
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3918
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5338
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5007
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Jolt-v0.1/89b45e8b-9979-4c7f-8aa6-c6ab7009cab0.json b/data/hfopenllm_v2/prithivMLmods/Jolt-v0.1/89b45e8b-9979-4c7f-8aa6-c6ab7009cab0.json
deleted file mode 100644
index 96783fe07..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Jolt-v0.1/89b45e8b-9979-4c7f-8aa6-c6ab7009cab0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Jolt-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Jolt-v0.1",
-    "id": "prithivMLmods/Jolt-v0.1",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5092
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6521
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4847
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5386
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Lacerta-Opus-14B-Elite8/41000c74-8b29-4369-996f-cf3a2fd09f63.json b/data/hfopenllm_v2/prithivMLmods/Lacerta-Opus-14B-Elite8/41000c74-8b29-4369-996f-cf3a2fd09f63.json
deleted file mode 100644
index 4a4f258fd..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Lacerta-Opus-14B-Elite8/41000c74-8b29-4369-996f-cf3a2fd09f63.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Lacerta-Opus-14B-Elite8/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lacerta-Opus-14B-Elite8",
-    "id": "prithivMLmods/Lacerta-Opus-14B-Elite8",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6141
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6401
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3648
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3784
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4635
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5322
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Llama-3.1-5B-Instruct/a1765846-74e1-440a-8851-12a571444059.json b/data/hfopenllm_v2/prithivMLmods/Llama-3.1-5B-Instruct/a1765846-74e1-440a-8851-12a571444059.json
deleted file mode 100644
index ae7d80588..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Llama-3.1-5B-Instruct/a1765846-74e1-440a-8851-12a571444059.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Llama-3.1-5B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-5B-Instruct",
-    "id": "prithivMLmods/Llama-3.1-5B-Instruct",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 5.413
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1407
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3051
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.354
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1184
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Llama-3.1-8B-Open-SFT/9c6b594f-387a-42a3-9e40-3b26363e6071.json b/data/hfopenllm_v2/prithivMLmods/Llama-3.1-8B-Open-SFT/9c6b594f-387a-42a3-9e40-3b26363e6071.json
deleted file mode 100644
index 41ba66370..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Llama-3.1-8B-Open-SFT/9c6b594f-387a-42a3-9e40-3b26363e6071.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Llama-3.1-8B-Open-SFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-Open-SFT",
-    "id": "prithivMLmods/Llama-3.1-8B-Open-SFT",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4123
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4968
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3904
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3522
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Llama-3.2-3B-Math-Oct/2b910401-457a-45dd-920a-559f4595897b.json b/data/hfopenllm_v2/prithivMLmods/Llama-3.2-3B-Math-Oct/2b910401-457a-45dd-920a-559f4595897b.json
deleted file mode 100644
index 7b7a839ae..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Llama-3.2-3B-Math-Oct/2b910401-457a-45dd-920a-559f4595897b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Llama-3.2-3B-Math-Oct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-Math-Oct",
-    "id": "prithivMLmods/Llama-3.2-3B-Math-Oct",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4585
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4372
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1156
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.347
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Llama-3.2-6B-AlgoCode/90b7be49-53a0-4d7f-8995-cbc52fe3a70f.json b/data/hfopenllm_v2/prithivMLmods/Llama-3.2-6B-AlgoCode/90b7be49-53a0-4d7f-8995-cbc52fe3a70f.json
deleted file mode 100644
index 8570bf857..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Llama-3.2-6B-AlgoCode/90b7be49-53a0-4d7f-8995-cbc52fe3a70f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Llama-3.2-6B-AlgoCode/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-6B-AlgoCode",
-    "id": "prithivMLmods/Llama-3.2-6B-AlgoCode",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.339
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2136
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3748
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4013
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1798
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Llama-8B-Distill-CoT/5e8854ba-7147-4fdd-a568-1ea58e79e7d8.json b/data/hfopenllm_v2/prithivMLmods/Llama-8B-Distill-CoT/5e8854ba-7147-4fdd-a568-1ea58e79e7d8.json
deleted file mode 100644
index 8bc00a642..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Llama-8B-Distill-CoT/5e8854ba-7147-4fdd-a568-1ea58e79e7d8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Llama-8B-Distill-CoT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-8B-Distill-CoT",
-    "id": "prithivMLmods/Llama-8B-Distill-CoT",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3342
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4298
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4003
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.372
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2732
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Llama-Deepsync-1B/df6e0cfb-d720-428a-a5ad-b1529faa07c0.json b/data/hfopenllm_v2/prithivMLmods/Llama-Deepsync-1B/df6e0cfb-d720-428a-a5ad-b1529faa07c0.json
deleted file mode 100644
index 193336b1d..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Llama-Deepsync-1B/df6e0cfb-d720-428a-a5ad-b1529faa07c0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Llama-Deepsync-1B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-Deepsync-1B",
-    "id": "prithivMLmods/Llama-Deepsync-1B",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.357
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3386
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1738
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Llama-Deepsync-3B/a88a6e6f-2253-4b67-9527-55ab6153e40f.json b/data/hfopenllm_v2/prithivMLmods/Llama-Deepsync-3B/a88a6e6f-2253-4b67-9527-55ab6153e40f.json
deleted file mode 100644
index 702c1642b..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Llama-Deepsync-3B/a88a6e6f-2253-4b67-9527-55ab6153e40f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Llama-Deepsync-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-Deepsync-3B",
-    "id": "prithivMLmods/Llama-Deepsync-3B",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4302
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4292
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1178
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3324
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3031
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Llama-Express.1-Math/00c66a37-b46b-47e8-a098-ce12433c1135.json b/data/hfopenllm_v2/prithivMLmods/Llama-Express.1-Math/00c66a37-b46b-47e8-a098-ce12433c1135.json
deleted file mode 100644
index 3d5e5f05f..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Llama-Express.1-Math/00c66a37-b46b-47e8-a098-ce12433c1135.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Llama-Express.1-Math/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-Express.1-Math",
-    "id": "prithivMLmods/Llama-Express.1-Math",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5084
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0559
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3143
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.161
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/LwQ-10B-Instruct/6ad5483c-13dc-4e79-a719-66af383d195a.json b/data/hfopenllm_v2/prithivMLmods/LwQ-10B-Instruct/6ad5483c-13dc-4e79-a719-66af383d195a.json
deleted file mode 100644
index d8310afc7..000000000
--- a/data/hfopenllm_v2/prithivMLmods/LwQ-10B-Instruct/6ad5483c-13dc-4e79-a719-66af383d195a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_LwQ-10B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LwQ-10B-Instruct",
-    "id": "prithivMLmods/LwQ-10B-Instruct",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5122
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4544
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3318
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/LwQ-Reasoner-10B/9fa6813a-7acb-4c08-9912-6dc0d356a7e2.json b/data/hfopenllm_v2/prithivMLmods/LwQ-Reasoner-10B/9fa6813a-7acb-4c08-9912-6dc0d356a7e2.json
deleted file mode 100644
index e4c1d9c13..000000000
--- a/data/hfopenllm_v2/prithivMLmods/LwQ-Reasoner-10B/9fa6813a-7acb-4c08-9912-6dc0d356a7e2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_LwQ-Reasoner-10B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LwQ-Reasoner-10B",
-    "id": "prithivMLmods/LwQ-Reasoner-10B",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2941
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5866
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3465
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4079
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4147
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Magellanic-Opus-14B-Exp/3880e3bf-6ff0-4eef-a519-2649014254e1.json b/data/hfopenllm_v2/prithivMLmods/Magellanic-Opus-14B-Exp/3880e3bf-6ff0-4eef-a519-2649014254e1.json
deleted file mode 100644
index 227d602c1..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Magellanic-Opus-14B-Exp/3880e3bf-6ff0-4eef-a519-2649014254e1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Magellanic-Opus-14B-Exp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Magellanic-Opus-14B-Exp",
-    "id": "prithivMLmods/Magellanic-Opus-14B-Exp",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6866
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6383
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3799
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3742
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4926
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5273
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Magellanic-Qwen-25B-R999/e77efb9d-b1fc-4833-8e7f-8da683019018.json b/data/hfopenllm_v2/prithivMLmods/Magellanic-Qwen-25B-R999/e77efb9d-b1fc-4833-8e7f-8da683019018.json
deleted file mode 100644
index e8a7e420c..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Magellanic-Qwen-25B-R999/e77efb9d-b1fc-4833-8e7f-8da683019018.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Magellanic-Qwen-25B-R999/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Magellanic-Qwen-25B-R999",
-    "id": "prithivMLmods/Magellanic-Qwen-25B-R999",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 24.962
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1873
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2608
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0053
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3831
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.13
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Megatron-Corpus-14B-Exp.v2/2bcc02df-8d27-412a-8b58-c331df98e4d4.json b/data/hfopenllm_v2/prithivMLmods/Megatron-Corpus-14B-Exp.v2/2bcc02df-8d27-412a-8b58-c331df98e4d4.json
deleted file mode 100644
index 1ccbd4ee2..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Megatron-Corpus-14B-Exp.v2/2bcc02df-8d27-412a-8b58-c331df98e4d4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Megatron-Corpus-14B-Exp.v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Megatron-Corpus-14B-Exp.v2",
-    "id": "prithivMLmods/Megatron-Corpus-14B-Exp.v2",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.487
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6321
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2591
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3423
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.449
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.481
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Megatron-Corpus-14B-Exp/622531d5-03f8-42cf-974e-94291aa1e515.json b/data/hfopenllm_v2/prithivMLmods/Megatron-Corpus-14B-Exp/622531d5-03f8-42cf-974e-94291aa1e515.json
deleted file mode 100644
index 7f5ceece4..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Megatron-Corpus-14B-Exp/622531d5-03f8-42cf-974e-94291aa1e515.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Megatron-Corpus-14B-Exp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Megatron-Corpus-14B-Exp",
-    "id": "prithivMLmods/Megatron-Corpus-14B-Exp",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4983
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6355
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3429
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3633
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4767
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.526
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-2.0/b772f20f-afbd-496c-9f94-e5fd30d54466.json b/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-2.0/b772f20f-afbd-496c-9f94-e5fd30d54466.json
deleted file mode 100644
index 05306c382..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-2.0/b772f20f-afbd-496c-9f94-e5fd30d54466.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Megatron-Opus-14B-2.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Megatron-Opus-14B-2.0",
-    "id": "prithivMLmods/Megatron-Opus-14B-2.0",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6694
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6871
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2779
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3591
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.414
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.517
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-2.1/169d5ad3-ae4a-42de-b951-f264d85bf623.json b/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-2.1/169d5ad3-ae4a-42de-b951-f264d85bf623.json
deleted file mode 100644
index 0def4383d..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-2.1/169d5ad3-ae4a-42de-b951-f264d85bf623.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Megatron-Opus-14B-2.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Megatron-Opus-14B-2.1",
-    "id": "prithivMLmods/Megatron-Opus-14B-2.1",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0246
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6727
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2998
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3834
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4928
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5174
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-Exp/e84c3b50-4ea9-4f41-be11-50c6aa3d4656.json b/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-Exp/e84c3b50-4ea9-4f41-be11-50c6aa3d4656.json
deleted file mode 100644
index 6ccd36216..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-Exp/e84c3b50-4ea9-4f41-be11-50c6aa3d4656.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Megatron-Opus-14B-Exp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Megatron-Opus-14B-Exp",
-    "id": "prithivMLmods/Megatron-Opus-14B-Exp",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4979
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6516
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3535
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4887
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5401
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-Stock/594780dc-d969-4a6b-b90b-1cc32f40c452.json b/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-Stock/594780dc-d969-4a6b-b90b-1cc32f40c452.json
deleted file mode 100644
index 5f0f9b228..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-14B-Stock/594780dc-d969-4a6b-b90b-1cc32f40c452.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Megatron-Opus-14B-Stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Megatron-Opus-14B-Stock",
-    "id": "prithivMLmods/Megatron-Opus-14B-Stock",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5174
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6412
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3346
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.482
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5293
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-7B-Exp/4ff7c238-d69c-4b92-83d0-69cacdfa0fe6.json b/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-7B-Exp/4ff7c238-d69c-4b92-83d0-69cacdfa0fe6.json
deleted file mode 100644
index 781a02264..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Megatron-Opus-7B-Exp/4ff7c238-d69c-4b92-83d0-69cacdfa0fe6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Megatron-Opus-7B-Exp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Megatron-Opus-7B-Exp",
-    "id": "prithivMLmods/Megatron-Opus-7B-Exp",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6017
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5367
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1971
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4186
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Messier-Opus-14B-Elite7/bb576dc9-eede-48d6-b438-732da91a4d29.json b/data/hfopenllm_v2/prithivMLmods/Messier-Opus-14B-Elite7/bb576dc9-eede-48d6-b438-732da91a4d29.json
deleted file mode 100644
index a719fbf21..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Messier-Opus-14B-Elite7/bb576dc9-eede-48d6-b438-732da91a4d29.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Messier-Opus-14B-Elite7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Messier-Opus-14B-Elite7",
-    "id": "prithivMLmods/Messier-Opus-14B-Elite7",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7113
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6499
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4071
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3909
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4886
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5404
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Omni-Reasoner-Merged/0fb2fe17-b55d-4802-ad48-bd4d711e1e0f.json b/data/hfopenllm_v2/prithivMLmods/Omni-Reasoner-Merged/0fb2fe17-b55d-4802-ad48-bd4d711e1e0f.json
deleted file mode 100644
index 86bda99c0..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Omni-Reasoner-Merged/0fb2fe17-b55d-4802-ad48-bd4d711e1e0f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Omni-Reasoner-Merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Omni-Reasoner-Merged",
-    "id": "prithivMLmods/Omni-Reasoner-Merged",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4599
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5508
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4616
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4364
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Omni-Reasoner3-Merged/03d59002-dc98-467f-b2a9-605ef8d9b763.json b/data/hfopenllm_v2/prithivMLmods/Omni-Reasoner3-Merged/03d59002-dc98-467f-b2a9-605ef8d9b763.json
deleted file mode 100644
index 12f798f0d..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Omni-Reasoner3-Merged/03d59002-dc98-467f-b2a9-605ef8d9b763.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Omni-Reasoner3-Merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Omni-Reasoner3-Merged",
-    "id": "prithivMLmods/Omni-Reasoner3-Merged",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4935
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4388
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1088
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3522
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.295
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Pegasus-Opus-14B-Exp/8a7034fd-7027-4a87-9cac-c95b745935d0.json b/data/hfopenllm_v2/prithivMLmods/Pegasus-Opus-14B-Exp/8a7034fd-7027-4a87-9cac-c95b745935d0.json
deleted file mode 100644
index 8f2a0236b..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Pegasus-Opus-14B-Exp/8a7034fd-7027-4a87-9cac-c95b745935d0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Pegasus-Opus-14B-Exp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Pegasus-Opus-14B-Exp",
-    "id": "prithivMLmods/Pegasus-Opus-14B-Exp",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6982
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6548
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4086
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3951
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.486
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5412
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Phi-4-Empathetic/717f745f-1eae-4277-8a31-dbed140ef3e8.json b/data/hfopenllm_v2/prithivMLmods/Phi-4-Empathetic/717f745f-1eae-4277-8a31-dbed140ef3e8.json
deleted file mode 100644
index 6b7f41a83..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Phi-4-Empathetic/717f745f-1eae-4277-8a31-dbed140ef3e8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Phi-4-Empathetic/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-Empathetic",
-    "id": "prithivMLmods/Phi-4-Empathetic",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0497
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6727
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2621
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4991
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5066
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Phi-4-Math-IO/2dc78735-c0c3-4dd7-8e97-52c92785e623.json b/data/hfopenllm_v2/prithivMLmods/Phi-4-Math-IO/2dc78735-c0c3-4dd7-8e97-52c92785e623.json
deleted file mode 100644
index 6dc544cef..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Phi-4-Math-IO/2dc78735-c0c3-4dd7-8e97-52c92785e623.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Phi-4-Math-IO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-Math-IO",
-    "id": "prithivMLmods/Phi-4-Math-IO",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.059
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6668
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4577
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3985
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4873
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5205
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Phi-4-QwQ/e9ab98ff-5cf0-4437-9cf3-c77ecb546c84.json b/data/hfopenllm_v2/prithivMLmods/Phi-4-QwQ/e9ab98ff-5cf0-4437-9cf3-c77ecb546c84.json
deleted file mode 100644
index 6915cdbb1..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Phi-4-QwQ/e9ab98ff-5cf0-4437-9cf3-c77ecb546c84.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Phi-4-QwQ/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-QwQ",
-    "id": "prithivMLmods/Phi-4-QwQ",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0559
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6696
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4577
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3909
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4651
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5275
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Phi-4-Super-1/6303d73e-4129-472a-a6fd-c64cb3de7204.json b/data/hfopenllm_v2/prithivMLmods/Phi-4-Super-1/6303d73e-4129-472a-a6fd-c64cb3de7204.json
deleted file mode 100644
index 6500288f7..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Phi-4-Super-1/6303d73e-4129-472a-a6fd-c64cb3de7204.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Phi-4-Super-1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-Super-1",
-    "id": "prithivMLmods/Phi-4-Super-1",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0418
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6729
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.352
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5017
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5235
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Phi-4-Super-o1/8a689e8f-19cc-45b7-80be-ce861a549af7.json b/data/hfopenllm_v2/prithivMLmods/Phi-4-Super-o1/8a689e8f-19cc-45b7-80be-ce861a549af7.json
deleted file mode 100644
index 6a3f46611..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Phi-4-Super-o1/8a689e8f-19cc-45b7-80be-ce861a549af7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Phi-4-Super-o1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-Super-o1",
-    "id": "prithivMLmods/Phi-4-Super-o1",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0418
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6729
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.352
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5017
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5235
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Phi-4-Super/84881315-55a4-4f05-a115-cf82f850090d.json b/data/hfopenllm_v2/prithivMLmods/Phi-4-Super/84881315-55a4-4f05-a115-cf82f850090d.json
deleted file mode 100644
index 659daae2b..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Phi-4-Super/84881315-55a4-4f05-a115-cf82f850090d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Phi-4-Super/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-Super",
-    "id": "prithivMLmods/Phi-4-Super",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0481
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.672
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3489
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3943
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5044
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5266
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Phi-4-o1/970dc71c-42be-4d50-86ac-f7301ec969ca.json b/data/hfopenllm_v2/prithivMLmods/Phi-4-o1/970dc71c-42be-4d50-86ac-f7301ec969ca.json
deleted file mode 100644
index f575a083a..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Phi-4-o1/970dc71c-42be-4d50-86ac-f7301ec969ca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Phi-4-o1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-4-o1",
-    "id": "prithivMLmods/Phi-4-o1",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.029
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6689
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3995
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3826
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4978
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5174
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Phi4-Super/c02e1fcf-a837-4b8a-a42d-63837c56128d.json b/data/hfopenllm_v2/prithivMLmods/Phi4-Super/c02e1fcf-a837-4b8a-a42d-63837c56128d.json
deleted file mode 100644
index af8b3653b..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Phi4-Super/c02e1fcf-a837-4b8a-a42d-63837c56128d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Phi4-Super/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi4-Super",
-    "id": "prithivMLmods/Phi4-Super",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0481
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.672
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3489
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3943
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5044
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5266
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Porpoise-Opus-14B-Exp/37280340-5b9a-47d9-aa37-9299d9025518.json b/data/hfopenllm_v2/prithivMLmods/Porpoise-Opus-14B-Exp/37280340-5b9a-47d9-aa37-9299d9025518.json
deleted file mode 100644
index 71bc4e12e..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Porpoise-Opus-14B-Exp/37280340-5b9a-47d9-aa37-9299d9025518.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Porpoise-Opus-14B-Exp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Porpoise-Opus-14B-Exp",
-    "id": "prithivMLmods/Porpoise-Opus-14B-Exp",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7098
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6519
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4041
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4926
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5396
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Primal-Opus-14B-Optimus-v1/46e7ad9b-b774-46b9-933c-913d1b307f7a.json b/data/hfopenllm_v2/prithivMLmods/Primal-Opus-14B-Optimus-v1/46e7ad9b-b774-46b9-933c-913d1b307f7a.json
deleted file mode 100644
index bb915353c..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Primal-Opus-14B-Optimus-v1/46e7ad9b-b774-46b9-933c-913d1b307f7a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Primal-Opus-14B-Optimus-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Primal-Opus-14B-Optimus-v1",
-    "id": "prithivMLmods/Primal-Opus-14B-Optimus-v1",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5013
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6419
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3384
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3725
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4847
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5259
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Primal-Opus-14B-Optimus-v2/c154d3f5-39dc-43c0-85ea-2e43b08494b4.json b/data/hfopenllm_v2/prithivMLmods/Primal-Opus-14B-Optimus-v2/c154d3f5-39dc-43c0-85ea-2e43b08494b4.json
deleted file mode 100644
index ff98c5def..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Primal-Opus-14B-Optimus-v2/c154d3f5-39dc-43c0-85ea-2e43b08494b4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Primal-Opus-14B-Optimus-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Primal-Opus-14B-Optimus-v2",
-    "id": "prithivMLmods/Primal-Opus-14B-Optimus-v2",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6404
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6544
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4207
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3918
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.49
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5422
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-14B-Conversational/abd830e4-2b7f-4895-8262-75926edbafd9.json b/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-14B-Conversational/abd830e4-2b7f-4895-8262-75926edbafd9.json
deleted file mode 100644
index 370d5eafa..000000000
--- a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-14B-Conversational/abd830e4-2b7f-4895-8262-75926edbafd9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_QwQ-LCoT-14B-Conversational/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwQ-LCoT-14B-Conversational",
-    "id": "prithivMLmods/QwQ-LCoT-14B-Conversational",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4047
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.624
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4653
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3498
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4847
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5278
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-3B-Instruct/2c945021-72e3-4e7a-9c6f-81efb27b2206.json b/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-3B-Instruct/2c945021-72e3-4e7a-9c6f-81efb27b2206.json
deleted file mode 100644
index 183db0968..000000000
--- a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-3B-Instruct/2c945021-72e3-4e7a-9c6f-81efb27b2206.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_QwQ-LCoT-3B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwQ-LCoT-3B-Instruct",
-    "id": "prithivMLmods/QwQ-LCoT-3B-Instruct",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4354
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4763
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2825
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4358
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3582
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-7B-Instruct/5f0ea694-7f73-45fa-b54f-49fc06d1a6d9.json b/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-7B-Instruct/5f0ea694-7f73-45fa-b54f-49fc06d1a6d9.json
deleted file mode 100644
index 97c5d8499..000000000
--- a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT-7B-Instruct/5f0ea694-7f73-45fa-b54f-49fc06d1a6d9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_QwQ-LCoT-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwQ-LCoT-7B-Instruct",
-    "id": "prithivMLmods/QwQ-LCoT-7B-Instruct",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4987
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5466
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4802
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4334
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT1-Merged/6c73f6ae-8ffd-4948-8071-33eab07437a6.json b/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT1-Merged/6c73f6ae-8ffd-4948-8071-33eab07437a6.json
deleted file mode 100644
index 5f5612428..000000000
--- a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT1-Merged/6c73f6ae-8ffd-4948-8071-33eab07437a6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_QwQ-LCoT1-Merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwQ-LCoT1-Merged",
-    "id": "prithivMLmods/QwQ-LCoT1-Merged",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4751
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5481
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3731
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4696
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4358
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT2-7B-Instruct/fbf71df3-b9c3-4f9c-b538-e4ccf097e81c.json b/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT2-7B-Instruct/fbf71df3-b9c3-4f9c-b538-e4ccf097e81c.json
deleted file mode 100644
index 32ba1bb53..000000000
--- a/data/hfopenllm_v2/prithivMLmods/QwQ-LCoT2-7B-Instruct/fbf71df3-b9c3-4f9c-b538-e4ccf097e81c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_QwQ-LCoT2-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwQ-LCoT2-7B-Instruct",
-    "id": "prithivMLmods/QwQ-LCoT2-7B-Instruct",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5561
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5425
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.327
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4564
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4342
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/QwQ-MathOct-7B/e3dcfd94-ca04-4cd3-ada5-e701a8b776da.json b/data/hfopenllm_v2/prithivMLmods/QwQ-MathOct-7B/e3dcfd94-ca04-4cd3-ada5-e701a8b776da.json
deleted file mode 100644
index 380be752b..000000000
--- a/data/hfopenllm_v2/prithivMLmods/QwQ-MathOct-7B/e3dcfd94-ca04-4cd3-ada5-e701a8b776da.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_QwQ-MathOct-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwQ-MathOct-7B",
-    "id": "prithivMLmods/QwQ-MathOct-7B",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4684
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5486
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4601
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.433
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/QwQ-R1-Distill-1.5B-CoT/9278bcf2-bfab-437f-bd64-7496b24fb8cf.json b/data/hfopenllm_v2/prithivMLmods/QwQ-R1-Distill-1.5B-CoT/9278bcf2-bfab-437f-bd64-7496b24fb8cf.json
deleted file mode 100644
index cd87f5d54..000000000
--- a/data/hfopenllm_v2/prithivMLmods/QwQ-R1-Distill-1.5B-CoT/9278bcf2-bfab-437f-bd64-7496b24fb8cf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_QwQ-R1-Distill-1.5B-CoT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwQ-R1-Distill-1.5B-CoT",
-    "id": "prithivMLmods/QwQ-R1-Distill-1.5B-CoT",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2194
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3666
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3346
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3434
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1913
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/QwQ-R1-Distill-7B-CoT/633aa068-5613-41d8-a194-aebc9ce1586f.json b/data/hfopenllm_v2/prithivMLmods/QwQ-R1-Distill-7B-CoT/633aa068-5613-41d8-a194-aebc9ce1586f.json
deleted file mode 100644
index 3e840aa4b..000000000
--- a/data/hfopenllm_v2/prithivMLmods/QwQ-R1-Distill-7B-CoT/633aa068-5613-41d8-a194-aebc9ce1586f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_QwQ-R1-Distill-7B-CoT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwQ-R1-Distill-7B-CoT",
-    "id": "prithivMLmods/QwQ-R1-Distill-7B-CoT",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4388
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4683
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3779
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2804
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Qwen-7B-Distill-Reasoner/d3c1a922-a453-4c7b-b33b-52934e7bf72b.json b/data/hfopenllm_v2/prithivMLmods/Qwen-7B-Distill-Reasoner/d3c1a922-a453-4c7b-b33b-52934e7bf72b.json
deleted file mode 100644
index d23b96a42..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Qwen-7B-Distill-Reasoner/d3c1a922-a453-4c7b-b33b-52934e7bf72b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Qwen-7B-Distill-Reasoner/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-7B-Distill-Reasoner",
-    "id": "prithivMLmods/Qwen-7B-Distill-Reasoner",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3396
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4409
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.395
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.366
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2818
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Qwen2.5-1.5B-DeepSeek-R1-Instruct/3a27b2a6-5eea-450b-91c7-1dc006229985.json b/data/hfopenllm_v2/prithivMLmods/Qwen2.5-1.5B-DeepSeek-R1-Instruct/3a27b2a6-5eea-450b-91c7-1dc006229985.json
deleted file mode 100644
index 5d9c9b69d..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Qwen2.5-1.5B-DeepSeek-R1-Instruct/3a27b2a6-5eea-450b-91c7-1dc006229985.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Qwen2.5-1.5B-DeepSeek-R1-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-1.5B-DeepSeek-R1-Instruct",
-    "id": "prithivMLmods/Qwen2.5-1.5B-DeepSeek-R1-Instruct",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1397
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2824
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3724
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1123
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Qwen2.5-14B-DeepSeek-R1-1M/395e37ae-005d-47c0-9cf5-919460e34350.json b/data/hfopenllm_v2/prithivMLmods/Qwen2.5-14B-DeepSeek-R1-1M/395e37ae-005d-47c0-9cf5-919460e34350.json
deleted file mode 100644
index d92c0c284..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Qwen2.5-14B-DeepSeek-R1-1M/395e37ae-005d-47c0-9cf5-919460e34350.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Qwen2.5-14B-DeepSeek-R1-1M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-DeepSeek-R1-1M",
-    "id": "prithivMLmods/Qwen2.5-14B-DeepSeek-R1-1M",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4193
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5935
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3322
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4606
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4899
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Qwen2.5-7B-DeepSeek-R1-1M/b03b7c7a-f263-4712-bcf4-2e32ca4bd237.json b/data/hfopenllm_v2/prithivMLmods/Qwen2.5-7B-DeepSeek-R1-1M/b03b7c7a-f263-4712-bcf4-2e32ca4bd237.json
deleted file mode 100644
index 5a7f8d146..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Qwen2.5-7B-DeepSeek-R1-1M/b03b7c7a-f263-4712-bcf4-2e32ca4bd237.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Qwen2.5-7B-DeepSeek-R1-1M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-DeepSeek-R1-1M",
-    "id": "prithivMLmods/Qwen2.5-7B-DeepSeek-R1-1M",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1861
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3126
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3417
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/SmolLM2-CoT-360M/452ab810-6921-4922-9446-f2a5c081dc61.json b/data/hfopenllm_v2/prithivMLmods/SmolLM2-CoT-360M/452ab810-6921-4922-9446-f2a5c081dc61.json
deleted file mode 100644
index b9e15197c..000000000
--- a/data/hfopenllm_v2/prithivMLmods/SmolLM2-CoT-360M/452ab810-6921-4922-9446-f2a5c081dc61.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_SmolLM2-CoT-360M/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM2-CoT-360M",
-    "id": "prithivMLmods/SmolLM2-CoT-360M",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.362
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2216
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3135
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2366
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3794
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1085
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Elite5/1abba5a0-f1a3-4f39-a81c-f4cd641d33ac.json b/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Elite5/1abba5a0-f1a3-4f39-a81c-f4cd641d33ac.json
deleted file mode 100644
index 29c5fd248..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Elite5/1abba5a0-f1a3-4f39-a81c-f4cd641d33ac.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Sombrero-Opus-14B-Elite5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sombrero-Opus-14B-Elite5",
-    "id": "prithivMLmods/Sombrero-Opus-14B-Elite5",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7881
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6502
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5355
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4287
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.52
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Elite6/b2eefd3a-795c-4dc0-a10e-924bece05ea5.json b/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Elite6/b2eefd3a-795c-4dc0-a10e-924bece05ea5.json
deleted file mode 100644
index 625a33f9d..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Elite6/b2eefd3a-795c-4dc0-a10e-924bece05ea5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Sombrero-Opus-14B-Elite6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sombrero-Opus-14B-Elite6",
-    "id": "prithivMLmods/Sombrero-Opus-14B-Elite6",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7226
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6488
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4079
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4886
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.539
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm1/008cc919-f156-4a2e-af4b-eed015ca91f6.json b/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm1/008cc919-f156-4a2e-af4b-eed015ca91f6.json
deleted file mode 100644
index f425d01b1..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm1/008cc919-f156-4a2e-af4b-eed015ca91f6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Sombrero-Opus-14B-Sm1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sombrero-Opus-14B-Sm1",
-    "id": "prithivMLmods/Sombrero-Opus-14B-Sm1",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3813
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6355
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4035
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5299
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5125
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm2/9d56082f-5e46-4d7a-8f06-cb44fc983b3f.json b/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm2/9d56082f-5e46-4d7a-8f06-cb44fc983b3f.json
deleted file mode 100644
index 333fc6de0..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm2/9d56082f-5e46-4d7a-8f06-cb44fc983b3f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Sombrero-Opus-14B-Sm2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sombrero-Opus-14B-Sm2",
-    "id": "prithivMLmods/Sombrero-Opus-14B-Sm2",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4272
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6609
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4864
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3884
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5088
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5345
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm4/7ea26e73-a501-40bf-8f01-81ab8e850a91.json b/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm4/7ea26e73-a501-40bf-8f01-81ab8e850a91.json
deleted file mode 100644
index 9a2d75354..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm4/7ea26e73-a501-40bf-8f01-81ab8e850a91.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Sombrero-Opus-14B-Sm4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sombrero-Opus-14B-Sm4",
-    "id": "prithivMLmods/Sombrero-Opus-14B-Sm4",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4347
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6613
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4879
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3951
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5192
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.53
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm5/e3343130-cf4f-4e5c-b2d3-5dda13d575b9.json b/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm5/e3343130-cf4f-4e5c-b2d3-5dda13d575b9.json
deleted file mode 100644
index 2d767adc0..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Sombrero-Opus-14B-Sm5/e3343130-cf4f-4e5c-b2d3-5dda13d575b9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Sombrero-Opus-14B-Sm5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sombrero-Opus-14B-Sm5",
-    "id": "prithivMLmods/Sombrero-Opus-14B-Sm5",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6852
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6564
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4094
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4806
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.54
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Sqweeks-7B-Instruct/ba1965f8-b59f-4d71-920c-e3b401ca0534.json b/data/hfopenllm_v2/prithivMLmods/Sqweeks-7B-Instruct/ba1965f8-b59f-4d71-920c-e3b401ca0534.json
deleted file mode 100644
index 55de0790e..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Sqweeks-7B-Instruct/ba1965f8-b59f-4d71-920c-e3b401ca0534.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Sqweeks-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Sqweeks-7B-Instruct",
-    "id": "prithivMLmods/Sqweeks-7B-Instruct",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2158
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4667
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4476
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3133
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Tadpole-Opus-14B-Exp/6dc87410-a39e-41b1-8759-68c1556c8419.json b/data/hfopenllm_v2/prithivMLmods/Tadpole-Opus-14B-Exp/6dc87410-a39e-41b1-8759-68c1556c8419.json
deleted file mode 100644
index dc0362947..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Tadpole-Opus-14B-Exp/6dc87410-a39e-41b1-8759-68c1556c8419.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Tadpole-Opus-14B-Exp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tadpole-Opus-14B-Exp",
-    "id": "prithivMLmods/Tadpole-Opus-14B-Exp",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.575
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6369
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3134
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3859
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4728
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5322
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Taurus-Opus-7B/c4ebe788-fb60-453b-914b-56bf87dd6374.json b/data/hfopenllm_v2/prithivMLmods/Taurus-Opus-7B/c4ebe788-fb60-453b-914b-56bf87dd6374.json
deleted file mode 100644
index 02432070f..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Taurus-Opus-7B/c4ebe788-fb60-453b-914b-56bf87dd6374.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Taurus-Opus-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Taurus-Opus-7B",
-    "id": "prithivMLmods/Taurus-Opus-7B",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4223
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5367
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2168
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4399
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3951
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Triangulum-10B/45a44cc8-a550-4d2f-b0f4-37b4aac6a2b5.json b/data/hfopenllm_v2/prithivMLmods/Triangulum-10B/45a44cc8-a550-4d2f-b0f4-37b4aac6a2b5.json
deleted file mode 100644
index 880fb2e12..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Triangulum-10B/45a44cc8-a550-4d2f-b0f4-37b4aac6a2b5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Triangulum-10B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Triangulum-10B",
-    "id": "prithivMLmods/Triangulum-10B",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3229
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5968
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.355
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.354
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4172
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4178
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Triangulum-5B/10593c13-3b30-4605-8063-c6a6526fc9d9.json b/data/hfopenllm_v2/prithivMLmods/Triangulum-5B/10593c13-3b30-4605-8063-c6a6526fc9d9.json
deleted file mode 100644
index 6aff0ab5e..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Triangulum-5B/10593c13-3b30-4605-8063-c6a6526fc9d9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Triangulum-5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Triangulum-5B",
-    "id": "prithivMLmods/Triangulum-5B",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 5.413
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1283
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3124
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3445
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1223
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Triangulum-v2-10B/12b8f4d7-2ae8-492c-8756-f7cb21a58c76.json b/data/hfopenllm_v2/prithivMLmods/Triangulum-v2-10B/12b8f4d7-2ae8-492c-8756-f7cb21a58c76.json
deleted file mode 100644
index dc5fbfdfd..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Triangulum-v2-10B/12b8f4d7-2ae8-492c-8756-f7cb21a58c76.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Triangulum-v2-10B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Triangulum-v2-10B",
-    "id": "prithivMLmods/Triangulum-v2-10B",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6705
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6065
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2447
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3372
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4281
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4466
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Tucana-Opus-14B-r999/96d9b675-c299-4138-a381-fb4de36287e5.json b/data/hfopenllm_v2/prithivMLmods/Tucana-Opus-14B-r999/96d9b675-c299-4138-a381-fb4de36287e5.json
deleted file mode 100644
index 044957de8..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Tucana-Opus-14B-r999/96d9b675-c299-4138-a381-fb4de36287e5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Tucana-Opus-14B-r999/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tucana-Opus-14B-r999",
-    "id": "prithivMLmods/Tucana-Opus-14B-r999",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6067
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6557
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4063
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3918
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.473
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5384
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Tulu-MathLingo-8B/17fffa9b-8ed4-44c7-87ea-7ee2c1f28e6a.json b/data/hfopenllm_v2/prithivMLmods/Tulu-MathLingo-8B/17fffa9b-8ed4-44c7-87ea-7ee2c1f28e6a.json
deleted file mode 100644
index 075ca9a4b..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Tulu-MathLingo-8B/17fffa9b-8ed4-44c7-87ea-7ee2c1f28e6a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Tulu-MathLingo-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tulu-MathLingo-8B",
-    "id": "prithivMLmods/Tulu-MathLingo-8B",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5589
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4659
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3864
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3044
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-7B-Elite14/8999a5f3-f421-4663-835e-7626cebd2282.json b/data/hfopenllm_v2/prithivMLmods/Viper-Coder-7B-Elite14/8999a5f3-f421-4663-835e-7626cebd2282.json
deleted file mode 100644
index df24efe05..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-7B-Elite14/8999a5f3-f421-4663-835e-7626cebd2282.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Viper-Coder-7B-Elite14/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Viper-Coder-7B-Elite14",
-    "id": "prithivMLmods/Viper-Coder-7B-Elite14",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1488
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2829
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3422
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1089
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-Hybrid-v1.2/951e1a4f-ed6c-49ca-b648-6086989e333f.json b/data/hfopenllm_v2/prithivMLmods/Viper-Coder-Hybrid-v1.2/951e1a4f-ed6c-49ca-b648-6086989e333f.json
deleted file mode 100644
index 82f43e5e6..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-Hybrid-v1.2/951e1a4f-ed6c-49ca-b648-6086989e333f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Viper-Coder-Hybrid-v1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Viper-Coder-Hybrid-v1.2",
-    "id": "prithivMLmods/Viper-Coder-Hybrid-v1.2",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6736
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6391
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3742
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4822
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5243
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-Hybrid-v1.3/2acc0666-e0ff-4760-a74a-227a02775344.json b/data/hfopenllm_v2/prithivMLmods/Viper-Coder-Hybrid-v1.3/2acc0666-e0ff-4760-a74a-227a02775344.json
deleted file mode 100644
index 0c7fa3197..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-Hybrid-v1.3/2acc0666-e0ff-4760-a74a-227a02775344.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Viper-Coder-Hybrid-v1.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Viper-Coder-Hybrid-v1.3",
-    "id": "prithivMLmods/Viper-Coder-Hybrid-v1.3",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7555
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6471
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4517
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4403
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5097
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-HybridMini-v1.3/3196c71d-0e0a-4d29-8bca-c31ba3d99dfd.json b/data/hfopenllm_v2/prithivMLmods/Viper-Coder-HybridMini-v1.3/3196c71d-0e0a-4d29-8bca-c31ba3d99dfd.json
deleted file mode 100644
index 37eb769a1..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-HybridMini-v1.3/3196c71d-0e0a-4d29-8bca-c31ba3d99dfd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Viper-Coder-HybridMini-v1.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Viper-Coder-HybridMini-v1.3",
-    "id": "prithivMLmods/Viper-Coder-HybridMini-v1.3",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6104
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5365
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.463
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4505
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4352
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v0.1/e858aa6c-c424-447e-b512-7dcf794f9f0f.json b/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v0.1/e858aa6c-c424-447e-b512-7dcf794f9f0f.json
deleted file mode 100644
index f8a0e8b52..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v0.1/e858aa6c-c424-447e-b512-7dcf794f9f0f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Viper-Coder-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Viper-Coder-v0.1",
-    "id": "prithivMLmods/Viper-Coder-v0.1",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5521
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6143
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.327
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.354
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4394
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3928
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.1/8773eac5-205e-4264-981b-58f1a25f872a.json b/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.1/8773eac5-205e-4264-981b-58f1a25f872a.json
deleted file mode 100644
index 781f52345..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.1/8773eac5-205e-4264-981b-58f1a25f872a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Viper-Coder-v1.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Viper-Coder-v1.1",
-    "id": "prithivMLmods/Viper-Coder-v1.1",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4432
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6492
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5461
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.401
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5219
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5232
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.6-r999/c26ae286-a9b8-499f-b886-4b75be0cf2da.json b/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.6-r999/c26ae286-a9b8-499f-b886-4b75be0cf2da.json
deleted file mode 100644
index 852cfb91f..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.6-r999/c26ae286-a9b8-499f-b886-4b75be0cf2da.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Viper-Coder-v1.6-r999/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Viper-Coder-v1.6-r999",
-    "id": "prithivMLmods/Viper-Coder-v1.6-r999",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4433
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6492
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5657
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.401
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5219
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5232
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.7-Vsm6/d3a61998-2d41-4349-bd15-ce29143cc910.json b/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.7-Vsm6/d3a61998-2d41-4349-bd15-ce29143cc910.json
deleted file mode 100644
index 0353bf927..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Viper-Coder-v1.7-Vsm6/d3a61998-2d41-4349-bd15-ce29143cc910.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Viper-Coder-v1.7-Vsm6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Viper-Coder-v1.7-Vsm6",
-    "id": "prithivMLmods/Viper-Coder-v1.7-Vsm6",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5004
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6502
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4645
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3968
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4768
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5288
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Viper-OneCoder-UIGEN/56b66428-2751-4c62-b98c-6c60e58c45ca.json b/data/hfopenllm_v2/prithivMLmods/Viper-OneCoder-UIGEN/56b66428-2751-4c62-b98c-6c60e58c45ca.json
deleted file mode 100644
index 022eaef25..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Viper-OneCoder-UIGEN/56b66428-2751-4c62-b98c-6c60e58c45ca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Viper-OneCoder-UIGEN/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Viper-OneCoder-UIGEN",
-    "id": "prithivMLmods/Viper-OneCoder-UIGEN",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4692
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6047
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3423
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4514
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3904
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/Volans-Opus-14B-Exp/9b2ec4af-4a7c-4cf7-8b7d-79b6cc219880.json b/data/hfopenllm_v2/prithivMLmods/Volans-Opus-14B-Exp/9b2ec4af-4a7c-4cf7-8b7d-79b6cc219880.json
deleted file mode 100644
index ee55e09c5..000000000
--- a/data/hfopenllm_v2/prithivMLmods/Volans-Opus-14B-Exp/9b2ec4af-4a7c-4cf7-8b7d-79b6cc219880.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_Volans-Opus-14B-Exp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Volans-Opus-14B-Exp",
-    "id": "prithivMLmods/Volans-Opus-14B-Exp",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5868
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6521
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4252
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3851
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4872
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5385
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/prithivMLmods/WebMind-7B-v0.1/5855a920-428f-4699-becc-73d4422f706f.json b/data/hfopenllm_v2/prithivMLmods/WebMind-7B-v0.1/5855a920-428f-4699-becc-73d4422f706f.json
deleted file mode 100644
index 5e3113aff..000000000
--- a/data/hfopenllm_v2/prithivMLmods/WebMind-7B-v0.1/5855a920-428f-4699-becc-73d4422f706f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/prithivMLmods_WebMind-7B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "WebMind-7B-v0.1",
-    "id": "prithivMLmods/WebMind-7B-v0.1",
-    "developer": "prithivMLmods",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5278
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5434
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3648
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4537
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4279
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pszemraj/Llama-3-6.3b-v0.1/f1004f08-7f46-4eb1-8f60-66893fca7180.json b/data/hfopenllm_v2/pszemraj/Llama-3-6.3b-v0.1/f1004f08-7f46-4eb1-8f60-66893fca7180.json
deleted file mode 100644
index 00c890719..000000000
--- a/data/hfopenllm_v2/pszemraj/Llama-3-6.3b-v0.1/f1004f08-7f46-4eb1-8f60-66893fca7180.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pszemraj_Llama-3-6.3b-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-6.3b-v0.1",
-    "id": "pszemraj/Llama-3-6.3b-v0.1",
-    "developer": "pszemraj",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.3
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1044
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4197
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0211
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3908
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.284
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/pszemraj/Mistral-v0.3-6B/97db158a-3035-45d3-8d92-a08c9e605493.json b/data/hfopenllm_v2/pszemraj/Mistral-v0.3-6B/97db158a-3035-45d3-8d92-a08c9e605493.json
deleted file mode 100644
index 73d3cc391..000000000
--- a/data/hfopenllm_v2/pszemraj/Mistral-v0.3-6B/97db158a-3035-45d3-8d92-a08c9e605493.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/pszemraj_Mistral-v0.3-6B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-v0.3-6B",
-    "id": "pszemraj/Mistral-v0.3-6B",
-    "developer": "pszemraj",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 5.939
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2454
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3774
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3908
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2143
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2019/LLaMa_3.2_3B_Catalysts/0d81b928-2a24-4eb4-93d5-224e3c505532.json b/data/hfopenllm_v2/qingy2019/LLaMa_3.2_3B_Catalysts/0d81b928-2a24-4eb4-93d5-224e3c505532.json
deleted file mode 100644
index 077f5175f..000000000
--- a/data/hfopenllm_v2/qingy2019/LLaMa_3.2_3B_Catalysts/0d81b928-2a24-4eb4-93d5-224e3c505532.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2019_LLaMa_3.2_3B_Catalysts/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMa_3.2_3B_Catalysts",
-    "id": "qingy2019/LLaMa_3.2_3B_Catalysts",
-    "developer": "qingy2019",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4992
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4468
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1292
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3788
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3008
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2019/OpenMath2-Llama3.1-8B/bf4cc7ee-cad4-42af-8638-6b371577ec68.json b/data/hfopenllm_v2/qingy2019/OpenMath2-Llama3.1-8B/bf4cc7ee-cad4-42af-8638-6b371577ec68.json
deleted file mode 100644
index 8d52de0fa..000000000
--- a/data/hfopenllm_v2/qingy2019/OpenMath2-Llama3.1-8B/bf4cc7ee-cad4-42af-8638-6b371577ec68.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2019_OpenMath2-Llama3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenMath2-Llama3.1-8B",
-    "id": "qingy2019/OpenMath2-Llama3.1-8B",
-    "developer": "qingy2019",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2331
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4096
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2674
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3436
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1553
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2019/Oracle-14B/5b574dda-0d85-47aa-9ebc-7f8581d402ca.json b/data/hfopenllm_v2/qingy2019/Oracle-14B/5b574dda-0d85-47aa-9ebc-7f8581d402ca.json
deleted file mode 100644
index 8c2258503..000000000
--- a/data/hfopenllm_v2/qingy2019/Oracle-14B/5b574dda-0d85-47aa-9ebc-7f8581d402ca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2019_Oracle-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Oracle-14B",
-    "id": "qingy2019/Oracle-14B",
-    "developer": "qingy2019",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 13.668
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2401
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4622
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0725
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3703
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2379
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2019/Oracle-14B/6043830f-8a9d-4a03-9de5-4805724a9ae8.json b/data/hfopenllm_v2/qingy2019/Oracle-14B/6043830f-8a9d-4a03-9de5-4805724a9ae8.json
deleted file mode 100644
index 3296a5d2c..000000000
--- a/data/hfopenllm_v2/qingy2019/Oracle-14B/6043830f-8a9d-4a03-9de5-4805724a9ae8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2019_Oracle-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Oracle-14B",
-    "id": "qingy2019/Oracle-14B",
-    "developer": "qingy2019",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 13.668
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2358
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4612
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0642
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3717
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2382
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct-Alpha/9d5fdb25-0d6a-4d5c-bcfb-0903504e620a.json b/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct-Alpha/9d5fdb25-0d6a-4d5c-bcfb-0903504e620a.json
deleted file mode 100644
index 7da78d24a..000000000
--- a/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct-Alpha/9d5fdb25-0d6a-4d5c-bcfb-0903504e620a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2019_Qwen2.5-Math-14B-Instruct-Alpha/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Math-14B-Instruct-Alpha",
-    "id": "qingy2019/Qwen2.5-Math-14B-Instruct-Alpha",
-    "developer": "qingy2019",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5981
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6375
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3142
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4649
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5331
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct-Pro/217819b0-2c4b-4c26-823b-1ea14f893e01.json b/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct-Pro/217819b0-2c4b-4c26-823b-1ea14f893e01.json
deleted file mode 100644
index 302221eda..000000000
--- a/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct-Pro/217819b0-2c4b-4c26-823b-1ea14f893e01.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2019_Qwen2.5-Math-14B-Instruct-Pro/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Math-14B-Instruct-Pro",
-    "id": "qingy2019/Qwen2.5-Math-14B-Instruct-Pro",
-    "developer": "qingy2019",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1922
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5319
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.284
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.374
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3558
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct/0f844855-fb46-4b53-82c2-f36e5721c385.json b/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct/0f844855-fb46-4b53-82c2-f36e5721c385.json
deleted file mode 100644
index 792719c86..000000000
--- a/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct/0f844855-fb46-4b53-82c2-f36e5721c385.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2019_Qwen2.5-Math-14B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Math-14B-Instruct",
-    "id": "qingy2019/Qwen2.5-Math-14B-Instruct",
-    "developer": "qingy2019",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6005
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6356
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2764
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3691
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4757
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5339
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct/59aaa7ed-27d4-4765-b115-90570ad86c77.json b/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct/59aaa7ed-27d4-4765-b115-90570ad86c77.json
deleted file mode 100644
index 5ff1d5b79..000000000
--- a/data/hfopenllm_v2/qingy2019/Qwen2.5-Math-14B-Instruct/59aaa7ed-27d4-4765-b115-90570ad86c77.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2019_Qwen2.5-Math-14B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Math-14B-Instruct",
-    "id": "qingy2019/Qwen2.5-Math-14B-Instruct",
-    "developer": "qingy2019",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6066
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.635
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3725
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4757
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5331
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2019/Qwen2.5-Ultimate-14B-Instruct/4478c5ff-3b51-4be2-abce-3fb6a951b6e7.json b/data/hfopenllm_v2/qingy2019/Qwen2.5-Ultimate-14B-Instruct/4478c5ff-3b51-4be2-abce-3fb6a951b6e7.json
deleted file mode 100644
index 12db4718f..000000000
--- a/data/hfopenllm_v2/qingy2019/Qwen2.5-Ultimate-14B-Instruct/4478c5ff-3b51-4be2-abce-3fb6a951b6e7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2019_Qwen2.5-Ultimate-14B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Ultimate-14B-Instruct",
-    "id": "qingy2019/Qwen2.5-Ultimate-14B-Instruct",
-    "developer": "qingy2019",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3938
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5842
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2893
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4135
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4929
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2024/Benchmaxx-Llama-3.2-1B-Instruct/9202146d-5889-49fd-9025-e03153ba9093.json b/data/hfopenllm_v2/qingy2024/Benchmaxx-Llama-3.2-1B-Instruct/9202146d-5889-49fd-9025-e03153ba9093.json
deleted file mode 100644
index e323ed5cd..000000000
--- a/data/hfopenllm_v2/qingy2024/Benchmaxx-Llama-3.2-1B-Instruct/9202146d-5889-49fd-9025-e03153ba9093.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2024_Benchmaxx-Llama-3.2-1B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Benchmaxx-Llama-3.2-1B-Instruct",
-    "id": "qingy2024/Benchmaxx-Llama-3.2-1B-Instruct",
-    "developer": "qingy2024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2014
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8269
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4804
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3446
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1113
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2024/Eyas-17B-Instruct/94257d3e-2b1e-47a1-bbd1-7fc696a574b3.json b/data/hfopenllm_v2/qingy2024/Eyas-17B-Instruct/94257d3e-2b1e-47a1-bbd1-7fc696a574b3.json
deleted file mode 100644
index a3811ca19..000000000
--- a/data/hfopenllm_v2/qingy2024/Eyas-17B-Instruct/94257d3e-2b1e-47a1-bbd1-7fc696a574b3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2024_Eyas-17B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Eyas-17B-Instruct",
-    "id": "qingy2024/Eyas-17B-Instruct",
-    "developer": "qingy2024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 17.431
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6575
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6085
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.247
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4522
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4343
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2024/Falcon3-2x10B-MoE-Instruct/2245cf71-fb8d-44ca-b58d-06608312ee8c.json b/data/hfopenllm_v2/qingy2024/Falcon3-2x10B-MoE-Instruct/2245cf71-fb8d-44ca-b58d-06608312ee8c.json
deleted file mode 100644
index 97d1ad31b..000000000
--- a/data/hfopenllm_v2/qingy2024/Falcon3-2x10B-MoE-Instruct/2245cf71-fb8d-44ca-b58d-06608312ee8c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2024_Falcon3-2x10B-MoE-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon3-2x10B-MoE-Instruct",
-    "id": "qingy2024/Falcon3-2x10B-MoE-Instruct",
-    "developer": "qingy2024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 18.799
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.785
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6185
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2795
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3305
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4284
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4423
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2024/Fusion-14B-Instruct/9a823fde-7802-4876-b72c-d8f73cd17236.json b/data/hfopenllm_v2/qingy2024/Fusion-14B-Instruct/9a823fde-7802-4876-b72c-d8f73cd17236.json
deleted file mode 100644
index 7fb915e5f..000000000
--- a/data/hfopenllm_v2/qingy2024/Fusion-14B-Instruct/9a823fde-7802-4876-b72c-d8f73cd17236.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2024_Fusion-14B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fusion-14B-Instruct",
-    "id": "qingy2024/Fusion-14B-Instruct",
-    "developer": "qingy2024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.726
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6396
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3369
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3549
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5044
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2024/Fusion2-14B-Instruct/ede99239-ef8f-49eb-a48b-0ec2553c99e5.json b/data/hfopenllm_v2/qingy2024/Fusion2-14B-Instruct/ede99239-ef8f-49eb-a48b-0ec2553c99e5.json
deleted file mode 100644
index 130eb0036..000000000
--- a/data/hfopenllm_v2/qingy2024/Fusion2-14B-Instruct/ede99239-ef8f-49eb-a48b-0ec2553c99e5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2024_Fusion2-14B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fusion2-14B-Instruct",
-    "id": "qingy2024/Fusion2-14B-Instruct",
-    "developer": "qingy2024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6064
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6119
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3127
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3448
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4634
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5051
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2024/Fusion4-14B-Instruct/4a307570-994f-491c-87a7-ad90b7965b8b.json b/data/hfopenllm_v2/qingy2024/Fusion4-14B-Instruct/4a307570-994f-491c-87a7-ad90b7965b8b.json
deleted file mode 100644
index 144b32ba3..000000000
--- a/data/hfopenllm_v2/qingy2024/Fusion4-14B-Instruct/4a307570-994f-491c-87a7-ad90b7965b8b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2024_Fusion4-14B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fusion4-14B-Instruct",
-    "id": "qingy2024/Fusion4-14B-Instruct",
-    "developer": "qingy2024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7649
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6543
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3882
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3305
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4326
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5194
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2024/OwO-14B-Instruct/eb448d78-6417-4533-8458-99c1869a74ae.json b/data/hfopenllm_v2/qingy2024/OwO-14B-Instruct/eb448d78-6417-4533-8458-99c1869a74ae.json
deleted file mode 100644
index 2f6ab406d..000000000
--- a/data/hfopenllm_v2/qingy2024/OwO-14B-Instruct/eb448d78-6417-4533-8458-99c1869a74ae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2024_OwO-14B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OwO-14B-Instruct",
-    "id": "qingy2024/OwO-14B-Instruct",
-    "developer": "qingy2024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1383
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6165
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4162
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3641
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4407
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5181
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2024/QwEnlarge-16B-Instruct/e1b8e4ad-4327-46b9-b957-fbd02e57c87e.json b/data/hfopenllm_v2/qingy2024/QwEnlarge-16B-Instruct/e1b8e4ad-4327-46b9-b957-fbd02e57c87e.json
deleted file mode 100644
index 64c399092..000000000
--- a/data/hfopenllm_v2/qingy2024/QwEnlarge-16B-Instruct/e1b8e4ad-4327-46b9-b957-fbd02e57c87e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2024_QwEnlarge-16B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwEnlarge-16B-Instruct",
-    "id": "qingy2024/QwEnlarge-16B-Instruct",
-    "developer": "qingy2024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 15.871
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7802
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5949
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4101
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4476
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2024/QwQ-14B-Math-v0.2/aab6b224-b948-4fb1-84b7-0dbe5c46d527.json b/data/hfopenllm_v2/qingy2024/QwQ-14B-Math-v0.2/aab6b224-b948-4fb1-84b7-0dbe5c46d527.json
deleted file mode 100644
index 1d8168654..000000000
--- a/data/hfopenllm_v2/qingy2024/QwQ-14B-Math-v0.2/aab6b224-b948-4fb1-84b7-0dbe5c46d527.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2024_QwQ-14B-Math-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "QwQ-14B-Math-v0.2",
-    "id": "qingy2024/QwQ-14B-Math-v0.2",
-    "developer": "qingy2024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3391
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5731
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4811
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4021
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.48
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2024/Qwarkstar-4B-Instruct-Preview/2e5cd1de-6109-4f76-b722-abbd4b207f4d.json b/data/hfopenllm_v2/qingy2024/Qwarkstar-4B-Instruct-Preview/2e5cd1de-6109-4f76-b722-abbd4b207f4d.json
deleted file mode 100644
index dd72818b2..000000000
--- a/data/hfopenllm_v2/qingy2024/Qwarkstar-4B-Instruct-Preview/2e5cd1de-6109-4f76-b722-abbd4b207f4d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2024_Qwarkstar-4B-Instruct-Preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwarkstar-4B-Instruct-Preview",
-    "id": "qingy2024/Qwarkstar-4B-Instruct-Preview",
-    "developer": "qingy2024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 4.473
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5324
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4358
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1284
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3896
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2502
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2024/Qwarkstar-4B/767d1296-4971-478f-8d78-1d63d162ae5b.json b/data/hfopenllm_v2/qingy2024/Qwarkstar-4B/767d1296-4971-478f-8d78-1d63d162ae5b.json
deleted file mode 100644
index 4fa365663..000000000
--- a/data/hfopenllm_v2/qingy2024/Qwarkstar-4B/767d1296-4971-478f-8d78-1d63d162ae5b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2024_Qwarkstar-4B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwarkstar-4B",
-    "id": "qingy2024/Qwarkstar-4B",
-    "developer": "qingy2024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 4.473
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1994
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4015
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0861
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4428
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2425
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2024/Qwen2.5-4B/eab74e3b-de61-4fa9-87c2-56e69b70349a.json b/data/hfopenllm_v2/qingy2024/Qwen2.5-4B/eab74e3b-de61-4fa9-87c2-56e69b70349a.json
deleted file mode 100644
index 9a1241837..000000000
--- a/data/hfopenllm_v2/qingy2024/Qwen2.5-4B/eab74e3b-de61-4fa9-87c2-56e69b70349a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2024_Qwen2.5-4B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-4B",
-    "id": "qingy2024/Qwen2.5-4B",
-    "developer": "qingy2024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 4.168
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2158
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4269
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.461
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2024/Qwen2.5-Coder-Draft-1.5B-Instruct/3219d563-3bfb-4618-8cb3-e9b198d5b11f.json b/data/hfopenllm_v2/qingy2024/Qwen2.5-Coder-Draft-1.5B-Instruct/3219d563-3bfb-4618-8cb3-e9b198d5b11f.json
deleted file mode 100644
index 56cfd6f64..000000000
--- a/data/hfopenllm_v2/qingy2024/Qwen2.5-Coder-Draft-1.5B-Instruct/3219d563-3bfb-4618-8cb3-e9b198d5b11f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2024_Qwen2.5-Coder-Draft-1.5B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Coder-Draft-1.5B-Instruct",
-    "id": "qingy2024/Qwen2.5-Coder-Draft-1.5B-Instruct",
-    "developer": "qingy2024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4125
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3837
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1579
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2244
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2024/Qwen2.5-Math-14B-Instruct-Alpha/233fd27c-561e-4c9e-a917-cbc5b08c055a.json b/data/hfopenllm_v2/qingy2024/Qwen2.5-Math-14B-Instruct-Alpha/233fd27c-561e-4c9e-a917-cbc5b08c055a.json
deleted file mode 100644
index 140c278d3..000000000
--- a/data/hfopenllm_v2/qingy2024/Qwen2.5-Math-14B-Instruct-Alpha/233fd27c-561e-4c9e-a917-cbc5b08c055a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2024_Qwen2.5-Math-14B-Instruct-Alpha/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Math-14B-Instruct-Alpha",
-    "id": "qingy2024/Qwen2.5-Math-14B-Instruct-Alpha",
-    "developer": "qingy2024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7704
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6465
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.429
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.349
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4021
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4966
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2024/Qwen2.5-Math-14B-Instruct-Preview/a875e8f7-a4e6-4c17-abbc-b8d4b73b7501.json b/data/hfopenllm_v2/qingy2024/Qwen2.5-Math-14B-Instruct-Preview/a875e8f7-a4e6-4c17-abbc-b8d4b73b7501.json
deleted file mode 100644
index f9376abc8..000000000
--- a/data/hfopenllm_v2/qingy2024/Qwen2.5-Math-14B-Instruct-Preview/a875e8f7-a4e6-4c17-abbc-b8d4b73b7501.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2024_Qwen2.5-Math-14B-Instruct-Preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Math-14B-Instruct-Preview",
-    "id": "qingy2024/Qwen2.5-Math-14B-Instruct-Preview",
-    "developer": "qingy2024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7826
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6294
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4758
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4115
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4993
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2024/Qwen2.6-14B-Instruct/4b68ba49-6681-4add-9197-2cd711701e15.json b/data/hfopenllm_v2/qingy2024/Qwen2.6-14B-Instruct/4b68ba49-6681-4add-9197-2cd711701e15.json
deleted file mode 100644
index c2c1c3606..000000000
--- a/data/hfopenllm_v2/qingy2024/Qwen2.6-14B-Instruct/4b68ba49-6681-4add-9197-2cd711701e15.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2024_Qwen2.6-14B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.6-14B-Instruct",
-    "id": "qingy2024/Qwen2.6-14B-Instruct",
-    "developer": "qingy2024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5811
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6394
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3051
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4569
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5285
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qingy2024/Qwen2.6-Math-14B-Instruct/5679ca73-3d5f-4bc7-bea2-5e9e713db0cc.json b/data/hfopenllm_v2/qingy2024/Qwen2.6-Math-14B-Instruct/5679ca73-3d5f-4bc7-bea2-5e9e713db0cc.json
deleted file mode 100644
index ee424cd53..000000000
--- a/data/hfopenllm_v2/qingy2024/Qwen2.6-Math-14B-Instruct/5679ca73-3d5f-4bc7-bea2-5e9e713db0cc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qingy2024_Qwen2.6-Math-14B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.6-Math-14B-Instruct",
-    "id": "qingy2024/Qwen2.6-Math-14B-Instruct",
-    "developer": "qingy2024",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3862
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6324
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.429
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4759
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5241
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/qq8933/OpenLongCoT-Base-Gemma2-2B/a6c631f6-890c-4199-abee-18b012bc48df.json b/data/hfopenllm_v2/qq8933/OpenLongCoT-Base-Gemma2-2B/a6c631f6-890c-4199-abee-18b012bc48df.json
deleted file mode 100644
index 62c5fe7c9..000000000
--- a/data/hfopenllm_v2/qq8933/OpenLongCoT-Base-Gemma2-2B/a6c631f6-890c-4199-abee-18b012bc48df.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/qq8933_OpenLongCoT-Base-Gemma2-2B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenLongCoT-Base-Gemma2-2B",
-    "id": "qq8933/OpenLongCoT-Base-Gemma2-2B",
-    "developer": "qq8933",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 3.204
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1965
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3106
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0234
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3222
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1316
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/raphgg/test-2.5-72B/1edc3610-40fc-467d-8410-26d4b6adebce.json b/data/hfopenllm_v2/raphgg/test-2.5-72B/1edc3610-40fc-467d-8410-26d4b6adebce.json
deleted file mode 100644
index f9c3617d5..000000000
--- a/data/hfopenllm_v2/raphgg/test-2.5-72B/1edc3610-40fc-467d-8410-26d4b6adebce.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/raphgg_test-2.5-72B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "test-2.5-72B",
-    "id": "raphgg/test-2.5-72B",
-    "developer": "raphgg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8437
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7266
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4109
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3893
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4812
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5837
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rasyosef/Mistral-NeMo-Minitron-8B-Chat/42c773ba-8fb4-4b3c-8ac7-0688519bb55c.json b/data/hfopenllm_v2/rasyosef/Mistral-NeMo-Minitron-8B-Chat/42c773ba-8fb4-4b3c-8ac7-0688519bb55c.json
deleted file mode 100644
index 33c5db79a..000000000
--- a/data/hfopenllm_v2/rasyosef/Mistral-NeMo-Minitron-8B-Chat/42c773ba-8fb4-4b3c-8ac7-0688519bb55c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rasyosef_Mistral-NeMo-Minitron-8B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-NeMo-Minitron-8B-Chat",
-    "id": "rasyosef/Mistral-NeMo-Minitron-8B-Chat",
-    "developer": "rasyosef",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 8.414
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4452
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4759
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0272
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4304
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2404
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rasyosef/Phi-1_5-Instruct-v0.1/1a371df5-447f-4fd8-8fe8-dbf9a1dc079a.json b/data/hfopenllm_v2/rasyosef/Phi-1_5-Instruct-v0.1/1a371df5-447f-4fd8-8fe8-dbf9a1dc079a.json
deleted file mode 100644
index a55a5da1a..000000000
--- a/data/hfopenllm_v2/rasyosef/Phi-1_5-Instruct-v0.1/1a371df5-447f-4fd8-8fe8-dbf9a1dc079a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rasyosef_Phi-1_5-Instruct-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-1_5-Instruct-v0.1",
-    "id": "rasyosef/Phi-1_5-Instruct-v0.1",
-    "developer": "rasyosef",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 1.415
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2402
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3118
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3422
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1562
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rasyosef/phi-2-instruct-apo/821a21a0-6fd7-438a-933d-5e31b2dd2adc.json b/data/hfopenllm_v2/rasyosef/phi-2-instruct-apo/821a21a0-6fd7-438a-933d-5e31b2dd2adc.json
deleted file mode 100644
index 02df295de..000000000
--- a/data/hfopenllm_v2/rasyosef/phi-2-instruct-apo/821a21a0-6fd7-438a-933d-5e31b2dd2adc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rasyosef_phi-2-instruct-apo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-2-instruct-apo",
-    "id": "rasyosef/phi-2-instruct-apo",
-    "developer": "rasyosef",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.775
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4445
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3342
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2155
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rasyosef/phi-2-instruct-v0.1/781a4cc6-a69d-4106-81aa-06e114f7c897.json b/data/hfopenllm_v2/rasyosef/phi-2-instruct-v0.1/781a4cc6-a69d-4106-81aa-06e114f7c897.json
deleted file mode 100644
index 16d481683..000000000
--- a/data/hfopenllm_v2/rasyosef/phi-2-instruct-v0.1/781a4cc6-a69d-4106-81aa-06e114f7c897.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rasyosef_phi-2-instruct-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-2-instruct-v0.1",
-    "id": "rasyosef/phi-2-instruct-v0.1",
-    "developer": "rasyosef",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.775
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3681
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4726
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3524
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2247
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/realtreetune/rho-1b-sft-MATH/e49c98b4-46f4-406e-9eeb-7072bf72b9a3.json b/data/hfopenllm_v2/realtreetune/rho-1b-sft-MATH/e49c98b4-46f4-406e-9eeb-7072bf72b9a3.json
deleted file mode 100644
index 56ba23460..000000000
--- a/data/hfopenllm_v2/realtreetune/rho-1b-sft-MATH/e49c98b4-46f4-406e-9eeb-7072bf72b9a3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/realtreetune_rho-1b-sft-MATH/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "rho-1b-sft-MATH",
-    "id": "realtreetune/rho-1b-sft-MATH",
-    "developer": "realtreetune",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.1
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2121
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3144
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0347
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3458
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1117
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp/3b7524a8-d17b-4788-93f2-11076df464a7.json b/data/hfopenllm_v2/recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp/3b7524a8-d17b-4788-93f2-11076df464a7.json
deleted file mode 100644
index b8063cb4c..000000000
--- a/data/hfopenllm_v2/recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp/3b7524a8-d17b-4788-93f2-11076df464a7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/recoilme_Gemma-2-Ataraxy-Gemmasutra-9B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-Ataraxy-Gemmasutra-9B-slerp",
-    "id": "recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp",
-    "developer": "recoilme",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2854
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5984
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1005
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4607
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4162
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp/6188a57f-4bc3-42a5-ad18-c59774e40407.json b/data/hfopenllm_v2/recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp/6188a57f-4bc3-42a5-ad18-c59774e40407.json
deleted file mode 100644
index eda35f4dd..000000000
--- a/data/hfopenllm_v2/recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp/6188a57f-4bc3-42a5-ad18-c59774e40407.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/recoilme_Gemma-2-Ataraxy-Gemmasutra-9B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-Ataraxy-Gemmasutra-9B-slerp",
-    "id": "recoilme/Gemma-2-Ataraxy-Gemmasutra-9B-slerp",
-    "developer": "recoilme",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7649
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5974
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0174
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3305
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4245
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4207
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.1/28689805-7c4c-438e-8431-f4a6aceb5e94.json b/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.1/28689805-7c4c-438e-8431-f4a6aceb5e94.json
deleted file mode 100644
index fb6c7b792..000000000
--- a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.1/28689805-7c4c-438e-8431-f4a6aceb5e94.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/recoilme_recoilme-gemma-2-9B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "recoilme-gemma-2-9B-v0.1",
-    "id": "recoilme/recoilme-gemma-2-9B-v0.1",
-    "developer": "recoilme",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7515
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5995
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2039
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3389
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4191
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4159
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.2/7c156689-9668-4ded-bacc-c88a03ad1526.json b/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.2/7c156689-9668-4ded-bacc-c88a03ad1526.json
deleted file mode 100644
index 8be1d1216..000000000
--- a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.2/7c156689-9668-4ded-bacc-c88a03ad1526.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/recoilme_recoilme-gemma-2-9B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "recoilme-gemma-2-9B-v0.2",
-    "id": "recoilme/recoilme-gemma-2-9B-v0.2",
-    "developer": "recoilme",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7592
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6026
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0529
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4099
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4163
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.2/7e43f187-1959-4dfe-802f-094ba88f3b0d.json b/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.2/7e43f187-1959-4dfe-802f-094ba88f3b0d.json
deleted file mode 100644
index 2194c56b4..000000000
--- a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.2/7e43f187-1959-4dfe-802f-094ba88f3b0d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/recoilme_recoilme-gemma-2-9B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "recoilme-gemma-2-9B-v0.2",
-    "id": "recoilme/recoilme-gemma-2-9B-v0.2",
-    "developer": "recoilme",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2747
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6031
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0831
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3305
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4686
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4122
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.3/a6170173-ef17-4cfa-a76e-8e51cb8cb970.json b/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.3/a6170173-ef17-4cfa-a76e-8e51cb8cb970.json
deleted file mode 100644
index b07a872e1..000000000
--- a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.3/a6170173-ef17-4cfa-a76e-8e51cb8cb970.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/recoilme_recoilme-gemma-2-9B-v0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "recoilme-gemma-2-9B-v0.3",
-    "id": "recoilme/recoilme-gemma-2-9B-v0.3",
-    "developer": "recoilme",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7439
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5993
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0876
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4204
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4072
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.3/e998d52b-dd94-4ef2-9cfc-5034ded0105a.json b/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.3/e998d52b-dd94-4ef2-9cfc-5034ded0105a.json
deleted file mode 100644
index f0948a794..000000000
--- a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.3/e998d52b-dd94-4ef2-9cfc-5034ded0105a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/recoilme_recoilme-gemma-2-9B-v0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "recoilme-gemma-2-9B-v0.3",
-    "id": "recoilme/recoilme-gemma-2-9B-v0.3",
-    "developer": "recoilme",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5761
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.602
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1888
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3372
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4632
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4039
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.4/a3ac60bd-8fb3-47d9-b378-1f0c4d74fed2.json b/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.4/a3ac60bd-8fb3-47d9-b378-1f0c4d74fed2.json
deleted file mode 100644
index 4927c66a1..000000000
--- a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.4/a3ac60bd-8fb3-47d9-b378-1f0c4d74fed2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/recoilme_recoilme-gemma-2-9B-v0.4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "recoilme-gemma-2-9B-v0.4",
-    "id": "recoilme/recoilme-gemma-2-9B-v0.4",
-    "developer": "recoilme",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2562
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5967
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0846
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4727
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4406
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.5/0f69217c-74ed-4398-8d1b-53d1a43be890.json b/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.5/0f69217c-74ed-4398-8d1b-53d1a43be890.json
deleted file mode 100644
index 31bf944b2..000000000
--- a/data/hfopenllm_v2/recoilme/recoilme-gemma-2-9B-v0.5/0f69217c-74ed-4398-8d1b-53d1a43be890.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/recoilme_recoilme-gemma-2-9B-v0.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "recoilme-gemma-2-9B-v0.5",
-    "id": "recoilme/recoilme-gemma-2-9B-v0.5",
-    "developer": "recoilme",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7664
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5981
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2115
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4232
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.42
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/redrix/AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS/b973adcc-769c-4009-87c5-5f5af02a5d3a.json b/data/hfopenllm_v2/redrix/AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS/b973adcc-769c-4009-87c5-5f5af02a5d3a.json
deleted file mode 100644
index f90aef9ac..000000000
--- a/data/hfopenllm_v2/redrix/AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS/b973adcc-769c-4009-87c5-5f5af02a5d3a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/redrix_AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS",
-    "id": "redrix/AngelSlayer-12B-Unslop-Mell-RPMax-DARKNESS",
-    "developer": "redrix",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.536
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5129
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1133
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3818
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/redrix/patricide-12B-Unslop-Mell/4b30f11e-a2b9-40e9-b080-9d7484a5d048.json b/data/hfopenllm_v2/redrix/patricide-12B-Unslop-Mell/4b30f11e-a2b9-40e9-b080-9d7484a5d048.json
deleted file mode 100644
index 98a7c58db..000000000
--- a/data/hfopenllm_v2/redrix/patricide-12B-Unslop-Mell/4b30f11e-a2b9-40e9-b080-9d7484a5d048.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/redrix_patricide-12B-Unslop-Mell/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "patricide-12B-Unslop-Mell",
-    "id": "redrix/patricide-12B-Unslop-Mell",
-    "developer": "redrix",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4074
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5399
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1314
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4026
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.357
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/refuelai/Llama-3-Refueled/befdae09-4caa-4996-a3ac-fe36310aaf01.json b/data/hfopenllm_v2/refuelai/Llama-3-Refueled/befdae09-4caa-4996-a3ac-fe36310aaf01.json
deleted file mode 100644
index d68f26a1b..000000000
--- a/data/hfopenllm_v2/refuelai/Llama-3-Refueled/befdae09-4caa-4996-a3ac-fe36310aaf01.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/refuelai_Llama-3-Refueled/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Refueled",
-    "id": "refuelai/Llama-3-Refueled",
-    "developer": "refuelai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.462
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5871
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4454
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3095
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rhplus0831/maid-yuzu-v7/8cd7fc1b-2873-4154-9de7-c0b8e5f4f5e9.json b/data/hfopenllm_v2/rhplus0831/maid-yuzu-v7/8cd7fc1b-2873-4154-9de7-c0b8e5f4f5e9.json
deleted file mode 100644
index 92de2faa1..000000000
--- a/data/hfopenllm_v2/rhplus0831/maid-yuzu-v7/8cd7fc1b-2873-4154-9de7-c0b8e5f4f5e9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rhplus0831_maid-yuzu-v7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "maid-yuzu-v7",
-    "id": "rhplus0831/maid-yuzu-v7",
-    "developer": "rhplus0831",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 46.703
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6462
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4805
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4136
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.354
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rhymes-ai/Aria/7f6e5858-f5d4-41cf-9bb7-c3c82a55c392.json b/data/hfopenllm_v2/rhymes-ai/Aria/7f6e5858-f5d4-41cf-9bb7-c3c82a55c392.json
deleted file mode 100644
index f9084404d..000000000
--- a/data/hfopenllm_v2/rhymes-ai/Aria/7f6e5858-f5d4-41cf-9bb7-c3c82a55c392.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rhymes-ai_Aria/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Aria",
-    "id": "rhymes-ai/Aria",
-    "developer": "rhymes-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "AriaForConditionalGeneration",
-      "params_billions": 25.307
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4773
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5695
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1934
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3624
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4338
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4405
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rhysjones/phi-2-orange-v2/7b8bf84f-4101-41a1-b6ff-9cadbb5f84a3.json b/data/hfopenllm_v2/rhysjones/phi-2-orange-v2/7b8bf84f-4101-41a1-b6ff-9cadbb5f84a3.json
deleted file mode 100644
index c708415c7..000000000
--- a/data/hfopenllm_v2/rhysjones/phi-2-orange-v2/7b8bf84f-4101-41a1-b6ff-9cadbb5f84a3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rhysjones_phi-2-orange-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-2-orange-v2",
-    "id": "rhysjones/phi-2-orange-v2",
-    "developer": "rhysjones",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "PhiForCausalLM",
-      "params_billions": 2.78
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.367
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.477
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.363
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2532
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/riaz/FineLlama-3.1-8B/1f3a733d-a6d3-453b-9763-61992cd514b0.json b/data/hfopenllm_v2/riaz/FineLlama-3.1-8B/1f3a733d-a6d3-453b-9763-61992cd514b0.json
deleted file mode 100644
index 8ee7040c9..000000000
--- a/data/hfopenllm_v2/riaz/FineLlama-3.1-8B/1f3a733d-a6d3-453b-9763-61992cd514b0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/riaz_FineLlama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FineLlama-3.1-8B",
-    "id": "riaz/FineLlama-3.1-8B",
-    "developer": "riaz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4373
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4586
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3763
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2964
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/riaz/FineLlama-3.1-8B/d0eed3c1-2226-48c5-a314-e429f66c5053.json b/data/hfopenllm_v2/riaz/FineLlama-3.1-8B/d0eed3c1-2226-48c5-a314-e429f66c5053.json
deleted file mode 100644
index d57a902b6..000000000
--- a/data/hfopenllm_v2/riaz/FineLlama-3.1-8B/d0eed3c1-2226-48c5-a314-e429f66c5053.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/riaz_FineLlama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FineLlama-3.1-8B",
-    "id": "riaz/FineLlama-3.1-8B",
-    "developer": "riaz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4137
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4565
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3776
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rmdhirr/Gluon-8B/957f02f1-45c7-4cce-b5aa-86bb5e485ad3.json b/data/hfopenllm_v2/rmdhirr/Gluon-8B/957f02f1-45c7-4cce-b5aa-86bb5e485ad3.json
deleted file mode 100644
index 37d3c1fbe..000000000
--- a/data/hfopenllm_v2/rmdhirr/Gluon-8B/957f02f1-45c7-4cce-b5aa-86bb5e485ad3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rmdhirr_Gluon-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gluon-8B",
-    "id": "rmdhirr/Gluon-8B",
-    "developer": "rmdhirr",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5053
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5153
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1443
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4039
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3808
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rombodawg/Rombos-Coder-V2.5-Qwen-14b/55a01e8e-318a-4609-a862-bab4d62b3e7a.json b/data/hfopenllm_v2/rombodawg/Rombos-Coder-V2.5-Qwen-14b/55a01e8e-318a-4609-a862-bab4d62b3e7a.json
deleted file mode 100644
index c74361cf2..000000000
--- a/data/hfopenllm_v2/rombodawg/Rombos-Coder-V2.5-Qwen-14b/55a01e8e-318a-4609-a862-bab4d62b3e7a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-Coder-V2.5-Qwen-14b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rombos-Coder-V2.5-Qwen-14b",
-    "id": "rombodawg/Rombos-Coder-V2.5-Qwen-14b",
-    "developer": "rombodawg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7047
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6165
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3301
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3915
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3939
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rombodawg/Rombos-Coder-V2.5-Qwen-7b/cbdcd76f-be8f-42fe-89ed-d1d09d9d785f.json b/data/hfopenllm_v2/rombodawg/Rombos-Coder-V2.5-Qwen-7b/cbdcd76f-be8f-42fe-89ed-d1d09d9d785f.json
deleted file mode 100644
index e153878a8..000000000
--- a/data/hfopenllm_v2/rombodawg/Rombos-Coder-V2.5-Qwen-7b/cbdcd76f-be8f-42fe-89ed-d1d09d9d785f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-Coder-V2.5-Qwen-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rombos-Coder-V2.5-Qwen-7b",
-    "id": "rombodawg/Rombos-Coder-V2.5-Qwen-7b",
-    "developer": "rombodawg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.621
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5077
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3338
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3979
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3398
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-0.5b/c7b6515e-6f96-468b-8bc0-15212c31e790.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-0.5b/c7b6515e-6f96-468b-8bc0-15212c31e790.json
deleted file mode 100644
index 65683c672..000000000
--- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-0.5b/c7b6515e-6f96-468b-8bc0-15212c31e790.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.5-Qwen-0.5b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rombos-LLM-V2.5-Qwen-0.5b",
-    "id": "rombodawg/Rombos-LLM-V2.5-Qwen-0.5b",
-    "developer": "rombodawg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2847
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3294
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3236
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1866
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-1.5b/f27f3a1d-c19a-42b2-8b49-64ecfe5d3405.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-1.5b/f27f3a1d-c19a-42b2-8b49-64ecfe5d3405.json
deleted file mode 100644
index c6d4a9338..000000000
--- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-1.5b/f27f3a1d-c19a-42b2-8b49-64ecfe5d3405.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.5-Qwen-1.5b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rombos-LLM-V2.5-Qwen-1.5b",
-    "id": "rombodawg/Rombos-LLM-V2.5-Qwen-1.5b",
-    "developer": "rombodawg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3402
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4257
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0853
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4186
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2922
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-14b/994aa481-627a-4bed-8719-9e874373cbc6.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-14b/994aa481-627a-4bed-8719-9e874373cbc6.json
deleted file mode 100644
index 85be3f982..000000000
--- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-14b/994aa481-627a-4bed-8719-9e874373cbc6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.5-Qwen-14b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rombos-LLM-V2.5-Qwen-14b",
-    "id": "rombodawg/Rombos-LLM-V2.5-Qwen-14b",
-    "developer": "rombodawg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.584
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6481
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4554
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4717
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5376
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-32b/9f5cd849-20b1-4e8d-9deb-f286dcfd9d6e.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-32b/9f5cd849-20b1-4e8d-9deb-f286dcfd9d6e.json
deleted file mode 100644
index 85c6ec04a..000000000
--- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-32b/9f5cd849-20b1-4e8d-9deb-f286dcfd9d6e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.5-Qwen-32b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rombos-LLM-V2.5-Qwen-32b",
-    "id": "rombodawg/Rombos-LLM-V2.5-Qwen-32b",
-    "developer": "rombodawg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6827
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7046
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4955
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3968
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5034
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5916
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-3b/c4dd34f2-7acc-4a94-a9aa-3c6aeeae8a8c.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-3b/c4dd34f2-7acc-4a94-a9aa-3c6aeeae8a8c.json
deleted file mode 100644
index 272c24433..000000000
--- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-3b/c4dd34f2-7acc-4a94-a9aa-3c6aeeae8a8c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.5-Qwen-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rombos-LLM-V2.5-Qwen-3b",
-    "id": "rombodawg/Rombos-LLM-V2.5-Qwen-3b",
-    "developer": "rombodawg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.397
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5342
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4809
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2795
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4042
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3761
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-72b/e908b473-a015-4156-8e88-d67153479cb9.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-72b/e908b473-a015-4156-8e88-d67153479cb9.json
deleted file mode 100644
index 04db7b85b..000000000
--- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-72b/e908b473-a015-4156-8e88-d67153479cb9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.5-Qwen-72b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rombos-LLM-V2.5-Qwen-72b",
-    "id": "rombodawg/Rombos-LLM-V2.5-Qwen-72b",
-    "developer": "rombodawg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7155
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.723
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5423
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3985
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4599
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5935
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-7b/173af77d-7a51-4d5a-8fd3-366aaa5d78a0.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-7b/173af77d-7a51-4d5a-8fd3-366aaa5d78a0.json
deleted file mode 100644
index 5fba641ae..000000000
--- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5-Qwen-7b/173af77d-7a51-4d5a-8fd3-366aaa5d78a0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.5-Qwen-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rombos-LLM-V2.5-Qwen-7b",
-    "id": "rombodawg/Rombos-LLM-V2.5-Qwen-7b",
-    "developer": "rombodawg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6237
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5544
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3814
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4291
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4469
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5.1-Qwen-3b/0bb65f09-323d-485f-886e-5a35c8bcd342.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5.1-Qwen-3b/0bb65f09-323d-485f-886e-5a35c8bcd342.json
deleted file mode 100644
index 18b32860b..000000000
--- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5.1-Qwen-3b/0bb65f09-323d-485f-886e-5a35c8bcd342.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.5.1-Qwen-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rombos-LLM-V2.5.1-Qwen-3b",
-    "id": "rombodawg/Rombos-LLM-V2.5.1-Qwen-3b",
-    "developer": "rombodawg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.397
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2566
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1208
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3991
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2741
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5.1-Qwen-3b/86b4c877-ef2d-4563-93a2-92d7e77eab5c.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5.1-Qwen-3b/86b4c877-ef2d-4563-93a2-92d7e77eab5c.json
deleted file mode 100644
index df1c02b1d..000000000
--- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.5.1-Qwen-3b/86b4c877-ef2d-4563-93a2-92d7e77eab5c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.5.1-Qwen-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rombos-LLM-V2.5.1-Qwen-3b",
-    "id": "rombodawg/Rombos-LLM-V2.5.1-Qwen-3b",
-    "developer": "rombodawg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.397
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2595
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3884
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0914
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3991
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2719
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.6-Nemotron-70b/be2ee3f6-37ee-4895-821a-3d3c7eb04eac.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.6-Nemotron-70b/be2ee3f6-37ee-4895-821a-3d3c7eb04eac.json
deleted file mode 100644
index 945721456..000000000
--- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.6-Nemotron-70b/be2ee3f6-37ee-4895-821a-3d3c7eb04eac.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.6-Nemotron-70b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rombos-LLM-V2.6-Nemotron-70b",
-    "id": "rombodawg/Rombos-LLM-V2.6-Nemotron-70b",
-    "developer": "rombodawg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7527
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6938
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.406
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4669
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5329
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.6-Qwen-14b/e574af17-dd3b-4c09-8689-ea598d44e562.json b/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.6-Qwen-14b/e574af17-dd3b-4c09-8689-ea598d44e562.json
deleted file mode 100644
index a382429d5..000000000
--- a/data/hfopenllm_v2/rombodawg/Rombos-LLM-V2.6-Qwen-14b/e574af17-dd3b-4c09-8689-ea598d44e562.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rombodawg_Rombos-LLM-V2.6-Qwen-14b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rombos-LLM-V2.6-Qwen-14b",
-    "id": "rombodawg/Rombos-LLM-V2.6-Qwen-14b",
-    "developer": "rombodawg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8432
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6442
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5211
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3339
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4221
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4961
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rombodawg/rombos_Replete-Coder-Instruct-8b-Merged/83958185-047a-4356-918d-2f45f273c08a.json b/data/hfopenllm_v2/rombodawg/rombos_Replete-Coder-Instruct-8b-Merged/83958185-047a-4356-918d-2f45f273c08a.json
deleted file mode 100644
index 3c606dff2..000000000
--- a/data/hfopenllm_v2/rombodawg/rombos_Replete-Coder-Instruct-8b-Merged/83958185-047a-4356-918d-2f45f273c08a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rombodawg_rombos_Replete-Coder-Instruct-8b-Merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "rombos_Replete-Coder-Instruct-8b-Merged",
-    "id": "rombodawg/rombos_Replete-Coder-Instruct-8b-Merged",
-    "developer": "rombodawg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5388
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4462
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0778
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.366
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1809
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rombodawg/rombos_Replete-Coder-Llama3-8B/d04c6e84-0b63-4de1-9278-aa37c9d2c8e3.json b/data/hfopenllm_v2/rombodawg/rombos_Replete-Coder-Llama3-8B/d04c6e84-0b63-4de1-9278-aa37c9d2c8e3.json
deleted file mode 100644
index c4f4fe486..000000000
--- a/data/hfopenllm_v2/rombodawg/rombos_Replete-Coder-Llama3-8B/d04c6e84-0b63-4de1-9278-aa37c9d2c8e3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rombodawg_rombos_Replete-Coder-Llama3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "rombos_Replete-Coder-Llama3-8B",
-    "id": "rombodawg/rombos_Replete-Coder-Llama3-8B",
-    "developer": "rombodawg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4714
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3276
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3966
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1335
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rootxhacker/Apollo-70B/a218e260-7f56-4676-af58-254bd84d0327.json b/data/hfopenllm_v2/rootxhacker/Apollo-70B/a218e260-7f56-4676-af58-254bd84d0327.json
deleted file mode 100644
index c04a8ad40..000000000
--- a/data/hfopenllm_v2/rootxhacker/Apollo-70B/a218e260-7f56-4676-af58-254bd84d0327.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rootxhacker_Apollo-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Apollo-70B",
-    "id": "rootxhacker/Apollo-70B",
-    "developer": "rootxhacker",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5099
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6804
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4572
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4948
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5279
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rootxhacker/Apollo_v2-32B/f21fb2c8-4abe-40de-ab2c-9d23e95ee281.json b/data/hfopenllm_v2/rootxhacker/Apollo_v2-32B/f21fb2c8-4abe-40de-ab2c-9d23e95ee281.json
deleted file mode 100644
index c4804ceaf..000000000
--- a/data/hfopenllm_v2/rootxhacker/Apollo_v2-32B/f21fb2c8-4abe-40de-ab2c-9d23e95ee281.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rootxhacker_Apollo_v2-32B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Apollo_v2-32B",
-    "id": "rootxhacker/Apollo_v2-32B",
-    "developer": "rootxhacker",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.428
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7072
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4275
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3784
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4994
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5869
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rootxhacker/apollo-7B/da5774b2-8a6f-4f2d-8267-beb25490b06a.json b/data/hfopenllm_v2/rootxhacker/apollo-7B/da5774b2-8a6f-4f2d-8267-beb25490b06a.json
deleted file mode 100644
index eac1bfc10..000000000
--- a/data/hfopenllm_v2/rootxhacker/apollo-7B/da5774b2-8a6f-4f2d-8267-beb25490b06a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rootxhacker_apollo-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "apollo-7B",
-    "id": "rootxhacker/apollo-7B",
-    "developer": "rootxhacker",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3636
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0257
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4131
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1748
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rsh345/mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B/274705bd-8eb6-4863-8998-f5d67c4ac827.json b/data/hfopenllm_v2/rsh345/mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B/274705bd-8eb6-4863-8998-f5d67c4ac827.json
deleted file mode 100644
index cf9929e17..000000000
--- a/data/hfopenllm_v2/rsh345/mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B/274705bd-8eb6-4863-8998-f5d67c4ac827.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rsh345_mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B",
-    "id": "rsh345/mistral-ft-optimized-1218-NeuralHermes-2.5-Mistral-7B",
-    "developer": "rsh345",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3892
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5188
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0733
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4672
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rubenroy/Geneva-12B-GCv2-5m/5b95cc2f-3378-45e7-9f56-6bb7e1ce4826.json b/data/hfopenllm_v2/rubenroy/Geneva-12B-GCv2-5m/5b95cc2f-3378-45e7-9f56-6bb7e1ce4826.json
deleted file mode 100644
index ca4143a6b..000000000
--- a/data/hfopenllm_v2/rubenroy/Geneva-12B-GCv2-5m/5b95cc2f-3378-45e7-9f56-6bb7e1ce4826.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rubenroy_Geneva-12B-GCv2-5m/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Geneva-12B-GCv2-5m",
-    "id": "rubenroy/Geneva-12B-GCv2-5m",
-    "developer": "rubenroy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2586
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5278
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0801
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3525
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.325
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rubenroy/Gilgamesh-72B/6918d1a3-e547-46b7-9062-274057c1f513.json b/data/hfopenllm_v2/rubenroy/Gilgamesh-72B/6918d1a3-e547-46b7-9062-274057c1f513.json
deleted file mode 100644
index 5a7ac28c6..000000000
--- a/data/hfopenllm_v2/rubenroy/Gilgamesh-72B/6918d1a3-e547-46b7-9062-274057c1f513.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rubenroy_Gilgamesh-72B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gilgamesh-72B",
-    "id": "rubenroy/Gilgamesh-72B",
-    "developer": "rubenroy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8486
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7253
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4381
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3943
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4626
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5802
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rubenroy/Zurich-14B-GCv2-5m/599deb3c-49f9-4c0b-af8d-78f9e166820b.json b/data/hfopenllm_v2/rubenroy/Zurich-14B-GCv2-5m/599deb3c-49f9-4c0b-af8d-78f9e166820b.json
deleted file mode 100644
index fc4afde1a..000000000
--- a/data/hfopenllm_v2/rubenroy/Zurich-14B-GCv2-5m/599deb3c-49f9-4c0b-af8d-78f9e166820b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rubenroy_Zurich-14B-GCv2-5m/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Zurich-14B-GCv2-5m",
-    "id": "rubenroy/Zurich-14B-GCv2-5m",
-    "developer": "rubenroy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6164
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6308
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3074
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3616
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4874
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5233
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ruizhe1217/sft-s1-qwen-0.5b/b4ea3f14-3787-434b-8f26-20ff640c0146.json b/data/hfopenllm_v2/ruizhe1217/sft-s1-qwen-0.5b/b4ea3f14-3787-434b-8f26-20ff640c0146.json
deleted file mode 100644
index 739248056..000000000
--- a/data/hfopenllm_v2/ruizhe1217/sft-s1-qwen-0.5b/b4ea3f14-3787-434b-8f26-20ff640c0146.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ruizhe1217_sft-s1-qwen-0.5b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "sft-s1-qwen-0.5b",
-    "id": "ruizhe1217/sft-s1-qwen-0.5b",
-    "developer": "ruizhe1217",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.494
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2749
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3301
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0619
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1892
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/rwitz/go-bruins-v2/6952c527-ca23-494a-910c-1c027e4a5a29.json b/data/hfopenllm_v2/rwitz/go-bruins-v2/6952c527-ca23-494a-910c-1c027e4a5a29.json
deleted file mode 100644
index 9ab1d084b..000000000
--- a/data/hfopenllm_v2/rwitz/go-bruins-v2/6952c527-ca23-494a-910c-1c027e4a5a29.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/rwitz_go-bruins-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "go-bruins-v2",
-    "id": "rwitz/go-bruins-v2",
-    "developer": "rwitz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4096
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3799
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0672
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4138
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2761
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sabersaleh/Llama2-7B-CPO/3f12e79c-dd1b-428d-9094-10a047205e3e.json b/data/hfopenllm_v2/sabersaleh/Llama2-7B-CPO/3f12e79c-dd1b-428d-9094-10a047205e3e.json
deleted file mode 100644
index 63c7d223e..000000000
--- a/data/hfopenllm_v2/sabersaleh/Llama2-7B-CPO/3f12e79c-dd1b-428d-9094-10a047205e3e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sabersaleh_Llama2-7B-CPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama2-7B-CPO",
-    "id": "sabersaleh/Llama2-7B-CPO",
-    "developer": "sabersaleh",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1545
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3458
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4048
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1606
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sabersaleh/Llama2-7B-DPO/d508da29-0288-4a0a-b727-fc5355515c5e.json b/data/hfopenllm_v2/sabersaleh/Llama2-7B-DPO/d508da29-0288-4a0a-b727-fc5355515c5e.json
deleted file mode 100644
index 86e3966a1..000000000
--- a/data/hfopenllm_v2/sabersaleh/Llama2-7B-DPO/d508da29-0288-4a0a-b727-fc5355515c5e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sabersaleh_Llama2-7B-DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama2-7B-DPO",
-    "id": "sabersaleh/Llama2-7B-DPO",
-    "developer": "sabersaleh",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1453
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3512
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4114
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1626
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sabersaleh/Llama2-7B-IPO/48cf5a8a-70c6-4c55-8959-32d773d6dbcf.json b/data/hfopenllm_v2/sabersaleh/Llama2-7B-IPO/48cf5a8a-70c6-4c55-8959-32d773d6dbcf.json
deleted file mode 100644
index 2dd58f3e1..000000000
--- a/data/hfopenllm_v2/sabersaleh/Llama2-7B-IPO/48cf5a8a-70c6-4c55-8959-32d773d6dbcf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sabersaleh_Llama2-7B-IPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama2-7B-IPO",
-    "id": "sabersaleh/Llama2-7B-IPO",
-    "developer": "sabersaleh",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1769
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3475
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4048
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1617
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sabersaleh/Llama2-7B-KTO/4bb7d331-f305-4c08-a073-87ba7b2cbde2.json b/data/hfopenllm_v2/sabersaleh/Llama2-7B-KTO/4bb7d331-f305-4c08-a073-87ba7b2cbde2.json
deleted file mode 100644
index 1848858c0..000000000
--- a/data/hfopenllm_v2/sabersaleh/Llama2-7B-KTO/4bb7d331-f305-4c08-a073-87ba7b2cbde2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sabersaleh_Llama2-7B-KTO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama2-7B-KTO",
-    "id": "sabersaleh/Llama2-7B-KTO",
-    "developer": "sabersaleh",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1528
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3501
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0189
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4167
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1636
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sabersaleh/Llama2-7B-SPO/94639454-c525-4e6f-af27-d92d45a9ac40.json b/data/hfopenllm_v2/sabersaleh/Llama2-7B-SPO/94639454-c525-4e6f-af27-d92d45a9ac40.json
deleted file mode 100644
index 108215f39..000000000
--- a/data/hfopenllm_v2/sabersaleh/Llama2-7B-SPO/94639454-c525-4e6f-af27-d92d45a9ac40.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sabersaleh_Llama2-7B-SPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama2-7B-SPO",
-    "id": "sabersaleh/Llama2-7B-SPO",
-    "developer": "sabersaleh",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1567
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3383
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3874
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1757
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sabersaleh/Llama2-7B-SimPO/9fa81bb7-7abc-4764-9465-d61217590da5.json b/data/hfopenllm_v2/sabersaleh/Llama2-7B-SimPO/9fa81bb7-7abc-4764-9465-d61217590da5.json
deleted file mode 100644
index 7ee98a8ff..000000000
--- a/data/hfopenllm_v2/sabersaleh/Llama2-7B-SimPO/9fa81bb7-7abc-4764-9465-d61217590da5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sabersaleh_Llama2-7B-SimPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama2-7B-SimPO",
-    "id": "sabersaleh/Llama2-7B-SimPO",
-    "developer": "sabersaleh",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1659
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3489
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4007
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1641
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sabersaleh/Llama3/9a683492-4057-4de4-a30a-aa66becffb13.json b/data/hfopenllm_v2/sabersaleh/Llama3/9a683492-4057-4de4-a30a-aa66becffb13.json
deleted file mode 100644
index 8c46cb2b7..000000000
--- a/data/hfopenllm_v2/sabersaleh/Llama3/9a683492-4057-4de4-a30a-aa66becffb13.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sabersaleh_Llama3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3",
-    "id": "sabersaleh/Llama3",
-    "developer": "sabersaleh",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3321
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4782
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3933
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3162
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sabersalehk/Llama3-001-300/b917df45-62f2-4c3b-943a-ad6c98ef8bc1.json b/data/hfopenllm_v2/sabersalehk/Llama3-001-300/b917df45-62f2-4c3b-943a-ad6c98ef8bc1.json
deleted file mode 100644
index aa57d205c..000000000
--- a/data/hfopenllm_v2/sabersalehk/Llama3-001-300/b917df45-62f2-4c3b-943a-ad6c98ef8bc1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sabersalehk_Llama3-001-300/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3-001-300",
-    "id": "sabersalehk/Llama3-001-300",
-    "developer": "sabersalehk",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3179
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4745
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0529
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4064
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3158
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sabersalehk/Llama3-SimPO/ba658bc7-b89d-4fb7-a794-f48bd3715a49.json b/data/hfopenllm_v2/sabersalehk/Llama3-SimPO/ba658bc7-b89d-4fb7-a794-f48bd3715a49.json
deleted file mode 100644
index ec3a89b1c..000000000
--- a/data/hfopenllm_v2/sabersalehk/Llama3-SimPO/ba658bc7-b89d-4fb7-a794-f48bd3715a49.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sabersalehk_Llama3-SimPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3-SimPO",
-    "id": "sabersalehk/Llama3-SimPO",
-    "developer": "sabersalehk",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3642
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4874
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4046
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3157
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sabersalehk/Llama3_001_200/93f79cdc-ffd7-4299-9876-c0c7bed55ae5.json b/data/hfopenllm_v2/sabersalehk/Llama3_001_200/93f79cdc-ffd7-4299-9876-c0c7bed55ae5.json
deleted file mode 100644
index 71b6274f4..000000000
--- a/data/hfopenllm_v2/sabersalehk/Llama3_001_200/93f79cdc-ffd7-4299-9876-c0c7bed55ae5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sabersalehk_Llama3_001_200/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3_001_200",
-    "id": "sabersalehk/Llama3_001_200",
-    "developer": "sabersalehk",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3218
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4728
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4037
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3183
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sabersalehk/Llama3_01_300/5a91b0bf-b043-41d2-960d-5f0e78abc400.json b/data/hfopenllm_v2/sabersalehk/Llama3_01_300/5a91b0bf-b043-41d2-960d-5f0e78abc400.json
deleted file mode 100644
index 5ecbad865..000000000
--- a/data/hfopenllm_v2/sabersalehk/Llama3_01_300/5a91b0bf-b043-41d2-960d-5f0e78abc400.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sabersalehk_Llama3_01_300/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3_01_300",
-    "id": "sabersalehk/Llama3_01_300",
-    "developer": "sabersalehk",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2959
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4691
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0498
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4065
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3124
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/saishf/Fimbulvetr-Kuro-Lotus-10.7B/263f56e5-b578-475a-9bc4-b5ffc142f9e2.json b/data/hfopenllm_v2/saishf/Fimbulvetr-Kuro-Lotus-10.7B/263f56e5-b578-475a-9bc4-b5ffc142f9e2.json
deleted file mode 100644
index 603e3b142..000000000
--- a/data/hfopenllm_v2/saishf/Fimbulvetr-Kuro-Lotus-10.7B/263f56e5-b578-475a-9bc4-b5ffc142f9e2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/saishf_Fimbulvetr-Kuro-Lotus-10.7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fimbulvetr-Kuro-Lotus-10.7B",
-    "id": "saishf/Fimbulvetr-Kuro-Lotus-10.7B",
-    "developer": "saishf",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4939
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4342
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4445
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3389
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/saishf/Neural-SOVLish-Devil-8B-L3/9219ff66-73ba-45d8-99a0-23d23b3555ba.json b/data/hfopenllm_v2/saishf/Neural-SOVLish-Devil-8B-L3/9219ff66-73ba-45d8-99a0-23d23b3555ba.json
deleted file mode 100644
index 11c0ba6c8..000000000
--- a/data/hfopenllm_v2/saishf/Neural-SOVLish-Devil-8B-L3/9219ff66-73ba-45d8-99a0-23d23b3555ba.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/saishf_Neural-SOVLish-Devil-8B-L3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Neural-SOVLish-Devil-8B-L3",
-    "id": "saishf/Neural-SOVLish-Devil-8B-L3",
-    "developer": "saishf",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4199
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5142
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0891
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.411
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3807
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/saishshinde15/TethysAI_Base_Reasoning/b2328396-e9b2-464d-94e4-f03db19144ea.json b/data/hfopenllm_v2/saishshinde15/TethysAI_Base_Reasoning/b2328396-e9b2-464d-94e4-f03db19144ea.json
deleted file mode 100644
index 75ae575a6..000000000
--- a/data/hfopenllm_v2/saishshinde15/TethysAI_Base_Reasoning/b2328396-e9b2-464d-94e4-f03db19144ea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/saishshinde15_TethysAI_Base_Reasoning/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TethysAI_Base_Reasoning",
-    "id": "saishshinde15/TethysAI_Base_Reasoning",
-    "developer": "saishshinde15",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6369
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4519
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3142
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4075
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3236
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/saishshinde15/TethysAI_Vortex/3f895edf-8f54-48ff-a731-666144af0fda.json b/data/hfopenllm_v2/saishshinde15/TethysAI_Vortex/3f895edf-8f54-48ff-a731-666144af0fda.json
deleted file mode 100644
index ce5a5eb6e..000000000
--- a/data/hfopenllm_v2/saishshinde15/TethysAI_Vortex/3f895edf-8f54-48ff-a731-666144af0fda.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/saishshinde15_TethysAI_Vortex/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TethysAI_Vortex",
-    "id": "saishshinde15/TethysAI_Vortex",
-    "developer": "saishshinde15",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4298
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4749
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.315
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4458
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3241
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/saishshinde15/TethysAI_Vortex_Reasoning/b48b8e16-a555-466b-8b1c-246137223311.json b/data/hfopenllm_v2/saishshinde15/TethysAI_Vortex_Reasoning/b48b8e16-a555-466b-8b1c-246137223311.json
deleted file mode 100644
index 5425463aa..000000000
--- a/data/hfopenllm_v2/saishshinde15/TethysAI_Vortex_Reasoning/b48b8e16-a555-466b-8b1c-246137223311.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/saishshinde15_TethysAI_Vortex_Reasoning/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "TethysAI_Vortex_Reasoning",
-    "id": "saishshinde15/TethysAI_Vortex_Reasoning",
-    "developer": "saishshinde15",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4021
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4694
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4084
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sakaltcommunity/novablast-preview/5fdcb98f-4c50-4cdb-bd99-dd32efc6d6f3.json b/data/hfopenllm_v2/sakaltcommunity/novablast-preview/5fdcb98f-4c50-4cdb-bd99-dd32efc6d6f3.json
deleted file mode 100644
index 899a6a5a1..000000000
--- a/data/hfopenllm_v2/sakaltcommunity/novablast-preview/5fdcb98f-4c50-4cdb-bd99-dd32efc6d6f3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sakaltcommunity_novablast-preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "novablast-preview",
-    "id": "sakaltcommunity/novablast-preview",
-    "developer": "sakaltcommunity",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.453
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7043
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4894
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5021
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5915
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sakaltcommunity/sakaltum-7b/d49c5e72-0dd0-4663-a310-9cd9bf1f5150.json b/data/hfopenllm_v2/sakaltcommunity/sakaltum-7b/d49c5e72-0dd0-4663-a310-9cd9bf1f5150.json
deleted file mode 100644
index 3cafb3d05..000000000
--- a/data/hfopenllm_v2/sakaltcommunity/sakaltum-7b/d49c5e72-0dd0-4663-a310-9cd9bf1f5150.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sakaltcommunity_sakaltum-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "sakaltum-7b",
-    "id": "sakaltcommunity/sakaltum-7b",
-    "developer": "sakaltcommunity",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2604
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4575
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3775
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2769
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sakhan10/quantized_open_llama_3b_v2/0176903f-e6ca-4f21-b98a-00bc443bf244.json b/data/hfopenllm_v2/sakhan10/quantized_open_llama_3b_v2/0176903f-e6ca-4f21-b98a-00bc443bf244.json
deleted file mode 100644
index 19cf5433b..000000000
--- a/data/hfopenllm_v2/sakhan10/quantized_open_llama_3b_v2/0176903f-e6ca-4f21-b98a-00bc443bf244.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sakhan10_quantized_open_llama_3b_v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "quantized_open_llama_3b_v2",
-    "id": "sakhan10/quantized_open_llama_3b_v2",
-    "developer": "sakhan10",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1872
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3682
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1095
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/saltlux/luxia-21.4b-alignment-v1.0/11f32afc-95c1-4531-ae45-5a0974d36b3a.json b/data/hfopenllm_v2/saltlux/luxia-21.4b-alignment-v1.0/11f32afc-95c1-4531-ae45-5a0974d36b3a.json
deleted file mode 100644
index 30efb3c9e..000000000
--- a/data/hfopenllm_v2/saltlux/luxia-21.4b-alignment-v1.0/11f32afc-95c1-4531-ae45-5a0974d36b3a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/saltlux_luxia-21.4b-alignment-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "luxia-21.4b-alignment-v1.0",
-    "id": "saltlux/luxia-21.4b-alignment-v1.0",
-    "developer": "saltlux",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 21.421
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3693
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6373
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0974
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4328
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3403
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/saltlux/luxia-21.4b-alignment-v1.2/70657dd7-63cf-40f4-92a0-1097fc1ce9ae.json b/data/hfopenllm_v2/saltlux/luxia-21.4b-alignment-v1.2/70657dd7-63cf-40f4-92a0-1097fc1ce9ae.json
deleted file mode 100644
index 1f660f4e7..000000000
--- a/data/hfopenllm_v2/saltlux/luxia-21.4b-alignment-v1.2/70657dd7-63cf-40f4-92a0-1097fc1ce9ae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/saltlux_luxia-21.4b-alignment-v1.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "luxia-21.4b-alignment-v1.2",
-    "id": "saltlux/luxia-21.4b-alignment-v1.2",
-    "developer": "saltlux",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 21.421
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4115
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6371
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0846
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4459
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sam-paech/Darkest-muse-v1/53cf325b-6f32-4791-8f95-8b982ea03b23.json b/data/hfopenllm_v2/sam-paech/Darkest-muse-v1/53cf325b-6f32-4791-8f95-8b982ea03b23.json
deleted file mode 100644
index 3a8e6c998..000000000
--- a/data/hfopenllm_v2/sam-paech/Darkest-muse-v1/53cf325b-6f32-4791-8f95-8b982ea03b23.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sam-paech_Darkest-muse-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Darkest-muse-v1",
-    "id": "sam-paech/Darkest-muse-v1",
-    "developer": "sam-paech",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7344
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5968
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4502
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4184
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sam-paech/Delirium-v1/8c50491b-6ed4-4f38-9d3f-d5168600cf4f.json b/data/hfopenllm_v2/sam-paech/Delirium-v1/8c50491b-6ed4-4f38-9d3f-d5168600cf4f.json
deleted file mode 100644
index 2fdabfe78..000000000
--- a/data/hfopenllm_v2/sam-paech/Delirium-v1/8c50491b-6ed4-4f38-9d3f-d5168600cf4f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sam-paech_Delirium-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Delirium-v1",
-    "id": "sam-paech/Delirium-v1",
-    "developer": "sam-paech",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7208
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5962
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2107
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3431
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4514
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.419
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sam-paech/Quill-v1/7adf79de-a51d-4b87-989a-c218ec6d99e3.json b/data/hfopenllm_v2/sam-paech/Quill-v1/7adf79de-a51d-4b87-989a-c218ec6d99e3.json
deleted file mode 100644
index cd6582f9b..000000000
--- a/data/hfopenllm_v2/sam-paech/Quill-v1/7adf79de-a51d-4b87-989a-c218ec6d99e3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sam-paech_Quill-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Quill-v1",
-    "id": "sam-paech/Quill-v1",
-    "developer": "sam-paech",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7122
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5969
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2122
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3398
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4555
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4171
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sarvamai/OpenHathi-7B-Hi-v0.1-Base/92358e5a-5e73-4747-9e92-e5ac003b97f7.json b/data/hfopenllm_v2/sarvamai/OpenHathi-7B-Hi-v0.1-Base/92358e5a-5e73-4747-9e92-e5ac003b97f7.json
deleted file mode 100644
index 10e232e86..000000000
--- a/data/hfopenllm_v2/sarvamai/OpenHathi-7B-Hi-v0.1-Base/92358e5a-5e73-4747-9e92-e5ac003b97f7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sarvamai_OpenHathi-7B-Hi-v0.1-Base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenHathi-7B-Hi-v0.1-Base",
-    "id": "sarvamai/OpenHathi-7B-Hi-v0.1-Base",
-    "developer": "sarvamai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.87
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1804
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3354
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3658
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1543
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/schnapss/testmerge-7b/f1636512-b98f-4fe4-adf3-abd556dd0ab9.json b/data/hfopenllm_v2/schnapss/testmerge-7b/f1636512-b98f-4fe4-adf3-abd556dd0ab9.json
deleted file mode 100644
index 5594f41d4..000000000
--- a/data/hfopenllm_v2/schnapss/testmerge-7b/f1636512-b98f-4fe4-adf3-abd556dd0ab9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/schnapss_testmerge-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "testmerge-7b",
-    "id": "schnapss/testmerge-7b",
-    "developer": "schnapss",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3922
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5187
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0687
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4686
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.306
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sci-m-wang/Mistral-7B-Instruct-sa-v0.1/9333afdd-4866-412b-b11b-dfb118a06db9.json b/data/hfopenllm_v2/sci-m-wang/Mistral-7B-Instruct-sa-v0.1/9333afdd-4866-412b-b11b-dfb118a06db9.json
deleted file mode 100644
index 792d4db80..000000000
--- a/data/hfopenllm_v2/sci-m-wang/Mistral-7B-Instruct-sa-v0.1/9333afdd-4866-412b-b11b-dfb118a06db9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sci-m-wang_Mistral-7B-Instruct-sa-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-Instruct-sa-v0.1",
-    "id": "sci-m-wang/Mistral-7B-Instruct-sa-v0.1",
-    "developer": "sci-m-wang",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 14.483
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4335
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3273
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.39
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2362
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sci-m-wang/Phi-3-mini-4k-instruct-sa-v0.1/840c0e19-6d75-47a2-b64b-f9c51cb1dcff.json b/data/hfopenllm_v2/sci-m-wang/Phi-3-mini-4k-instruct-sa-v0.1/840c0e19-6d75-47a2-b64b-f9c51cb1dcff.json
deleted file mode 100644
index 4019e2d95..000000000
--- a/data/hfopenllm_v2/sci-m-wang/Phi-3-mini-4k-instruct-sa-v0.1/840c0e19-6d75-47a2-b64b-f9c51cb1dcff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sci-m-wang_Phi-3-mini-4k-instruct-sa-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3-mini-4k-instruct-sa-v0.1",
-    "id": "sci-m-wang/Phi-3-mini-4k-instruct-sa-v0.1",
-    "developer": "sci-m-wang",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 7.642
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5021
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5502
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.148
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4073
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3985
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sci-m-wang/deepseek-llm-7b-chat-sa-v0.1/071b49f2-8e23-47b1-9858-78d676d9905e.json b/data/hfopenllm_v2/sci-m-wang/deepseek-llm-7b-chat-sa-v0.1/071b49f2-8e23-47b1-9858-78d676d9905e.json
deleted file mode 100644
index ea4d21a1e..000000000
--- a/data/hfopenllm_v2/sci-m-wang/deepseek-llm-7b-chat-sa-v0.1/071b49f2-8e23-47b1-9858-78d676d9905e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sci-m-wang_deepseek-llm-7b-chat-sa-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "deepseek-llm-7b-chat-sa-v0.1",
-    "id": "sci-m-wang/deepseek-llm-7b-chat-sa-v0.1",
-    "developer": "sci-m-wang",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4036
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3718
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4173
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2209
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/securin/Securin-LLM-V2.5-Qwen-1.5B/d3821f53-87aa-470a-a403-c8e3cd100ae1.json b/data/hfopenllm_v2/securin/Securin-LLM-V2.5-Qwen-1.5B/d3821f53-87aa-470a-a403-c8e3cd100ae1.json
deleted file mode 100644
index 4a45da063..000000000
--- a/data/hfopenllm_v2/securin/Securin-LLM-V2.5-Qwen-1.5B/d3821f53-87aa-470a-a403-c8e3cd100ae1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/securin_Securin-LLM-V2.5-Qwen-1.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Securin-LLM-V2.5-Qwen-1.5B",
-    "id": "securin/Securin-LLM-V2.5-Qwen-1.5B",
-    "developer": "securin",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.543
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1492
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3158
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3606
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1615
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/senseable/WestLake-7B-v2/389dbaba-c9cd-4e6b-afb3-f2ee3951faa0.json b/data/hfopenllm_v2/senseable/WestLake-7B-v2/389dbaba-c9cd-4e6b-afb3-f2ee3951faa0.json
deleted file mode 100644
index 60607b3dd..000000000
--- a/data/hfopenllm_v2/senseable/WestLake-7B-v2/389dbaba-c9cd-4e6b-afb3-f2ee3951faa0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/senseable_WestLake-7B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "WestLake-7B-v2",
-    "id": "senseable/WestLake-7B-v2",
-    "developer": "senseable",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4419
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4073
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0483
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3937
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2764
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sequelbox/Llama3.1-70B-PlumChat/5f78f39a-42cc-4cf6-bb27-e2160765bf24.json b/data/hfopenllm_v2/sequelbox/Llama3.1-70B-PlumChat/5f78f39a-42cc-4cf6-bb27-e2160765bf24.json
deleted file mode 100644
index e8f20a830..000000000
--- a/data/hfopenllm_v2/sequelbox/Llama3.1-70B-PlumChat/5f78f39a-42cc-4cf6-bb27-e2160765bf24.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sequelbox_Llama3.1-70B-PlumChat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-70B-PlumChat",
-    "id": "sequelbox/Llama3.1-70B-PlumChat",
-    "developer": "sequelbox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5616
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6753
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3909
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4774
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5164
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sequelbox/Llama3.1-8B-MOTH/b6e3d811-bf9d-474e-b82d-358a44e0dfc9.json b/data/hfopenllm_v2/sequelbox/Llama3.1-8B-MOTH/b6e3d811-bf9d-474e-b82d-358a44e0dfc9.json
deleted file mode 100644
index fa67f2014..000000000
--- a/data/hfopenllm_v2/sequelbox/Llama3.1-8B-MOTH/b6e3d811-bf9d-474e-b82d-358a44e0dfc9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sequelbox_Llama3.1-8B-MOTH/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-8B-MOTH",
-    "id": "sequelbox/Llama3.1-8B-MOTH",
-    "developer": "sequelbox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5245
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4902
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3689
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3339
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumChat/bef1cbad-4f75-4dde-b467-6145f72a87f4.json b/data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumChat/bef1cbad-4f75-4dde-b467-6145f72a87f4.json
deleted file mode 100644
index 7e037a4de..000000000
--- a/data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumChat/bef1cbad-4f75-4dde-b467-6145f72a87f4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sequelbox_Llama3.1-8B-PlumChat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-8B-PlumChat",
-    "id": "sequelbox/Llama3.1-8B-PlumChat",
-    "developer": "sequelbox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4243
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3873
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0363
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3755
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2127
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumCode/654bebe0-b461-427e-a4cf-06386e9272d8.json b/data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumCode/654bebe0-b461-427e-a4cf-06386e9272d8.json
deleted file mode 100644
index cef3dea6d..000000000
--- a/data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumCode/654bebe0-b461-427e-a4cf-06386e9272d8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sequelbox_Llama3.1-8B-PlumCode/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-8B-PlumCode",
-    "id": "sequelbox/Llama3.1-8B-PlumCode",
-    "developer": "sequelbox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2045
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3368
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0272
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3773
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2335
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumMath/37ef4e34-58f8-463a-950f-48b3a6833d54.json b/data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumMath/37ef4e34-58f8-463a-950f-48b3a6833d54.json
deleted file mode 100644
index 553cddc4a..000000000
--- a/data/hfopenllm_v2/sequelbox/Llama3.1-8B-PlumMath/37ef4e34-58f8-463a-950f-48b3a6833d54.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sequelbox_Llama3.1-8B-PlumMath/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1-8B-PlumMath",
-    "id": "sequelbox/Llama3.1-8B-PlumMath",
-    "developer": "sequelbox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2242
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4032
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.318
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3919
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2975
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sequelbox/gemma-2-9B-MOTH/20687086-8aab-40f1-aec6-03917f4f9bf5.json b/data/hfopenllm_v2/sequelbox/gemma-2-9B-MOTH/20687086-8aab-40f1-aec6-03917f4f9bf5.json
deleted file mode 100644
index 1c49b7861..000000000
--- a/data/hfopenllm_v2/sequelbox/gemma-2-9B-MOTH/20687086-8aab-40f1-aec6-03917f4f9bf5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sequelbox_gemma-2-9B-MOTH/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-9B-MOTH",
-    "id": "sequelbox/gemma-2-9B-MOTH",
-    "developer": "sequelbox",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2059
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.308
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3409
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.114
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sethuiyer/Llama-3.1-8B-Experimental-1206-Instruct/53a0a998-a0a6-4800-80bf-bfd83123f2f6.json b/data/hfopenllm_v2/sethuiyer/Llama-3.1-8B-Experimental-1206-Instruct/53a0a998-a0a6-4800-80bf-bfd83123f2f6.json
deleted file mode 100644
index 4beac0c7e..000000000
--- a/data/hfopenllm_v2/sethuiyer/Llama-3.1-8B-Experimental-1206-Instruct/53a0a998-a0a6-4800-80bf-bfd83123f2f6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sethuiyer_Llama-3.1-8B-Experimental-1206-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-Experimental-1206-Instruct",
-    "id": "sethuiyer/Llama-3.1-8B-Experimental-1206-Instruct",
-    "developer": "sethuiyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6967
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5104
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1118
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3966
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3529
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sethuiyer/Llama-3.1-8B-Experimental-1208-Instruct/4ee8df1c-e8ff-4a56-816c-0c2258a226e7.json b/data/hfopenllm_v2/sethuiyer/Llama-3.1-8B-Experimental-1208-Instruct/4ee8df1c-e8ff-4a56-816c-0c2258a226e7.json
deleted file mode 100644
index a0e1dafd6..000000000
--- a/data/hfopenllm_v2/sethuiyer/Llama-3.1-8B-Experimental-1208-Instruct/4ee8df1c-e8ff-4a56-816c-0c2258a226e7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sethuiyer_Llama-3.1-8B-Experimental-1208-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-Experimental-1208-Instruct",
-    "id": "sethuiyer/Llama-3.1-8B-Experimental-1208-Instruct",
-    "developer": "sethuiyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.61
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4964
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0891
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.379
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3511
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sethuiyer/LlamaZero-3.1-8B-Experimental-1208/42c8d84d-c8b8-42c6-8f49-4e971df173d7.json b/data/hfopenllm_v2/sethuiyer/LlamaZero-3.1-8B-Experimental-1208/42c8d84d-c8b8-42c6-8f49-4e971df173d7.json
deleted file mode 100644
index 27df7dbc2..000000000
--- a/data/hfopenllm_v2/sethuiyer/LlamaZero-3.1-8B-Experimental-1208/42c8d84d-c8b8-42c6-8f49-4e971df173d7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sethuiyer_LlamaZero-3.1-8B-Experimental-1208/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LlamaZero-3.1-8B-Experimental-1208",
-    "id": "sethuiyer/LlamaZero-3.1-8B-Experimental-1208",
-    "developer": "sethuiyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6051
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4981
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.108
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sethuiyer/Llamaverse-3.1-8B-Instruct/77b57dea-22e1-48a6-b8ae-9e474f08ad5f.json b/data/hfopenllm_v2/sethuiyer/Llamaverse-3.1-8B-Instruct/77b57dea-22e1-48a6-b8ae-9e474f08ad5f.json
deleted file mode 100644
index 38d270dc1..000000000
--- a/data/hfopenllm_v2/sethuiyer/Llamaverse-3.1-8B-Instruct/77b57dea-22e1-48a6-b8ae-9e474f08ad5f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sethuiyer_Llamaverse-3.1-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llamaverse-3.1-8B-Instruct",
-    "id": "sethuiyer/Llamaverse-3.1-8B-Instruct",
-    "developer": "sethuiyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6185
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5414
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1858
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3762
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3523
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sethuiyer/Llamazing-3.1-8B-Instruct/a9ed5d04-57d2-4566-91df-b798be939fdb.json b/data/hfopenllm_v2/sethuiyer/Llamazing-3.1-8B-Instruct/a9ed5d04-57d2-4566-91df-b798be939fdb.json
deleted file mode 100644
index f6ecfe05e..000000000
--- a/data/hfopenllm_v2/sethuiyer/Llamazing-3.1-8B-Instruct/a9ed5d04-57d2-4566-91df-b798be939fdb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sethuiyer_Llamazing-3.1-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llamazing-3.1-8B-Instruct",
-    "id": "sethuiyer/Llamazing-3.1-8B-Instruct",
-    "developer": "sethuiyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5711
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5291
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3976
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3606
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sethuiyer/Qwen2.5-7B-Anvita/bad4ec47-fe84-4518-b072-6955938f0c86.json b/data/hfopenllm_v2/sethuiyer/Qwen2.5-7B-Anvita/bad4ec47-fe84-4518-b072-6955938f0c86.json
deleted file mode 100644
index de9d6f292..000000000
--- a/data/hfopenllm_v2/sethuiyer/Qwen2.5-7B-Anvita/bad4ec47-fe84-4518-b072-6955938f0c86.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sethuiyer_Qwen2.5-7B-Anvita/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-Anvita",
-    "id": "sethuiyer/Qwen2.5-7B-Anvita",
-    "developer": "sethuiyer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.648
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5466
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2017
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4337
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4166
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/shadowml/BeagSake-7B/497e585c-059a-4e18-9a8f-bdaa066f59ea.json b/data/hfopenllm_v2/shadowml/BeagSake-7B/497e585c-059a-4e18-9a8f-bdaa066f59ea.json
deleted file mode 100644
index 3c7a54074..000000000
--- a/data/hfopenllm_v2/shadowml/BeagSake-7B/497e585c-059a-4e18-9a8f-bdaa066f59ea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/shadowml_BeagSake-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BeagSake-7B",
-    "id": "shadowml/BeagSake-7B",
-    "developer": "shadowml",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5216
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4711
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4124
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2585
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/shadowml/Mixolar-4x7b/e24b2a4e-83e4-4a79-bc41-03a54af00595.json b/data/hfopenllm_v2/shadowml/Mixolar-4x7b/e24b2a4e-83e4-4a79-bc41-03a54af00595.json
deleted file mode 100644
index 193aa7115..000000000
--- a/data/hfopenllm_v2/shadowml/Mixolar-4x7b/e24b2a4e-83e4-4a79-bc41-03a54af00595.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/shadowml_Mixolar-4x7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mixolar-4x7b",
-    "id": "shadowml/Mixolar-4x7b",
-    "developer": "shadowml",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 36.099
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3893
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5216
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0582
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4258
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3305
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/shastraai/Shastra-LLAMA2-Math-Commonsense-SFT/15e39361-585b-4870-b91a-64dce4fb37ec.json b/data/hfopenllm_v2/shastraai/Shastra-LLAMA2-Math-Commonsense-SFT/15e39361-585b-4870-b91a-64dce4fb37ec.json
deleted file mode 100644
index 015b23d62..000000000
--- a/data/hfopenllm_v2/shastraai/Shastra-LLAMA2-Math-Commonsense-SFT/15e39361-585b-4870-b91a-64dce4fb37ec.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/shastraai_Shastra-LLAMA2-Math-Commonsense-SFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Shastra-LLAMA2-Math-Commonsense-SFT",
-    "id": "shastraai/Shastra-LLAMA2-Math-Commonsense-SFT",
-    "developer": "shastraai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.738
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3042
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3843
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0174
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3604
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1997
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/shivam9980/NEPALI-LLM/96efd11b-e9f2-4bf1-90f9-561714137edf.json b/data/hfopenllm_v2/shivam9980/NEPALI-LLM/96efd11b-e9f2-4bf1-90f9-561714137edf.json
deleted file mode 100644
index 60b5094a5..000000000
--- a/data/hfopenllm_v2/shivam9980/NEPALI-LLM/96efd11b-e9f2-4bf1-90f9-561714137edf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/shivam9980_NEPALI-LLM/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NEPALI-LLM",
-    "id": "shivam9980/NEPALI-LLM",
-    "developer": "shivam9980",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.273
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0417
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3828
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4122
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2064
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/shivam9980/mistral-7b-news-cnn-merged/98e9936d-d376-4c72-80a6-0a28cf722ac4.json b/data/hfopenllm_v2/shivam9980/mistral-7b-news-cnn-merged/98e9936d-d376-4c72-80a6-0a28cf722ac4.json
deleted file mode 100644
index aa2c40613..000000000
--- a/data/hfopenllm_v2/shivam9980/mistral-7b-news-cnn-merged/98e9936d-d376-4c72-80a6-0a28cf722ac4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/shivam9980_mistral-7b-news-cnn-merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-7b-news-cnn-merged",
-    "id": "shivam9980/mistral-7b-news-cnn-merged",
-    "developer": "shivam9980",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 7.723
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4634
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3635
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0189
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4523
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/shivank21/mistral_dpo_self/7ada9c83-7851-4da2-b9d1-d744b174b777.json b/data/hfopenllm_v2/shivank21/mistral_dpo_self/7ada9c83-7851-4da2-b9d1-d744b174b777.json
deleted file mode 100644
index ab32e42b6..000000000
--- a/data/hfopenllm_v2/shivank21/mistral_dpo_self/7ada9c83-7851-4da2-b9d1-d744b174b777.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/shivank21_mistral_dpo_self/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral_dpo_self",
-    "id": "shivank21/mistral_dpo_self",
-    "developer": "shivank21",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "",
-      "params_billions": 7.913
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3403
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3216
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2408
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2214
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/shuttleai/shuttle-3/a6ed72b7-14f1-464c-a7f5-590791982696.json b/data/hfopenllm_v2/shuttleai/shuttle-3/a6ed72b7-14f1-464c-a7f5-590791982696.json
deleted file mode 100644
index b5153a72b..000000000
--- a/data/hfopenllm_v2/shuttleai/shuttle-3/a6ed72b7-14f1-464c-a7f5-590791982696.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/shuttleai_shuttle-3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "shuttle-3",
-    "id": "shuttleai/shuttle-3",
-    "developer": "shuttleai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8154
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.742
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.46
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4119
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4377
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5716
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/shyamieee/Padma-v7.0/79e3f38d-ae2b-44a7-be0d-024adad6bcd6.json b/data/hfopenllm_v2/shyamieee/Padma-v7.0/79e3f38d-ae2b-44a7-be0d-024adad6bcd6.json
deleted file mode 100644
index 17b7fc494..000000000
--- a/data/hfopenllm_v2/shyamieee/Padma-v7.0/79e3f38d-ae2b-44a7-be0d-024adad6bcd6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/shyamieee_Padma-v7.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Padma-v7.0",
-    "id": "shyamieee/Padma-v7.0",
-    "developer": "shyamieee",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3841
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5119
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0702
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4386
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/silma-ai/SILMA-9B-Instruct-v1.0/ef13bdea-cf73-4ead-b6d7-73a155fa9a79.json b/data/hfopenllm_v2/silma-ai/SILMA-9B-Instruct-v1.0/ef13bdea-cf73-4ead-b6d7-73a155fa9a79.json
deleted file mode 100644
index d7f107192..000000000
--- a/data/hfopenllm_v2/silma-ai/SILMA-9B-Instruct-v1.0/ef13bdea-cf73-4ead-b6d7-73a155fa9a79.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/silma-ai_SILMA-9B-Instruct-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SILMA-9B-Instruct-v1.0",
-    "id": "silma-ai/SILMA-9B-Instruct-v1.0",
-    "developer": "silma-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5842
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5219
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1163
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4637
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.392
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/silma-ai/SILMA-Kashif-2B-Instruct-v1.0/2663884f-941c-4e16-8029-b38e3a543733.json b/data/hfopenllm_v2/silma-ai/SILMA-Kashif-2B-Instruct-v1.0/2663884f-941c-4e16-8029-b38e3a543733.json
deleted file mode 100644
index 31c0700a3..000000000
--- a/data/hfopenllm_v2/silma-ai/SILMA-Kashif-2B-Instruct-v1.0/2663884f-941c-4e16-8029-b38e3a543733.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/silma-ai_SILMA-Kashif-2B-Instruct-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SILMA-Kashif-2B-Instruct-v1.0",
-    "id": "silma-ai/SILMA-Kashif-2B-Instruct-v1.0",
-    "developer": "silma-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1181
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3793
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0113
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4043
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2258
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/siqi00/Mistral-7B-DFT/ca7af645-4796-4b31-ae7d-2cbebe5a369b.json b/data/hfopenllm_v2/siqi00/Mistral-7B-DFT/ca7af645-4796-4b31-ae7d-2cbebe5a369b.json
deleted file mode 100644
index 6c0b09631..000000000
--- a/data/hfopenllm_v2/siqi00/Mistral-7B-DFT/ca7af645-4796-4b31-ae7d-2cbebe5a369b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/siqi00_Mistral-7B-DFT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-DFT",
-    "id": "siqi00/Mistral-7B-DFT",
-    "developer": "siqi00",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5569
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4665
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0378
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4191
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2963
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/siqi00/Mistral-7B-DFT2/f95e098c-d320-4db1-887d-8c3252bbaf77.json b/data/hfopenllm_v2/siqi00/Mistral-7B-DFT2/f95e098c-d320-4db1-887d-8c3252bbaf77.json
deleted file mode 100644
index e31ef75fe..000000000
--- a/data/hfopenllm_v2/siqi00/Mistral-7B-DFT2/f95e098c-d320-4db1-887d-8c3252bbaf77.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/siqi00_Mistral-7B-DFT2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-DFT2",
-    "id": "siqi00/Mistral-7B-DFT2",
-    "developer": "siqi00",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5804
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3968
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4401
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/skumar9/Llama-medx_v2/2bbf6dc9-8dd5-4dee-908e-d4a8fc03bc84.json b/data/hfopenllm_v2/skumar9/Llama-medx_v2/2bbf6dc9-8dd5-4dee-908e-d4a8fc03bc84.json
deleted file mode 100644
index 675a3eeee..000000000
--- a/data/hfopenllm_v2/skumar9/Llama-medx_v2/2bbf6dc9-8dd5-4dee-908e-d4a8fc03bc84.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/skumar9_Llama-medx_v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-medx_v2",
-    "id": "skumar9/Llama-medx_v2",
-    "developer": "skumar9",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4462
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4909
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0914
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3661
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3463
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/skymizer/Llama2-7b-sft-chat-custom-template-dpo/5f4edfdb-a62c-4410-83a3-1ceb15d2e7b0.json b/data/hfopenllm_v2/skymizer/Llama2-7b-sft-chat-custom-template-dpo/5f4edfdb-a62c-4410-83a3-1ceb15d2e7b0.json
deleted file mode 100644
index 69c7e0621..000000000
--- a/data/hfopenllm_v2/skymizer/Llama2-7b-sft-chat-custom-template-dpo/5f4edfdb-a62c-4410-83a3-1ceb15d2e7b0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/skymizer_Llama2-7b-sft-chat-custom-template-dpo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama2-7b-sft-chat-custom-template-dpo",
-    "id": "skymizer/Llama2-7b-sft-chat-custom-template-dpo",
-    "developer": "skymizer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.738
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2353
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3688
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2391
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4429
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1946
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/someon98/qwen-CoMa-0.5b/aadfae06-73b6-4306-b056-0a733b9bd8f4.json b/data/hfopenllm_v2/someon98/qwen-CoMa-0.5b/aadfae06-73b6-4306-b056-0a733b9bd8f4.json
deleted file mode 100644
index 5b36f35c5..000000000
--- a/data/hfopenllm_v2/someon98/qwen-CoMa-0.5b/aadfae06-73b6-4306-b056-0a733b9bd8f4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/someon98_qwen-CoMa-0.5b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen-CoMa-0.5b",
-    "id": "someon98/qwen-CoMa-0.5b",
-    "developer": "someon98",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2277
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2953
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2399
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4046
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1099
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/ChocoTrio-14B-v1/cfecbfbc-46c3-4dd3-8bd9-afe4cd386973.json b/data/hfopenllm_v2/sometimesanotion/ChocoTrio-14B-v1/cfecbfbc-46c3-4dd3-8bd9-afe4cd386973.json
deleted file mode 100644
index 9b58091a6..000000000
--- a/data/hfopenllm_v2/sometimesanotion/ChocoTrio-14B-v1/cfecbfbc-46c3-4dd3-8bd9-afe4cd386973.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_ChocoTrio-14B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ChocoTrio-14B-v1",
-    "id": "sometimesanotion/ChocoTrio-14B-v1",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7089
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6506
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3973
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3851
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4821
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.537
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/IF-reasoning-experiment-40/97640dd1-d415-4b56-818c-cdcede3c52fd.json b/data/hfopenllm_v2/sometimesanotion/IF-reasoning-experiment-40/97640dd1-d415-4b56-818c-cdcede3c52fd.json
deleted file mode 100644
index 3e9ffdabd..000000000
--- a/data/hfopenllm_v2/sometimesanotion/IF-reasoning-experiment-40/97640dd1-d415-4b56-818c-cdcede3c52fd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_IF-reasoning-experiment-40/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IF-reasoning-experiment-40",
-    "id": "sometimesanotion/IF-reasoning-experiment-40",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.633
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6112
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5194
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5025
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/IF-reasoning-experiment-80/b750c460-ef70-4abf-b77d-118a82039598.json b/data/hfopenllm_v2/sometimesanotion/IF-reasoning-experiment-80/b750c460-ef70-4abf-b77d-118a82039598.json
deleted file mode 100644
index f511af28e..000000000
--- a/data/hfopenllm_v2/sometimesanotion/IF-reasoning-experiment-80/b750c460-ef70-4abf-b77d-118a82039598.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_IF-reasoning-experiment-80/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IF-reasoning-experiment-80",
-    "id": "sometimesanotion/IF-reasoning-experiment-80",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.383
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5463
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.421
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0989
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5025
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3368
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/KytheraMix-7B-v0.2/f4c20519-9e33-4698-a17a-07e5fe7d2707.json b/data/hfopenllm_v2/sometimesanotion/KytheraMix-7B-v0.2/f4c20519-9e33-4698-a17a-07e5fe7d2707.json
deleted file mode 100644
index 99248458d..000000000
--- a/data/hfopenllm_v2/sometimesanotion/KytheraMix-7B-v0.2/f4c20519-9e33-4698-a17a-07e5fe7d2707.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_KytheraMix-7B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "KytheraMix-7B-v0.2",
-    "id": "sometimesanotion/KytheraMix-7B-v0.2",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6129
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5635
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2923
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4594
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4505
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.1-experimental/0f204733-55b4-4c06-bd12-dbc2e2593abd.json b/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.1-experimental/0f204733-55b4-4c06-bd12-dbc2e2593abd.json
deleted file mode 100644
index f40410446..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.1-experimental/0f204733-55b4-4c06-bd12-dbc2e2593abd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Lamarck-14B-v0.1-experimental/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lamarck-14B-v0.1-experimental",
-    "id": "sometimesanotion/Lamarck-14B-v0.1-experimental",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5354
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6583
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4728
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5408
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.3/0bb226ed-fe88-4678-9b50-f77883ceb708.json b/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.3/0bb226ed-fe88-4678-9b50-f77883ceb708.json
deleted file mode 100644
index 27f7815dc..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.3/0bb226ed-fe88-4678-9b50-f77883ceb708.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Lamarck-14B-v0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lamarck-14B-v0.3",
-    "id": "sometimesanotion/Lamarck-14B-v0.3",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5032
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6611
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3884
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4688
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5411
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.4-Qwenvergence/fb297e45-9e14-4853-8384-75c187b28a9b.json b/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.4-Qwenvergence/fb297e45-9e14-4853-8384-75c187b28a9b.json
deleted file mode 100644
index 96418c26f..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.4-Qwenvergence/fb297e45-9e14-4853-8384-75c187b28a9b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Lamarck-14B-v0.4-Qwenvergence/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lamarck-14B-v0.4-Qwenvergence",
-    "id": "sometimesanotion/Lamarck-14B-v0.4-Qwenvergence",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4906
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6535
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3399
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3784
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4847
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5406
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6-002-model_stock/4f6eba27-2ab4-4b33-9568-814d15fbd6b9.json b/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6-002-model_stock/4f6eba27-2ab4-4b33-9568-814d15fbd6b9.json
deleted file mode 100644
index d6ddf3a3c..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6-002-model_stock/4f6eba27-2ab4-4b33-9568-814d15fbd6b9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Lamarck-14B-v0.6-002-model_stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lamarck-14B-v0.6-002-model_stock",
-    "id": "sometimesanotion/Lamarck-14B-v0.6-002-model_stock",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6692
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6143
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3776
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3742
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.518
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5054
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6-model_stock/c3bc3d69-a987-4dd0-b6a5-e0ecc50034fb.json b/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6-model_stock/c3bc3d69-a987-4dd0-b6a5-e0ecc50034fb.json
deleted file mode 100644
index 509a05a43..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6-model_stock/c3bc3d69-a987-4dd0-b6a5-e0ecc50034fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Lamarck-14B-v0.6-model_stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lamarck-14B-v0.6-model_stock",
-    "id": "sometimesanotion/Lamarck-14B-v0.6-model_stock",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.679
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6269
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4245
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5007
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5198
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6/5d02ba78-cf8b-44ee-a1b3-e51ecf437d89.json b/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6/5d02ba78-cf8b-44ee-a1b3-e51ecf437d89.json
deleted file mode 100644
index 0f6ebd239..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.6/5d02ba78-cf8b-44ee-a1b3-e51ecf437d89.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Lamarck-14B-v0.6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lamarck-14B-v0.6",
-    "id": "sometimesanotion/Lamarck-14B-v0.6",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6973
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.646
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4041
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3893
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4847
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.54
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-Fusion/4a43fa67-2438-4c2a-b17b-9d2f221e5a86.json b/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-Fusion/4a43fa67-2438-4c2a-b17b-9d2f221e5a86.json
deleted file mode 100644
index 1e00d79b8..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-Fusion/4a43fa67-2438-4c2a-b17b-9d2f221e5a86.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Lamarck-14B-v0.7-Fusion/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lamarck-14B-v0.7-Fusion",
-    "id": "sometimesanotion/Lamarck-14B-v0.7-Fusion",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6821
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6544
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4041
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.401
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4991
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5391
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-rc1/2c044767-1169-48c6-9e37-e9d1e35f4cfe.json b/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-rc1/2c044767-1169-48c6-9e37-e9d1e35f4cfe.json
deleted file mode 100644
index 0e64b71d7..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-rc1/2c044767-1169-48c6-9e37-e9d1e35f4cfe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Lamarck-14B-v0.7-rc1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lamarck-14B-v0.7-rc1",
-    "id": "sometimesanotion/Lamarck-14B-v0.7-rc1",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7305
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6486
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3852
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3893
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4715
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5416
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-rc4/bad67b35-d9ef-417a-955b-9c33e87cb927.json b/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-rc4/bad67b35-d9ef-417a-955b-9c33e87cb927.json
deleted file mode 100644
index ebc21eef2..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Lamarck-14B-v0.7-rc4/bad67b35-d9ef-417a-955b-9c33e87cb927.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Lamarck-14B-v0.7-rc4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lamarck-14B-v0.7-rc4",
-    "id": "sometimesanotion/Lamarck-14B-v0.7-rc4",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7211
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.651
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4026
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3893
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4912
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.54
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v1/60eaa315-f489-405d-a67d-7f1312e90cab.json b/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v1/60eaa315-f489-405d-a67d-7f1312e90cab.json
deleted file mode 100644
index 1c1c6efe8..000000000
--- a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v1/60eaa315-f489-405d-a67d-7f1312e90cab.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_LamarckInfusion-14B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LamarckInfusion-14B-v1",
-    "id": "sometimesanotion/LamarckInfusion-14B-v1",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7198
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6539
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4169
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3909
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4899
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5376
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2-hi/50de312a-293d-41a4-8bee-4feb0c148b90.json b/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2-hi/50de312a-293d-41a4-8bee-4feb0c148b90.json
deleted file mode 100644
index a6c8039e6..000000000
--- a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2-hi/50de312a-293d-41a4-8bee-4feb0c148b90.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_LamarckInfusion-14B-v2-hi/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LamarckInfusion-14B-v2-hi",
-    "id": "sometimesanotion/LamarckInfusion-14B-v2-hi",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6855
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6555
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.423
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3884
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4847
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5405
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2-lo/56f24cac-394c-4439-8f2e-8270e7519bda.json b/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2-lo/56f24cac-394c-4439-8f2e-8270e7519bda.json
deleted file mode 100644
index b52d3f3f1..000000000
--- a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2-lo/56f24cac-394c-4439-8f2e-8270e7519bda.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_LamarckInfusion-14B-v2-lo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LamarckInfusion-14B-v2-lo",
-    "id": "sometimesanotion/LamarckInfusion-14B-v2-lo",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6788
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6528
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4237
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3859
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4991
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5397
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2/8efa1423-0a39-4674-a94d-3d92448010d6.json b/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2/8efa1423-0a39-4674-a94d-3d92448010d6.json
deleted file mode 100644
index c108f183f..000000000
--- a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v2/8efa1423-0a39-4674-a94d-3d92448010d6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_LamarckInfusion-14B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LamarckInfusion-14B-v2",
-    "id": "sometimesanotion/LamarckInfusion-14B-v2",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6812
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6564
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4388
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3876
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4993
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5416
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v3/350b3491-cba8-46b4-a07f-3d1277270530.json b/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v3/350b3491-cba8-46b4-a07f-3d1277270530.json
deleted file mode 100644
index 54ff84883..000000000
--- a/data/hfopenllm_v2/sometimesanotion/LamarckInfusion-14B-v3/350b3491-cba8-46b4-a07f-3d1277270530.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_LamarckInfusion-14B-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LamarckInfusion-14B-v3",
-    "id": "sometimesanotion/LamarckInfusion-14B-v3",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7131
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6518
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4124
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.482
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5407
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen-14B-ProseStock-v4/0741ead7-24f3-49b0-9967-f726df84f78a.json b/data/hfopenllm_v2/sometimesanotion/Qwen-14B-ProseStock-v4/0741ead7-24f3-49b0-9967-f726df84f78a.json
deleted file mode 100644
index 37d385cfe..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwen-14B-ProseStock-v4/0741ead7-24f3-49b0-9967-f726df84f78a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen-14B-ProseStock-v4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-14B-ProseStock-v4",
-    "id": "sometimesanotion/Qwen-14B-ProseStock-v4",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4942
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6498
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.364
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3884
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4938
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5386
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen-2.5-14B-Virmarckeoso/1ea4d10e-e099-4967-8c43-e84acaeb40be.json b/data/hfopenllm_v2/sometimesanotion/Qwen-2.5-14B-Virmarckeoso/1ea4d10e-e099-4967-8c43-e84acaeb40be.json
deleted file mode 100644
index fb27ac194..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwen-2.5-14B-Virmarckeoso/1ea4d10e-e099-4967-8c43-e84acaeb40be.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen-2.5-14B-Virmarckeoso/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen-2.5-14B-Virmarckeoso",
-    "id": "sometimesanotion/Qwen-2.5-14B-Virmarckeoso",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4813
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.657
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4794
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5377
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v2/6c78d9f7-a61e-4f65-ac57-61597f735541.json b/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v2/6c78d9f7-a61e-4f65-ac57-61597f735541.json
deleted file mode 100644
index 916ea88f9..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v2/6c78d9f7-a61e-4f65-ac57-61597f735541.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen2.5-14B-Vimarckoso-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Vimarckoso-v2",
-    "id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v2",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4505
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.655
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3826
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4819
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.538
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-IF-Variant/e9bcfb1f-c688-4e7a-918a-e697adaf7aa5.json b/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-IF-Variant/e9bcfb1f-c688-4e7a-918a-e697adaf7aa5.json
deleted file mode 100644
index 5757a1b7a..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-IF-Variant/e9bcfb1f-c688-4e7a-918a-e697adaf7aa5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3-IF-Variant/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Vimarckoso-v3-IF-Variant",
-    "id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-IF-Variant",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6413
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5521
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2545
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5319
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4589
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-Prose01/153cfe7f-c27a-40b8-b8d2-54351f26f583.json b/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-Prose01/153cfe7f-c27a-40b8-b8d2-54351f26f583.json
deleted file mode 100644
index 874b6da06..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-Prose01/153cfe7f-c27a-40b8-b8d2-54351f26f583.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3-Prose01/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Vimarckoso-v3-Prose01",
-    "id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-Prose01",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6872
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6359
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3995
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4807
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5275
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-model_stock/b58372cd-5d55-4f42-a5da-2970e55b44b0.json b/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-model_stock/b58372cd-5d55-4f42-a5da-2970e55b44b0.json
deleted file mode 100644
index b0e9a12fb..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-model_stock/b58372cd-5d55-4f42-a5da-2970e55b44b0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3-model_stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Vimarckoso-v3-model_stock",
-    "id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v3-model_stock",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7162
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6421
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4245
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4781
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5316
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3/34a028ac-2002-480c-a1af-5b945ffe872e.json b/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3/34a028ac-2002-480c-a1af-5b945ffe872e.json
deleted file mode 100644
index f888babcd..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso-v3/34a028ac-2002-480c-a1af-5b945ffe872e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen2.5-14B-Vimarckoso-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Vimarckoso-v3",
-    "id": "sometimesanotion/Qwen2.5-14B-Vimarckoso-v3",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7257
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6415
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4003
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4807
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5343
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso/065ffc51-154c-4a93-a342-0dd476fda473.json b/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso/065ffc51-154c-4a93-a342-0dd476fda473.json
deleted file mode 100644
index c071dc2d8..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-14B-Vimarckoso/065ffc51-154c-4a93-a342-0dd476fda473.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen2.5-14B-Vimarckoso/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Vimarckoso",
-    "id": "sometimesanotion/Qwen2.5-14B-Vimarckoso",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4574
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6446
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3384
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3926
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4859
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5329
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Prose/ebc74f4f-157d-4ee4-8b99-9fb5b685afd5.json b/data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Prose/ebc74f4f-157d-4ee4-8b99-9fb5b685afd5.json
deleted file mode 100644
index 580c363ac..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Prose/ebc74f4f-157d-4ee4-8b99-9fb5b685afd5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen2.5-7B-Gordion-v0.1-Prose/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-Gordion-v0.1-Prose",
-    "id": "sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Prose",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5347
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5599
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2893
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3205
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4502
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4525
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Reason/91004d26-7b8b-4c0a-bd8c-8880654dc93a.json b/data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Reason/91004d26-7b8b-4c0a-bd8c-8880654dc93a.json
deleted file mode 100644
index 8e9f53078..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Reason/91004d26-7b8b-4c0a-bd8c-8880654dc93a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen2.5-7B-Gordion-v0.1-Reason/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-Gordion-v0.1-Reason",
-    "id": "sometimesanotion/Qwen2.5-7B-Gordion-v0.1-Reason",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4917
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5498
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2621
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4434
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4307
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1/5eb1aa92-a031-40d4-ad64-552075dae68a.json b/data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1/5eb1aa92-a031-40d4-ad64-552075dae68a.json
deleted file mode 100644
index 806d212aa..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwen2.5-7B-Gordion-v0.1/5eb1aa92-a031-40d4-ad64-552075dae68a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwen2.5-7B-Gordion-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-Gordion-v0.1",
-    "id": "sometimesanotion/Qwen2.5-7B-Gordion-v0.1",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7482
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5524
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2915
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4016
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.43
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentessential-14B-v1/3ebc147d-58f2-4605-a011-a71c591fac0e.json b/data/hfopenllm_v2/sometimesanotion/Qwentessential-14B-v1/3ebc147d-58f2-4605-a011-a71c591fac0e.json
deleted file mode 100644
index 22b1d0f94..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwentessential-14B-v1/3ebc147d-58f2-4605-a011-a71c591fac0e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentessential-14B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwentessential-14B-v1",
-    "id": "sometimesanotion/Qwentessential-14B-v1",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6279
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6545
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4071
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3876
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4873
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5381
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v013/01795776-e909-46d3-8b6c-0989334e3d0e.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v013/01795776-e909-46d3-8b6c-0989334e3d0e.json
deleted file mode 100644
index 29ab8f101..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v013/01795776-e909-46d3-8b6c-0989334e3d0e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v013/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwentinuum-14B-v013",
-    "id": "sometimesanotion/Qwentinuum-14B-v013",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6711
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6087
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3708
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3574
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5154
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4991
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v1/00dffa94-31f9-4b5c-b032-03dd20fc2e8d.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v1/00dffa94-31f9-4b5c-b032-03dd20fc2e8d.json
deleted file mode 100644
index a544e7948..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v1/00dffa94-31f9-4b5c-b032-03dd20fc2e8d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwentinuum-14B-v1",
-    "id": "sometimesanotion/Qwentinuum-14B-v1",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5032
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6573
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3603
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3826
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4781
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.541
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v2/736249d0-cea9-46c6-9677-ecae4b410af4.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v2/736249d0-cea9-46c6-9677-ecae4b410af4.json
deleted file mode 100644
index f73c3f45f..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v2/736249d0-cea9-46c6-9677-ecae4b410af4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwentinuum-14B-v2",
-    "id": "sometimesanotion/Qwentinuum-14B-v2",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5378
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6555
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3754
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3884
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4714
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5409
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v3/ef602cfe-3453-4189-b583-292cf05421d1.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v3/ef602cfe-3453-4189-b583-292cf05421d1.json
deleted file mode 100644
index fadcebc67..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v3/ef602cfe-3453-4189-b583-292cf05421d1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwentinuum-14B-v3",
-    "id": "sometimesanotion/Qwentinuum-14B-v3",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6158
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6539
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3535
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3876
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.486
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5413
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v5/559af2c1-deca-4c35-b83a-004c22ac958a.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v5/559af2c1-deca-4c35-b83a-004c22ac958a.json
deleted file mode 100644
index a63f22447..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v5/559af2c1-deca-4c35-b83a-004c22ac958a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwentinuum-14B-v5",
-    "id": "sometimesanotion/Qwentinuum-14B-v5",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6286
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.655
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3444
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3876
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4874
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5418
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v6-Prose/8d66d895-626a-477f-91b6-2195f35aacb3.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v6-Prose/8d66d895-626a-477f-91b6-2195f35aacb3.json
deleted file mode 100644
index 6b40b100b..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v6-Prose/8d66d895-626a-477f-91b6-2195f35aacb3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v6-Prose/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwentinuum-14B-v6-Prose",
-    "id": "sometimesanotion/Qwentinuum-14B-v6-Prose",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5643
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6545
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3701
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3884
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4913
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5392
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v6/004df803-70da-4e59-b3ad-f210c790f29e.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v6/004df803-70da-4e59-b3ad-f210c790f29e.json
deleted file mode 100644
index ff2133b12..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v6/004df803-70da-4e59-b3ad-f210c790f29e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwentinuum-14B-v6",
-    "id": "sometimesanotion/Qwentinuum-14B-v6",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6304
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6545
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3603
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.49
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.54
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v7/bb2972ca-e673-4be5-bc7e-2689adeac3a9.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v7/bb2972ca-e673-4be5-bc7e-2689adeac3a9.json
deleted file mode 100644
index 86db2ec7b..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v7/bb2972ca-e673-4be5-bc7e-2689adeac3a9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwentinuum-14B-v7",
-    "id": "sometimesanotion/Qwentinuum-14B-v7",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6109
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6551
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3573
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3909
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.482
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.541
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v8/eacf2411-a0ea-41fd-8363-e565fce0f26f.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v8/eacf2411-a0ea-41fd-8363-e565fce0f26f.json
deleted file mode 100644
index 14b1b0dfd..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v8/eacf2411-a0ea-41fd-8363-e565fce0f26f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v8/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwentinuum-14B-v8",
-    "id": "sometimesanotion/Qwentinuum-14B-v8",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5412
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6534
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3912
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3834
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4873
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5412
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v9/4eefe3cd-ff42-4d4c-89c6-c3e48d8c85e9.json b/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v9/4eefe3cd-ff42-4d4c-89c6-c3e48d8c85e9.json
deleted file mode 100644
index 470b2bb89..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwentinuum-14B-v9/4eefe3cd-ff42-4d4c-89c6-c3e48d8c85e9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwentinuum-14B-v9/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwentinuum-14B-v9",
-    "id": "sometimesanotion/Qwentinuum-14B-v9",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5107
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.658
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3482
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3859
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4781
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5421
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-qv256/f19dab38-48ed-438e-8a62-86e4d111f6c8.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-qv256/f19dab38-48ed-438e-8a62-86e4d111f6c8.json
deleted file mode 100644
index 17591ed25..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-qv256/f19dab38-48ed-438e-8a62-86e4d111f6c8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-qv256/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenvergence-14B-qv256",
-    "id": "sometimesanotion/Qwenvergence-14B-qv256",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7006
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6312
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3897
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3784
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4926
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5178
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v0.6-004-model_stock/ff4b6d28-62e2-4671-8df9-690ce7f13f0b.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v0.6-004-model_stock/ff4b6d28-62e2-4671-8df9-690ce7f13f0b.json
deleted file mode 100644
index 10044a8d2..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v0.6-004-model_stock/ff4b6d28-62e2-4671-8df9-690ce7f13f0b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v0.6-004-model_stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenvergence-14B-v0.6-004-model_stock",
-    "id": "sometimesanotion/Qwenvergence-14B-v0.6-004-model_stock",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.686
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6249
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4094
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3834
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5033
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5193
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v10/9c05a7e4-f495-41d0-a7f0-1959e7434ba2.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v10/9c05a7e4-f495-41d0-a7f0-1959e7434ba2.json
deleted file mode 100644
index 1827b725f..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v10/9c05a7e4-f495-41d0-a7f0-1959e7434ba2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v10/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenvergence-14B-v10",
-    "id": "sometimesanotion/Qwenvergence-14B-v10",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6757
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6316
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4789
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4991
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5239
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v11/404e3d61-26d3-4f95-9847-064f0c7c6970.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v11/404e3d61-26d3-4f95-9847-064f0c7c6970.json
deleted file mode 100644
index df64804c6..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v11/404e3d61-26d3-4f95-9847-064f0c7c6970.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v11/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenvergence-14B-v11",
-    "id": "sometimesanotion/Qwenvergence-14B-v11",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7192
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6368
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4645
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3725
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4754
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5327
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v12-Prose-DS/0b4574f2-1b71-427f-9923-17db449be191.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v12-Prose-DS/0b4574f2-1b71-427f-9923-17db449be191.json
deleted file mode 100644
index 12b28bc82..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v12-Prose-DS/0b4574f2-1b71-427f-9923-17db449be191.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v12-Prose-DS/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenvergence-14B-v12-Prose-DS",
-    "id": "sometimesanotion/Qwenvergence-14B-v12-Prose-DS",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6173
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6507
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4305
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3943
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5151
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5369
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v12-Prose/775b88cd-98e8-4d93-acca-e294f68f2da2.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v12-Prose/775b88cd-98e8-4d93-acca-e294f68f2da2.json
deleted file mode 100644
index c5c86aec1..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v12-Prose/775b88cd-98e8-4d93-acca-e294f68f2da2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v12-Prose/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenvergence-14B-v12-Prose",
-    "id": "sometimesanotion/Qwenvergence-14B-v12-Prose",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5412
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6504
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3535
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4991
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5381
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v13-Prose-DS/89464568-47cb-4659-af37-8b061d3f0c8c.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v13-Prose-DS/89464568-47cb-4659-af37-8b061d3f0c8c.json
deleted file mode 100644
index 84c78b460..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v13-Prose-DS/89464568-47cb-4659-af37-8b061d3f0c8c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v13-Prose-DS/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenvergence-14B-v13-Prose-DS",
-    "id": "sometimesanotion/Qwenvergence-14B-v13-Prose-DS",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7178
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6405
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.386
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3834
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4927
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5349
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v15-Prose-MS/9fad9d73-acbf-4ffc-886c-551c1fe1ed45.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v15-Prose-MS/9fad9d73-acbf-4ffc-886c-551c1fe1ed45.json
deleted file mode 100644
index 5ec413f82..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v15-Prose-MS/9fad9d73-acbf-4ffc-886c-551c1fe1ed45.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v15-Prose-MS/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenvergence-14B-v15-Prose-MS",
-    "id": "sometimesanotion/Qwenvergence-14B-v15-Prose-MS",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5032
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.655
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3633
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3951
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4913
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5393
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v2-Prose/c1882335-0df5-4df2-bfa1-c16126c328fb.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v2-Prose/c1882335-0df5-4df2-bfa1-c16126c328fb.json
deleted file mode 100644
index a6c3cb27c..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v2-Prose/c1882335-0df5-4df2-bfa1-c16126c328fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v2-Prose/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenvergence-14B-v2-Prose",
-    "id": "sometimesanotion/Qwenvergence-14B-v2-Prose",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4705
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6519
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3557
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4926
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5372
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Prose/291471ed-3b7c-4bd4-91bb-c27cd74ec460.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Prose/291471ed-3b7c-4bd4-91bb-c27cd74ec460.json
deleted file mode 100644
index 556f2a8cf..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Prose/291471ed-3b7c-4bd4-91bb-c27cd74ec460.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v3-Prose/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenvergence-14B-v3-Prose",
-    "id": "sometimesanotion/Qwenvergence-14B-v3-Prose",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4918
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6513
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3648
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3951
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4939
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.537
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Reason/53565fe4-0368-477b-9916-ac9a4b8a9c7b.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Reason/53565fe4-0368-477b-9916-ac9a4b8a9c7b.json
deleted file mode 100644
index 0e8401765..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Reason/53565fe4-0368-477b-9916-ac9a4b8a9c7b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v3-Reason/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenvergence-14B-v3-Reason",
-    "id": "sometimesanotion/Qwenvergence-14B-v3-Reason",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5278
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6557
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3119
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4754
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5396
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Reason/f6cb5e9d-c4c9-44a2-9adf-7fa5639d84d9.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Reason/f6cb5e9d-c4c9-44a2-9adf-7fa5639d84d9.json
deleted file mode 100644
index 49b6488af..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3-Reason/f6cb5e9d-c4c9-44a2-9adf-7fa5639d84d9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v3-Reason/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenvergence-14B-v3-Reason",
-    "id": "sometimesanotion/Qwenvergence-14B-v3-Reason",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5367
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6561
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3867
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.474
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5395
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3/e51fee25-7648-49d9-a8da-b8dbc68a722b.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3/e51fee25-7648-49d9-a8da-b8dbc68a722b.json
deleted file mode 100644
index 6b7f693f2..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v3/e51fee25-7648-49d9-a8da-b8dbc68a722b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenvergence-14B-v3",
-    "id": "sometimesanotion/Qwenvergence-14B-v3",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5044
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6548
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3693
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4886
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5386
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v6-Prose-model_stock/6acdc96b-cfde-439f-b6b3-a66257b3fcde.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v6-Prose-model_stock/6acdc96b-cfde-439f-b6b3-a66257b3fcde.json
deleted file mode 100644
index 1fad180ca..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v6-Prose-model_stock/6acdc96b-cfde-439f-b6b3-a66257b3fcde.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v6-Prose-model_stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenvergence-14B-v6-Prose-model_stock",
-    "id": "sometimesanotion/Qwenvergence-14B-v6-Prose-model_stock",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4811
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.653
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3603
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4899
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5387
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v6-Prose/850da8de-ca13-4f15-bb9f-68b910355cfd.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v6-Prose/850da8de-ca13-4f15-bb9f-68b910355cfd.json
deleted file mode 100644
index a338fb9a8..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v6-Prose/850da8de-ca13-4f15-bb9f-68b910355cfd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v6-Prose/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenvergence-14B-v6-Prose",
-    "id": "sometimesanotion/Qwenvergence-14B-v6-Prose",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.599
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6544
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3884
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4887
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5371
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v8/542fbb7a-d4eb-4cbf-b63a-4305cb108361.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v8/542fbb7a-d4eb-4cbf-b63a-4305cb108361.json
deleted file mode 100644
index fd2e91afc..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v8/542fbb7a-d4eb-4cbf-b63a-4305cb108361.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v8/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenvergence-14B-v8",
-    "id": "sometimesanotion/Qwenvergence-14B-v8",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5913
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6522
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4048
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3809
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4768
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5435
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v9/1dbb8206-6a86-4e2c-8ee0-d80fed014a69.json b/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v9/1dbb8206-6a86-4e2c-8ee0-d80fed014a69.json
deleted file mode 100644
index e4304c3e3..000000000
--- a/data/hfopenllm_v2/sometimesanotion/Qwenvergence-14B-v9/1dbb8206-6a86-4e2c-8ee0-d80fed014a69.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_Qwenvergence-14B-v9/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenvergence-14B-v9",
-    "id": "sometimesanotion/Qwenvergence-14B-v9",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6598
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6166
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4139
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3683
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5141
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5111
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/lamarck-14b-prose-model_stock/6341de3c-8d4c-4af8-8f0d-c81e948bacd6.json b/data/hfopenllm_v2/sometimesanotion/lamarck-14b-prose-model_stock/6341de3c-8d4c-4af8-8f0d-c81e948bacd6.json
deleted file mode 100644
index 404e5cee8..000000000
--- a/data/hfopenllm_v2/sometimesanotion/lamarck-14b-prose-model_stock/6341de3c-8d4c-4af8-8f0d-c81e948bacd6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_lamarck-14b-prose-model_stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "lamarck-14b-prose-model_stock",
-    "id": "sometimesanotion/lamarck-14b-prose-model_stock",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4276
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6488
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3414
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4846
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5354
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sometimesanotion/lamarck-14b-reason-model_stock/e6cb6a87-6db8-4aee-bede-ce8a60dc8f4a.json b/data/hfopenllm_v2/sometimesanotion/lamarck-14b-reason-model_stock/e6cb6a87-6db8-4aee-bede-ce8a60dc8f4a.json
deleted file mode 100644
index 92a2a8d29..000000000
--- a/data/hfopenllm_v2/sometimesanotion/lamarck-14b-reason-model_stock/e6cb6a87-6db8-4aee-bede-ce8a60dc8f4a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sometimesanotion_lamarck-14b-reason-model_stock/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "lamarck-14b-reason-model_stock",
-    "id": "sometimesanotion/lamarck-14b-reason-model_stock",
-    "developer": "sometimesanotion",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4965
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6569
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4741
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5402
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415/5113439d-1394-46f2-a38e-34b54e94a9e6.json b/data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415/5113439d-1394-46f2-a38e-34b54e94a9e6.json
deleted file mode 100644
index d02f279fc..000000000
--- a/data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415/5113439d-1394-46f2-a38e-34b54e94a9e6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sonthenguyen_ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415",
-    "id": "sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-161415",
-    "developer": "sonthenguyen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 7.723
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2893
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3804
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0113
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2466
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3861
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1401
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205/a03d88aa-7ccd-4f8a-9a1e-c9469d3ae559.json b/data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205/a03d88aa-7ccd-4f8a-9a1e-c9469d3ae559.json
deleted file mode 100644
index 374cc5708..000000000
--- a/data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205/a03d88aa-7ccd-4f8a-9a1e-c9469d3ae559.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sonthenguyen_ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205",
-    "id": "sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-164205",
-    "developer": "sonthenguyen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 7.723
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3199
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3959
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4272
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2124
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522/1cfb40a7-7373-417c-aa1c-f6ab63ecb3b8.json b/data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522/1cfb40a7-7373-417c-aa1c-f6ab63ecb3b8.json
deleted file mode 100644
index 37b361fb2..000000000
--- a/data/hfopenllm_v2/sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522/1cfb40a7-7373-417c-aa1c-f6ab63ecb3b8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sonthenguyen_ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522",
-    "id": "sonthenguyen/ft-unsloth-zephyr-sft-bnb-4bit-20241014-170522",
-    "developer": "sonthenguyen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 7.723
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3764
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3828
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4404
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2055
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbc-213steps/446ac93f-d47c-4207-bf32-0cd94e88a931.json b/data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbc-213steps/446ac93f-d47c-4207-bf32-0cd94e88a931.json
deleted file mode 100644
index a216eeafc..000000000
--- a/data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbc-213steps/446ac93f-d47c-4207-bf32-0cd94e88a931.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sonthenguyen_zephyr-sft-bnb-4bit-DPO-mtbc-213steps/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "zephyr-sft-bnb-4bit-DPO-mtbc-213steps",
-    "id": "sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbc-213steps",
-    "developer": "sonthenguyen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4275
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4197
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0257
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4086
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2709
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbo-180steps/7e4ba4f8-2768-4e7b-a11d-75ad22a47c45.json b/data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbo-180steps/7e4ba4f8-2768-4e7b-a11d-75ad22a47c45.json
deleted file mode 100644
index 51e507e7a..000000000
--- a/data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbo-180steps/7e4ba4f8-2768-4e7b-a11d-75ad22a47c45.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sonthenguyen_zephyr-sft-bnb-4bit-DPO-mtbo-180steps/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "zephyr-sft-bnb-4bit-DPO-mtbo-180steps",
-    "id": "sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbo-180steps",
-    "developer": "sonthenguyen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4087
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4323
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0234
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3885
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2748
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbr-180steps/ca77f821-4722-45b1-b731-7d774232acb4.json b/data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbr-180steps/ca77f821-4722-45b1-b731-7d774232acb4.json
deleted file mode 100644
index c5738810a..000000000
--- a/data/hfopenllm_v2/sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbr-180steps/ca77f821-4722-45b1-b731-7d774232acb4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sonthenguyen_zephyr-sft-bnb-4bit-DPO-mtbr-180steps/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "zephyr-sft-bnb-4bit-DPO-mtbr-180steps",
-    "id": "sonthenguyen/zephyr-sft-bnb-4bit-DPO-mtbr-180steps",
-    "developer": "sonthenguyen",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4032
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4305
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4258
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2711
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sophosympatheia/Midnight-Miqu-70B-v1.5/f32d2a11-edd3-4662-aed7-88c6820b2c2e.json b/data/hfopenllm_v2/sophosympatheia/Midnight-Miqu-70B-v1.5/f32d2a11-edd3-4662-aed7-88c6820b2c2e.json
deleted file mode 100644
index fddda10e0..000000000
--- a/data/hfopenllm_v2/sophosympatheia/Midnight-Miqu-70B-v1.5/f32d2a11-edd3-4662-aed7-88c6820b2c2e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sophosympatheia_Midnight-Miqu-70B-v1.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Midnight-Miqu-70B-v1.5",
-    "id": "sophosympatheia/Midnight-Miqu-70B-v1.5",
-    "developer": "sophosympatheia",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 68.977
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6118
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5606
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0702
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4244
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3825
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/speakleash/Bielik-11B-v2.0-Instruct/71c56883-dd14-4f16-b839-5ce607a4aadb.json b/data/hfopenllm_v2/speakleash/Bielik-11B-v2.0-Instruct/71c56883-dd14-4f16-b839-5ce607a4aadb.json
deleted file mode 100644
index 12049af61..000000000
--- a/data/hfopenllm_v2/speakleash/Bielik-11B-v2.0-Instruct/71c56883-dd14-4f16-b839-5ce607a4aadb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/speakleash_Bielik-11B-v2.0-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Bielik-11B-v2.0-Instruct",
-    "id": "speakleash/Bielik-11B-v2.0-Instruct",
-    "developer": "speakleash",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 11.169
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5252
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5362
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1186
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3171
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4467
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3351
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/speakleash/Bielik-11B-v2.1-Instruct/639004c2-81a5-410d-bd61-e3e263f55335.json b/data/hfopenllm_v2/speakleash/Bielik-11B-v2.1-Instruct/639004c2-81a5-410d-bd61-e3e263f55335.json
deleted file mode 100644
index e5d5e6b4f..000000000
--- a/data/hfopenllm_v2/speakleash/Bielik-11B-v2.1-Instruct/639004c2-81a5-410d-bd61-e3e263f55335.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/speakleash_Bielik-11B-v2.1-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Bielik-11B-v2.1-Instruct",
-    "id": "speakleash/Bielik-11B-v2.1-Instruct",
-    "developer": "speakleash",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 11.169
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.509
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.553
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2666
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3372
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4185
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3447
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/speakleash/Bielik-11B-v2.2-Instruct/5f232a99-07c9-4df7-9d3b-837966ea6de5.json b/data/hfopenllm_v2/speakleash/Bielik-11B-v2.2-Instruct/5f232a99-07c9-4df7-9d3b-837966ea6de5.json
deleted file mode 100644
index 29b63348b..000000000
--- a/data/hfopenllm_v2/speakleash/Bielik-11B-v2.2-Instruct/5f232a99-07c9-4df7-9d3b-837966ea6de5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/speakleash_Bielik-11B-v2.2-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Bielik-11B-v2.2-Instruct",
-    "id": "speakleash/Bielik-11B-v2.2-Instruct",
-    "developer": "speakleash",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 11.169
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5552
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5597
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2681
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4171
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3487
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/speakleash/Bielik-11B-v2.3-Instruct/482e34ee-8974-46c6-b3f4-4cc9872ef562.json b/data/hfopenllm_v2/speakleash/Bielik-11B-v2.3-Instruct/482e34ee-8974-46c6-b3f4-4cc9872ef562.json
deleted file mode 100644
index d3532936e..000000000
--- a/data/hfopenllm_v2/speakleash/Bielik-11B-v2.3-Instruct/482e34ee-8974-46c6-b3f4-4cc9872ef562.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/speakleash_Bielik-11B-v2.3-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Bielik-11B-v2.3-Instruct",
-    "id": "speakleash/Bielik-11B-v2.3-Instruct",
-    "developer": "speakleash",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 11.169
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5583
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5663
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2085
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4518
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3444
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/speakleash/Bielik-11B-v2/13743252-3ba3-406d-8e95-5a4cd3ac3772.json b/data/hfopenllm_v2/speakleash/Bielik-11B-v2/13743252-3ba3-406d-8e95-5a4cd3ac3772.json
deleted file mode 100644
index 0c4687f31..000000000
--- a/data/hfopenllm_v2/speakleash/Bielik-11B-v2/13743252-3ba3-406d-8e95-5a4cd3ac3772.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/speakleash_Bielik-11B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Bielik-11B-v2",
-    "id": "speakleash/Bielik-11B-v2",
-    "developer": "speakleash",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 11.169
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2381
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4931
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0785
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3924
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3137
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/spmurrayzzz/Mistral-Syndicate-7B/ff25cb66-ed6f-421a-a038-1feb24666645.json b/data/hfopenllm_v2/spmurrayzzz/Mistral-Syndicate-7B/ff25cb66-ed6f-421a-a038-1feb24666645.json
deleted file mode 100644
index 7a760f4b4..000000000
--- a/data/hfopenllm_v2/spmurrayzzz/Mistral-Syndicate-7B/ff25cb66-ed6f-421a-a038-1feb24666645.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/spmurrayzzz_Mistral-Syndicate-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-Syndicate-7B",
-    "id": "spmurrayzzz/Mistral-Syndicate-7B",
-    "developer": "spmurrayzzz",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2496
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4245
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.034
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4386
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2631
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/spow12/ChatWaifu_12B_v2.0/843f0d9a-04e8-4cea-bb18-94651a814d1f.json b/data/hfopenllm_v2/spow12/ChatWaifu_12B_v2.0/843f0d9a-04e8-4cea-bb18-94651a814d1f.json
deleted file mode 100644
index 4c523e210..000000000
--- a/data/hfopenllm_v2/spow12/ChatWaifu_12B_v2.0/843f0d9a-04e8-4cea-bb18-94651a814d1f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/spow12_ChatWaifu_12B_v2.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ChatWaifu_12B_v2.0",
-    "id": "spow12/ChatWaifu_12B_v2.0",
-    "developer": "spow12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4768
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5208
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.071
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4432
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3388
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/spow12/ChatWaifu_22B_v2.0_preview/fa3ccf4a-9b26-4a76-a974-3a776adec7c2.json b/data/hfopenllm_v2/spow12/ChatWaifu_22B_v2.0_preview/fa3ccf4a-9b26-4a76-a974-3a776adec7c2.json
deleted file mode 100644
index f32e11888..000000000
--- a/data/hfopenllm_v2/spow12/ChatWaifu_22B_v2.0_preview/fa3ccf4a-9b26-4a76-a974-3a776adec7c2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/spow12_ChatWaifu_22B_v2.0_preview/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ChatWaifu_22B_v2.0_preview",
-    "id": "spow12/ChatWaifu_22B_v2.0_preview",
-    "developer": "spow12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6745
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.617
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1888
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3154
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3685
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3988
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/spow12/ChatWaifu_v1.4/ef4ac8ab-4ff5-4fce-94b6-443b1ef7964f.json b/data/hfopenllm_v2/spow12/ChatWaifu_v1.4/ef4ac8ab-4ff5-4fce-94b6-443b1ef7964f.json
deleted file mode 100644
index 4403618f1..000000000
--- a/data/hfopenllm_v2/spow12/ChatWaifu_v1.4/ef4ac8ab-4ff5-4fce-94b6-443b1ef7964f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/spow12_ChatWaifu_v1.4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ChatWaifu_v1.4",
-    "id": "spow12/ChatWaifu_v1.4",
-    "developer": "spow12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5691
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5176
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1057
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4743
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3475
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/spow12/ChatWaifu_v2.0_22B/468bbea7-6dee-4a1a-84b3-e44b0f3ab95a.json b/data/hfopenllm_v2/spow12/ChatWaifu_v2.0_22B/468bbea7-6dee-4a1a-84b3-e44b0f3ab95a.json
deleted file mode 100644
index 83ea96c1c..000000000
--- a/data/hfopenllm_v2/spow12/ChatWaifu_v2.0_22B/468bbea7-6dee-4a1a-84b3-e44b0f3ab95a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/spow12_ChatWaifu_v2.0_22B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ChatWaifu_v2.0_22B",
-    "id": "spow12/ChatWaifu_v2.0_22B",
-    "developer": "spow12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6511
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5926
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1858
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3247
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3836
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/spow12/ChatWaifu_v2.0_22B/bd8fdfa5-bda1-402b-9010-94bf78b0127b.json b/data/hfopenllm_v2/spow12/ChatWaifu_v2.0_22B/bd8fdfa5-bda1-402b-9010-94bf78b0127b.json
deleted file mode 100644
index 699016bfe..000000000
--- a/data/hfopenllm_v2/spow12/ChatWaifu_v2.0_22B/bd8fdfa5-bda1-402b-9010-94bf78b0127b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/spow12_ChatWaifu_v2.0_22B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ChatWaifu_v2.0_22B",
-    "id": "spow12/ChatWaifu_v2.0_22B",
-    "developer": "spow12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 22.247
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6517
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5908
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2032
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3842
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3812
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ssmits/Qwen2.5-95B-Instruct/a0b34b40-3e68-463f-a7fa-3c58c15aa16d.json b/data/hfopenllm_v2/ssmits/Qwen2.5-95B-Instruct/a0b34b40-3e68-463f-a7fa-3c58c15aa16d.json
deleted file mode 100644
index 21deb3a14..000000000
--- a/data/hfopenllm_v2/ssmits/Qwen2.5-95B-Instruct/a0b34b40-3e68-463f-a7fa-3c58c15aa16d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ssmits_Qwen2.5-95B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-95B-Instruct",
-    "id": "ssmits/Qwen2.5-95B-Instruct",
-    "developer": "ssmits",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 94.648
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8431
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7038
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5302
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3641
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4284
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5217
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/stabilityai/StableBeluga2/dbf4fbac-cd99-426d-b725-600e60af00d2.json b/data/hfopenllm_v2/stabilityai/StableBeluga2/dbf4fbac-cd99-426d-b725-600e60af00d2.json
deleted file mode 100644
index e8054b7c1..000000000
--- a/data/hfopenllm_v2/stabilityai/StableBeluga2/dbf4fbac-cd99-426d-b725-600e60af00d2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/stabilityai_StableBeluga2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "StableBeluga2",
-    "id": "stabilityai/StableBeluga2",
-    "developer": "stabilityai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 68.977
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3787
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5824
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3163
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.473
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3326
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/stabilityai/stablelm-2-12b-chat/f793c471-1638-476a-a050-455a32368e29.json b/data/hfopenllm_v2/stabilityai/stablelm-2-12b-chat/f793c471-1638-476a-a050-455a32368e29.json
deleted file mode 100644
index 53354e46f..000000000
--- a/data/hfopenllm_v2/stabilityai/stablelm-2-12b-chat/f793c471-1638-476a-a050-455a32368e29.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/stabilityai_stablelm-2-12b-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "stablelm-2-12b-chat",
-    "id": "stabilityai/stablelm-2-12b-chat",
-    "developer": "stabilityai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "StableLmForCausalLM",
-      "params_billions": 12.143
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4082
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4672
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0536
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3914
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2734
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/stabilityai/stablelm-2-12b/1d9c1beb-f84b-4eb7-9c1e-ce5a70afabfb.json b/data/hfopenllm_v2/stabilityai/stablelm-2-12b/1d9c1beb-f84b-4eb7-9c1e-ce5a70afabfb.json
deleted file mode 100644
index 864befec9..000000000
--- a/data/hfopenllm_v2/stabilityai/stablelm-2-12b/1d9c1beb-f84b-4eb7-9c1e-ce5a70afabfb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/stabilityai_stablelm-2-12b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "stablelm-2-12b",
-    "id": "stabilityai/stablelm-2-12b",
-    "developer": "stabilityai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "StableLmForCausalLM",
-      "params_billions": 12.143
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1569
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4509
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4479
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3072
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/stabilityai/stablelm-2-1_6b-chat/99396d97-d875-4cd9-a8a1-a9aec5c43bfc.json b/data/hfopenllm_v2/stabilityai/stablelm-2-1_6b-chat/99396d97-d875-4cd9-a8a1-a9aec5c43bfc.json
deleted file mode 100644
index 28f4bd994..000000000
--- a/data/hfopenllm_v2/stabilityai/stablelm-2-1_6b-chat/99396d97-d875-4cd9-a8a1-a9aec5c43bfc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/stabilityai_stablelm-2-1_6b-chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "stablelm-2-1_6b-chat",
-    "id": "stabilityai/stablelm-2-1_6b-chat",
-    "developer": "stabilityai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "StableLmForCausalLM",
-      "params_billions": 1.645
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.306
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.339
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2475
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1622
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/stabilityai/stablelm-2-1_6b/82a44b46-156f-4232-92e4-6a08d7a4f197.json b/data/hfopenllm_v2/stabilityai/stablelm-2-1_6b/82a44b46-156f-4232-92e4-6a08d7a4f197.json
deleted file mode 100644
index c8c4b25df..000000000
--- a/data/hfopenllm_v2/stabilityai/stablelm-2-1_6b/82a44b46-156f-4232-92e4-6a08d7a4f197.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/stabilityai_stablelm-2-1_6b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "stablelm-2-1_6b",
-    "id": "stabilityai/stablelm-2-1_6b",
-    "developer": "stabilityai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "StableLmForCausalLM",
-      "params_billions": 1.645
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1157
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3385
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0076
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2483
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3882
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1464
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/stabilityai/stablelm-2-zephyr-1_6b/3b40defd-5a2e-4d6e-838f-dbbbf12236fb.json b/data/hfopenllm_v2/stabilityai/stablelm-2-zephyr-1_6b/3b40defd-5a2e-4d6e-838f-dbbbf12236fb.json
deleted file mode 100644
index c7a4b2631..000000000
--- a/data/hfopenllm_v2/stabilityai/stablelm-2-zephyr-1_6b/3b40defd-5a2e-4d6e-838f-dbbbf12236fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/stabilityai_stablelm-2-zephyr-1_6b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "stablelm-2-zephyr-1_6b",
-    "id": "stabilityai/stablelm-2-zephyr-1_6b",
-    "developer": "stabilityai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "StableLmForCausalLM",
-      "params_billions": 1.645
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3279
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3352
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0332
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2433
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3511
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1714
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/stabilityai/stablelm-3b-4e1t/dde41cd5-e6d1-43a9-9593-1a5751bc5f44.json b/data/hfopenllm_v2/stabilityai/stablelm-3b-4e1t/dde41cd5-e6d1-43a9-9593-1a5751bc5f44.json
deleted file mode 100644
index 5c466e1f8..000000000
--- a/data/hfopenllm_v2/stabilityai/stablelm-3b-4e1t/dde41cd5-e6d1-43a9-9593-1a5751bc5f44.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/stabilityai_stablelm-3b-4e1t/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "stablelm-3b-4e1t",
-    "id": "stabilityai/stablelm-3b-4e1t",
-    "developer": "stabilityai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "StableLmForCausalLM",
-      "params_billions": 2.795
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2203
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3504
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2374
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3778
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1669
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/stabilityai/stablelm-zephyr-3b/1cffcbeb-ef81-4efe-b883-0a8540a799e7.json b/data/hfopenllm_v2/stabilityai/stablelm-zephyr-3b/1cffcbeb-ef81-4efe-b883-0a8540a799e7.json
deleted file mode 100644
index 736b9c1ac..000000000
--- a/data/hfopenllm_v2/stabilityai/stablelm-zephyr-3b/1cffcbeb-ef81-4efe-b883-0a8540a799e7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/stabilityai_stablelm-zephyr-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "stablelm-zephyr-3b",
-    "id": "stabilityai/stablelm-zephyr-3b",
-    "developer": "stabilityai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "StableLmForCausalLM",
-      "params_billions": 2.795
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3683
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3866
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2391
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4183
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1768
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sthenno-com/miscii-14b-0130/033ef96e-3d2d-49a4-bbff-8bc815a1b40e.json b/data/hfopenllm_v2/sthenno-com/miscii-14b-0130/033ef96e-3d2d-49a4-bbff-8bc815a1b40e.json
deleted file mode 100644
index 436ce33be..000000000
--- a/data/hfopenllm_v2/sthenno-com/miscii-14b-0130/033ef96e-3d2d-49a4-bbff-8bc815a1b40e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sthenno-com_miscii-14b-0130/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "miscii-14b-0130",
-    "id": "sthenno-com/miscii-14b-0130",
-    "developer": "sthenno-com",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6647
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6505
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.432
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4912
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5363
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sthenno-com/miscii-14b-0218/bfe654b8-cb79-4845-bf14-85012207ce90.json b/data/hfopenllm_v2/sthenno-com/miscii-14b-0218/bfe654b8-cb79-4845-bf14-85012207ce90.json
deleted file mode 100644
index 19981a9c3..000000000
--- a/data/hfopenllm_v2/sthenno-com/miscii-14b-0218/bfe654b8-cb79-4845-bf14-85012207ce90.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sthenno-com_miscii-14b-0218/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "miscii-14b-0218",
-    "id": "sthenno-com/miscii-14b-0218",
-    "developer": "sthenno-com",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7656
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6559
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3834
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4273
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5298
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sthenno-com/miscii-14b-1028/5c4efc23-9591-447b-aecc-4c82797d7d01.json b/data/hfopenllm_v2/sthenno-com/miscii-14b-1028/5c4efc23-9591-447b-aecc-4c82797d7d01.json
deleted file mode 100644
index 49b055e2c..000000000
--- a/data/hfopenllm_v2/sthenno-com/miscii-14b-1028/5c4efc23-9591-447b-aecc-4c82797d7d01.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sthenno-com_miscii-14b-1028/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "miscii-14b-1028",
-    "id": "sthenno-com/miscii-14b-1028",
-    "developer": "sthenno-com",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8237
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6448
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.503
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4182
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5153
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sthenno-com/miscii-14b-1225/a5fe3fab-95d9-41ac-a95f-66205e489dae.json b/data/hfopenllm_v2/sthenno-com/miscii-14b-1225/a5fe3fab-95d9-41ac-a95f-66205e489dae.json
deleted file mode 100644
index 4635e6e86..000000000
--- a/data/hfopenllm_v2/sthenno-com/miscii-14b-1225/a5fe3fab-95d9-41ac-a95f-66205e489dae.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sthenno-com_miscii-14b-1225/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "miscii-14b-1225",
-    "id": "sthenno-com/miscii-14b-1225",
-    "developer": "sthenno-com",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7878
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6572
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4517
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3775
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4366
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5272
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sthenno/tempesthenno-0120/c0bf8ffb-444a-43a3-9514-76aa92c5f5b7.json b/data/hfopenllm_v2/sthenno/tempesthenno-0120/c0bf8ffb-444a-43a3-9514-76aa92c5f5b7.json
deleted file mode 100644
index c30b2e881..000000000
--- a/data/hfopenllm_v2/sthenno/tempesthenno-0120/c0bf8ffb-444a-43a3-9514-76aa92c5f5b7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sthenno_tempesthenno-0120/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tempesthenno-0120",
-    "id": "sthenno/tempesthenno-0120",
-    "developer": "sthenno",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.539
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6373
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3353
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3943
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4633
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.529
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sthenno/tempesthenno-fusion-0309/3d556d9f-036b-4368-bb4a-18ad6b444bdf.json b/data/hfopenllm_v2/sthenno/tempesthenno-fusion-0309/3d556d9f-036b-4368-bb4a-18ad6b444bdf.json
deleted file mode 100644
index 78afda82b..000000000
--- a/data/hfopenllm_v2/sthenno/tempesthenno-fusion-0309/3d556d9f-036b-4368-bb4a-18ad6b444bdf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sthenno_tempesthenno-fusion-0309/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tempesthenno-fusion-0309",
-    "id": "sthenno/tempesthenno-fusion-0309",
-    "developer": "sthenno",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7692
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6581
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4766
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4325
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5258
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sthenno/tempesthenno-kto-0205-ckpt80/92905e27-1033-4423-b87d-23236f9be964.json b/data/hfopenllm_v2/sthenno/tempesthenno-kto-0205-ckpt80/92905e27-1033-4423-b87d-23236f9be964.json
deleted file mode 100644
index 588277559..000000000
--- a/data/hfopenllm_v2/sthenno/tempesthenno-kto-0205-ckpt80/92905e27-1033-4423-b87d-23236f9be964.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sthenno_tempesthenno-kto-0205-ckpt80/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tempesthenno-kto-0205-ckpt80",
-    "id": "sthenno/tempesthenno-kto-0205-ckpt80",
-    "developer": "sthenno",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8054
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6543
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4592
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3482
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4248
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5286
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sthenno/tempesthenno-nuslerp-001/17326bb0-42c2-469a-ac19-6a4b75d9e6e2.json b/data/hfopenllm_v2/sthenno/tempesthenno-nuslerp-001/17326bb0-42c2-469a-ac19-6a4b75d9e6e2.json
deleted file mode 100644
index 752a4f6d2..000000000
--- a/data/hfopenllm_v2/sthenno/tempesthenno-nuslerp-001/17326bb0-42c2-469a-ac19-6a4b75d9e6e2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sthenno_tempesthenno-nuslerp-001/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tempesthenno-nuslerp-001",
-    "id": "sthenno/tempesthenno-nuslerp-001",
-    "developer": "sthenno",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7926
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6578
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4758
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.43
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5257
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sthenno/tempesthenno-nuslerp-0124/11574f56-6c34-48e4-8fb5-c58d42f07330.json b/data/hfopenllm_v2/sthenno/tempesthenno-nuslerp-0124/11574f56-6c34-48e4-8fb5-c58d42f07330.json
deleted file mode 100644
index 26b47b3b3..000000000
--- a/data/hfopenllm_v2/sthenno/tempesthenno-nuslerp-0124/11574f56-6c34-48e4-8fb5-c58d42f07330.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sthenno_tempesthenno-nuslerp-0124/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tempesthenno-nuslerp-0124",
-    "id": "sthenno/tempesthenno-nuslerp-0124",
-    "developer": "sthenno",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7004
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6469
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4116
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3901
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4859
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5352
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sthenno/tempesthenno-ppo-ckpt40/8f728c51-15f9-422d-bbdb-4d976961ab9d.json b/data/hfopenllm_v2/sthenno/tempesthenno-ppo-ckpt40/8f728c51-15f9-422d-bbdb-4d976961ab9d.json
deleted file mode 100644
index e24916e51..000000000
--- a/data/hfopenllm_v2/sthenno/tempesthenno-ppo-ckpt40/8f728c51-15f9-422d-bbdb-4d976961ab9d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sthenno_tempesthenno-ppo-ckpt40/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tempesthenno-ppo-ckpt40",
-    "id": "sthenno/tempesthenno-ppo-ckpt40",
-    "developer": "sthenno",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7923
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.655
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4736
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3775
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4352
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5292
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sthenno/tempesthenno-sft-0309-ckpt10/8d6e4b5e-ad17-4390-bc6b-ab6581a62442.json b/data/hfopenllm_v2/sthenno/tempesthenno-sft-0309-ckpt10/8d6e4b5e-ad17-4390-bc6b-ab6581a62442.json
deleted file mode 100644
index 884df38c1..000000000
--- a/data/hfopenllm_v2/sthenno/tempesthenno-sft-0309-ckpt10/8d6e4b5e-ad17-4390-bc6b-ab6581a62442.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sthenno_tempesthenno-sft-0309-ckpt10/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tempesthenno-sft-0309-ckpt10",
-    "id": "sthenno/tempesthenno-sft-0309-ckpt10",
-    "developer": "sthenno",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7744
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6552
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4721
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4364
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5258
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sthenno/tempesthenno-sft-0314-stage1-ckpt50/5e33bf05-6c67-4ecc-982d-7590e9953145.json b/data/hfopenllm_v2/sthenno/tempesthenno-sft-0314-stage1-ckpt50/5e33bf05-6c67-4ecc-982d-7590e9953145.json
deleted file mode 100644
index bdf205c44..000000000
--- a/data/hfopenllm_v2/sthenno/tempesthenno-sft-0314-stage1-ckpt50/5e33bf05-6c67-4ecc-982d-7590e9953145.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sthenno_tempesthenno-sft-0314-stage1-ckpt50/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tempesthenno-sft-0314-stage1-ckpt50",
-    "id": "sthenno/tempesthenno-sft-0314-stage1-ckpt50",
-    "developer": "sthenno",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7394
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6601
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4683
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3733
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4429
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5302
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sthenno/tempestissimo-14b-0309/f55ae879-bd95-409c-a8a3-9a57cd615a31.json b/data/hfopenllm_v2/sthenno/tempestissimo-14b-0309/f55ae879-bd95-409c-a8a3-9a57cd615a31.json
deleted file mode 100644
index 63af9669e..000000000
--- a/data/hfopenllm_v2/sthenno/tempestissimo-14b-0309/f55ae879-bd95-409c-a8a3-9a57cd615a31.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sthenno_tempestissimo-14b-0309/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tempestissimo-14b-0309",
-    "id": "sthenno/tempestissimo-14b-0309",
-    "developer": "sthenno",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7549
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6587
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4796
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3666
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4312
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5281
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/streamerbtw1002/Nexuim-R1-7B-Instruct/b8426ac9-14f1-4e07-9c7e-b50cb2c7a1e3.json b/data/hfopenllm_v2/streamerbtw1002/Nexuim-R1-7B-Instruct/b8426ac9-14f1-4e07-9c7e-b50cb2c7a1e3.json
deleted file mode 100644
index c89456ac8..000000000
--- a/data/hfopenllm_v2/streamerbtw1002/Nexuim-R1-7B-Instruct/b8426ac9-14f1-4e07-9c7e-b50cb2c7a1e3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/streamerbtw1002_Nexuim-R1-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nexuim-R1-7B-Instruct",
-    "id": "streamerbtw1002/Nexuim-R1-7B-Instruct",
-    "developer": "streamerbtw1002",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6934
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5175
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4456
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4138
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/stupidity-ai/Llama-3-8B-Instruct-MultiMoose/51fd90b0-0d5a-4199-ba5b-ff29eeeab06b.json b/data/hfopenllm_v2/stupidity-ai/Llama-3-8B-Instruct-MultiMoose/51fd90b0-0d5a-4199-ba5b-ff29eeeab06b.json
deleted file mode 100644
index 09f45ca03..000000000
--- a/data/hfopenllm_v2/stupidity-ai/Llama-3-8B-Instruct-MultiMoose/51fd90b0-0d5a-4199-ba5b-ff29eeeab06b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/stupidity-ai_Llama-3-8B-Instruct-MultiMoose/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-MultiMoose",
-    "id": "stupidity-ai/Llama-3-8B-Instruct-MultiMoose",
-    "developer": "stupidity-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2318
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2823
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3485
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1094
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/suayptalha/Clarus-7B-v0.1/c46e4fa1-afae-4b68-a13e-034b5cd2b779.json b/data/hfopenllm_v2/suayptalha/Clarus-7B-v0.1/c46e4fa1-afae-4b68-a13e-034b5cd2b779.json
deleted file mode 100644
index 516714298..000000000
--- a/data/hfopenllm_v2/suayptalha/Clarus-7B-v0.1/c46e4fa1-afae-4b68-a13e-034b5cd2b779.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/suayptalha_Clarus-7B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Clarus-7B-v0.1",
-    "id": "suayptalha/Clarus-7B-v0.1",
-    "developer": "suayptalha",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7454
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5497
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4924
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.443
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4387
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/suayptalha/Clarus-7B-v0.2/42cc06ed-20fc-4e84-836f-3d7243ec336d.json b/data/hfopenllm_v2/suayptalha/Clarus-7B-v0.2/42cc06ed-20fc-4e84-836f-3d7243ec336d.json
deleted file mode 100644
index c5b09072a..000000000
--- a/data/hfopenllm_v2/suayptalha/Clarus-7B-v0.2/42cc06ed-20fc-4e84-836f-3d7243ec336d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/suayptalha_Clarus-7B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Clarus-7B-v0.2",
-    "id": "suayptalha/Clarus-7B-v0.2",
-    "developer": "suayptalha",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.613
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7679
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.549
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4856
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4417
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/suayptalha/Clarus-7B-v0.3/aaa53387-af33-4454-95f0-3af85f4778c0.json b/data/hfopenllm_v2/suayptalha/Clarus-7B-v0.3/aaa53387-af33-4454-95f0-3af85f4778c0.json
deleted file mode 100644
index d58a5c307..000000000
--- a/data/hfopenllm_v2/suayptalha/Clarus-7B-v0.3/aaa53387-af33-4454-95f0-3af85f4778c0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/suayptalha_Clarus-7B-v0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Clarus-7B-v0.3",
-    "id": "suayptalha/Clarus-7B-v0.3",
-    "developer": "suayptalha",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7509
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5526
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4879
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4402
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4385
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/suayptalha/DeepSeek-R1-Distill-Llama-3B/465bca6d-b32a-4d34-9916-fc8b3166faa0.json b/data/hfopenllm_v2/suayptalha/DeepSeek-R1-Distill-Llama-3B/465bca6d-b32a-4d34-9916-fc8b3166faa0.json
deleted file mode 100644
index c2621e7be..000000000
--- a/data/hfopenllm_v2/suayptalha/DeepSeek-R1-Distill-Llama-3B/465bca6d-b32a-4d34-9916-fc8b3166faa0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/suayptalha_DeepSeek-R1-Distill-Llama-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "DeepSeek-R1-Distill-Llama-3B",
-    "id": "suayptalha/DeepSeek-R1-Distill-Llama-3B",
-    "developer": "suayptalha",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7093
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4452
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2092
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3396
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/suayptalha/Falcon3-Jessi-v0.4-7B-Slerp/bf138f3d-09d9-4dea-aa43-5efc804bc775.json b/data/hfopenllm_v2/suayptalha/Falcon3-Jessi-v0.4-7B-Slerp/bf138f3d-09d9-4dea-aa43-5efc804bc775.json
deleted file mode 100644
index 21408cebb..000000000
--- a/data/hfopenllm_v2/suayptalha/Falcon3-Jessi-v0.4-7B-Slerp/bf138f3d-09d9-4dea-aa43-5efc804bc775.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/suayptalha_Falcon3-Jessi-v0.4-7B-Slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon3-Jessi-v0.4-7B-Slerp",
-    "id": "suayptalha/Falcon3-Jessi-v0.4-7B-Slerp",
-    "developer": "suayptalha",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7676
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5591
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3965
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4812
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.406
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/suayptalha/HomerCreativeAnvita-Mix-Qw7B/cb4e944c-66f6-49f2-b1e0-d90454e34315.json b/data/hfopenllm_v2/suayptalha/HomerCreativeAnvita-Mix-Qw7B/cb4e944c-66f6-49f2-b1e0-d90454e34315.json
deleted file mode 100644
index 52dfc2c70..000000000
--- a/data/hfopenllm_v2/suayptalha/HomerCreativeAnvita-Mix-Qw7B/cb4e944c-66f6-49f2-b1e0-d90454e34315.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/suayptalha_HomerCreativeAnvita-Mix-Qw7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HomerCreativeAnvita-Mix-Qw7B",
-    "id": "suayptalha/HomerCreativeAnvita-Mix-Qw7B",
-    "developer": "suayptalha",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7808
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5565
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.361
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4416
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4445
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/suayptalha/Komodo-Llama-3.2-3B-v2-fp16/b2b6bc49-bda1-4a3e-a071-ec0a0bdc1313.json b/data/hfopenllm_v2/suayptalha/Komodo-Llama-3.2-3B-v2-fp16/b2b6bc49-bda1-4a3e-a071-ec0a0bdc1313.json
deleted file mode 100644
index 6760ae662..000000000
--- a/data/hfopenllm_v2/suayptalha/Komodo-Llama-3.2-3B-v2-fp16/b2b6bc49-bda1-4a3e-a071-ec0a0bdc1313.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/suayptalha_Komodo-Llama-3.2-3B-v2-fp16/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Komodo-Llama-3.2-3B-v2-fp16",
-    "id": "suayptalha/Komodo-Llama-3.2-3B-v2-fp16",
-    "developer": "suayptalha",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6341
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4355
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3406
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/suayptalha/Lamarckvergence-14B/933f3d40-8726-418f-be2f-1f9686e9ab02.json b/data/hfopenllm_v2/suayptalha/Lamarckvergence-14B/933f3d40-8726-418f-be2f-1f9686e9ab02.json
deleted file mode 100644
index e8329fe1f..000000000
--- a/data/hfopenllm_v2/suayptalha/Lamarckvergence-14B/933f3d40-8726-418f-be2f-1f9686e9ab02.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/suayptalha_Lamarckvergence-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lamarckvergence-14B",
-    "id": "suayptalha/Lamarckvergence-14B",
-    "developer": "suayptalha",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7656
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6517
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.54
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3633
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4422
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5283
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/suayptalha/Lix-14B-v0.1/af1bf15c-7c5f-46fa-ba3a-821b521e86f4.json b/data/hfopenllm_v2/suayptalha/Lix-14B-v0.1/af1bf15c-7c5f-46fa-ba3a-821b521e86f4.json
deleted file mode 100644
index 14484ddbd..000000000
--- a/data/hfopenllm_v2/suayptalha/Lix-14B-v0.1/af1bf15c-7c5f-46fa-ba3a-821b521e86f4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/suayptalha_Lix-14B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Lix-14B-v0.1",
-    "id": "suayptalha/Lix-14B-v0.1",
-    "developer": "suayptalha",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7813
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6608
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4338
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5314
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/suayptalha/Luminis-phi-4/43df4336-1eb8-4df7-8309-1199aafc07b1.json b/data/hfopenllm_v2/suayptalha/Luminis-phi-4/43df4336-1eb8-4df7-8309-1199aafc07b1.json
deleted file mode 100644
index 1b20a9640..000000000
--- a/data/hfopenllm_v2/suayptalha/Luminis-phi-4/43df4336-1eb8-4df7-8309-1199aafc07b1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/suayptalha_Luminis-phi-4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Luminis-phi-4",
-    "id": "suayptalha/Luminis-phi-4",
-    "developer": "suayptalha",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.692
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4637
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4572
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5424
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/suayptalha/Maestro-10B/44ae222d-407c-4c8b-9b67-75440631f848.json b/data/hfopenllm_v2/suayptalha/Maestro-10B/44ae222d-407c-4c8b-9b67-75440631f848.json
deleted file mode 100644
index 352d09f29..000000000
--- a/data/hfopenllm_v2/suayptalha/Maestro-10B/44ae222d-407c-4c8b-9b67-75440631f848.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/suayptalha_Maestro-10B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Maestro-10B",
-    "id": "suayptalha/Maestro-10B",
-    "developer": "suayptalha",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7768
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5746
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1911
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4397
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4218
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/suayptalha/Rombos-2.5-T.E-8.1/a87db0fe-3727-4ff1-875f-9edd3109f3a2.json b/data/hfopenllm_v2/suayptalha/Rombos-2.5-T.E-8.1/a87db0fe-3727-4ff1-875f-9edd3109f3a2.json
deleted file mode 100644
index e6ce27e84..000000000
--- a/data/hfopenllm_v2/suayptalha/Rombos-2.5-T.E-8.1/a87db0fe-3727-4ff1-875f-9edd3109f3a2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/suayptalha_Rombos-2.5-T.E-8.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rombos-2.5-T.E-8.1",
-    "id": "suayptalha/Rombos-2.5-T.E-8.1",
-    "developer": "suayptalha",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6925
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5515
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4924
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4166
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4446
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/Qmerft/0c73e33a-7f6f-4925-970b-db289069d5ca.json b/data/hfopenllm_v2/sumink/Qmerft/0c73e33a-7f6f-4925-970b-db289069d5ca.json
deleted file mode 100644
index bcee9059c..000000000
--- a/data/hfopenllm_v2/sumink/Qmerft/0c73e33a-7f6f-4925-970b-db289069d5ca.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_Qmerft/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qmerft",
-    "id": "sumink/Qmerft",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1564
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2939
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0023
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3688
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1157
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/Qwenftmodel/02bc7f5c-dc2f-4d8c-adcb-a89a34ff5549.json b/data/hfopenllm_v2/sumink/Qwenftmodel/02bc7f5c-dc2f-4d8c-adcb-a89a34ff5549.json
deleted file mode 100644
index 4e22ca5b7..000000000
--- a/data/hfopenllm_v2/sumink/Qwenftmodel/02bc7f5c-dc2f-4d8c-adcb-a89a34ff5549.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_Qwenftmodel/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenftmodel",
-    "id": "sumink/Qwenftmodel",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1729
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3823
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0891
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3617
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2339
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/Qwenmplus/590c031c-2aa6-48e6-9b3f-68b1a585dd39.json b/data/hfopenllm_v2/sumink/Qwenmplus/590c031c-2aa6-48e6-9b3f-68b1a585dd39.json
deleted file mode 100644
index 9502b1af4..000000000
--- a/data/hfopenllm_v2/sumink/Qwenmplus/590c031c-2aa6-48e6-9b3f-68b1a585dd39.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_Qwenmplus/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwenmplus",
-    "id": "sumink/Qwenmplus",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.543
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.204
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3676
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3828
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1992
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/Qwensci/970c9fb8-c217-444b-a025-f4d9acdd679d.json b/data/hfopenllm_v2/sumink/Qwensci/970c9fb8-c217-444b-a025-f4d9acdd679d.json
deleted file mode 100644
index 0a21c2f79..000000000
--- a/data/hfopenllm_v2/sumink/Qwensci/970c9fb8-c217-444b-a025-f4d9acdd679d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_Qwensci/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwensci",
-    "id": "sumink/Qwensci",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.543
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.174
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3282
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3609
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.126
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/bbhqwen/07a08dd7-822b-49ac-859b-d2fc75b9c88d.json b/data/hfopenllm_v2/sumink/bbhqwen/07a08dd7-822b-49ac-859b-d2fc75b9c88d.json
deleted file mode 100644
index c216d7b5c..000000000
--- a/data/hfopenllm_v2/sumink/bbhqwen/07a08dd7-822b-49ac-859b-d2fc75b9c88d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_bbhqwen/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bbhqwen",
-    "id": "sumink/bbhqwen",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1809
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3388
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4352
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1617
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/bbhqwen2/0c0e9250-b75a-4549-9fb2-2b5c9ac2ef49.json b/data/hfopenllm_v2/sumink/bbhqwen2/0c0e9250-b75a-4549-9fb2-2b5c9ac2ef49.json
deleted file mode 100644
index 1dc18132d..000000000
--- a/data/hfopenllm_v2/sumink/bbhqwen2/0c0e9250-b75a-4549-9fb2-2b5c9ac2ef49.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_bbhqwen2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bbhqwen2",
-    "id": "sumink/bbhqwen2",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1533
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3066
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4431
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1149
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/bbhqwen3/2ae306b1-5409-4418-b5e4-50feff9dafe7.json b/data/hfopenllm_v2/sumink/bbhqwen3/2ae306b1-5409-4418-b5e4-50feff9dafe7.json
deleted file mode 100644
index d43a52330..000000000
--- a/data/hfopenllm_v2/sumink/bbhqwen3/2ae306b1-5409-4418-b5e4-50feff9dafe7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_bbhqwen3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bbhqwen3",
-    "id": "sumink/bbhqwen3",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1943
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2951
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3796
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1166
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/bbhqwen4/44bf5d75-afb2-48fa-a0fa-96d283b0ae94.json b/data/hfopenllm_v2/sumink/bbhqwen4/44bf5d75-afb2-48fa-a0fa-96d283b0ae94.json
deleted file mode 100644
index 7b63e0384..000000000
--- a/data/hfopenllm_v2/sumink/bbhqwen4/44bf5d75-afb2-48fa-a0fa-96d283b0ae94.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_bbhqwen4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bbhqwen4",
-    "id": "sumink/bbhqwen4",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1449
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3199
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2441
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4029
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1509
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/bbhqwen5/e3860bb2-b2e4-4fdf-91cb-3343ad6440d7.json b/data/hfopenllm_v2/sumink/bbhqwen5/e3860bb2-b2e4-4fdf-91cb-3343ad6440d7.json
deleted file mode 100644
index d06e6e52f..000000000
--- a/data/hfopenllm_v2/sumink/bbhqwen5/e3860bb2-b2e4-4fdf-91cb-3343ad6440d7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_bbhqwen5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bbhqwen5",
-    "id": "sumink/bbhqwen5",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1522
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2913
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0023
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4019
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1131
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/bbhqwen6/6369fceb-148f-4491-9488-420182a9838f.json b/data/hfopenllm_v2/sumink/bbhqwen6/6369fceb-148f-4491-9488-420182a9838f.json
deleted file mode 100644
index 80f188511..000000000
--- a/data/hfopenllm_v2/sumink/bbhqwen6/6369fceb-148f-4491-9488-420182a9838f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_bbhqwen6/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "bbhqwen6",
-    "id": "sumink/bbhqwen6",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1893
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2782
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1153
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/flflmillama/045c814e-a30f-4b6b-b4f4-382dee4063b7.json b/data/hfopenllm_v2/sumink/flflmillama/045c814e-a30f-4b6b-b4f4-382dee4063b7.json
deleted file mode 100644
index 05e8e4568..000000000
--- a/data/hfopenllm_v2/sumink/flflmillama/045c814e-a30f-4b6b-b4f4-382dee4063b7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_flflmillama/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "flflmillama",
-    "id": "sumink/flflmillama",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1676
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3851
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3591
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2096
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/ftgpt/59d2b375-5696-47d0-9c96-1a826c08bea0.json b/data/hfopenllm_v2/sumink/ftgpt/59d2b375-5696-47d0-9c96-1a826c08bea0.json
deleted file mode 100644
index 697ef4b76..000000000
--- a/data/hfopenllm_v2/sumink/ftgpt/59d2b375-5696-47d0-9c96-1a826c08bea0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_ftgpt/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ftgpt",
-    "id": "sumink/ftgpt",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "GPT2LMHeadModel",
-      "params_billions": 0.124
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0787
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4138
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1172
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/llamaft/ff601b4f-24a1-4376-8c5e-5bda2ea88f65.json b/data/hfopenllm_v2/sumink/llamaft/ff601b4f-24a1-4376-8c5e-5bda2ea88f65.json
deleted file mode 100644
index 69e1ab691..000000000
--- a/data/hfopenllm_v2/sumink/llamaft/ff601b4f-24a1-4376-8c5e-5bda2ea88f65.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_llamaft/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llamaft",
-    "id": "sumink/llamaft",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1609
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3763
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3498
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2114
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/llamamerge/8c043ba8-f7dd-4cc8-a3b1-7201042b8dc8.json b/data/hfopenllm_v2/sumink/llamamerge/8c043ba8-f7dd-4cc8-a3b1-7201042b8dc8.json
deleted file mode 100644
index f90932f9e..000000000
--- a/data/hfopenllm_v2/sumink/llamamerge/8c043ba8-f7dd-4cc8-a3b1-7201042b8dc8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_llamamerge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llamamerge",
-    "id": "sumink/llamamerge",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.016
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2672
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4632
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.424
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.259
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/llftfl7/ce27dff4-9ca7-47cb-bc18-b5dd167c72a2.json b/data/hfopenllm_v2/sumink/llftfl7/ce27dff4-9ca7-47cb-bc18-b5dd167c72a2.json
deleted file mode 100644
index 454849f8b..000000000
--- a/data/hfopenllm_v2/sumink/llftfl7/ce27dff4-9ca7-47cb-bc18-b5dd167c72a2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_llftfl7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llftfl7",
-    "id": "sumink/llftfl7",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1714
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3786
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3632
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1743
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/llmer/d69ecbfa-5036-48b8-8fed-f9162e2857f5.json b/data/hfopenllm_v2/sumink/llmer/d69ecbfa-5036-48b8-8fed-f9162e2857f5.json
deleted file mode 100644
index 2b5e1209f..000000000
--- a/data/hfopenllm_v2/sumink/llmer/d69ecbfa-5036-48b8-8fed-f9162e2857f5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_llmer/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llmer",
-    "id": "sumink/llmer",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3191
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4885
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.065
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4039
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3529
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/qwft/b5924329-c182-482a-bee8-22fcb348281d.json b/data/hfopenllm_v2/sumink/qwft/b5924329-c182-482a-bee8-22fcb348281d.json
deleted file mode 100644
index 734732a61..000000000
--- a/data/hfopenllm_v2/sumink/qwft/b5924329-c182-482a-bee8-22fcb348281d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_qwft/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwft",
-    "id": "sumink/qwft",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1197
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3002
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3581
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1129
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/qwmer/a6a6b6f2-ac28-4c4a-806e-8abe8c7f9190.json b/data/hfopenllm_v2/sumink/qwmer/a6a6b6f2-ac28-4c4a-806e-8abe8c7f9190.json
deleted file mode 100644
index 88f04b8bd..000000000
--- a/data/hfopenllm_v2/sumink/qwmer/a6a6b6f2-ac28-4c4a-806e-8abe8c7f9190.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_qwmer/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwmer",
-    "id": "sumink/qwmer",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2212
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4299
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0008
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4032
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2215
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/solarmer3/b904301c-d0c0-41a4-b92e-92b2d7c9c13a.json b/data/hfopenllm_v2/sumink/solarmer3/b904301c-d0c0-41a4-b92e-92b2d7c9c13a.json
deleted file mode 100644
index ac7e81ef1..000000000
--- a/data/hfopenllm_v2/sumink/solarmer3/b904301c-d0c0-41a4-b92e-92b2d7c9c13a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_solarmer3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "solarmer3",
-    "id": "sumink/solarmer3",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3741
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5266
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0582
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4401
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3323
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/somer/b5de0218-91dc-487a-be90-70f8bcb64803.json b/data/hfopenllm_v2/sumink/somer/b5de0218-91dc-487a-be90-70f8bcb64803.json
deleted file mode 100644
index 3dadaebd8..000000000
--- a/data/hfopenllm_v2/sumink/somer/b5de0218-91dc-487a-be90-70f8bcb64803.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_somer/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "somer",
-    "id": "sumink/somer",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.299
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5194
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0415
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.465
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3447
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/somer2/3870f65b-3429-45c2-846f-6af30155a78b.json b/data/hfopenllm_v2/sumink/somer2/3870f65b-3429-45c2-846f-6af30155a78b.json
deleted file mode 100644
index f13d9325b..000000000
--- a/data/hfopenllm_v2/sumink/somer2/3870f65b-3429-45c2-846f-6af30155a78b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_somer2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "somer2",
-    "id": "sumink/somer2",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3132
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5167
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0468
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4663
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3433
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sumink/somerft/d6c33a51-be09-4cb5-9942-4348668d3e5e.json b/data/hfopenllm_v2/sumink/somerft/d6c33a51-be09-4cb5-9942-4348668d3e5e.json
deleted file mode 100644
index fd584a2ba..000000000
--- a/data/hfopenllm_v2/sumink/somerft/d6c33a51-be09-4cb5-9942-4348668d3e5e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sumink_somerft/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "somerft",
-    "id": "sumink/somerft",
-    "developer": "sumink",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.543
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1431
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3093
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2483
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4045
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1117
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/sunbaby/BrainCog-8B-0.1-Instruct/1ccd36ee-445a-4861-8835-d602973148fc.json b/data/hfopenllm_v2/sunbaby/BrainCog-8B-0.1-Instruct/1ccd36ee-445a-4861-8835-d602973148fc.json
deleted file mode 100644
index af3751a29..000000000
--- a/data/hfopenllm_v2/sunbaby/BrainCog-8B-0.1-Instruct/1ccd36ee-445a-4861-8835-d602973148fc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/sunbaby_BrainCog-8B-0.1-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BrainCog-8B-0.1-Instruct",
-    "id": "sunbaby/BrainCog-8B-0.1-Instruct",
-    "developer": "sunbaby",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4253
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4618
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0967
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3656
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2858
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA/4c7ef4ee-3a7e-4f15-8a4a-c5853b1c6a47.json b/data/hfopenllm_v2/swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA/4c7ef4ee-3a7e-4f15-8a4a-c5853b1c6a47.json
deleted file mode 100644
index 8d0457f9c..000000000
--- a/data/hfopenllm_v2/swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA/4c7ef4ee-3a7e-4f15-8a4a-c5853b1c6a47.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/swap-uniba_LLaMAntino-3-ANITA-8B-Inst-DPO-ITA/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMAntino-3-ANITA-8B-Inst-DPO-ITA",
-    "id": "swap-uniba/LLaMAntino-3-ANITA-8B-Inst-DPO-ITA",
-    "developer": "swap-uniba",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4815
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4936
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0483
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4387
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3723
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/synergetic/FrankenQwen2.5-14B/6a69202c-1c68-43e4-bd45-bbc2ff2db743.json b/data/hfopenllm_v2/synergetic/FrankenQwen2.5-14B/6a69202c-1c68-43e4-bd45-bbc2ff2db743.json
deleted file mode 100644
index 5de286878..000000000
--- a/data/hfopenllm_v2/synergetic/FrankenQwen2.5-14B/6a69202c-1c68-43e4-bd45-bbc2ff2db743.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/synergetic_FrankenQwen2.5-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FrankenQwen2.5-14B",
-    "id": "synergetic/FrankenQwen2.5-14B",
-    "developer": "synergetic",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 16.972
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1869
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6048
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3843
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4382
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/talha2001/Beast-Soul-new/a053d6a3-05d4-4d0b-a9b8-7865cf7ac612.json b/data/hfopenllm_v2/talha2001/Beast-Soul-new/a053d6a3-05d4-4d0b-a9b8-7865cf7ac612.json
deleted file mode 100644
index 3ca4e52f3..000000000
--- a/data/hfopenllm_v2/talha2001/Beast-Soul-new/a053d6a3-05d4-4d0b-a9b8-7865cf7ac612.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/talha2001_Beast-Soul-new/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Beast-Soul-new",
-    "id": "talha2001/Beast-Soul-new",
-    "developer": "talha2001",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4854
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5227
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.074
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4459
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3102
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tangledgroup/tangled-llama-pints-1.5b-v0.1-instruct/f76d3d30-4fce-48a9-a26b-7d714fff1d29.json b/data/hfopenllm_v2/tangledgroup/tangled-llama-pints-1.5b-v0.1-instruct/f76d3d30-4fce-48a9-a26b-7d714fff1d29.json
deleted file mode 100644
index 2cab635d2..000000000
--- a/data/hfopenllm_v2/tangledgroup/tangled-llama-pints-1.5b-v0.1-instruct/f76d3d30-4fce-48a9-a26b-7d714fff1d29.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tangledgroup_tangled-llama-pints-1.5b-v0.1-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tangled-llama-pints-1.5b-v0.1-instruct",
-    "id": "tangledgroup/tangled-llama-pints-1.5b-v0.1-instruct",
-    "developer": "tangledgroup",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.5
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1509
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3143
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2399
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3761
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1109
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tangledgroup/tangled-llama-pints-1.5b-v0.2-instruct/eb38a092-1b56-4348-8188-baa2243f7046.json b/data/hfopenllm_v2/tangledgroup/tangled-llama-pints-1.5b-v0.2-instruct/eb38a092-1b56-4348-8188-baa2243f7046.json
deleted file mode 100644
index 306584ab5..000000000
--- a/data/hfopenllm_v2/tangledgroup/tangled-llama-pints-1.5b-v0.2-instruct/eb38a092-1b56-4348-8188-baa2243f7046.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tangledgroup_tangled-llama-pints-1.5b-v0.2-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "tangled-llama-pints-1.5b-v0.2-instruct",
-    "id": "tangledgroup/tangled-llama-pints-1.5b-v0.2-instruct",
-    "developer": "tangledgroup",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.5
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1724
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3158
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2416
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3643
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1117
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tanliboy/lambda-gemma-2-9b-dpo/1c4cfb94-fc66-4fe2-9879-78683abe654f.json b/data/hfopenllm_v2/tanliboy/lambda-gemma-2-9b-dpo/1c4cfb94-fc66-4fe2-9879-78683abe654f.json
deleted file mode 100644
index c4123e916..000000000
--- a/data/hfopenllm_v2/tanliboy/lambda-gemma-2-9b-dpo/1c4cfb94-fc66-4fe2-9879-78683abe654f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tanliboy_lambda-gemma-2-9b-dpo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "lambda-gemma-2-9b-dpo",
-    "id": "tanliboy/lambda-gemma-2-9b-dpo",
-    "developer": "tanliboy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4501
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5472
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0944
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4017
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3792
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tanliboy/lambda-gemma-2-9b-dpo/2deef730-c37b-46ca-82b7-de38ae724fd4.json b/data/hfopenllm_v2/tanliboy/lambda-gemma-2-9b-dpo/2deef730-c37b-46ca-82b7-de38ae724fd4.json
deleted file mode 100644
index a0e743c57..000000000
--- a/data/hfopenllm_v2/tanliboy/lambda-gemma-2-9b-dpo/2deef730-c37b-46ca-82b7-de38ae724fd4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tanliboy_lambda-gemma-2-9b-dpo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "lambda-gemma-2-9b-dpo",
-    "id": "tanliboy/lambda-gemma-2-9b-dpo",
-    "developer": "tanliboy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1829
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5488
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4056
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3805
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tanliboy/lambda-qwen2.5-14b-dpo-test/13a92beb-a8a4-4853-b2f5-1b09d3e2a64a.json b/data/hfopenllm_v2/tanliboy/lambda-qwen2.5-14b-dpo-test/13a92beb-a8a4-4853-b2f5-1b09d3e2a64a.json
deleted file mode 100644
index 0ef9f7333..000000000
--- a/data/hfopenllm_v2/tanliboy/lambda-qwen2.5-14b-dpo-test/13a92beb-a8a4-4853-b2f5-1b09d3e2a64a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tanliboy_lambda-qwen2.5-14b-dpo-test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "lambda-qwen2.5-14b-dpo-test",
-    "id": "tanliboy/lambda-qwen2.5-14b-dpo-test",
-    "developer": "tanliboy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8231
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6394
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5461
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3624
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.426
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4848
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tanliboy/lambda-qwen2.5-32b-dpo-test/36cf5b59-5369-4baf-80c1-3a47678eb5cb.json b/data/hfopenllm_v2/tanliboy/lambda-qwen2.5-32b-dpo-test/36cf5b59-5369-4baf-80c1-3a47678eb5cb.json
deleted file mode 100644
index ca6c94a3a..000000000
--- a/data/hfopenllm_v2/tanliboy/lambda-qwen2.5-32b-dpo-test/36cf5b59-5369-4baf-80c1-3a47678eb5cb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tanliboy_lambda-qwen2.5-32b-dpo-test/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "lambda-qwen2.5-32b-dpo-test",
-    "id": "tanliboy/lambda-qwen2.5-32b-dpo-test",
-    "developer": "tanliboy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8084
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6764
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6103
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4274
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5657
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tannedbum/Ellaria-9B/fced3ef1-fb69-47fe-bf68-3efe72db3142.json b/data/hfopenllm_v2/tannedbum/Ellaria-9B/fced3ef1-fb69-47fe-bf68-3efe72db3142.json
deleted file mode 100644
index 769518cca..000000000
--- a/data/hfopenllm_v2/tannedbum/Ellaria-9B/fced3ef1-fb69-47fe-bf68-3efe72db3142.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tannedbum_Ellaria-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ellaria-9B",
-    "id": "tannedbum/Ellaria-9B",
-    "developer": "tannedbum",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7826
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5942
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2077
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4151
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4205
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tannedbum/L3-Nymeria-Maid-8B/7a83d75a-332e-476a-b0f7-986b2ec9cc5d.json b/data/hfopenllm_v2/tannedbum/L3-Nymeria-Maid-8B/7a83d75a-332e-476a-b0f7-986b2ec9cc5d.json
deleted file mode 100644
index f95541ce4..000000000
--- a/data/hfopenllm_v2/tannedbum/L3-Nymeria-Maid-8B/7a83d75a-332e-476a-b0f7-986b2ec9cc5d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tannedbum_L3-Nymeria-Maid-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-Nymeria-Maid-8B",
-    "id": "tannedbum/L3-Nymeria-Maid-8B",
-    "developer": "tannedbum",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.725
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5146
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0937
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3751
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3747
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tannedbum/L3-Nymeria-v2-8B/6f413d72-cd9f-435c-b13e-9cec14edeb5c.json b/data/hfopenllm_v2/tannedbum/L3-Nymeria-v2-8B/6f413d72-cd9f-435c-b13e-9cec14edeb5c.json
deleted file mode 100644
index c7b09040b..000000000
--- a/data/hfopenllm_v2/tannedbum/L3-Nymeria-v2-8B/6f413d72-cd9f-435c-b13e-9cec14edeb5c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tannedbum_L3-Nymeria-v2-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-Nymeria-v2-8B",
-    "id": "tannedbum/L3-Nymeria-v2-8B",
-    "developer": "tannedbum",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7168
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5224
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0921
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3699
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3753
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tannedbum/L3-Rhaenys-8B/a7822bbf-bc23-437d-8e5b-32fb06d3a9ec.json b/data/hfopenllm_v2/tannedbum/L3-Rhaenys-8B/a7822bbf-bc23-437d-8e5b-32fb06d3a9ec.json
deleted file mode 100644
index c71fb151f..000000000
--- a/data/hfopenllm_v2/tannedbum/L3-Rhaenys-8B/a7822bbf-bc23-437d-8e5b-32fb06d3a9ec.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tannedbum_L3-Rhaenys-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-Rhaenys-8B",
-    "id": "tannedbum/L3-Rhaenys-8B",
-    "developer": "tannedbum",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7363
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5299
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0876
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2978
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3725
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3799
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/teknium/CollectiveCognition-v1.1-Mistral-7B/0b19508c-4996-4fb7-b0e0-9fa952854fa3.json b/data/hfopenllm_v2/teknium/CollectiveCognition-v1.1-Mistral-7B/0b19508c-4996-4fb7-b0e0-9fa952854fa3.json
deleted file mode 100644
index df3839754..000000000
--- a/data/hfopenllm_v2/teknium/CollectiveCognition-v1.1-Mistral-7B/0b19508c-4996-4fb7-b0e0-9fa952854fa3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/teknium_CollectiveCognition-v1.1-Mistral-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CollectiveCognition-v1.1-Mistral-7B",
-    "id": "teknium/CollectiveCognition-v1.1-Mistral-7B",
-    "developer": "teknium",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.279
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4493
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.031
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3869
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2837
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/teknium/OpenHermes-13B/447c22c1-8929-420f-b59b-01ab32a22281.json b/data/hfopenllm_v2/teknium/OpenHermes-13B/447c22c1-8929-420f-b59b-01ab32a22281.json
deleted file mode 100644
index 217a4b6f0..000000000
--- a/data/hfopenllm_v2/teknium/OpenHermes-13B/447c22c1-8929-420f-b59b-01ab32a22281.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/teknium_OpenHermes-13B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenHermes-13B",
-    "id": "teknium/OpenHermes-13B",
-    "developer": "teknium",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4206
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4043
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2389
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/teknium/OpenHermes-2-Mistral-7B/ab3dbe43-658e-4c8a-a399-b3d070d467ba.json b/data/hfopenllm_v2/teknium/OpenHermes-2-Mistral-7B/ab3dbe43-658e-4c8a-a399-b3d070d467ba.json
deleted file mode 100644
index 75e18465b..000000000
--- a/data/hfopenllm_v2/teknium/OpenHermes-2-Mistral-7B/ab3dbe43-658e-4c8a-a399-b3d070d467ba.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/teknium_OpenHermes-2-Mistral-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenHermes-2-Mistral-7B",
-    "id": "teknium/OpenHermes-2-Mistral-7B",
-    "developer": "teknium",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5286
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4948
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.452
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2931
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/teknium/OpenHermes-2.5-Mistral-7B/ee5c87a4-aa06-4728-a9bf-2fc35284b987.json b/data/hfopenllm_v2/teknium/OpenHermes-2.5-Mistral-7B/ee5c87a4-aa06-4728-a9bf-2fc35284b987.json
deleted file mode 100644
index 1bb9bc4d1..000000000
--- a/data/hfopenllm_v2/teknium/OpenHermes-2.5-Mistral-7B/ee5c87a4-aa06-4728-a9bf-2fc35284b987.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/teknium_OpenHermes-2.5-Mistral-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenHermes-2.5-Mistral-7B",
-    "id": "teknium/OpenHermes-2.5-Mistral-7B",
-    "developer": "teknium",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5571
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.487
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0506
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4242
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/teknium/OpenHermes-7B/6a1a58f6-e399-4ac3-a516-f02a37b6ff68.json b/data/hfopenllm_v2/teknium/OpenHermes-7B/6a1a58f6-e399-4ac3-a516-f02a37b6ff68.json
deleted file mode 100644
index 58a599dbe..000000000
--- a/data/hfopenllm_v2/teknium/OpenHermes-7B/6a1a58f6-e399-4ac3-a516-f02a37b6ff68.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/teknium_OpenHermes-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenHermes-7B",
-    "id": "teknium/OpenHermes-7B",
-    "developer": "teknium",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1813
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.362
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4324
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1933
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tensopolis/falcon3-10b-tensopolis-v1/9e2bfd77-b73e-436f-ad50-ccfd379cd3f2.json b/data/hfopenllm_v2/tensopolis/falcon3-10b-tensopolis-v1/9e2bfd77-b73e-436f-ad50-ccfd379cd3f2.json
deleted file mode 100644
index 052c9b9d5..000000000
--- a/data/hfopenllm_v2/tensopolis/falcon3-10b-tensopolis-v1/9e2bfd77-b73e-436f-ad50-ccfd379cd3f2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tensopolis_falcon3-10b-tensopolis-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "falcon3-10b-tensopolis-v1",
-    "id": "tensopolis/falcon3-10b-tensopolis-v1",
-    "developer": "tensopolis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7817
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6182
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2749
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4375
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.442
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tensopolis/falcon3-10b-tensopolis-v2/100cf60a-c43c-4b3a-a667-a45cffdd562a.json b/data/hfopenllm_v2/tensopolis/falcon3-10b-tensopolis-v2/100cf60a-c43c-4b3a-a667-a45cffdd562a.json
deleted file mode 100644
index 67f93fc67..000000000
--- a/data/hfopenllm_v2/tensopolis/falcon3-10b-tensopolis-v2/100cf60a-c43c-4b3a-a667-a45cffdd562a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tensopolis_falcon3-10b-tensopolis-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "falcon3-10b-tensopolis-v2",
-    "id": "tensopolis/falcon3-10b-tensopolis-v2",
-    "developer": "tensopolis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7792
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6182
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2666
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3272
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4297
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4424
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tensopolis/lamarckvergence-14b-tensopolis-v1/2088fca7-11d7-47de-808d-d47da0caad0f.json b/data/hfopenllm_v2/tensopolis/lamarckvergence-14b-tensopolis-v1/2088fca7-11d7-47de-808d-d47da0caad0f.json
deleted file mode 100644
index 4284ac283..000000000
--- a/data/hfopenllm_v2/tensopolis/lamarckvergence-14b-tensopolis-v1/2088fca7-11d7-47de-808d-d47da0caad0f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tensopolis_lamarckvergence-14b-tensopolis-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "lamarckvergence-14b-tensopolis-v1",
-    "id": "tensopolis/lamarckvergence-14b-tensopolis-v1",
-    "developer": "tensopolis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7604
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6561
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5166
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3607
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4475
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.525
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tensopolis/mistral-small-2501-tensopolis-v1/bf0b3560-9d38-406a-ad30-5fd157f0fe43.json b/data/hfopenllm_v2/tensopolis/mistral-small-2501-tensopolis-v1/bf0b3560-9d38-406a-ad30-5fd157f0fe43.json
deleted file mode 100644
index 26e8accb5..000000000
--- a/data/hfopenllm_v2/tensopolis/mistral-small-2501-tensopolis-v1/bf0b3560-9d38-406a-ad30-5fd157f0fe43.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tensopolis_mistral-small-2501-tensopolis-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-small-2501-tensopolis-v1",
-    "id": "tensopolis/mistral-small-2501-tensopolis-v1",
-    "developer": "tensopolis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7762
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6475
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4441
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3574
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.428
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4465
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tensopolis/mistral-small-r1-tensopolis/9ce12fbc-00f7-4cc8-bd9d-67ead83a0801.json b/data/hfopenllm_v2/tensopolis/mistral-small-r1-tensopolis/9ce12fbc-00f7-4cc8-bd9d-67ead83a0801.json
deleted file mode 100644
index 1b98fd82a..000000000
--- a/data/hfopenllm_v2/tensopolis/mistral-small-r1-tensopolis/9ce12fbc-00f7-4cc8-bd9d-67ead83a0801.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tensopolis_mistral-small-r1-tensopolis/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistral-small-r1-tensopolis",
-    "id": "tensopolis/mistral-small-r1-tensopolis",
-    "developer": "tensopolis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 23.572
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4622
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5436
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2908
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4035
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tensopolis/phi-4-tensopolis-v1/14501de3-dac0-44af-8c17-7abcd9bbba8b.json b/data/hfopenllm_v2/tensopolis/phi-4-tensopolis-v1/14501de3-dac0-44af-8c17-7abcd9bbba8b.json
deleted file mode 100644
index 498b1170f..000000000
--- a/data/hfopenllm_v2/tensopolis/phi-4-tensopolis-v1/14501de3-dac0-44af-8c17-7abcd9bbba8b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tensopolis_phi-4-tensopolis-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-4-tensopolis-v1",
-    "id": "tensopolis/phi-4-tensopolis-v1",
-    "developer": "tensopolis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6767
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6872
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.494
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3347
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4141
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5384
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tensopolis/qwen2.5-14b-tensopolis-v1/c9db8ce4-6f0d-4c13-8484-6fca9e9c3798.json b/data/hfopenllm_v2/tensopolis/qwen2.5-14b-tensopolis-v1/c9db8ce4-6f0d-4c13-8484-6fca9e9c3798.json
deleted file mode 100644
index 32b5e1f0e..000000000
--- a/data/hfopenllm_v2/tensopolis/qwen2.5-14b-tensopolis-v1/c9db8ce4-6f0d-4c13-8484-6fca9e9c3798.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tensopolis_qwen2.5-14b-tensopolis-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen2.5-14b-tensopolis-v1",
-    "id": "tensopolis/qwen2.5-14b-tensopolis-v1",
-    "developer": "tensopolis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.799
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6364
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3347
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4193
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4911
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tensopolis/qwen2.5-3b-or1-tensopolis/8c6c06be-bbc6-4307-ba5b-336dc2bb466f.json b/data/hfopenllm_v2/tensopolis/qwen2.5-3b-or1-tensopolis/8c6c06be-bbc6-4307-ba5b-336dc2bb466f.json
deleted file mode 100644
index a7e6952b8..000000000
--- a/data/hfopenllm_v2/tensopolis/qwen2.5-3b-or1-tensopolis/8c6c06be-bbc6-4307-ba5b-336dc2bb466f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tensopolis_qwen2.5-3b-or1-tensopolis/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen2.5-3b-or1-tensopolis",
-    "id": "tensopolis/qwen2.5-3b-or1-tensopolis",
-    "developer": "tensopolis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.354
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4421
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.173
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3749
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3197
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tensopolis/qwen2.5-7b-tensopolis-v1/1326ff61-d0b4-46eb-9bcf-f978166e622b.json b/data/hfopenllm_v2/tensopolis/qwen2.5-7b-tensopolis-v1/1326ff61-d0b4-46eb-9bcf-f978166e622b.json
deleted file mode 100644
index 2517471e4..000000000
--- a/data/hfopenllm_v2/tensopolis/qwen2.5-7b-tensopolis-v1/1326ff61-d0b4-46eb-9bcf-f978166e622b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tensopolis_qwen2.5-7b-tensopolis-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen2.5-7b-tensopolis-v1",
-    "id": "tensopolis/qwen2.5-7b-tensopolis-v1",
-    "developer": "tensopolis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7661
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5379
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4562
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4339
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4269
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tensopolis/qwen2.5-7b-tensopolis-v2/4c9e829f-7a99-4d61-8730-7457215a4fd6.json b/data/hfopenllm_v2/tensopolis/qwen2.5-7b-tensopolis-v2/4c9e829f-7a99-4d61-8730-7457215a4fd6.json
deleted file mode 100644
index eb25ba8f2..000000000
--- a/data/hfopenllm_v2/tensopolis/qwen2.5-7b-tensopolis-v2/4c9e829f-7a99-4d61-8730-7457215a4fd6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tensopolis_qwen2.5-7b-tensopolis-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen2.5-7b-tensopolis-v2",
-    "id": "tensopolis/qwen2.5-7b-tensopolis-v2",
-    "developer": "tensopolis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7521
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5415
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4819
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2903
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4246
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4243
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tensopolis/virtuoso-lite-tensopolis-v1/afc24d42-6d25-4036-8f22-fcf944b481b7.json b/data/hfopenllm_v2/tensopolis/virtuoso-lite-tensopolis-v1/afc24d42-6d25-4036-8f22-fcf944b481b7.json
deleted file mode 100644
index 03ab187f1..000000000
--- a/data/hfopenllm_v2/tensopolis/virtuoso-lite-tensopolis-v1/afc24d42-6d25-4036-8f22-fcf944b481b7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tensopolis_virtuoso-lite-tensopolis-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "virtuoso-lite-tensopolis-v1",
-    "id": "tensopolis/virtuoso-lite-tensopolis-v1",
-    "developer": "tensopolis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8069
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6102
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2545
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3448
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4582
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4435
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tensopolis/virtuoso-lite-tensopolis-v2/6f6db681-991e-408b-8d4e-71fff9e1c974.json b/data/hfopenllm_v2/tensopolis/virtuoso-lite-tensopolis-v2/6f6db681-991e-408b-8d4e-71fff9e1c974.json
deleted file mode 100644
index 21561cd85..000000000
--- a/data/hfopenllm_v2/tensopolis/virtuoso-lite-tensopolis-v2/6f6db681-991e-408b-8d4e-71fff9e1c974.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tensopolis_virtuoso-lite-tensopolis-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "virtuoso-lite-tensopolis-v2",
-    "id": "tensopolis/virtuoso-lite-tensopolis-v2",
-    "developer": "tensopolis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8029
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.61
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3431
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4595
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.444
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tensopolis/virtuoso-small-tensopolis-v1/f3fa76bf-f11c-4dee-9b9f-00f1ec793dac.json b/data/hfopenllm_v2/tensopolis/virtuoso-small-tensopolis-v1/f3fa76bf-f11c-4dee-9b9f-00f1ec793dac.json
deleted file mode 100644
index 636b8d846..000000000
--- a/data/hfopenllm_v2/tensopolis/virtuoso-small-tensopolis-v1/f3fa76bf-f11c-4dee-9b9f-00f1ec793dac.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tensopolis_virtuoso-small-tensopolis-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "virtuoso-small-tensopolis-v1",
-    "id": "tensopolis/virtuoso-small-tensopolis-v1",
-    "developer": "tensopolis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7856
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6415
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3527
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4326
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4968
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tensopolis/virtuoso-small-tensopolis-v2/77b457d9-4957-4f0d-a8d3-e005ae382239.json b/data/hfopenllm_v2/tensopolis/virtuoso-small-tensopolis-v2/77b457d9-4957-4f0d-a8d3-e005ae382239.json
deleted file mode 100644
index 243e5f94d..000000000
--- a/data/hfopenllm_v2/tensopolis/virtuoso-small-tensopolis-v2/77b457d9-4957-4f0d-a8d3-e005ae382239.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tensopolis_virtuoso-small-tensopolis-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "virtuoso-small-tensopolis-v2",
-    "id": "tensopolis/virtuoso-small-tensopolis-v2",
-    "developer": "tensopolis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.802
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6516
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3875
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4352
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5154
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tensopolis/virtuoso-small-v2-tensopolis-v1/11474a7a-73a6-4a3f-8bcb-bef783e12a2b.json b/data/hfopenllm_v2/tensopolis/virtuoso-small-v2-tensopolis-v1/11474a7a-73a6-4a3f-8bcb-bef783e12a2b.json
deleted file mode 100644
index 24a089280..000000000
--- a/data/hfopenllm_v2/tensopolis/virtuoso-small-v2-tensopolis-v1/11474a7a-73a6-4a3f-8bcb-bef783e12a2b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tensopolis_virtuoso-small-v2-tensopolis-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "virtuoso-small-v2-tensopolis-v1",
-    "id": "tensopolis/virtuoso-small-v2-tensopolis-v1",
-    "developer": "tensopolis",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8419
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6545
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4524
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3465
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4509
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5175
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tensoropera/Fox-1-1.6B/23cc1e7f-0994-43a5-8403-5361a2976285.json b/data/hfopenllm_v2/tensoropera/Fox-1-1.6B/23cc1e7f-0994-43a5-8403-5361a2976285.json
deleted file mode 100644
index fa1deaf6f..000000000
--- a/data/hfopenllm_v2/tensoropera/Fox-1-1.6B/23cc1e7f-0994-43a5-8403-5361a2976285.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tensoropera_Fox-1-1.6B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Fox-1-1.6B",
-    "id": "tensoropera/Fox-1-1.6B",
-    "developer": "tensoropera",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.665
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2766
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3307
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0174
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.355
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1371
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tenyx/Llama3-TenyxChat-70B/88c257d3-d5c1-4e1f-bbc8-9fc6bd65e15e.json b/data/hfopenllm_v2/tenyx/Llama3-TenyxChat-70B/88c257d3-d5c1-4e1f-bbc8-9fc6bd65e15e.json
deleted file mode 100644
index f42d9a03a..000000000
--- a/data/hfopenllm_v2/tenyx/Llama3-TenyxChat-70B/88c257d3-d5c1-4e1f-bbc8-9fc6bd65e15e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tenyx_Llama3-TenyxChat-70B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3-TenyxChat-70B",
-    "id": "tenyx/Llama3-TenyxChat-70B",
-    "developer": "tenyx",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 70.554
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8087
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6511
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2356
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.426
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.521
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/theo77186/Qwen2.5-Coder-7B-Instruct-20241106/ec4c2032-8fc0-448a-a7c4-ee9b35b642db.json b/data/hfopenllm_v2/theo77186/Qwen2.5-Coder-7B-Instruct-20241106/ec4c2032-8fc0-448a-a7c4-ee9b35b642db.json
deleted file mode 100644
index 21cc408b5..000000000
--- a/data/hfopenllm_v2/theo77186/Qwen2.5-Coder-7B-Instruct-20241106/ec4c2032-8fc0-448a-a7c4-ee9b35b642db.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/theo77186_Qwen2.5-Coder-7B-Instruct-20241106/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Coder-7B-Instruct-20241106",
-    "id": "theo77186/Qwen2.5-Coder-7B-Instruct-20241106",
-    "developer": "theo77186",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6101
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5008
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3882
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2919
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4073
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3353
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/theprint/Boptruth-Agatha-7B/3c7ac4de-1456-4afb-b7ac-07beb6cb4d39.json b/data/hfopenllm_v2/theprint/Boptruth-Agatha-7B/3c7ac4de-1456-4afb-b7ac-07beb6cb4d39.json
deleted file mode 100644
index 096a756a8..000000000
--- a/data/hfopenllm_v2/theprint/Boptruth-Agatha-7B/3c7ac4de-1456-4afb-b7ac-07beb6cb4d39.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/theprint_Boptruth-Agatha-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Boptruth-Agatha-7B",
-    "id": "theprint/Boptruth-Agatha-7B",
-    "developer": "theprint",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3124
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4984
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0551
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4277
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/theprint/CleverBoi-7B-v2/a06ad94f-13ee-466c-b25f-87cd87012678.json b/data/hfopenllm_v2/theprint/CleverBoi-7B-v2/a06ad94f-13ee-466c-b25f-87cd87012678.json
deleted file mode 100644
index 1d1af6ab3..000000000
--- a/data/hfopenllm_v2/theprint/CleverBoi-7B-v2/a06ad94f-13ee-466c-b25f-87cd87012678.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/theprint_CleverBoi-7B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CleverBoi-7B-v2",
-    "id": "theprint/CleverBoi-7B-v2",
-    "developer": "theprint",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 7.736
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.217
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4532
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4695
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2709
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/theprint/CleverBoi-7B-v3/9e1ca6d0-d2b2-48c5-acc2-ad299ce02e1f.json b/data/hfopenllm_v2/theprint/CleverBoi-7B-v3/9e1ca6d0-d2b2-48c5-acc2-ad299ce02e1f.json
deleted file mode 100644
index 15f7a9b45..000000000
--- a/data/hfopenllm_v2/theprint/CleverBoi-7B-v3/9e1ca6d0-d2b2-48c5-acc2-ad299ce02e1f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/theprint_CleverBoi-7B-v3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CleverBoi-7B-v3",
-    "id": "theprint/CleverBoi-7B-v3",
-    "developer": "theprint",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 7.736
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2382
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4414
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2659
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4072
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2868
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/theprint/CleverBoi-Llama-3.1-8B-Instruct/7dcd6e37-3685-4b08-b983-b2a711aeaf73.json b/data/hfopenllm_v2/theprint/CleverBoi-Llama-3.1-8B-Instruct/7dcd6e37-3685-4b08-b983-b2a711aeaf73.json
deleted file mode 100644
index a63c67af4..000000000
--- a/data/hfopenllm_v2/theprint/CleverBoi-Llama-3.1-8B-Instruct/7dcd6e37-3685-4b08-b983-b2a711aeaf73.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/theprint_CleverBoi-Llama-3.1-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CleverBoi-Llama-3.1-8B-Instruct",
-    "id": "theprint/CleverBoi-Llama-3.1-8B-Instruct",
-    "developer": "theprint",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 16.061
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1682
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.456
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0491
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4014
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3075
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/theprint/CleverBoi-Llama-3.1-8B-v2/b1ae6801-0139-41d3-85dc-102ad5cc4c6a.json b/data/hfopenllm_v2/theprint/CleverBoi-Llama-3.1-8B-v2/b1ae6801-0139-41d3-85dc-102ad5cc4c6a.json
deleted file mode 100644
index f1cf9de8b..000000000
--- a/data/hfopenllm_v2/theprint/CleverBoi-Llama-3.1-8B-v2/b1ae6801-0139-41d3-85dc-102ad5cc4c6a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/theprint_CleverBoi-Llama-3.1-8B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CleverBoi-Llama-3.1-8B-v2",
-    "id": "theprint/CleverBoi-Llama-3.1-8B-v2",
-    "developer": "theprint",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 9.3
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1961
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4668
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0529
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2861
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3735
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3188
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/theprint/CleverBoi-Nemo-12B-v2/4cc037a2-d952-4566-a575-015f8e3a5925.json b/data/hfopenllm_v2/theprint/CleverBoi-Nemo-12B-v2/4cc037a2-d952-4566-a575-015f8e3a5925.json
deleted file mode 100644
index 76b2de2d9..000000000
--- a/data/hfopenllm_v2/theprint/CleverBoi-Nemo-12B-v2/4cc037a2-d952-4566-a575-015f8e3a5925.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/theprint_CleverBoi-Nemo-12B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CleverBoi-Nemo-12B-v2",
-    "id": "theprint/CleverBoi-Nemo-12B-v2",
-    "developer": "theprint",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 13.933
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2046
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5241
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1035
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4187
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3228
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/theprint/Code-Llama-Bagel-8B/a1eaadae-8601-4c18-ab0c-4f6d80d3307b.json b/data/hfopenllm_v2/theprint/Code-Llama-Bagel-8B/a1eaadae-8601-4c18-ab0c-4f6d80d3307b.json
deleted file mode 100644
index b1815116f..000000000
--- a/data/hfopenllm_v2/theprint/Code-Llama-Bagel-8B/a1eaadae-8601-4c18-ab0c-4f6d80d3307b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/theprint_Code-Llama-Bagel-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Code-Llama-Bagel-8B",
-    "id": "theprint/Code-Llama-Bagel-8B",
-    "developer": "theprint",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.253
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4697
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0612
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.368
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2822
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/theprint/Conversely-Mistral-7B/40e452df-8f0a-4473-a3d1-41f9c288c12f.json b/data/hfopenllm_v2/theprint/Conversely-Mistral-7B/40e452df-8f0a-4473-a3d1-41f9c288c12f.json
deleted file mode 100644
index 71ebe1082..000000000
--- a/data/hfopenllm_v2/theprint/Conversely-Mistral-7B/40e452df-8f0a-4473-a3d1-41f9c288c12f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/theprint_Conversely-Mistral-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Conversely-Mistral-7B",
-    "id": "theprint/Conversely-Mistral-7B",
-    "developer": "theprint",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 14.496
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2608
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4672
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0279
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4189
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2826
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/theprint/Llama-3.2-3B-VanRossum/216020ac-276b-436e-815b-d6968eb83770.json b/data/hfopenllm_v2/theprint/Llama-3.2-3B-VanRossum/216020ac-276b-436e-815b-d6968eb83770.json
deleted file mode 100644
index aa1889b1c..000000000
--- a/data/hfopenllm_v2/theprint/Llama-3.2-3B-VanRossum/216020ac-276b-436e-815b-d6968eb83770.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/theprint_Llama-3.2-3B-VanRossum/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-VanRossum",
-    "id": "theprint/Llama-3.2-3B-VanRossum",
-    "developer": "theprint",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 3.696
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4783
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4279
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0974
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3442
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.277
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/theprint/ReWiz-7B/1bb4aeac-a5e1-4fd7-9e70-64fdcfc600cd.json b/data/hfopenllm_v2/theprint/ReWiz-7B/1bb4aeac-a5e1-4fd7-9e70-64fdcfc600cd.json
deleted file mode 100644
index 28eca97a3..000000000
--- a/data/hfopenllm_v2/theprint/ReWiz-7B/1bb4aeac-a5e1-4fd7-9e70-64fdcfc600cd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/theprint_ReWiz-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReWiz-7B",
-    "id": "theprint/ReWiz-7B",
-    "developer": "theprint",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 7.736
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4048
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4564
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0408
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4612
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.267
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/theprint/ReWiz-Llama-3.1-8B-v2/25739611-f690-41b4-87de-9f4ea8b3d815.json b/data/hfopenllm_v2/theprint/ReWiz-Llama-3.1-8B-v2/25739611-f690-41b4-87de-9f4ea8b3d815.json
deleted file mode 100644
index f45735570..000000000
--- a/data/hfopenllm_v2/theprint/ReWiz-Llama-3.1-8B-v2/25739611-f690-41b4-87de-9f4ea8b3d815.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/theprint_ReWiz-Llama-3.1-8B-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReWiz-Llama-3.1-8B-v2",
-    "id": "theprint/ReWiz-Llama-3.1-8B-v2",
-    "developer": "theprint",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 9.3
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2379
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4632
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3029
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3814
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.331
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/theprint/ReWiz-Llama-3.2-3B/b8c27fdd-5b35-41ab-8a35-b5a48f27cceb.json b/data/hfopenllm_v2/theprint/ReWiz-Llama-3.2-3B/b8c27fdd-5b35-41ab-8a35-b5a48f27cceb.json
deleted file mode 100644
index e0c5c3d8e..000000000
--- a/data/hfopenllm_v2/theprint/ReWiz-Llama-3.2-3B/b8c27fdd-5b35-41ab-8a35-b5a48f27cceb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/theprint_ReWiz-Llama-3.2-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReWiz-Llama-3.2-3B",
-    "id": "theprint/ReWiz-Llama-3.2-3B",
-    "developer": "theprint",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4649
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4343
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1095
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3614
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2887
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/theprint/ReWiz-Nemo-12B-Instruct/fa237949-c3ac-482a-8a54-5a2019f24016.json b/data/hfopenllm_v2/theprint/ReWiz-Nemo-12B-Instruct/fa237949-c3ac-482a-8a54-5a2019f24016.json
deleted file mode 100644
index 121036620..000000000
--- a/data/hfopenllm_v2/theprint/ReWiz-Nemo-12B-Instruct/fa237949-c3ac-482a-8a54-5a2019f24016.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/theprint_ReWiz-Nemo-12B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReWiz-Nemo-12B-Instruct",
-    "id": "theprint/ReWiz-Nemo-12B-Instruct",
-    "developer": "theprint",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1062
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5092
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1042
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3238
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4096
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3339
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/theprint/ReWiz-Qwen-2.5-14B/b60dd828-a3e7-46a8-b4c2-322aeca42faf.json b/data/hfopenllm_v2/theprint/ReWiz-Qwen-2.5-14B/b60dd828-a3e7-46a8-b4c2-322aeca42faf.json
deleted file mode 100644
index c794f72d9..000000000
--- a/data/hfopenllm_v2/theprint/ReWiz-Qwen-2.5-14B/b60dd828-a3e7-46a8-b4c2-322aeca42faf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/theprint_ReWiz-Qwen-2.5-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReWiz-Qwen-2.5-14B",
-    "id": "theprint/ReWiz-Qwen-2.5-14B",
-    "developer": "theprint",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 16.743
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2785
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6179
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2923
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.38
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4539
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5092
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/theprint/ReWiz-Worldbuilder-7B/5de9f914-333f-4181-a93f-79257a3daf54.json b/data/hfopenllm_v2/theprint/ReWiz-Worldbuilder-7B/5de9f914-333f-4181-a93f-79257a3daf54.json
deleted file mode 100644
index 499adb749..000000000
--- a/data/hfopenllm_v2/theprint/ReWiz-Worldbuilder-7B/5de9f914-333f-4181-a93f-79257a3daf54.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/theprint_ReWiz-Worldbuilder-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ReWiz-Worldbuilder-7B",
-    "id": "theprint/ReWiz-Worldbuilder-7B",
-    "developer": "theprint",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.248
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.251
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4636
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.037
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4572
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2971
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/theprint/RuDolph-Hermes-7B/e2d23da4-226a-4a02-8390-e8edaea4b65b.json b/data/hfopenllm_v2/theprint/RuDolph-Hermes-7B/e2d23da4-226a-4a02-8390-e8edaea4b65b.json
deleted file mode 100644
index b8a758ee2..000000000
--- a/data/hfopenllm_v2/theprint/RuDolph-Hermes-7B/e2d23da4-226a-4a02-8390-e8edaea4b65b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/theprint_RuDolph-Hermes-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RuDolph-Hermes-7B",
-    "id": "theprint/RuDolph-Hermes-7B",
-    "developer": "theprint",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3604
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5053
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0514
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3121
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4226
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3073
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/theprint/WorldBuilder-12B/c64c7470-dcf9-46f8-b789-cab7e902739d.json b/data/hfopenllm_v2/theprint/WorldBuilder-12B/c64c7470-dcf9-46f8-b789-cab7e902739d.json
deleted file mode 100644
index 70af1d5c7..000000000
--- a/data/hfopenllm_v2/theprint/WorldBuilder-12B/c64c7470-dcf9-46f8-b789-cab7e902739d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/theprint_WorldBuilder-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "WorldBuilder-12B",
-    "id": "theprint/WorldBuilder-12B",
-    "developer": "theprint",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 13.933
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1374
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.501
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0446
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4066
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3192
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/theprint/phi-3-mini-4k-python/f6d727a3-19dc-4173-a88f-2c47449896aa.json b/data/hfopenllm_v2/theprint/phi-3-mini-4k-python/f6d727a3-19dc-4173-a88f-2c47449896aa.json
deleted file mode 100644
index e5805f869..000000000
--- a/data/hfopenllm_v2/theprint/phi-3-mini-4k-python/f6d727a3-19dc-4173-a88f-2c47449896aa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/theprint_phi-3-mini-4k-python/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-3-mini-4k-python",
-    "id": "theprint/phi-3-mini-4k-python",
-    "developer": "theprint",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "?",
-      "params_billions": 4.132
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2409
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4938
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.105
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3922
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3577
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/thinkcoder/llama3-8b-instruct-lora-8-sft/490d14c8-2cb0-4328-9f41-6074b28d6fdc.json b/data/hfopenllm_v2/thinkcoder/llama3-8b-instruct-lora-8-sft/490d14c8-2cb0-4328-9f41-6074b28d6fdc.json
deleted file mode 100644
index 440a5ef62..000000000
--- a/data/hfopenllm_v2/thinkcoder/llama3-8b-instruct-lora-8-sft/490d14c8-2cb0-4328-9f41-6074b28d6fdc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/thinkcoder_llama3-8b-instruct-lora-8-sft/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama3-8b-instruct-lora-8-sft",
-    "id": "thinkcoder/llama3-8b-instruct-lora-8-sft",
-    "developer": "thinkcoder",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.648
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4865
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3235
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3476
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/thirdeyeai/elevate360m/9351b079-7ef5-42ec-bb83-f0d8ec7de479.json b/data/hfopenllm_v2/thirdeyeai/elevate360m/9351b079-7ef5-42ec-bb83-f0d8ec7de479.json
deleted file mode 100644
index 2f88fa8a2..000000000
--- a/data/hfopenllm_v2/thirdeyeai/elevate360m/9351b079-7ef5-42ec-bb83-f0d8ec7de479.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/thirdeyeai_elevate360m/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "elevate360m",
-    "id": "thirdeyeai/elevate360m",
-    "developer": "thirdeyeai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.362
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0445
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2963
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2408
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3462
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1077
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-1_5B/852d5adb-f422-4102-8114-082ab0b3c07d.json b/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-1_5B/852d5adb-f422-4102-8114-082ab0b3c07d.json
deleted file mode 100644
index d16c6d65e..000000000
--- a/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-1_5B/852d5adb-f422-4102-8114-082ab0b3c07d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/thomas-yanxin_XinYuan-Qwen2-1_5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "XinYuan-Qwen2-1_5B",
-    "id": "thomas-yanxin/XinYuan-Qwen2-1_5B",
-    "developer": "thomas-yanxin",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.777
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2986
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3635
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0672
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3634
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2357
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-7B-0917/c64e98cd-c022-4834-a3e0-3949416d1fb1.json b/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-7B-0917/c64e98cd-c022-4834-a3e0-3949416d1fb1.json
deleted file mode 100644
index 075ddb870..000000000
--- a/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-7B-0917/c64e98cd-c022-4834-a3e0-3949416d1fb1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/thomas-yanxin_XinYuan-Qwen2-7B-0917/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "XinYuan-Qwen2-7B-0917",
-    "id": "thomas-yanxin/XinYuan-Qwen2-7B-0917",
-    "developer": "thomas-yanxin",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3719
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5169
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1979
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4401
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4245
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-7B/f101bd15-ac61-49d4-beac-c89bc889b34b.json b/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-7B/f101bd15-ac61-49d4-beac-c89bc889b34b.json
deleted file mode 100644
index eee3e1e34..000000000
--- a/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2-7B/f101bd15-ac61-49d4-beac-c89bc889b34b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/thomas-yanxin_XinYuan-Qwen2-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "XinYuan-Qwen2-7B",
-    "id": "thomas-yanxin/XinYuan-Qwen2-7B",
-    "developer": "thomas-yanxin",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4438
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4937
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1458
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4058
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3925
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2.5-7B-0917/11caf1c1-e2a0-4abb-bb0e-d06853a06e4d.json b/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2.5-7B-0917/11caf1c1-e2a0-4abb-bb0e-d06853a06e4d.json
deleted file mode 100644
index ff36d37cb..000000000
--- a/data/hfopenllm_v2/thomas-yanxin/XinYuan-Qwen2.5-7B-0917/11caf1c1-e2a0-4abb-bb0e-d06853a06e4d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/thomas-yanxin_XinYuan-Qwen2.5-7B-0917/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "XinYuan-Qwen2.5-7B-0917",
-    "id": "thomas-yanxin/XinYuan-Qwen2.5-7B-0917",
-    "developer": "thomas-yanxin",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3577
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5184
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1934
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3676
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3882
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tianyil1/MistralForCausalLM_Cal_DPO/f0b57a60-8402-4430-93f3-b846a94113f2.json b/data/hfopenllm_v2/tianyil1/MistralForCausalLM_Cal_DPO/f0b57a60-8402-4430-93f3-b846a94113f2.json
deleted file mode 100644
index 06e058942..000000000
--- a/data/hfopenllm_v2/tianyil1/MistralForCausalLM_Cal_DPO/f0b57a60-8402-4430-93f3-b846a94113f2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tianyil1_MistralForCausalLM_Cal_DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MistralForCausalLM_Cal_DPO",
-    "id": "tianyil1/MistralForCausalLM_Cal_DPO",
-    "developer": "tianyil1",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5328
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4381
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0287
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3977
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2763
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-10B-Base/50aa8077-4493-47a9-9cec-014c56343ecf.json b/data/hfopenllm_v2/tiiuae/Falcon3-10B-Base/50aa8077-4493-47a9-9cec-014c56343ecf.json
deleted file mode 100644
index 35ab332bf..000000000
--- a/data/hfopenllm_v2/tiiuae/Falcon3-10B-Base/50aa8077-4493-47a9-9cec-014c56343ecf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-10B-Base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon3-10B-Base",
-    "id": "tiiuae/Falcon3-10B-Base",
-    "developer": "tiiuae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3648
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.595
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3456
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4398
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.424
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-10B-Instruct/5e70d00b-c822-4ad6-afe8-3756a7038c57.json b/data/hfopenllm_v2/tiiuae/Falcon3-10B-Instruct/5e70d00b-c822-4ad6-afe8-3756a7038c57.json
deleted file mode 100644
index 968a3c904..000000000
--- a/data/hfopenllm_v2/tiiuae/Falcon3-10B-Instruct/5e70d00b-c822-4ad6-afe8-3756a7038c57.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-10B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon3-10B-Instruct",
-    "id": "tiiuae/Falcon3-10B-Instruct",
-    "developer": "tiiuae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.306
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7817
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.617
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2764
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4323
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4429
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-1B-Base/8162ba41-e630-470f-a297-72fb9f2110fd.json b/data/hfopenllm_v2/tiiuae/Falcon3-1B-Base/8162ba41-e630-470f-a297-72fb9f2110fd.json
deleted file mode 100644
index e1cc14037..000000000
--- a/data/hfopenllm_v2/tiiuae/Falcon3-1B-Base/8162ba41-e630-470f-a297-72fb9f2110fd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-1B-Base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon3-1B-Base",
-    "id": "tiiuae/Falcon3-1B-Base",
-    "developer": "tiiuae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.669
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2428
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3571
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0332
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4147
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1608
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-1B-Instruct/60dd9d02-476f-459d-a41c-f89f82116dc3.json b/data/hfopenllm_v2/tiiuae/Falcon3-1B-Instruct/60dd9d02-476f-459d-a41c-f89f82116dc3.json
deleted file mode 100644
index 3a40efcbb..000000000
--- a/data/hfopenllm_v2/tiiuae/Falcon3-1B-Instruct/60dd9d02-476f-459d-a41c-f89f82116dc3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-1B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon3-1B-Instruct",
-    "id": "tiiuae/Falcon3-1B-Instruct",
-    "developer": "tiiuae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.669
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5557
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3745
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0634
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4189
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1838
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-3B-Base/73e89f21-5799-4835-a0e0-a6664c0483da.json b/data/hfopenllm_v2/tiiuae/Falcon3-3B-Base/73e89f21-5799-4835-a0e0-a6664c0483da.json
deleted file mode 100644
index 81b06da6b..000000000
--- a/data/hfopenllm_v2/tiiuae/Falcon3-3B-Base/73e89f21-5799-4835-a0e0-a6664c0483da.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-3B-Base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon3-3B-Base",
-    "id": "tiiuae/Falcon3-3B-Base",
-    "developer": "tiiuae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.228
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2765
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4421
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1178
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2879
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-3B-Instruct/7f355ad4-9156-486d-8cf4-723117da3bb8.json b/data/hfopenllm_v2/tiiuae/Falcon3-3B-Instruct/7f355ad4-9156-486d-8cf4-723117da3bb8.json
deleted file mode 100644
index 2d79d95ab..000000000
--- a/data/hfopenllm_v2/tiiuae/Falcon3-3B-Instruct/7f355ad4-9156-486d-8cf4-723117da3bb8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-3B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon3-3B-Instruct",
-    "id": "tiiuae/Falcon3-3B-Instruct",
-    "developer": "tiiuae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.228
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6977
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4754
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4136
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3005
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-7B-Base/4ccc6026-b639-488d-867f-d98ea49cf1b6.json b/data/hfopenllm_v2/tiiuae/Falcon3-7B-Base/4ccc6026-b639-488d-867f-d98ea49cf1b6.json
deleted file mode 100644
index e683aaba0..000000000
--- a/data/hfopenllm_v2/tiiuae/Falcon3-7B-Base/4ccc6026-b639-488d-867f-d98ea49cf1b6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-7B-Base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon3-7B-Base",
-    "id": "tiiuae/Falcon3-7B-Base",
-    "developer": "tiiuae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3416
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5099
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1941
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3465
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4702
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.391
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-7B-Instruct/3cf2e68e-4de0-436e-935e-86935e11f72f.json b/data/hfopenllm_v2/tiiuae/Falcon3-7B-Instruct/3cf2e68e-4de0-436e-935e-86935e11f72f.json
deleted file mode 100644
index b188ad079..000000000
--- a/data/hfopenllm_v2/tiiuae/Falcon3-7B-Instruct/3cf2e68e-4de0-436e-935e-86935e11f72f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon3-7B-Instruct",
-    "id": "tiiuae/Falcon3-7B-Instruct",
-    "developer": "tiiuae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.456
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7612
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5632
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4086
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4827
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4087
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-Mamba-7B-Base/e9e4ae5d-0dd1-463c-9f15-47cb21efb409.json b/data/hfopenllm_v2/tiiuae/Falcon3-Mamba-7B-Base/e9e4ae5d-0dd1-463c-9f15-47cb21efb409.json
deleted file mode 100644
index 98e4d109f..000000000
--- a/data/hfopenllm_v2/tiiuae/Falcon3-Mamba-7B-Base/e9e4ae5d-0dd1-463c-9f15-47cb21efb409.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-Mamba-7B-Base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon3-Mamba-7B-Base",
-    "id": "tiiuae/Falcon3-Mamba-7B-Base",
-    "developer": "tiiuae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "FalconMambaForCausalLM",
-      "params_billions": 7.273
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2891
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4699
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1941
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3431
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3038
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tiiuae/Falcon3-Mamba-7B-Instruct/c57eb23a-5998-4ab9-9a98-39b1338f5ba6.json b/data/hfopenllm_v2/tiiuae/Falcon3-Mamba-7B-Instruct/c57eb23a-5998-4ab9-9a98-39b1338f5ba6.json
deleted file mode 100644
index cfbd00438..000000000
--- a/data/hfopenllm_v2/tiiuae/Falcon3-Mamba-7B-Instruct/c57eb23a-5998-4ab9-9a98-39b1338f5ba6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tiiuae_Falcon3-Mamba-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Falcon3-Mamba-7B-Instruct",
-    "id": "tiiuae/Falcon3-Mamba-7B-Instruct",
-    "developer": "tiiuae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "FalconMambaForCausalLM",
-      "params_billions": 7.273
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7165
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4679
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3006
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3869
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3369
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tiiuae/falcon-11B/94fb625d-f58c-4f2e-8268-1dc4472c1cce.json b/data/hfopenllm_v2/tiiuae/falcon-11B/94fb625d-f58c-4f2e-8268-1dc4472c1cce.json
deleted file mode 100644
index 50405922d..000000000
--- a/data/hfopenllm_v2/tiiuae/falcon-11B/94fb625d-f58c-4f2e-8268-1dc4472c1cce.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tiiuae_falcon-11B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "falcon-11B",
-    "id": "tiiuae/falcon-11B",
-    "developer": "tiiuae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "FalconForCausalLM",
-      "params_billions": 11.103
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3261
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4392
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0279
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.271
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3986
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2389
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tiiuae/falcon-40b-instruct/4481ddef-2bef-4284-b56d-21054f5a9a97.json b/data/hfopenllm_v2/tiiuae/falcon-40b-instruct/4481ddef-2bef-4284-b56d-21054f5a9a97.json
deleted file mode 100644
index fcd44b235..000000000
--- a/data/hfopenllm_v2/tiiuae/falcon-40b-instruct/4481ddef-2bef-4284-b56d-21054f5a9a97.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tiiuae_falcon-40b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "falcon-40b-instruct",
-    "id": "tiiuae/falcon-40b-instruct",
-    "developer": "tiiuae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "FalconForCausalLM",
-      "params_billions": 40.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2454
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4054
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3762
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2261
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tiiuae/falcon-40b/80048c4b-e97b-45c7-aa04-70ce69481a97.json b/data/hfopenllm_v2/tiiuae/falcon-40b/80048c4b-e97b-45c7-aa04-70ce69481a97.json
deleted file mode 100644
index a3ef17532..000000000
--- a/data/hfopenllm_v2/tiiuae/falcon-40b/80048c4b-e97b-45c7-aa04-70ce69481a97.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tiiuae_falcon-40b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "falcon-40b",
-    "id": "tiiuae/falcon-40b",
-    "developer": "tiiuae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "FalconForCausalLM",
-      "params_billions": 40.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2496
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4019
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3631
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2505
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tiiuae/falcon-7b-instruct/d21a2557-2348-4087-b2a6-6e1c0101bccc.json b/data/hfopenllm_v2/tiiuae/falcon-7b-instruct/d21a2557-2348-4087-b2a6-6e1c0101bccc.json
deleted file mode 100644
index 556a2f3e8..000000000
--- a/data/hfopenllm_v2/tiiuae/falcon-7b-instruct/d21a2557-2348-4087-b2a6-6e1c0101bccc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tiiuae_falcon-7b-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "falcon-7b-instruct",
-    "id": "tiiuae/falcon-7b-instruct",
-    "developer": "tiiuae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "FalconForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1969
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3203
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0121
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2475
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3634
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1155
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tiiuae/falcon-7b/76290d4b-5526-400b-8ca4-24d220f7c02d.json b/data/hfopenllm_v2/tiiuae/falcon-7b/76290d4b-5526-400b-8ca4-24d220f7c02d.json
deleted file mode 100644
index 24fa62ffd..000000000
--- a/data/hfopenllm_v2/tiiuae/falcon-7b/76290d4b-5526-400b-8ca4-24d220f7c02d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tiiuae_falcon-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "falcon-7b",
-    "id": "tiiuae/falcon-7b",
-    "developer": "tiiuae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "FalconForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1821
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3285
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.245
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3778
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tiiuae/falcon-mamba-7b/3a146535-09b3-4246-8bd8-0e984e0905b1.json b/data/hfopenllm_v2/tiiuae/falcon-mamba-7b/3a146535-09b3-4246-8bd8-0e984e0905b1.json
deleted file mode 100644
index 78cc692cd..000000000
--- a/data/hfopenllm_v2/tiiuae/falcon-mamba-7b/3a146535-09b3-4246-8bd8-0e984e0905b1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tiiuae_falcon-mamba-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "falcon-mamba-7b",
-    "id": "tiiuae/falcon-mamba-7b",
-    "developer": "tiiuae",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "FalconMambaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3336
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4285
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0446
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3104
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.421
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2302
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tinycompany/BiBo-v0.3/6683f95c-f97f-4117-b3c5-c1ed9587289e.json b/data/hfopenllm_v2/tinycompany/BiBo-v0.3/6683f95c-f97f-4117-b3c5-c1ed9587289e.json
deleted file mode 100644
index 731470475..000000000
--- a/data/hfopenllm_v2/tinycompany/BiBo-v0.3/6683f95c-f97f-4117-b3c5-c1ed9587289e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tinycompany_BiBo-v0.3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BiBo-v0.3",
-    "id": "tinycompany/BiBo-v0.3",
-    "developer": "tinycompany",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 2.943
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5184
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4642
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0876
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.395
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tinycompany/BiBo-v0.7/bbe74b2b-9e13-4c13-92c8-618078667248.json b/data/hfopenllm_v2/tinycompany/BiBo-v0.7/bbe74b2b-9e13-4c13-92c8-618078667248.json
deleted file mode 100644
index 9034a4865..000000000
--- a/data/hfopenllm_v2/tinycompany/BiBo-v0.7/bbe74b2b-9e13-4c13-92c8-618078667248.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tinycompany_BiBo-v0.7/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BiBo-v0.7",
-    "id": "tinycompany/BiBo-v0.7",
-    "developer": "tinycompany",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 2.943
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4311
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0823
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4044
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.265
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-bgem3/61876ce3-acc4-4619-b0c2-78ac4dff48ea.json b/data/hfopenllm_v2/tinycompany/ShawtyIsBad-bgem3/61876ce3-acc4-4619-b0c2-78ac4dff48ea.json
deleted file mode 100644
index 1ef8a0185..000000000
--- a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-bgem3/61876ce3-acc4-4619-b0c2-78ac4dff48ea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tinycompany_ShawtyIsBad-bgem3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ShawtyIsBad-bgem3",
-    "id": "tinycompany/ShawtyIsBad-bgem3",
-    "developer": "tinycompany",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.436
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2608
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3853
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0483
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3054
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3695
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2583
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-e5-large/b304baee-c9de-4982-801d-2b9e7f1a7334.json b/data/hfopenllm_v2/tinycompany/ShawtyIsBad-e5-large/b304baee-c9de-4982-801d-2b9e7f1a7334.json
deleted file mode 100644
index 6ba06459f..000000000
--- a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-e5-large/b304baee-c9de-4982-801d-2b9e7f1a7334.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tinycompany_ShawtyIsBad-e5-large/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ShawtyIsBad-e5-large",
-    "id": "tinycompany/ShawtyIsBad-e5-large",
-    "developer": "tinycompany",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.436
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2468
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3873
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0453
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.372
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2569
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-ib/6f27e746-1bdd-4cec-a955-c27f2f9900ef.json b/data/hfopenllm_v2/tinycompany/ShawtyIsBad-ib/6f27e746-1bdd-4cec-a955-c27f2f9900ef.json
deleted file mode 100644
index 44928ae48..000000000
--- a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-ib/6f27e746-1bdd-4cec-a955-c27f2f9900ef.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tinycompany_ShawtyIsBad-ib/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ShawtyIsBad-ib",
-    "id": "tinycompany/ShawtyIsBad-ib",
-    "developer": "tinycompany",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.436
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2565
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.388
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0491
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3641
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2581
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-nomic-moe/30637c5d-1bc0-49dc-8afd-335a9a66f196.json b/data/hfopenllm_v2/tinycompany/ShawtyIsBad-nomic-moe/30637c5d-1bc0-49dc-8afd-335a9a66f196.json
deleted file mode 100644
index 27d6d6ef9..000000000
--- a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-nomic-moe/30637c5d-1bc0-49dc-8afd-335a9a66f196.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tinycompany_ShawtyIsBad-nomic-moe/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ShawtyIsBad-nomic-moe",
-    "id": "tinycompany/ShawtyIsBad-nomic-moe",
-    "developer": "tinycompany",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.436
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2608
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3878
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.307
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3747
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2572
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-nomic1.5/169e29b6-50d8-456d-aa20-3fe2f3b19a1e.json b/data/hfopenllm_v2/tinycompany/ShawtyIsBad-nomic1.5/169e29b6-50d8-456d-aa20-3fe2f3b19a1e.json
deleted file mode 100644
index 354caa8ce..000000000
--- a/data/hfopenllm_v2/tinycompany/ShawtyIsBad-nomic1.5/169e29b6-50d8-456d-aa20-3fe2f3b19a1e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tinycompany_ShawtyIsBad-nomic1.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ShawtyIsBad-nomic1.5",
-    "id": "tinycompany/ShawtyIsBad-nomic1.5",
-    "developer": "tinycompany",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.436
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2544
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3874
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3112
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3628
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tinycompany/SigmaBoi-base/427d32f7-190b-4005-b02c-6a8ce089dbbf.json b/data/hfopenllm_v2/tinycompany/SigmaBoi-base/427d32f7-190b-4005-b02c-6a8ce089dbbf.json
deleted file mode 100644
index 13015d742..000000000
--- a/data/hfopenllm_v2/tinycompany/SigmaBoi-base/427d32f7-190b-4005-b02c-6a8ce089dbbf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tinycompany_SigmaBoi-base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SigmaBoi-base",
-    "id": "tinycompany/SigmaBoi-base",
-    "developer": "tinycompany",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 2.943
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2447
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4314
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0778
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4343
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2817
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tinycompany/SigmaBoi-bge-m3/de7551a8-63b1-4de3-899f-9d98cb985005.json b/data/hfopenllm_v2/tinycompany/SigmaBoi-bge-m3/de7551a8-63b1-4de3-899f-9d98cb985005.json
deleted file mode 100644
index e2bc5d9e6..000000000
--- a/data/hfopenllm_v2/tinycompany/SigmaBoi-bge-m3/de7551a8-63b1-4de3-899f-9d98cb985005.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tinycompany_SigmaBoi-bge-m3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SigmaBoi-bge-m3",
-    "id": "tinycompany/SigmaBoi-bge-m3",
-    "developer": "tinycompany",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 2.943
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.245
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4351
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0763
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4383
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tinycompany/SigmaBoi-bgem3/eff6f456-906d-4320-8e6f-667fbbf0574a.json b/data/hfopenllm_v2/tinycompany/SigmaBoi-bgem3/eff6f456-906d-4320-8e6f-667fbbf0574a.json
deleted file mode 100644
index 942fd8495..000000000
--- a/data/hfopenllm_v2/tinycompany/SigmaBoi-bgem3/eff6f456-906d-4320-8e6f-667fbbf0574a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tinycompany_SigmaBoi-bgem3/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SigmaBoi-bgem3",
-    "id": "tinycompany/SigmaBoi-bgem3",
-    "developer": "tinycompany",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 2.943
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.245
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4351
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0763
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4383
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tinycompany/SigmaBoi-ib/6cbd9a3a-7e06-4eee-af9e-6db4ff35c36a.json b/data/hfopenllm_v2/tinycompany/SigmaBoi-ib/6cbd9a3a-7e06-4eee-af9e-6db4ff35c36a.json
deleted file mode 100644
index 4fc792af1..000000000
--- a/data/hfopenllm_v2/tinycompany/SigmaBoi-ib/6cbd9a3a-7e06-4eee-af9e-6db4ff35c36a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tinycompany_SigmaBoi-ib/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SigmaBoi-ib",
-    "id": "tinycompany/SigmaBoi-ib",
-    "developer": "tinycompany",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 2.943
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2477
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4344
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.074
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.429
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2824
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tinycompany/SigmaBoi-nomic-moe/7e3d3803-c8d4-4025-8d12-c4c29c49c059.json b/data/hfopenllm_v2/tinycompany/SigmaBoi-nomic-moe/7e3d3803-c8d4-4025-8d12-c4c29c49c059.json
deleted file mode 100644
index 954ab673e..000000000
--- a/data/hfopenllm_v2/tinycompany/SigmaBoi-nomic-moe/7e3d3803-c8d4-4025-8d12-c4c29c49c059.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tinycompany_SigmaBoi-nomic-moe/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SigmaBoi-nomic-moe",
-    "id": "tinycompany/SigmaBoi-nomic-moe",
-    "developer": "tinycompany",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 2.943
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2474
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4334
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0718
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2928
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4316
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2837
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tinycompany/SigmaBoi-nomic1.5-fp32/a43a6ca9-3543-44bc-8511-ee5c45552070.json b/data/hfopenllm_v2/tinycompany/SigmaBoi-nomic1.5-fp32/a43a6ca9-3543-44bc-8511-ee5c45552070.json
deleted file mode 100644
index 5ad5da701..000000000
--- a/data/hfopenllm_v2/tinycompany/SigmaBoi-nomic1.5-fp32/a43a6ca9-3543-44bc-8511-ee5c45552070.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tinycompany_SigmaBoi-nomic1.5-fp32/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SigmaBoi-nomic1.5-fp32",
-    "id": "tinycompany/SigmaBoi-nomic1.5-fp32",
-    "developer": "tinycompany",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 2.943
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2462
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4371
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0831
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4316
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2841
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tinycompany/SigmaBoi-nomic1.5/83f6fdec-9592-45a1-acdf-0ebbb400c8a4.json b/data/hfopenllm_v2/tinycompany/SigmaBoi-nomic1.5/83f6fdec-9592-45a1-acdf-0ebbb400c8a4.json
deleted file mode 100644
index 323eb5e1d..000000000
--- a/data/hfopenllm_v2/tinycompany/SigmaBoi-nomic1.5/83f6fdec-9592-45a1-acdf-0ebbb400c8a4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tinycompany_SigmaBoi-nomic1.5/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SigmaBoi-nomic1.5",
-    "id": "tinycompany/SigmaBoi-nomic1.5",
-    "developer": "tinycompany",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 2.943
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2447
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4371
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0831
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4316
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2841
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tinycompany/Tamed-Shawty/6e2d4174-303f-437b-9abb-26667b1dd04c.json b/data/hfopenllm_v2/tinycompany/Tamed-Shawty/6e2d4174-303f-437b-9abb-26667b1dd04c.json
deleted file mode 100644
index 7acd05352..000000000
--- a/data/hfopenllm_v2/tinycompany/Tamed-Shawty/6e2d4174-303f-437b-9abb-26667b1dd04c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tinycompany_Tamed-Shawty/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Tamed-Shawty",
-    "id": "tinycompany/Tamed-Shawty",
-    "developer": "tinycompany",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.562
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3831
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3837
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0718
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2626
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3501
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tklohj/WindyFloLLM/955e93d0-bec1-483c-b3f0-258e13d5cb16.json b/data/hfopenllm_v2/tklohj/WindyFloLLM/955e93d0-bec1-483c-b3f0-258e13d5cb16.json
deleted file mode 100644
index 7110c6cac..000000000
--- a/data/hfopenllm_v2/tklohj/WindyFloLLM/955e93d0-bec1-483c-b3f0-258e13d5cb16.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tklohj_WindyFloLLM/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "WindyFloLLM",
-    "id": "tklohj/WindyFloLLM",
-    "developer": "tklohj",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.016
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2669
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4637
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4253
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2581
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/togethercomputer/GPT-JT-6B-v1/3065ca79-c5e9-4875-9f81-4231e971d818.json b/data/hfopenllm_v2/togethercomputer/GPT-JT-6B-v1/3065ca79-c5e9-4875-9f81-4231e971d818.json
deleted file mode 100644
index 3b3a90f5a..000000000
--- a/data/hfopenllm_v2/togethercomputer/GPT-JT-6B-v1/3065ca79-c5e9-4875-9f81-4231e971d818.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/togethercomputer_GPT-JT-6B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-JT-6B-v1",
-    "id": "togethercomputer/GPT-JT-6B-v1",
-    "developer": "togethercomputer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTJForCausalLM",
-      "params_billions": 6.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2061
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3303
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0106
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3737
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1626
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/togethercomputer/GPT-NeoXT-Chat-Base-20B/fc7e485f-a416-420b-b43c-e45e502c4a8f.json b/data/hfopenllm_v2/togethercomputer/GPT-NeoXT-Chat-Base-20B/fc7e485f-a416-420b-b43c-e45e502c4a8f.json
deleted file mode 100644
index c6293a07a..000000000
--- a/data/hfopenllm_v2/togethercomputer/GPT-NeoXT-Chat-Base-20B/fc7e485f-a416-420b-b43c-e45e502c4a8f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/togethercomputer_GPT-NeoXT-Chat-Base-20B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "GPT-NeoXT-Chat-Base-20B",
-    "id": "togethercomputer/GPT-NeoXT-Chat-Base-20B",
-    "developer": "togethercomputer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 20.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.183
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3321
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0234
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3461
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1145
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/togethercomputer/LLaMA-2-7B-32K/53e882c6-6eb5-4202-a8d0-3a313556c9f4.json b/data/hfopenllm_v2/togethercomputer/LLaMA-2-7B-32K/53e882c6-6eb5-4202-a8d0-3a313556c9f4.json
deleted file mode 100644
index f637076fd..000000000
--- a/data/hfopenllm_v2/togethercomputer/LLaMA-2-7B-32K/53e882c6-6eb5-4202-a8d0-3a313556c9f4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/togethercomputer_LLaMA-2-7B-32K/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LLaMA-2-7B-32K",
-    "id": "togethercomputer/LLaMA-2-7B-32K",
-    "developer": "togethercomputer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1865
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.34
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.25
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3754
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1768
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/togethercomputer/Llama-2-7B-32K-Instruct/ba715669-c0ed-471f-80a6-b67453fb4930.json b/data/hfopenllm_v2/togethercomputer/Llama-2-7B-32K-Instruct/ba715669-c0ed-471f-80a6-b67453fb4930.json
deleted file mode 100644
index 37237a2f5..000000000
--- a/data/hfopenllm_v2/togethercomputer/Llama-2-7B-32K-Instruct/ba715669-c0ed-471f-80a6-b67453fb4930.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/togethercomputer_Llama-2-7B-32K-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-2-7B-32K-Instruct",
-    "id": "togethercomputer/Llama-2-7B-32K-Instruct",
-    "developer": "togethercomputer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.213
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3443
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2517
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4056
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1781
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Base/316cab27-5cac-4d26-90ae-05d1fc3bd14a.json b/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Base/316cab27-5cac-4d26-90ae-05d1fc3bd14a.json
deleted file mode 100644
index 26801a949..000000000
--- a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Base/316cab27-5cac-4d26-90ae-05d1fc3bd14a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/togethercomputer_RedPajama-INCITE-7B-Base/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RedPajama-INCITE-7B-Base",
-    "id": "togethercomputer/RedPajama-INCITE-7B-Base",
-    "developer": "togethercomputer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2082
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3195
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0159
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.362
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1197
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Chat/d2b0a35a-ea72-42f4-9f71-fffa1480bc22.json b/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Chat/d2b0a35a-ea72-42f4-9f71-fffa1480bc22.json
deleted file mode 100644
index 56a01fe9b..000000000
--- a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Chat/d2b0a35a-ea72-42f4-9f71-fffa1480bc22.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/togethercomputer_RedPajama-INCITE-7B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RedPajama-INCITE-7B-Chat",
-    "id": "togethercomputer/RedPajama-INCITE-7B-Chat",
-    "developer": "togethercomputer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1558
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3175
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2525
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3448
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1121
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Instruct/bf3eabff-fbf7-421c-9e04-548accc7678c.json b/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Instruct/bf3eabff-fbf7-421c-9e04-548accc7678c.json
deleted file mode 100644
index ac9fb012b..000000000
--- a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-7B-Instruct/bf3eabff-fbf7-421c-9e04-548accc7678c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/togethercomputer_RedPajama-INCITE-7B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RedPajama-INCITE-7B-Instruct",
-    "id": "togethercomputer/RedPajama-INCITE-7B-Instruct",
-    "developer": "togethercomputer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2055
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3377
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0211
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2508
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3685
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1272
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Base-3B-v1/b7eeedd8-33ef-46b3-a3fb-6ac87247bc4e.json b/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Base-3B-v1/b7eeedd8-33ef-46b3-a3fb-6ac87247bc4e.json
deleted file mode 100644
index dff22f907..000000000
--- a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Base-3B-v1/b7eeedd8-33ef-46b3-a3fb-6ac87247bc4e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/togethercomputer_RedPajama-INCITE-Base-3B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RedPajama-INCITE-Base-3B-v1",
-    "id": "togethercomputer/RedPajama-INCITE-Base-3B-v1",
-    "developer": "togethercomputer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 3.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2294
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.306
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0144
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2433
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3739
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1111
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Chat-3B-v1/b1c41abe-e7f6-4229-b776-8ed0b5f91bd4.json b/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Chat-3B-v1/b1c41abe-e7f6-4229-b776-8ed0b5f91bd4.json
deleted file mode 100644
index 28081c01d..000000000
--- a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Chat-3B-v1/b1c41abe-e7f6-4229-b776-8ed0b5f91bd4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/togethercomputer_RedPajama-INCITE-Chat-3B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RedPajama-INCITE-Chat-3B-v1",
-    "id": "togethercomputer/RedPajama-INCITE-Chat-3B-v1",
-    "developer": "togethercomputer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 3.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1652
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3217
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2441
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3684
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1127
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Instruct-3B-v1/5b769770-3b63-4863-a723-95212e2be40e.json b/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Instruct-3B-v1/5b769770-3b63-4863-a723-95212e2be40e.json
deleted file mode 100644
index 503ea0a24..000000000
--- a/data/hfopenllm_v2/togethercomputer/RedPajama-INCITE-Instruct-3B-v1/5b769770-3b63-4863-a723-95212e2be40e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/togethercomputer_RedPajama-INCITE-Instruct-3B-v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RedPajama-INCITE-Instruct-3B-v1",
-    "id": "togethercomputer/RedPajama-INCITE-Instruct-3B-v1",
-    "developer": "togethercomputer",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GPTNeoXForCausalLM",
-      "params_billions": 3.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2124
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2475
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3886
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.111
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tokyotech-llm/Llama-3-Swallow-8B-Instruct-v0.1/f2264b41-efa5-4278-91fd-2f454aa91c61.json b/data/hfopenllm_v2/tokyotech-llm/Llama-3-Swallow-8B-Instruct-v0.1/f2264b41-efa5-4278-91fd-2f454aa91c61.json
deleted file mode 100644
index d7ea093eb..000000000
--- a/data/hfopenllm_v2/tokyotech-llm/Llama-3-Swallow-8B-Instruct-v0.1/f2264b41-efa5-4278-91fd-2f454aa91c61.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tokyotech-llm_Llama-3-Swallow-8B-Instruct-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-Swallow-8B-Instruct-v0.1",
-    "id": "tokyotech-llm/Llama-3-Swallow-8B-Instruct-v0.1",
-    "developer": "tokyotech-llm",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5508
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5009
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0748
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4357
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3088
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tomasmcm/sky-t1-coder-32b-flash/5c3484b4-6faa-47fd-a1a2-881898450f79.json b/data/hfopenllm_v2/tomasmcm/sky-t1-coder-32b-flash/5c3484b4-6faa-47fd-a1a2-881898450f79.json
deleted file mode 100644
index 49195f496..000000000
--- a/data/hfopenllm_v2/tomasmcm/sky-t1-coder-32b-flash/5c3484b4-6faa-47fd-a1a2-881898450f79.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tomasmcm_sky-t1-coder-32b-flash/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "sky-t1-coder-32b-flash",
-    "id": "tomasmcm/sky-t1-coder-32b-flash",
-    "developer": "tomasmcm",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.778
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6822
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5423
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3683
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4233
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5782
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/trthminh1112/autotrain-llama32-1b-finetune/326b95f8-9eae-4064-a261-077a957e233c.json b/data/hfopenllm_v2/trthminh1112/autotrain-llama32-1b-finetune/326b95f8-9eae-4064-a261-077a957e233c.json
deleted file mode 100644
index 2a5e33192..000000000
--- a/data/hfopenllm_v2/trthminh1112/autotrain-llama32-1b-finetune/326b95f8-9eae-4064-a261-077a957e233c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/trthminh1112_autotrain-llama32-1b-finetune/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "autotrain-llama32-1b-finetune",
-    "id": "trthminh1112/autotrain-llama32-1b-finetune",
-    "developer": "trthminh1112",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.1
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1769
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2996
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0151
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2567
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3513
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1099
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/tugstugi/Qwen2.5-7B-Instruct-QwQ-v0.1/c1c7336e-b8bf-4a69-a586-c1a224ba8a65.json b/data/hfopenllm_v2/tugstugi/Qwen2.5-7B-Instruct-QwQ-v0.1/c1c7336e-b8bf-4a69-a586-c1a224ba8a65.json
deleted file mode 100644
index 7f871c221..000000000
--- a/data/hfopenllm_v2/tugstugi/Qwen2.5-7B-Instruct-QwQ-v0.1/c1c7336e-b8bf-4a69-a586-c1a224ba8a65.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/tugstugi_Qwen2.5-7B-Instruct-QwQ-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-7B-Instruct-QwQ-v0.1",
-    "id": "tugstugi/Qwen2.5-7B-Instruct-QwQ-v0.1",
-    "developer": "tugstugi",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6017
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5101
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3814
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2685
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3794
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4081
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/universalml/NepaliGPT-2.0/89e55482-b762-4f5d-a021-211048719bdc.json b/data/hfopenllm_v2/universalml/NepaliGPT-2.0/89e55482-b762-4f5d-a021-211048719bdc.json
deleted file mode 100644
index ae1d50c1b..000000000
--- a/data/hfopenllm_v2/universalml/NepaliGPT-2.0/89e55482-b762-4f5d-a021-211048719bdc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/universalml_NepaliGPT-2.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NepaliGPT-2.0",
-    "id": "universalml/NepaliGPT-2.0",
-    "developer": "universalml",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0365
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.466
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4657
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.33
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/unsloth/Llama-3.2-1B-Instruct-no-system-message/81018e12-63f8-4ad8-87c4-181a13202497.json b/data/hfopenllm_v2/unsloth/Llama-3.2-1B-Instruct-no-system-message/81018e12-63f8-4ad8-87c4-181a13202497.json
deleted file mode 100644
index 71f6ae6c9..000000000
--- a/data/hfopenllm_v2/unsloth/Llama-3.2-1B-Instruct-no-system-message/81018e12-63f8-4ad8-87c4-181a13202497.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/unsloth_Llama-3.2-1B-Instruct-no-system-message/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-1B-Instruct-no-system-message",
-    "id": "unsloth/Llama-3.2-1B-Instruct-no-system-message",
-    "developer": "unsloth",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.565
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3544
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0755
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2727
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3341
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1669
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/unsloth/Llama-3.2-1B-Instruct/5b09e8cb-aaf1-48fd-a2f4-11a8d4bc9a4d.json b/data/hfopenllm_v2/unsloth/Llama-3.2-1B-Instruct/5b09e8cb-aaf1-48fd-a2f4-11a8d4bc9a4d.json
deleted file mode 100644
index 1bdf24114..000000000
--- a/data/hfopenllm_v2/unsloth/Llama-3.2-1B-Instruct/5b09e8cb-aaf1-48fd-a2f4-11a8d4bc9a4d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/unsloth_Llama-3.2-1B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-1B-Instruct",
-    "id": "unsloth/Llama-3.2-1B-Instruct",
-    "developer": "unsloth",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.236
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.581
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3485
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0823
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3196
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1742
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/unsloth/Phi-3-mini-4k-instruct/8b344f21-9038-4b15-aba8-308aa62e4b39.json b/data/hfopenllm_v2/unsloth/Phi-3-mini-4k-instruct/8b344f21-9038-4b15-aba8-308aa62e4b39.json
deleted file mode 100644
index 0e0bd0c1a..000000000
--- a/data/hfopenllm_v2/unsloth/Phi-3-mini-4k-instruct/8b344f21-9038-4b15-aba8-308aa62e4b39.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/unsloth_Phi-3-mini-4k-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3-mini-4k-instruct",
-    "id": "unsloth/Phi-3-mini-4k-instruct",
-    "developer": "unsloth",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.544
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.55
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1639
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.323
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4284
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4031
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/unsloth/phi-4-bnb-4bit/68ca8f7c-88c2-4ede-bcb7-d4ae23429d8f.json b/data/hfopenllm_v2/unsloth/phi-4-bnb-4bit/68ca8f7c-88c2-4ede-bcb7-d4ae23429d8f.json
deleted file mode 100644
index 04ac53e78..000000000
--- a/data/hfopenllm_v2/unsloth/phi-4-bnb-4bit/68ca8f7c-88c2-4ede-bcb7-d4ae23429d8f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/unsloth_phi-4-bnb-4bit/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-4-bnb-4bit",
-    "id": "unsloth/phi-4-bnb-4bit",
-    "developer": "unsloth",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.058
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.673
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.677
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4607
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4007
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5256
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/unsloth/phi-4-unsloth-bnb-4bit/df557f25-5505-49dd-a0cb-88fff601c6e2.json b/data/hfopenllm_v2/unsloth/phi-4-unsloth-bnb-4bit/df557f25-5505-49dd-a0cb-88fff601c6e2.json
deleted file mode 100644
index 5a64d5d96..000000000
--- a/data/hfopenllm_v2/unsloth/phi-4-unsloth-bnb-4bit/df557f25-5505-49dd-a0cb-88fff601c6e2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/unsloth_phi-4-unsloth-bnb-4bit/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-4-unsloth-bnb-4bit",
-    "id": "unsloth/phi-4-unsloth-bnb-4bit",
-    "developer": "unsloth",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.483
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6794
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6791
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4562
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4034
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5286
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/unsloth/phi-4/a50bf387-bf34-490f-979a-b6217a85a1bd.json b/data/hfopenllm_v2/unsloth/phi-4/a50bf387-bf34-490f-979a-b6217a85a1bd.json
deleted file mode 100644
index cbdc7e8b4..000000000
--- a/data/hfopenllm_v2/unsloth/phi-4/a50bf387-bf34-490f-979a-b6217a85a1bd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/unsloth_phi-4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "phi-4",
-    "id": "unsloth/phi-4",
-    "developer": "unsloth",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 14.66
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6882
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6886
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3364
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4114
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5378
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/upstage/SOLAR-10.7B-Instruct-v1.0/89264aa0-3bed-41d3-b171-2a5434cc990f.json b/data/hfopenllm_v2/upstage/SOLAR-10.7B-Instruct-v1.0/89264aa0-3bed-41d3-b171-2a5434cc990f.json
deleted file mode 100644
index efc06b49f..000000000
--- a/data/hfopenllm_v2/upstage/SOLAR-10.7B-Instruct-v1.0/89264aa0-3bed-41d3-b171-2a5434cc990f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/upstage_SOLAR-10.7B-Instruct-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SOLAR-10.7B-Instruct-v1.0",
-    "id": "upstage/SOLAR-10.7B-Instruct-v1.0",
-    "developer": "upstage",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4737
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5162
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3899
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3138
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/upstage/SOLAR-10.7B-v1.0/a3272caf-a292-4dc7-8932-636a4099ca6b.json b/data/hfopenllm_v2/upstage/SOLAR-10.7B-v1.0/a3272caf-a292-4dc7-8932-636a4099ca6b.json
deleted file mode 100644
index 711e8ad03..000000000
--- a/data/hfopenllm_v2/upstage/SOLAR-10.7B-v1.0/a3272caf-a292-4dc7-8932-636a4099ca6b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/upstage_SOLAR-10.7B-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SOLAR-10.7B-v1.0",
-    "id": "upstage/SOLAR-10.7B-v1.0",
-    "developer": "upstage",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2421
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5094
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4372
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.34
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/upstage/solar-pro-preview-instruct/c4ade77e-628f-457d-bbe1-3e5a0cb19d04.json b/data/hfopenllm_v2/upstage/solar-pro-preview-instruct/c4ade77e-628f-457d-bbe1-3e5a0cb19d04.json
deleted file mode 100644
index 86a94ac3f..000000000
--- a/data/hfopenllm_v2/upstage/solar-pro-preview-instruct/c4ade77e-628f-457d-bbe1-3e5a0cb19d04.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/upstage_solar-pro-preview-instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "solar-pro-preview-instruct",
-    "id": "upstage/solar-pro-preview-instruct",
-    "developer": "upstage",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "SolarForCausalLM",
-      "params_billions": 22.14
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8416
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6817
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2205
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3708
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4417
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5273
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/utkmst/chimera-beta-test2-lora-merged/b030646c-5f5c-43ab-bbc4-405f82992265.json b/data/hfopenllm_v2/utkmst/chimera-beta-test2-lora-merged/b030646c-5f5c-43ab-bbc4-405f82992265.json
deleted file mode 100644
index 794273d51..000000000
--- a/data/hfopenllm_v2/utkmst/chimera-beta-test2-lora-merged/b030646c-5f5c-43ab-bbc4-405f82992265.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/utkmst_chimera-beta-test2-lora-merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "chimera-beta-test2-lora-merged",
-    "id": "utkmst/chimera-beta-test2-lora-merged",
-    "developer": "utkmst",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6054
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4796
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0952
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3037
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4118
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2992
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/uukuguy/speechless-code-mistral-7b-v1.0/399e516c-d8c8-4511-a746-76c81f72b36a.json b/data/hfopenllm_v2/uukuguy/speechless-code-mistral-7b-v1.0/399e516c-d8c8-4511-a746-76c81f72b36a.json
deleted file mode 100644
index 8a74f1d3d..000000000
--- a/data/hfopenllm_v2/uukuguy/speechless-code-mistral-7b-v1.0/399e516c-d8c8-4511-a746-76c81f72b36a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/uukuguy_speechless-code-mistral-7b-v1.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "speechless-code-mistral-7b-v1.0",
-    "id": "uukuguy/speechless-code-mistral-7b-v1.0",
-    "developer": "uukuguy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3665
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4572
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0521
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4502
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/uukuguy/speechless-codellama-34b-v2.0/bd8e4424-7903-43e7-8105-269de734582e.json b/data/hfopenllm_v2/uukuguy/speechless-codellama-34b-v2.0/bd8e4424-7903-43e7-8105-269de734582e.json
deleted file mode 100644
index eea43c67b..000000000
--- a/data/hfopenllm_v2/uukuguy/speechless-codellama-34b-v2.0/bd8e4424-7903-43e7-8105-269de734582e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/uukuguy_speechless-codellama-34b-v2.0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "speechless-codellama-34b-v2.0",
-    "id": "uukuguy/speechless-codellama-34b-v2.0",
-    "developer": "uukuguy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 34.0
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4604
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4813
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3787
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/uukuguy/speechless-coder-ds-6.7b/9126e939-3a87-4774-9606-084c5b56e933.json b/data/hfopenllm_v2/uukuguy/speechless-coder-ds-6.7b/9126e939-3a87-4774-9606-084c5b56e933.json
deleted file mode 100644
index 9ccbd840a..000000000
--- a/data/hfopenllm_v2/uukuguy/speechless-coder-ds-6.7b/9126e939-3a87-4774-9606-084c5b56e933.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/uukuguy_speechless-coder-ds-6.7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "speechless-coder-ds-6.7b",
-    "id": "uukuguy/speechless-coder-ds-6.7b",
-    "developer": "uukuguy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 6.7
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2505
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4036
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0211
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3819
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1719
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/uukuguy/speechless-instruct-mistral-7b-v0.2/be2ef197-738e-422d-9a88-cafd124584b7.json b/data/hfopenllm_v2/uukuguy/speechless-instruct-mistral-7b-v0.2/be2ef197-738e-422d-9a88-cafd124584b7.json
deleted file mode 100644
index 06f2df151..000000000
--- a/data/hfopenllm_v2/uukuguy/speechless-instruct-mistral-7b-v0.2/be2ef197-738e-422d-9a88-cafd124584b7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/uukuguy_speechless-instruct-mistral-7b-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "speechless-instruct-mistral-7b-v0.2",
-    "id": "uukuguy/speechless-instruct-mistral-7b-v0.2",
-    "developer": "uukuguy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3261
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4607
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0491
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2819
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4902
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2902
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b/ee22e6c5-8529-4987-86d0-4abf3b525f90.json b/data/hfopenllm_v2/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b/ee22e6c5-8529-4987-86d0-4abf3b525f90.json
deleted file mode 100644
index 776e410d9..000000000
--- a/data/hfopenllm_v2/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b/ee22e6c5-8529-4987-86d0-4abf3b525f90.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/uukuguy_speechless-llama2-hermes-orca-platypus-wizardlm-13b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "speechless-llama2-hermes-orca-platypus-wizardlm-13b",
-    "id": "uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b",
-    "developer": "uukuguy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.016
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4562
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4846
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0204
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2701
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4655
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b/50f0ddc2-fccd-447c-ab50-a086ccb4cd3a.json b/data/hfopenllm_v2/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b/50f0ddc2-fccd-447c-ab50-a086ccb4cd3a.json
deleted file mode 100644
index c79782429..000000000
--- a/data/hfopenllm_v2/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b/50f0ddc2-fccd-447c-ab50-a086ccb4cd3a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/uukuguy_speechless-mistral-dolphin-orca-platypus-samantha-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "speechless-mistral-dolphin-orca-platypus-samantha-7b",
-    "id": "uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b",
-    "developer": "uukuguy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.37
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4983
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0295
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4361
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.299
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/uukuguy/speechless-zephyr-code-functionary-7b/83294141-a70f-40da-b3f8-21b367098cce.json b/data/hfopenllm_v2/uukuguy/speechless-zephyr-code-functionary-7b/83294141-a70f-40da-b3f8-21b367098cce.json
deleted file mode 100644
index 947584201..000000000
--- a/data/hfopenllm_v2/uukuguy/speechless-zephyr-code-functionary-7b/83294141-a70f-40da-b3f8-21b367098cce.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/uukuguy_speechless-zephyr-code-functionary-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "speechless-zephyr-code-functionary-7b",
-    "id": "uukuguy/speechless-zephyr-code-functionary-7b",
-    "developer": "uukuguy",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2696
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4664
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0423
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3003
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4268
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3094
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/v000000/L3-8B-Stheno-v3.2-abliterated/303ae3d2-fdf5-404d-83ca-8e6071e13e6b.json b/data/hfopenllm_v2/v000000/L3-8B-Stheno-v3.2-abliterated/303ae3d2-fdf5-404d-83ca-8e6071e13e6b.json
deleted file mode 100644
index ef4193f8a..000000000
--- a/data/hfopenllm_v2/v000000/L3-8B-Stheno-v3.2-abliterated/303ae3d2-fdf5-404d-83ca-8e6071e13e6b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/v000000_L3-8B-Stheno-v3.2-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3-8B-Stheno-v3.2-abliterated",
-    "id": "v000000/L3-8B-Stheno-v3.2-abliterated",
-    "developer": "v000000",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6718
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5141
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0695
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3096
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.362
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3604
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/v000000/L3.1-Niitorm-8B-DPO-t0.0001/1b13d76d-259f-41f2-baba-ce96ef0cb937.json b/data/hfopenllm_v2/v000000/L3.1-Niitorm-8B-DPO-t0.0001/1b13d76d-259f-41f2-baba-ce96ef0cb937.json
deleted file mode 100644
index 4b7d05df3..000000000
--- a/data/hfopenllm_v2/v000000/L3.1-Niitorm-8B-DPO-t0.0001/1b13d76d-259f-41f2-baba-ce96ef0cb937.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/v000000_L3.1-Niitorm-8B-DPO-t0.0001/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.1-Niitorm-8B-DPO-t0.0001",
-    "id": "v000000/L3.1-Niitorm-8B-DPO-t0.0001",
-    "developer": "v000000",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7689
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5134
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1624
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.388
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3866
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/v000000/L3.1-Storniitova-8B/b644a420-0a70-4b3d-9a5a-ff91911c857b.json b/data/hfopenllm_v2/v000000/L3.1-Storniitova-8B/b644a420-0a70-4b3d-9a5a-ff91911c857b.json
deleted file mode 100644
index 45240285c..000000000
--- a/data/hfopenllm_v2/v000000/L3.1-Storniitova-8B/b644a420-0a70-4b3d-9a5a-ff91911c857b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/v000000_L3.1-Storniitova-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.1-Storniitova-8B",
-    "id": "v000000/L3.1-Storniitova-8B",
-    "developer": "v000000",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7817
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5151
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1465
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4029
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3776
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/v000000/Qwen2.5-14B-Gutenberg-1e-Delta/33aaa60f-eb69-4d36-917c-6862121a223e.json b/data/hfopenllm_v2/v000000/Qwen2.5-14B-Gutenberg-1e-Delta/33aaa60f-eb69-4d36-917c-6862121a223e.json
deleted file mode 100644
index c7a731adf..000000000
--- a/data/hfopenllm_v2/v000000/Qwen2.5-14B-Gutenberg-1e-Delta/33aaa60f-eb69-4d36-917c-6862121a223e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/v000000_Qwen2.5-14B-Gutenberg-1e-Delta/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Gutenberg-1e-Delta",
-    "id": "v000000/Qwen2.5-14B-Gutenberg-1e-Delta",
-    "developer": "v000000",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8045
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6398
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5264
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4073
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.493
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/v000000/Qwen2.5-14B-Gutenberg-Instruct-Slerpeno/a1d2e571-6de0-4bd7-bdcf-8b3921b450f6.json b/data/hfopenllm_v2/v000000/Qwen2.5-14B-Gutenberg-Instruct-Slerpeno/a1d2e571-6de0-4bd7-bdcf-8b3921b450f6.json
deleted file mode 100644
index 98637a74d..000000000
--- a/data/hfopenllm_v2/v000000/Qwen2.5-14B-Gutenberg-Instruct-Slerpeno/a1d2e571-6de0-4bd7-bdcf-8b3921b450f6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/v000000_Qwen2.5-14B-Gutenberg-Instruct-Slerpeno/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-14B-Gutenberg-Instruct-Slerpeno",
-    "id": "v000000/Qwen2.5-14B-Gutenberg-Instruct-Slerpeno",
-    "developer": "v000000",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8197
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.639
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5325
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3314
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4114
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4924
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/v000000/Qwen2.5-Lumen-14B/ad93274e-3ca0-40cb-9f65-e6e6c66a8008.json b/data/hfopenllm_v2/v000000/Qwen2.5-Lumen-14B/ad93274e-3ca0-40cb-9f65-e6e6c66a8008.json
deleted file mode 100644
index 13c590e0a..000000000
--- a/data/hfopenllm_v2/v000000/Qwen2.5-Lumen-14B/ad93274e-3ca0-40cb-9f65-e6e6c66a8008.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/v000000_Qwen2.5-Lumen-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Lumen-14B",
-    "id": "v000000/Qwen2.5-Lumen-14B",
-    "developer": "v000000",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8064
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6391
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5363
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4114
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4903
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vhab10/Llama-3.1-8B-Base-Instruct-SLERP/b8043d04-c3ab-4d6a-97eb-44b195a52710.json b/data/hfopenllm_v2/vhab10/Llama-3.1-8B-Base-Instruct-SLERP/b8043d04-c3ab-4d6a-97eb-44b195a52710.json
deleted file mode 100644
index 84ee49bc2..000000000
--- a/data/hfopenllm_v2/vhab10/Llama-3.1-8B-Base-Instruct-SLERP/b8043d04-c3ab-4d6a-97eb-44b195a52710.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vhab10_Llama-3.1-8B-Base-Instruct-SLERP/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-Base-Instruct-SLERP",
-    "id": "vhab10/Llama-3.1-8B-Base-Instruct-SLERP",
-    "developer": "vhab10",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2907
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5057
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1201
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2961
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4011
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3621
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vhab10/Llama-3.2-Instruct-3B-TIES/c6bff6da-382f-4423-ba3a-d987839132e0.json b/data/hfopenllm_v2/vhab10/Llama-3.2-Instruct-3B-TIES/c6bff6da-382f-4423-ba3a-d987839132e0.json
deleted file mode 100644
index 44b89e714..000000000
--- a/data/hfopenllm_v2/vhab10/Llama-3.2-Instruct-3B-TIES/c6bff6da-382f-4423-ba3a-d987839132e0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vhab10_Llama-3.2-Instruct-3B-TIES/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-Instruct-3B-TIES",
-    "id": "vhab10/Llama-3.2-Instruct-3B-TIES",
-    "developer": "vhab10",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.848
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4727
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4332
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0982
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3497
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2916
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vhab10/llama-3-8b-merged-linear/f3574ad1-a6d7-47fb-86e7-69c256452dea.json b/data/hfopenllm_v2/vhab10/llama-3-8b-merged-linear/f3574ad1-a6d7-47fb-86e7-69c256452dea.json
deleted file mode 100644
index a70dfa80f..000000000
--- a/data/hfopenllm_v2/vhab10/llama-3-8b-merged-linear/f3574ad1-a6d7-47fb-86e7-69c256452dea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vhab10_llama-3-8b-merged-linear/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-8b-merged-linear",
-    "id": "vhab10/llama-3-8b-merged-linear",
-    "developer": "vhab10",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 4.65
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5917
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4937
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0816
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4191
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3704
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vicgalle/CarbonBeagle-11B-truthy/f2e47267-6c40-4d70-8420-295c95b318f3.json b/data/hfopenllm_v2/vicgalle/CarbonBeagle-11B-truthy/f2e47267-6c40-4d70-8420-295c95b318f3.json
deleted file mode 100644
index 702223e17..000000000
--- a/data/hfopenllm_v2/vicgalle/CarbonBeagle-11B-truthy/f2e47267-6c40-4d70-8420-295c95b318f3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vicgalle_CarbonBeagle-11B-truthy/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CarbonBeagle-11B-truthy",
-    "id": "vicgalle/CarbonBeagle-11B-truthy",
-    "developer": "vicgalle",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5212
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5348
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0491
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2995
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.374
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3357
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vicgalle/CarbonBeagle-11B/395f246e-34c6-40e6-bfeb-b047aa12cf90.json b/data/hfopenllm_v2/vicgalle/CarbonBeagle-11B/395f246e-34c6-40e6-bfeb-b047aa12cf90.json
deleted file mode 100644
index d8ca9e059..000000000
--- a/data/hfopenllm_v2/vicgalle/CarbonBeagle-11B/395f246e-34c6-40e6-bfeb-b047aa12cf90.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vicgalle_CarbonBeagle-11B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CarbonBeagle-11B",
-    "id": "vicgalle/CarbonBeagle-11B",
-    "developer": "vicgalle",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5415
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5294
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0619
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.402
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3276
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vicgalle/Configurable-Hermes-2-Pro-Llama-3-8B/3a91f8bb-c132-45b3-b8b4-d2ecc9f03f3a.json b/data/hfopenllm_v2/vicgalle/Configurable-Hermes-2-Pro-Llama-3-8B/3a91f8bb-c132-45b3-b8b4-d2ecc9f03f3a.json
deleted file mode 100644
index ea5259667..000000000
--- a/data/hfopenllm_v2/vicgalle/Configurable-Hermes-2-Pro-Llama-3-8B/3a91f8bb-c132-45b3-b8b4-d2ecc9f03f3a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vicgalle_Configurable-Hermes-2-Pro-Llama-3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Configurable-Hermes-2-Pro-Llama-3-8B",
-    "id": "vicgalle/Configurable-Hermes-2-Pro-Llama-3-8B",
-    "developer": "vicgalle",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.031
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5763
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5055
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0763
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.297
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4184
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3098
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vicgalle/Configurable-Llama-3.1-8B-Instruct/97c92043-9bed-460a-8d7b-70ab3584c75b.json b/data/hfopenllm_v2/vicgalle/Configurable-Llama-3.1-8B-Instruct/97c92043-9bed-460a-8d7b-70ab3584c75b.json
deleted file mode 100644
index 86af2304f..000000000
--- a/data/hfopenllm_v2/vicgalle/Configurable-Llama-3.1-8B-Instruct/97c92043-9bed-460a-8d7b-70ab3584c75b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vicgalle_Configurable-Llama-3.1-8B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Configurable-Llama-3.1-8B-Instruct",
-    "id": "vicgalle/Configurable-Llama-3.1-8B-Instruct",
-    "developer": "vicgalle",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8312
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5045
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.173
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3845
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3592
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vicgalle/Configurable-Yi-1.5-9B-Chat/ab2ce171-bfcf-49ea-a341-2a52b2bd803a.json b/data/hfopenllm_v2/vicgalle/Configurable-Yi-1.5-9B-Chat/ab2ce171-bfcf-49ea-a341-2a52b2bd803a.json
deleted file mode 100644
index 263e7fc5d..000000000
--- a/data/hfopenllm_v2/vicgalle/Configurable-Yi-1.5-9B-Chat/ab2ce171-bfcf-49ea-a341-2a52b2bd803a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vicgalle_Configurable-Yi-1.5-9B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Configurable-Yi-1.5-9B-Chat",
-    "id": "vicgalle/Configurable-Yi-1.5-9B-Chat",
-    "developer": "vicgalle",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.829
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4323
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5452
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2047
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3431
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4271
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4015
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vicgalle/ConfigurableBeagle-11B/f9bbd9cc-dc6a-466f-b777-eaea4a15b874.json b/data/hfopenllm_v2/vicgalle/ConfigurableBeagle-11B/f9bbd9cc-dc6a-466f-b777-eaea4a15b874.json
deleted file mode 100644
index 562396548..000000000
--- a/data/hfopenllm_v2/vicgalle/ConfigurableBeagle-11B/f9bbd9cc-dc6a-466f-b777-eaea4a15b874.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vicgalle_ConfigurableBeagle-11B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ConfigurableBeagle-11B",
-    "id": "vicgalle/ConfigurableBeagle-11B",
-    "developer": "vicgalle",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5834
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5287
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3953
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3374
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vicgalle/ConfigurableHermes-7B/cd0aefa3-b0c9-4683-872f-f9f9d285e6c3.json b/data/hfopenllm_v2/vicgalle/ConfigurableHermes-7B/cd0aefa3-b0c9-4683-872f-f9f9d285e6c3.json
deleted file mode 100644
index 4374da041..000000000
--- a/data/hfopenllm_v2/vicgalle/ConfigurableHermes-7B/cd0aefa3-b0c9-4683-872f-f9f9d285e6c3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vicgalle_ConfigurableHermes-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ConfigurableHermes-7B",
-    "id": "vicgalle/ConfigurableHermes-7B",
-    "developer": "vicgalle",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5411
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4573
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0476
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4057
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3025
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vicgalle/ConfigurableSOLAR-10.7B/c42db2ab-dbc4-48e4-9c16-7b8a5f8492c3.json b/data/hfopenllm_v2/vicgalle/ConfigurableSOLAR-10.7B/c42db2ab-dbc4-48e4-9c16-7b8a5f8492c3.json
deleted file mode 100644
index 3fecdf2c2..000000000
--- a/data/hfopenllm_v2/vicgalle/ConfigurableSOLAR-10.7B/c42db2ab-dbc4-48e4-9c16-7b8a5f8492c3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vicgalle_ConfigurableSOLAR-10.7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ConfigurableSOLAR-10.7B",
-    "id": "vicgalle/ConfigurableSOLAR-10.7B",
-    "developer": "vicgalle",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.51
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4867
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0665
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3805
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3173
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vicgalle/Humanish-RP-Llama-3.1-8B/1b32c387-97a7-42ff-892c-d3bacebbf050.json b/data/hfopenllm_v2/vicgalle/Humanish-RP-Llama-3.1-8B/1b32c387-97a7-42ff-892c-d3bacebbf050.json
deleted file mode 100644
index 99c2ecb52..000000000
--- a/data/hfopenllm_v2/vicgalle/Humanish-RP-Llama-3.1-8B/1b32c387-97a7-42ff-892c-d3bacebbf050.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vicgalle_Humanish-RP-Llama-3.1-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Humanish-RP-Llama-3.1-8B",
-    "id": "vicgalle/Humanish-RP-Llama-3.1-8B",
-    "developer": "vicgalle",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6669
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.51
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1518
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2869
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3952
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3477
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vicgalle/Merge-Mistral-Prometheus-7B/cbea057c-b0f9-48ac-a075-eb28ebbaf358.json b/data/hfopenllm_v2/vicgalle/Merge-Mistral-Prometheus-7B/cbea057c-b0f9-48ac-a075-eb28ebbaf358.json
deleted file mode 100644
index f60131bd5..000000000
--- a/data/hfopenllm_v2/vicgalle/Merge-Mistral-Prometheus-7B/cbea057c-b0f9-48ac-a075-eb28ebbaf358.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vicgalle_Merge-Mistral-Prometheus-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Merge-Mistral-Prometheus-7B",
-    "id": "vicgalle/Merge-Mistral-Prometheus-7B",
-    "developer": "vicgalle",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4848
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4201
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0181
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.41
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2717
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vicgalle/Merge-Mixtral-Prometheus-8x7B/0b1bb876-9dc7-47d5-855a-f028fb7f2df6.json b/data/hfopenllm_v2/vicgalle/Merge-Mixtral-Prometheus-8x7B/0b1bb876-9dc7-47d5-855a-f028fb7f2df6.json
deleted file mode 100644
index 956f520a3..000000000
--- a/data/hfopenllm_v2/vicgalle/Merge-Mixtral-Prometheus-8x7B/0b1bb876-9dc7-47d5-855a-f028fb7f2df6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vicgalle_Merge-Mixtral-Prometheus-8x7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Merge-Mixtral-Prometheus-8x7B",
-    "id": "vicgalle/Merge-Mixtral-Prometheus-8x7B",
-    "developer": "vicgalle",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 46.703
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5744
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5351
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0929
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3087
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4098
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3684
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vicgalle/Roleplay-Llama-3-8B/a86678ad-344c-430f-80c7-02d634b0cd5b.json b/data/hfopenllm_v2/vicgalle/Roleplay-Llama-3-8B/a86678ad-344c-430f-80c7-02d634b0cd5b.json
deleted file mode 100644
index 3b3b5675b..000000000
--- a/data/hfopenllm_v2/vicgalle/Roleplay-Llama-3-8B/a86678ad-344c-430f-80c7-02d634b0cd5b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vicgalle_Roleplay-Llama-3-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Roleplay-Llama-3-8B",
-    "id": "vicgalle/Roleplay-Llama-3-8B",
-    "developer": "vicgalle",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.732
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5012
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0914
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3529
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3708
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/viettelsecurity-ai/security-llama3.2-3b/827f3236-74fa-432b-8177-8785ac25ad76.json b/data/hfopenllm_v2/viettelsecurity-ai/security-llama3.2-3b/827f3236-74fa-432b-8177-8785ac25ad76.json
deleted file mode 100644
index 088a8c8aa..000000000
--- a/data/hfopenllm_v2/viettelsecurity-ai/security-llama3.2-3b/827f3236-74fa-432b-8177-8785ac25ad76.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/viettelsecurity-ai_security-llama3.2-3b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "security-llama3.2-3b",
-    "id": "viettelsecurity-ai/security-llama3.2-3b",
-    "developer": "viettelsecurity-ai",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5909
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4401
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1261
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3379
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2837
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vihangd/smart-dan-sft-v0.1/7f694687-77e5-41d2-923b-f2d5f231729b.json b/data/hfopenllm_v2/vihangd/smart-dan-sft-v0.1/7f694687-77e5-41d2-923b-f2d5f231729b.json
deleted file mode 100644
index ee3f7a212..000000000
--- a/data/hfopenllm_v2/vihangd/smart-dan-sft-v0.1/7f694687-77e5-41d2-923b-f2d5f231729b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vihangd_smart-dan-sft-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smart-dan-sft-v0.1",
-    "id": "vihangd/smart-dan-sft-v0.1",
-    "developer": "vihangd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "4bit",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.379
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1576
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3062
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0098
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.255
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3502
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1142
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/voidful/smol-360m-ft/daa9d03e-63b0-4c08-ae72-e11041200ac7.json b/data/hfopenllm_v2/voidful/smol-360m-ft/daa9d03e-63b0-4c08-ae72-e11041200ac7.json
deleted file mode 100644
index b88c7efbe..000000000
--- a/data/hfopenllm_v2/voidful/smol-360m-ft/daa9d03e-63b0-4c08-ae72-e11041200ac7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/voidful_smol-360m-ft/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "smol-360m-ft",
-    "id": "voidful/smol-360m-ft",
-    "developer": "voidful",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.362
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2013
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0083
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2458
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3714
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1087
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vonjack/MobileLLM-125M-HF/1539822f-acc4-4dae-9e61-133da97ebcbe.json b/data/hfopenllm_v2/vonjack/MobileLLM-125M-HF/1539822f-acc4-4dae-9e61-133da97ebcbe.json
deleted file mode 100644
index 3d05087f3..000000000
--- a/data/hfopenllm_v2/vonjack/MobileLLM-125M-HF/1539822f-acc4-4dae-9e61-133da97ebcbe.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vonjack_MobileLLM-125M-HF/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MobileLLM-125M-HF",
-    "id": "vonjack/MobileLLM-125M-HF",
-    "developer": "vonjack",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.125
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2107
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3027
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0091
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3782
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1164
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vonjack/Phi-3-mini-4k-instruct-LLaMAfied/eec80fda-ce2f-4ef4-94d3-9e7b90f7f2e5.json b/data/hfopenllm_v2/vonjack/Phi-3-mini-4k-instruct-LLaMAfied/eec80fda-ce2f-4ef4-94d3-9e7b90f7f2e5.json
deleted file mode 100644
index adf71cb9a..000000000
--- a/data/hfopenllm_v2/vonjack/Phi-3-mini-4k-instruct-LLaMAfied/eec80fda-ce2f-4ef4-94d3-9e7b90f7f2e5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vonjack_Phi-3-mini-4k-instruct-LLaMAfied/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3-mini-4k-instruct-LLaMAfied",
-    "id": "vonjack/Phi-3-mini-4k-instruct-LLaMAfied",
-    "developer": "vonjack",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.821
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5787
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5741
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1382
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3305
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3924
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3885
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vonjack/Phi-3.5-mini-instruct-hermes-fc-json/448cac5f-a7d3-41fb-9b49-666758037eb4.json b/data/hfopenllm_v2/vonjack/Phi-3.5-mini-instruct-hermes-fc-json/448cac5f-a7d3-41fb-9b49-666758037eb4.json
deleted file mode 100644
index 745062dbb..000000000
--- a/data/hfopenllm_v2/vonjack/Phi-3.5-mini-instruct-hermes-fc-json/448cac5f-a7d3-41fb-9b49-666758037eb4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vonjack_Phi-3.5-mini-instruct-hermes-fc-json/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Phi-3.5-mini-instruct-hermes-fc-json",
-    "id": "vonjack/Phi-3.5-mini-instruct-hermes-fc-json",
-    "developer": "vonjack",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "?",
-      "params_billions": 4.132
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1416
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2975
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0076
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2542
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4041
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1139
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vonjack/Qwen2.5-Coder-0.5B-Merged/5d7c5ac1-84c3-4fd1-ac51-4c00ed8c59c7.json b/data/hfopenllm_v2/vonjack/Qwen2.5-Coder-0.5B-Merged/5d7c5ac1-84c3-4fd1-ac51-4c00ed8c59c7.json
deleted file mode 100644
index 7da1c30b9..000000000
--- a/data/hfopenllm_v2/vonjack/Qwen2.5-Coder-0.5B-Merged/5d7c5ac1-84c3-4fd1-ac51-4c00ed8c59c7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vonjack_Qwen2.5-Coder-0.5B-Merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-Coder-0.5B-Merged",
-    "id": "vonjack/Qwen2.5-Coder-0.5B-Merged",
-    "developer": "vonjack",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 0.63
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.31
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3076
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0378
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3303
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1202
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vonjack/SmolLM2-1.7B-Merged/7e1741cc-f9ea-4940-9b6b-d7a515cfce31.json b/data/hfopenllm_v2/vonjack/SmolLM2-1.7B-Merged/7e1741cc-f9ea-4940-9b6b-d7a515cfce31.json
deleted file mode 100644
index a43b36871..000000000
--- a/data/hfopenllm_v2/vonjack/SmolLM2-1.7B-Merged/7e1741cc-f9ea-4940-9b6b-d7a515cfce31.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vonjack_SmolLM2-1.7B-Merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM2-1.7B-Merged",
-    "id": "vonjack/SmolLM2-1.7B-Merged",
-    "developer": "vonjack",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 1.711
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3698
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3587
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0627
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3408
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2048
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vonjack/SmolLM2-135M-Merged/ec4d21be-b1a6-47a9-84a4-1a25249c1768.json b/data/hfopenllm_v2/vonjack/SmolLM2-135M-Merged/ec4d21be-b1a6-47a9-84a4-1a25249c1768.json
deleted file mode 100644
index 409a63ac3..000000000
--- a/data/hfopenllm_v2/vonjack/SmolLM2-135M-Merged/ec4d21be-b1a6-47a9-84a4-1a25249c1768.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vonjack_SmolLM2-135M-Merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM2-135M-Merged",
-    "id": "vonjack/SmolLM2-135M-Merged",
-    "developer": "vonjack",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.135
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2483
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.31
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0113
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2383
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3662
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1112
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/vonjack/SmolLM2-360M-Merged/c6b03539-04b3-4ef2-909d-8036a7ea2ae1.json b/data/hfopenllm_v2/vonjack/SmolLM2-360M-Merged/c6b03539-04b3-4ef2-909d-8036a7ea2ae1.json
deleted file mode 100644
index 0d45953b7..000000000
--- a/data/hfopenllm_v2/vonjack/SmolLM2-360M-Merged/c6b03539-04b3-4ef2-909d-8036a7ea2ae1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/vonjack_SmolLM2-360M-Merged/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SmolLM2-360M-Merged",
-    "id": "vonjack/SmolLM2-360M-Merged",
-    "developer": "vonjack",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 0.362
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3206
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3155
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0174
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2559
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3527
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1098
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/w4r10ck/SOLAR-10.7B-Instruct-v1.0-uncensored/f156ac38-056e-4ef1-bdbe-e83c299a683b.json b/data/hfopenllm_v2/w4r10ck/SOLAR-10.7B-Instruct-v1.0-uncensored/f156ac38-056e-4ef1-bdbe-e83c299a683b.json
deleted file mode 100644
index 1c4c50f83..000000000
--- a/data/hfopenllm_v2/w4r10ck/SOLAR-10.7B-Instruct-v1.0-uncensored/f156ac38-056e-4ef1-bdbe-e83c299a683b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/w4r10ck_SOLAR-10.7B-Instruct-v1.0-uncensored/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SOLAR-10.7B-Instruct-v1.0-uncensored",
-    "id": "w4r10ck/SOLAR-10.7B-Instruct-v1.0-uncensored",
-    "developer": "w4r10ck",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 10.732
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3884
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5302
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0657
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4639
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3344
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/wanlige/li-14b-v0.4-slerp/11d3c8db-300c-4e02-b729-7adba6844ad2.json b/data/hfopenllm_v2/wanlige/li-14b-v0.4-slerp/11d3c8db-300c-4e02-b729-7adba6844ad2.json
deleted file mode 100644
index 11b77c784..000000000
--- a/data/hfopenllm_v2/wanlige/li-14b-v0.4-slerp/11d3c8db-300c-4e02-b729-7adba6844ad2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/wanlige_li-14b-v0.4-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "li-14b-v0.4-slerp",
-    "id": "wanlige/li-14b-v0.4-slerp",
-    "developer": "wanlige",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4606
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6587
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4192
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4002
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4768
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5372
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/wanlige/li-14b-v0.4-slerp0.1/fc75a820-fc0b-4e50-9304-61f0e93795c0.json b/data/hfopenllm_v2/wanlige/li-14b-v0.4-slerp0.1/fc75a820-fc0b-4e50-9304-61f0e93795c0.json
deleted file mode 100644
index c132d7290..000000000
--- a/data/hfopenllm_v2/wanlige/li-14b-v0.4-slerp0.1/fc75a820-fc0b-4e50-9304-61f0e93795c0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/wanlige_li-14b-v0.4-slerp0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "li-14b-v0.4-slerp0.1",
-    "id": "wanlige/li-14b-v0.4-slerp0.1",
-    "developer": "wanlige",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7923
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6572
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5332
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3591
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4207
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5294
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/wanlige/li-14b-v0.4/bb66896f-799c-4e17-8b54-af5e795699fa.json b/data/hfopenllm_v2/wanlige/li-14b-v0.4/bb66896f-799c-4e17-8b54-af5e795699fa.json
deleted file mode 100644
index 3c975e261..000000000
--- a/data/hfopenllm_v2/wanlige/li-14b-v0.4/bb66896f-799c-4e17-8b54-af5e795699fa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/wanlige_li-14b-v0.4/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "li-14b-v0.4",
-    "id": "wanlige/li-14b-v0.4",
-    "developer": "wanlige",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8133
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6544
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5574
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3389
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.446
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5167
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/wannaphong/KhanomTanLLM-Instruct/30a1a786-7478-401f-85ae-57037ada3d32.json b/data/hfopenllm_v2/wannaphong/KhanomTanLLM-Instruct/30a1a786-7478-401f-85ae-57037ada3d32.json
deleted file mode 100644
index 2276ead59..000000000
--- a/data/hfopenllm_v2/wannaphong/KhanomTanLLM-Instruct/30a1a786-7478-401f-85ae-57037ada3d32.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/wannaphong_KhanomTanLLM-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "KhanomTanLLM-Instruct",
-    "id": "wannaphong/KhanomTanLLM-Instruct",
-    "developer": "wannaphong",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.447
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1621
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3093
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0136
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2634
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3701
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1119
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/waqasali1707/Beast-Soul-new/05430b16-07b6-41a1-ade9-6211cdf8ccf1.json b/data/hfopenllm_v2/waqasali1707/Beast-Soul-new/05430b16-07b6-41a1-ade9-6211cdf8ccf1.json
deleted file mode 100644
index 7fbd3aeba..000000000
--- a/data/hfopenllm_v2/waqasali1707/Beast-Soul-new/05430b16-07b6-41a1-ade9-6211cdf8ccf1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/waqasali1707_Beast-Soul-new/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Beast-Soul-new",
-    "id": "waqasali1707/Beast-Soul-new",
-    "developer": "waqasali1707",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.503
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5225
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0702
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4486
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3108
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/wave-on-discord/qwent-7b/09bc4d5a-f104-4a36-999c-11e2532eef1e.json b/data/hfopenllm_v2/wave-on-discord/qwent-7b/09bc4d5a-f104-4a36-999c-11e2532eef1e.json
deleted file mode 100644
index 3152ab264..000000000
--- a/data/hfopenllm_v2/wave-on-discord/qwent-7b/09bc4d5a-f104-4a36-999c-11e2532eef1e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/wave-on-discord_qwent-7b/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwent-7b",
-    "id": "wave-on-discord/qwent-7b",
-    "developer": "wave-on-discord",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2015
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4228
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0038
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2651
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1603
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/weathermanj/Menda-3B-500/a92cfff6-6caf-4bf1-913a-9d7dd2d8d449.json b/data/hfopenllm_v2/weathermanj/Menda-3B-500/a92cfff6-6caf-4bf1-913a-9d7dd2d8d449.json
deleted file mode 100644
index cc66f0e58..000000000
--- a/data/hfopenllm_v2/weathermanj/Menda-3B-500/a92cfff6-6caf-4bf1-913a-9d7dd2d8d449.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/weathermanj_Menda-3B-500/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Menda-3B-500",
-    "id": "weathermanj/Menda-3B-500",
-    "developer": "weathermanj",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6353
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4766
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3724
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3968
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3475
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/weathermanj/Menda-3b-750/8972e92c-ebbe-4dc4-8a8c-6f7a42ab5c11.json b/data/hfopenllm_v2/weathermanj/Menda-3b-750/8972e92c-ebbe-4dc4-8a8c-6f7a42ab5c11.json
deleted file mode 100644
index ed1d22611..000000000
--- a/data/hfopenllm_v2/weathermanj/Menda-3b-750/8972e92c-ebbe-4dc4-8a8c-6f7a42ab5c11.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/weathermanj_Menda-3b-750/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Menda-3b-750",
-    "id": "weathermanj/Menda-3b-750",
-    "developer": "weathermanj",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6335
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4737
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3942
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3506
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/weathermanj/Menda-3b-Optim-100/e4f39815-9704-4d0a-8d9b-39359367adcc.json b/data/hfopenllm_v2/weathermanj/Menda-3b-Optim-100/e4f39815-9704-4d0a-8d9b-39359367adcc.json
deleted file mode 100644
index d3fdfc367..000000000
--- a/data/hfopenllm_v2/weathermanj/Menda-3b-Optim-100/e4f39815-9704-4d0a-8d9b-39359367adcc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/weathermanj_Menda-3b-Optim-100/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Menda-3b-Optim-100",
-    "id": "weathermanj/Menda-3b-Optim-100",
-    "developer": "weathermanj",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6398
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4735
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3993
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3461
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/weathermanj/Menda-3b-Optim-200/f40df456-eb9a-46f8-8fb0-b6ad2748f3c2.json b/data/hfopenllm_v2/weathermanj/Menda-3b-Optim-200/f40df456-eb9a-46f8-8fb0-b6ad2748f3c2.json
deleted file mode 100644
index dc2afd5e8..000000000
--- a/data/hfopenllm_v2/weathermanj/Menda-3b-Optim-200/f40df456-eb9a-46f8-8fb0-b6ad2748f3c2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/weathermanj_Menda-3b-Optim-200/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Menda-3b-Optim-200",
-    "id": "weathermanj/Menda-3b-Optim-200",
-    "developer": "weathermanj",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6375
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4746
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3731
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2827
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4033
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3484
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/win10/ArliAI-RPMax-v1.3-merge-13.3B/398996d9-299b-4120-a757-e2fe14e779ee.json b/data/hfopenllm_v2/win10/ArliAI-RPMax-v1.3-merge-13.3B/398996d9-299b-4120-a757-e2fe14e779ee.json
deleted file mode 100644
index 8283795f0..000000000
--- a/data/hfopenllm_v2/win10/ArliAI-RPMax-v1.3-merge-13.3B/398996d9-299b-4120-a757-e2fe14e779ee.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/win10_ArliAI-RPMax-v1.3-merge-13.3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ArliAI-RPMax-v1.3-merge-13.3B",
-    "id": "win10/ArliAI-RPMax-v1.3-merge-13.3B",
-    "developer": "win10",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.265
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3038
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4581
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4325
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.32
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/win10/Breeze-13B-32k-Instruct-v1_0/4398633e-77b0-4b61-ae85-29b0e5aad38b.json b/data/hfopenllm_v2/win10/Breeze-13B-32k-Instruct-v1_0/4398633e-77b0-4b61-ae85-29b0e5aad38b.json
deleted file mode 100644
index 4e29ec855..000000000
--- a/data/hfopenllm_v2/win10/Breeze-13B-32k-Instruct-v1_0/4398633e-77b0-4b61-ae85-29b0e5aad38b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/win10_Breeze-13B-32k-Instruct-v1_0/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Breeze-13B-32k-Instruct-v1_0",
-    "id": "win10/Breeze-13B-32k-Instruct-v1_0",
-    "developer": "win10",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 12.726
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3584
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4611
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0128
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2643
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4202
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2568
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/win10/EVA-Norns-Qwen2.5-v0.1/1bc60148-512f-4830-b541-f30535cf74bf.json b/data/hfopenllm_v2/win10/EVA-Norns-Qwen2.5-v0.1/1bc60148-512f-4830-b541-f30535cf74bf.json
deleted file mode 100644
index 2e96decff..000000000
--- a/data/hfopenllm_v2/win10/EVA-Norns-Qwen2.5-v0.1/1bc60148-512f-4830-b541-f30535cf74bf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/win10_EVA-Norns-Qwen2.5-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "EVA-Norns-Qwen2.5-v0.1",
-    "id": "win10/EVA-Norns-Qwen2.5-v0.1",
-    "developer": "win10",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.622
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5072
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2613
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4045
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3425
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/win10/Llama-3.2-3B-Instruct-24-9-29/a9dfb20a-13e0-4419-a747-7c001b2e9435.json b/data/hfopenllm_v2/win10/Llama-3.2-3B-Instruct-24-9-29/a9dfb20a-13e0-4419-a747-7c001b2e9435.json
deleted file mode 100644
index e291ffe11..000000000
--- a/data/hfopenllm_v2/win10/Llama-3.2-3B-Instruct-24-9-29/a9dfb20a-13e0-4419-a747-7c001b2e9435.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/win10_Llama-3.2-3B-Instruct-24-9-29/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.2-3B-Instruct-24-9-29",
-    "id": "win10/Llama-3.2-3B-Instruct-24-9-29",
-    "developer": "win10",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7332
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4614
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1707
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3555
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3228
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/win10/Norns-Qwen2.5-12B/388e3559-a3b6-4738-9843-9bdd048bae09.json b/data/hfopenllm_v2/win10/Norns-Qwen2.5-12B/388e3559-a3b6-4738-9843-9bdd048bae09.json
deleted file mode 100644
index f5e98a734..000000000
--- a/data/hfopenllm_v2/win10/Norns-Qwen2.5-12B/388e3559-a3b6-4738-9843-9bdd048bae09.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/win10_Norns-Qwen2.5-12B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Norns-Qwen2.5-12B",
-    "id": "win10/Norns-Qwen2.5-12B",
-    "developer": "win10",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 12.277
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4897
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4619
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0838
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3555
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.266
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/win10/Norns-Qwen2.5-7B/994a6930-42d5-463a-9e7c-0a3070144211.json b/data/hfopenllm_v2/win10/Norns-Qwen2.5-7B/994a6930-42d5-463a-9e7c-0a3070144211.json
deleted file mode 100644
index a0a55bfe7..000000000
--- a/data/hfopenllm_v2/win10/Norns-Qwen2.5-7B/994a6930-42d5-463a-9e7c-0a3070144211.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/win10_Norns-Qwen2.5-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Norns-Qwen2.5-7B",
-    "id": "win10/Norns-Qwen2.5-7B",
-    "developer": "win10",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6122
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5073
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2628
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2844
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4085
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3413
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/win10/Qwen2.5-2B-Instruct/cce46320-9794-443a-831a-92e2a21515b0.json b/data/hfopenllm_v2/win10/Qwen2.5-2B-Instruct/cce46320-9794-443a-831a-92e2a21515b0.json
deleted file mode 100644
index 0e10fdee0..000000000
--- a/data/hfopenllm_v2/win10/Qwen2.5-2B-Instruct/cce46320-9794-443a-831a-92e2a21515b0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/win10_Qwen2.5-2B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-2B-Instruct",
-    "id": "win10/Qwen2.5-2B-Instruct",
-    "developer": "win10",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 2.9
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2273
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3706
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0227
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2676
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4378
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1934
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/win10/llama3-13.45b-Instruct/988f4cc0-ebfb-43a9-8a7f-3dd1f1c1e342.json b/data/hfopenllm_v2/win10/llama3-13.45b-Instruct/988f4cc0-ebfb-43a9-8a7f-3dd1f1c1e342.json
deleted file mode 100644
index 4c991522b..000000000
--- a/data/hfopenllm_v2/win10/llama3-13.45b-Instruct/988f4cc0-ebfb-43a9-8a7f-3dd1f1c1e342.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/win10_llama3-13.45b-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama3-13.45b-Instruct",
-    "id": "win10/llama3-13.45b-Instruct",
-    "developer": "win10",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 13.265
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4144
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4865
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0242
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3848
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3345
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/win10/miscii-14b-1M-0128/3c675148-5d09-4778-baad-9295ef8cfc79.json b/data/hfopenllm_v2/win10/miscii-14b-1M-0128/3c675148-5d09-4778-baad-9295ef8cfc79.json
deleted file mode 100644
index be52a6670..000000000
--- a/data/hfopenllm_v2/win10/miscii-14b-1M-0128/3c675148-5d09-4778-baad-9295ef8cfc79.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/win10_miscii-14b-1M-0128/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "miscii-14b-1M-0128",
-    "id": "win10/miscii-14b-1M-0128",
-    "developer": "win10",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.766
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4181
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5742
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4773
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3826
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5431
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4491
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/winglian/Llama-3-8b-64k-PoSE/620b80ba-81ab-4504-9f42-4965014f3cd1.json b/data/hfopenllm_v2/winglian/Llama-3-8b-64k-PoSE/620b80ba-81ab-4504-9f42-4965014f3cd1.json
deleted file mode 100644
index 6c33d53ee..000000000
--- a/data/hfopenllm_v2/winglian/Llama-3-8b-64k-PoSE/620b80ba-81ab-4504-9f42-4965014f3cd1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/winglian_Llama-3-8b-64k-PoSE/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8b-64k-PoSE",
-    "id": "winglian/Llama-3-8b-64k-PoSE",
-    "developer": "winglian",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2857
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3702
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0415
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3396
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2467
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/winglian/llama-3-8b-256k-PoSE/b6c68fc1-c2c1-4cdf-91ef-2007becd7ade.json b/data/hfopenllm_v2/winglian/llama-3-8b-256k-PoSE/b6c68fc1-c2c1-4cdf-91ef-2007becd7ade.json
deleted file mode 100644
index e3d0f5b20..000000000
--- a/data/hfopenllm_v2/winglian/llama-3-8b-256k-PoSE/b6c68fc1-c2c1-4cdf-91ef-2007becd7ade.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/winglian_llama-3-8b-256k-PoSE/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-8b-256k-PoSE",
-    "id": "winglian/llama-3-8b-256k-PoSE",
-    "developer": "winglian",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2909
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3157
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0196
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2576
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3316
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1116
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/wzhouad/gemma-2-9b-it-WPO-HB/19279c18-c2f7-4f75-a9c5-a121b2d4bcff.json b/data/hfopenllm_v2/wzhouad/gemma-2-9b-it-WPO-HB/19279c18-c2f7-4f75-a9c5-a121b2d4bcff.json
deleted file mode 100644
index 7af55e4ae..000000000
--- a/data/hfopenllm_v2/wzhouad/gemma-2-9b-it-WPO-HB/19279c18-c2f7-4f75-a9c5-a121b2d4bcff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/wzhouad_gemma-2-9b-it-WPO-HB/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-9b-it-WPO-HB",
-    "id": "wzhouad/gemma-2-9b-it-WPO-HB",
-    "developer": "wzhouad",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5437
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5629
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1533
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3498
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3675
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.336
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/x0000001/Deepseek-Lumen-R1-Qwen2.5-14B/7966789d-8ace-4b39-9093-96bbb8e641d8.json b/data/hfopenllm_v2/x0000001/Deepseek-Lumen-R1-Qwen2.5-14B/7966789d-8ace-4b39-9093-96bbb8e641d8.json
deleted file mode 100644
index 1d76659b4..000000000
--- a/data/hfopenllm_v2/x0000001/Deepseek-Lumen-R1-Qwen2.5-14B/7966789d-8ace-4b39-9093-96bbb8e641d8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/x0000001_Deepseek-Lumen-R1-Qwen2.5-14B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Deepseek-Lumen-R1-Qwen2.5-14B",
-    "id": "x0000001/Deepseek-Lumen-R1-Qwen2.5-14B",
-    "developer": "x0000001",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 14.77
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4436
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4569
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2779
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2852
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.474
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4379
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xMaulana/FinMatcha-3B-Instruct/5e1d849d-0342-4de9-a7d8-dd5cd5960fac.json b/data/hfopenllm_v2/xMaulana/FinMatcha-3B-Instruct/5e1d849d-0342-4de9-a7d8-dd5cd5960fac.json
deleted file mode 100644
index ef5a60545..000000000
--- a/data/hfopenllm_v2/xMaulana/FinMatcha-3B-Instruct/5e1d849d-0342-4de9-a7d8-dd5cd5960fac.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xMaulana_FinMatcha-3B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "FinMatcha-3B-Instruct",
-    "id": "xMaulana/FinMatcha-3B-Instruct",
-    "developer": "xMaulana",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7548
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4536
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1435
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3633
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3182
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xinchen9/Llama3.1_8B_Instruct_CoT/a17563e3-0369-4042-8006-2ec781653f63.json b/data/hfopenllm_v2/xinchen9/Llama3.1_8B_Instruct_CoT/a17563e3-0369-4042-8006-2ec781653f63.json
deleted file mode 100644
index 6fe27572a..000000000
--- a/data/hfopenllm_v2/xinchen9/Llama3.1_8B_Instruct_CoT/a17563e3-0369-4042-8006-2ec781653f63.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xinchen9_Llama3.1_8B_Instruct_CoT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1_8B_Instruct_CoT",
-    "id": "xinchen9/Llama3.1_8B_Instruct_CoT",
-    "developer": "xinchen9",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2974
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4398
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0604
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4371
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2879
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xinchen9/Llama3.1_CoT/68369110-e371-4112-ae0a-14f7fe9fc40f.json b/data/hfopenllm_v2/xinchen9/Llama3.1_CoT/68369110-e371-4112-ae0a-14f7fe9fc40f.json
deleted file mode 100644
index 9bdeb1364..000000000
--- a/data/hfopenllm_v2/xinchen9/Llama3.1_CoT/68369110-e371-4112-ae0a-14f7fe9fc40f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xinchen9_Llama3.1_CoT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1_CoT",
-    "id": "xinchen9/Llama3.1_CoT",
-    "developer": "xinchen9",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2246
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4341
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4305
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2739
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xinchen9/Llama3.1_CoT_V1/2a6925d3-992f-4c4f-a57b-3eb41062743b.json b/data/hfopenllm_v2/xinchen9/Llama3.1_CoT_V1/2a6925d3-992f-4c4f-a57b-3eb41062743b.json
deleted file mode 100644
index cd826b22d..000000000
--- a/data/hfopenllm_v2/xinchen9/Llama3.1_CoT_V1/2a6925d3-992f-4c4f-a57b-3eb41062743b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xinchen9_Llama3.1_CoT_V1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3.1_CoT_V1",
-    "id": "xinchen9/Llama3.1_CoT_V1",
-    "developer": "xinchen9",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2453
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4376
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0332
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4572
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2805
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xinchen9/Mistral-7B-CoT/28290ea9-9ce5-4605-ac5b-aa2d606994d8.json b/data/hfopenllm_v2/xinchen9/Mistral-7B-CoT/28290ea9-9ce5-4605-ac5b-aa2d606994d8.json
deleted file mode 100644
index 678ee6b2d..000000000
--- a/data/hfopenllm_v2/xinchen9/Mistral-7B-CoT/28290ea9-9ce5-4605-ac5b-aa2d606994d8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xinchen9_Mistral-7B-CoT/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Mistral-7B-CoT",
-    "id": "xinchen9/Mistral-7B-CoT",
-    "developer": "xinchen9",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2783
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3873
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0249
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2492
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3994
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2284
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xinchen9/llama3-b8-ft-dis/eb2ed6eb-4789-400d-aea5-841547a20cd7.json b/data/hfopenllm_v2/xinchen9/llama3-b8-ft-dis/eb2ed6eb-4789-400d-aea5-841547a20cd7.json
deleted file mode 100644
index bb558ab87..000000000
--- a/data/hfopenllm_v2/xinchen9/llama3-b8-ft-dis/eb2ed6eb-4789-400d-aea5-841547a20cd7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xinchen9_llama3-b8-ft-dis/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama3-b8-ft-dis",
-    "id": "xinchen9/llama3-b8-ft-dis",
-    "developer": "xinchen9",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1546
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4626
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0393
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3129
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3654
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3244
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table/873218a0-7ddb-4287-88ce-8c8214e85c85.json b/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table/873218a0-7ddb-4287-88ce-8c8214e85c85.json
deleted file mode 100644
index 9463f18e7..000000000
--- a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table/873218a0-7ddb-4287-88ce-8c8214e85c85.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table",
-    "id": "xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_2b-table",
-    "developer": "xkp24",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6375
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4912
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0921
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.382
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3686
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table/e4c32b92-46b4-431a-83f2-11499f587534.json b/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table/e4c32b92-46b4-431a-83f2-11499f587534.json
deleted file mode 100644
index ea401b3b0..000000000
--- a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table/e4c32b92-46b4-431a-83f2-11499f587534.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table",
-    "id": "xkp24/Llama-3-8B-Instruct-SPPO-Iter2_bt_8b-table",
-    "developer": "xkp24",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7275
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5057
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0846
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3819
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3697
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table/a05681a0-07e4-4206-ae89-dee4e9706467.json b/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table/a05681a0-07e4-4206-ae89-dee4e9706467.json
deleted file mode 100644
index a2cc2910a..000000000
--- a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table/a05681a0-07e4-4206-ae89-dee4e9706467.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table",
-    "id": "xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_2b-table",
-    "developer": "xkp24",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6569
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4952
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0891
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3594
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3702
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table/b078f823-d603-4030-81a2-a3ca1a1117f9.json b/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table/b078f823-d603-4030-81a2-a3ca1a1117f9.json
deleted file mode 100644
index 0aa1684a6..000000000
--- a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table/b078f823-d603-4030-81a2-a3ca1a1117f9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xkp24_Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table",
-    "id": "xkp24/Llama-3-8B-Instruct-SPPO-Iter2_gp_8b-table",
-    "developer": "xkp24",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6621
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5004
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0861
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3805
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.36
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001/26625158-6720-47c7-8c28-46ca7b4b947e.json b/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001/26625158-6720-47c7-8c28-46ca7b4b947e.json
deleted file mode 100644
index 296a0d099..000000000
--- a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001/26625158-6720-47c7-8c28-46ca7b4b947e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001",
-    "id": "xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_2b-table-0.001",
-    "developer": "xkp24",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6042
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4936
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0997
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3793
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3708
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002/5e3e8dec-f14b-4b7a-ace1-1e1728395e84.json b/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002/5e3e8dec-f14b-4b7a-ace1-1e1728395e84.json
deleted file mode 100644
index ca115a61a..000000000
--- a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002/5e3e8dec-f14b-4b7a-ace1-1e1728395e84.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002",
-    "id": "xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_bt_8b-table-0.002",
-    "developer": "xkp24",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7132
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4996
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0853
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3872
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3664
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001/35b4378e-52cd-4ae1-985b-c8e2c00dc61a.json b/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001/35b4378e-52cd-4ae1-985b-c8e2c00dc61a.json
deleted file mode 100644
index fbbc12e22..000000000
--- a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001/35b4378e-52cd-4ae1-985b-c8e2c00dc61a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001",
-    "id": "xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_2b-table-0.001",
-    "developer": "xkp24",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5947
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4899
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1073
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3581
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3704
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002/4d99a55e-39c0-41c7-9ef0-494f739ceaec.json b/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002/4d99a55e-39c0-41c7-9ef0-494f739ceaec.json
deleted file mode 100644
index 60810c2c1..000000000
--- a/data/hfopenllm_v2/xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002/4d99a55e-39c0-41c7-9ef0-494f739ceaec.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xkp24_Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002",
-    "id": "xkp24/Llama-3-8B-Instruct-SPPO-score-Iter2_gp_8b-table-0.002",
-    "developer": "xkp24",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6453
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4951
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0937
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3939
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.353
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table/f3c7bacd-e231-45fd-b503-ee4d34caf4e8.json b/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table/f3c7bacd-e231-45fd-b503-ee4d34caf4e8.json
deleted file mode 100644
index 9670d371e..000000000
--- a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table/f3c7bacd-e231-45fd-b503-ee4d34caf4e8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table",
-    "id": "xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_2b-table",
-    "developer": "xukp20",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5756
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4901
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0997
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.366
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3659
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table/1bb87d8f-2d66-42b2-a744-1a7cbc2c17dc.json b/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table/1bb87d8f-2d66-42b2-a744-1a7cbc2c17dc.json
deleted file mode 100644
index 527102943..000000000
--- a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table/1bb87d8f-2d66-42b2-a744-1a7cbc2c17dc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table",
-    "id": "xukp20/Llama-3-8B-Instruct-SPPO-Iter3_bt_8b-table",
-    "developer": "xukp20",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7034
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5092
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0967
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3739
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3693
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table/ae10fd26-e648-4fa0-ae24-dfaaf4ff510d.json b/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table/ae10fd26-e648-4fa0-ae24-dfaaf4ff510d.json
deleted file mode 100644
index dd8c0822b..000000000
--- a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table/ae10fd26-e648-4fa0-ae24-dfaaf4ff510d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table",
-    "id": "xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_2b-table",
-    "developer": "xukp20",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6024
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.497
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1042
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3674
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3658
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table/0af58746-0492-4ba7-8a17-c0a5c43d0700.json b/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table/0af58746-0492-4ba7-8a17-c0a5c43d0700.json
deleted file mode 100644
index 27ec692c2..000000000
--- a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table/0af58746-0492-4ba7-8a17-c0a5c43d0700.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xukp20_Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table",
-    "id": "xukp20/Llama-3-8B-Instruct-SPPO-Iter3_gp_8b-table",
-    "developer": "xukp20",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.662
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0937
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3818
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3615
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001/88fff9f5-7aa7-463a-87e0-5fd2f5bacf09.json b/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001/88fff9f5-7aa7-463a-87e0-5fd2f5bacf09.json
deleted file mode 100644
index 0f5aeda25..000000000
--- a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001/88fff9f5-7aa7-463a-87e0-5fd2f5bacf09.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xukp20_Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001",
-    "id": "xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_2b-table-0.001",
-    "developer": "xukp20",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5336
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4915
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0982
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.378
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3625
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002/bc79527d-ae58-4b17-afd8-df931562dbf3.json b/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002/bc79527d-ae58-4b17-afd8-df931562dbf3.json
deleted file mode 100644
index 8d52c8dfa..000000000
--- a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002/bc79527d-ae58-4b17-afd8-df931562dbf3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xukp20_Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002",
-    "id": "xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_bt_8b-table-0.002",
-    "developer": "xukp20",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6852
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5075
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0718
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2584
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3832
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3621
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001/3e7423d5-ad7e-48e2-bd25-a4946d443c24.json b/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001/3e7423d5-ad7e-48e2-bd25-a4946d443c24.json
deleted file mode 100644
index 922c5078c..000000000
--- a/data/hfopenllm_v2/xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001/3e7423d5-ad7e-48e2-bd25-a4946d443c24.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xukp20_Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001",
-    "id": "xukp20/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_2b-table-0.001",
-    "developer": "xukp20",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5482
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4887
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0891
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3633
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3671
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xukp20/llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table/7979fd6a-a886-41cc-987b-356b7c452bff.json b/data/hfopenllm_v2/xukp20/llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table/7979fd6a-a886-41cc-987b-356b7c452bff.json
deleted file mode 100644
index 4feb8bd7f..000000000
--- a/data/hfopenllm_v2/xukp20/llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table/7979fd6a-a886-41cc-987b-356b7c452bff.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xukp20_llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table",
-    "id": "xukp20/llama-3-8b-instruct-sppo-iter1-gp-2b-tau01-table",
-    "developer": "xukp20",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4978
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.105
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3673
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xwen-team/Xwen-7B-Chat/2be6bc34-1e61-426f-b963-6e096b5418fb.json b/data/hfopenllm_v2/xwen-team/Xwen-7B-Chat/2be6bc34-1e61-426f-b963-6e096b5418fb.json
deleted file mode 100644
index fb936d3e3..000000000
--- a/data/hfopenllm_v2/xwen-team/Xwen-7B-Chat/2be6bc34-1e61-426f-b963-6e096b5418fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xwen-team_Xwen-7B-Chat/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Xwen-7B-Chat",
-    "id": "xwen-team/Xwen-7B-Chat",
-    "developer": "xwen-team",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 7.616
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6864
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5068
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4509
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2609
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3914
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.429
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/xxx777xxxASD/L3.1-ClaudeMaid-4x8B/c4f69339-be6b-4bb4-8faf-a1f40e73d4b0.json b/data/hfopenllm_v2/xxx777xxxASD/L3.1-ClaudeMaid-4x8B/c4f69339-be6b-4bb4-8faf-a1f40e73d4b0.json
deleted file mode 100644
index e7afb7649..000000000
--- a/data/hfopenllm_v2/xxx777xxxASD/L3.1-ClaudeMaid-4x8B/c4f69339-be6b-4bb4-8faf-a1f40e73d4b0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/xxx777xxxASD_L3.1-ClaudeMaid-4x8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "L3.1-ClaudeMaid-4x8B",
-    "id": "xxx777xxxASD/L3.1-ClaudeMaid-4x8B",
-    "developer": "xxx777xxxASD",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 24.942
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6696
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5071
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1412
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2911
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4289
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.358
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yam-peleg/Hebrew-Gemma-11B-Instruct/c845eb10-a028-4cc2-8f64-25d75480c0d5.json b/data/hfopenllm_v2/yam-peleg/Hebrew-Gemma-11B-Instruct/c845eb10-a028-4cc2-8f64-25d75480c0d5.json
deleted file mode 100644
index cb76ed21b..000000000
--- a/data/hfopenllm_v2/yam-peleg/Hebrew-Gemma-11B-Instruct/c845eb10-a028-4cc2-8f64-25d75480c0d5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yam-peleg_Hebrew-Gemma-11B-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hebrew-Gemma-11B-Instruct",
-    "id": "yam-peleg/Hebrew-Gemma-11B-Instruct",
-    "developer": "yam-peleg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "GemmaForCausalLM",
-      "params_billions": 10.475
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3021
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4036
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0657
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4089
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2554
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B-200K/377e7223-4876-49b6-8057-b1831d7f129b.json b/data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B-200K/377e7223-4876-49b6-8057-b1831d7f129b.json
deleted file mode 100644
index 7e2ef6384..000000000
--- a/data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B-200K/377e7223-4876-49b6-8057-b1831d7f129b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yam-peleg_Hebrew-Mistral-7B-200K/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hebrew-Mistral-7B-200K",
-    "id": "yam-peleg/Hebrew-Mistral-7B-200K",
-    "developer": "yam-peleg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.504
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1856
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4149
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0234
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.276
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3765
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2573
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B-200K/4ddb9ed6-0599-482e-b12e-bcb01975cc85.json b/data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B-200K/4ddb9ed6-0599-482e-b12e-bcb01975cc85.json
deleted file mode 100644
index d83bce8f2..000000000
--- a/data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B-200K/4ddb9ed6-0599-482e-b12e-bcb01975cc85.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yam-peleg_Hebrew-Mistral-7B-200K/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hebrew-Mistral-7B-200K",
-    "id": "yam-peleg/Hebrew-Mistral-7B-200K",
-    "developer": "yam-peleg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.504
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.177
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3411
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.031
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.374
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2529
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B/9d5af106-be69-4b62-99c1-fcfb6091d080.json b/data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B/9d5af106-be69-4b62-99c1-fcfb6091d080.json
deleted file mode 100644
index 93e4556ea..000000000
--- a/data/hfopenllm_v2/yam-peleg/Hebrew-Mistral-7B/9d5af106-be69-4b62-99c1-fcfb6091d080.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yam-peleg_Hebrew-Mistral-7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Hebrew-Mistral-7B",
-    "id": "yam-peleg/Hebrew-Mistral-7B",
-    "developer": "yam-peleg",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.504
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2328
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4334
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0498
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2794
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3977
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.278
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yanng1242/Marcoro14-7B-slerp/2f2d7a55-2838-446d-9487-a6cfa0c03356.json b/data/hfopenllm_v2/yanng1242/Marcoro14-7B-slerp/2f2d7a55-2838-446d-9487-a6cfa0c03356.json
deleted file mode 100644
index 348a5fe37..000000000
--- a/data/hfopenllm_v2/yanng1242/Marcoro14-7B-slerp/2f2d7a55-2838-446d-9487-a6cfa0c03356.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yanng1242_Marcoro14-7B-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Marcoro14-7B-slerp",
-    "id": "yanng1242/Marcoro14-7B-slerp",
-    "developer": "yanng1242",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MistralForCausalLM",
-      "params_billions": 7.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.406
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5252
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0748
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3146
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4686
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3168
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yasserrmd/Coder-GRPO-3B/65d20d45-f63b-4b09-b66d-5f53297c0c20.json b/data/hfopenllm_v2/yasserrmd/Coder-GRPO-3B/65d20d45-f63b-4b09-b66d-5f53297c0c20.json
deleted file mode 100644
index f3af930c7..000000000
--- a/data/hfopenllm_v2/yasserrmd/Coder-GRPO-3B/65d20d45-f63b-4b09-b66d-5f53297c0c20.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yasserrmd_Coder-GRPO-3B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Coder-GRPO-3B",
-    "id": "yasserrmd/Coder-GRPO-3B",
-    "developer": "yasserrmd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 3.086
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6208
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4469
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3202
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4115
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3197
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yasserrmd/Text2SQL-1.5B/4712953f-0777-4b97-8f13-f7309f19f0dc.json b/data/hfopenllm_v2/yasserrmd/Text2SQL-1.5B/4712953f-0777-4b97-8f13-f7309f19f0dc.json
deleted file mode 100644
index eca8489e8..000000000
--- a/data/hfopenllm_v2/yasserrmd/Text2SQL-1.5B/4712953f-0777-4b97-8f13-f7309f19f0dc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yasserrmd_Text2SQL-1.5B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Text2SQL-1.5B",
-    "id": "yasserrmd/Text2SQL-1.5B",
-    "developer": "yasserrmd",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.544
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2857
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3858
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.068
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2878
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3942
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2363
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ycros/BagelMIsteryTour-v2-8x7B/84382308-04b5-439f-b486-b26d20da605a.json b/data/hfopenllm_v2/ycros/BagelMIsteryTour-v2-8x7B/84382308-04b5-439f-b486-b26d20da605a.json
deleted file mode 100644
index e45b7fe88..000000000
--- a/data/hfopenllm_v2/ycros/BagelMIsteryTour-v2-8x7B/84382308-04b5-439f-b486-b26d20da605a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ycros_BagelMIsteryTour-v2-8x7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BagelMIsteryTour-v2-8x7B",
-    "id": "ycros/BagelMIsteryTour-v2-8x7B",
-    "developer": "ycros",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 46.703
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6262
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5142
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0937
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3079
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4138
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3481
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ycros/BagelMIsteryTour-v2-8x7B/e82be06f-14ed-45e8-a273-d28c50f5212b.json b/data/hfopenllm_v2/ycros/BagelMIsteryTour-v2-8x7B/e82be06f-14ed-45e8-a273-d28c50f5212b.json
deleted file mode 100644
index 7ac95c1dd..000000000
--- a/data/hfopenllm_v2/ycros/BagelMIsteryTour-v2-8x7B/e82be06f-14ed-45e8-a273-d28c50f5212b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ycros_BagelMIsteryTour-v2-8x7B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "BagelMIsteryTour-v2-8x7B",
-    "id": "ycros/BagelMIsteryTour-v2-8x7B",
-    "developer": "ycros",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 46.703
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5994
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5159
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0785
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3045
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4203
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table/5815ba55-40fc-4f8e-ae0b-b329c42fd503.json b/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table/5815ba55-40fc-4f8e-ae0b-b329c42fd503.json
deleted file mode 100644
index fd37fa6d3..000000000
--- a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table/5815ba55-40fc-4f8e-ae0b-b329c42fd503.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table",
-    "id": "yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_2b-table",
-    "developer": "yfzp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6709
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4987
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1118
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3727
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3716
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table/e58eceb3-b501-4924-9d0d-98d7da3c16c5.json b/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table/e58eceb3-b501-4924-9d0d-98d7da3c16c5.json
deleted file mode 100644
index 3a674e52d..000000000
--- a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table/e58eceb3-b501-4924-9d0d-98d7da3c16c5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table",
-    "id": "yfzp/Llama-3-8B-Instruct-SPPO-Iter1_bt_8b-table",
-    "developer": "yfzp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7333
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.508
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1035
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3806
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3748
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table/5a88455c-7699-4c49-8a12-76cda15d878c.json b/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table/5a88455c-7699-4c49-8a12-76cda15d878c.json
deleted file mode 100644
index 6b39085c6..000000000
--- a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table/5a88455c-7699-4c49-8a12-76cda15d878c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table",
-    "id": "yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_2b-table",
-    "developer": "yfzp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6785
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4941
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3647
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3718
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table/122b4c1e-6e6c-4db5-8991-b091361c3ecf.json b/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table/122b4c1e-6e6c-4db5-8991-b091361c3ecf.json
deleted file mode 100644
index f983fdb6a..000000000
--- a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table/122b4c1e-6e6c-4db5-8991-b091361c3ecf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yfzp_Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table",
-    "id": "yfzp/Llama-3-8B-Instruct-SPPO-Iter1_gp_8b-table",
-    "developer": "yfzp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7132
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5025
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0989
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3713
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3683
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001/6abeb0e4-32ee-4dbb-9902-b19cc96a2aa7.json b/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001/6abeb0e4-32ee-4dbb-9902-b19cc96a2aa7.json
deleted file mode 100644
index 269e54be1..000000000
--- a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001/6abeb0e4-32ee-4dbb-9902-b19cc96a2aa7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001",
-    "id": "yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_2b-table-0.001",
-    "developer": "yfzp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6496
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4979
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1012
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.378
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.372
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002/679f214f-e03f-47a9-8a11-91adbf1c4880.json b/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002/679f214f-e03f-47a9-8a11-91adbf1c4880.json
deleted file mode 100644
index 7cceda8ce..000000000
--- a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002/679f214f-e03f-47a9-8a11-91adbf1c4880.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002",
-    "id": "yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_bt_8b-table-0.002",
-    "developer": "yfzp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7196
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5045
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0876
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2601
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3831
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3734
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001/680e77b8-9c64-4c52-aa83-55236039cef1.json b/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001/680e77b8-9c64-4c52-aa83-55236039cef1.json
deleted file mode 100644
index 78466338e..000000000
--- a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001/680e77b8-9c64-4c52-aa83-55236039cef1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001",
-    "id": "yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_2b-table-0.001",
-    "developer": "yfzp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6504
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4958
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0937
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.366
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3703
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002/c24c471c-14b3-462e-8b81-6548b27e5ffc.json b/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002/c24c471c-14b3-462e-8b81-6548b27e5ffc.json
deleted file mode 100644
index 3d58afca0..000000000
--- a/data/hfopenllm_v2/yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002/c24c471c-14b3-462e-8b81-6548b27e5ffc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yfzp_Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002",
-    "id": "yfzp/Llama-3-8B-Instruct-SPPO-score-Iter1_gp_8b-table-0.002",
-    "developer": "yfzp",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7016
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4992
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0869
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2592
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3779
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3669
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yifAI/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002/efa7fa62-2e8b-403c-b345-eef876b48dbd.json b/data/hfopenllm_v2/yifAI/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002/efa7fa62-2e8b-403c-b345-eef876b48dbd.json
deleted file mode 100644
index 183ccc754..000000000
--- a/data/hfopenllm_v2/yifAI/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002/efa7fa62-2e8b-403c-b345-eef876b48dbd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yifAI_Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002",
-    "id": "yifAI/Llama-3-8B-Instruct-SPPO-score-Iter3_gp_8b-table-0.002",
-    "developer": "yifAI",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.649
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4915
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0755
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3899
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.352
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ylalain/ECE-PRYMMAL-YL-1B-SLERP-V8/40bae762-65bd-4b4c-b422-ffd0fd3790a9.json b/data/hfopenllm_v2/ylalain/ECE-PRYMMAL-YL-1B-SLERP-V8/40bae762-65bd-4b4c-b422-ffd0fd3790a9.json
deleted file mode 100644
index 1a855481e..000000000
--- a/data/hfopenllm_v2/ylalain/ECE-PRYMMAL-YL-1B-SLERP-V8/40bae762-65bd-4b4c-b422-ffd0fd3790a9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ylalain_ECE-PRYMMAL-YL-1B-SLERP-V8/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ECE-PRYMMAL-YL-1B-SLERP-V8",
-    "id": "ylalain/ECE-PRYMMAL-YL-1B-SLERP-V8",
-    "developer": "ylalain",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 1.357
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1505
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3976
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0045
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2894
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3875
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2384
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ymcki/Llama-3.1-8B-GRPO-Instruct/596957cc-719c-44c7-8284-06a9ba0d1a30.json b/data/hfopenllm_v2/ymcki/Llama-3.1-8B-GRPO-Instruct/596957cc-719c-44c7-8284-06a9ba0d1a30.json
deleted file mode 100644
index a68709211..000000000
--- a/data/hfopenllm_v2/ymcki/Llama-3.1-8B-GRPO-Instruct/596957cc-719c-44c7-8284-06a9ba0d1a30.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ymcki_Llama-3.1-8B-GRPO-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-GRPO-Instruct",
-    "id": "ymcki/Llama-3.1-8B-GRPO-Instruct",
-    "developer": "ymcki",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7445
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5132
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2024
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2945
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3817
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3738
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ymcki/Llama-3.1-8B-SFT-GRPO-Instruct/706bbc09-f867-4327-bc4d-b5ede41ebd93.json b/data/hfopenllm_v2/ymcki/Llama-3.1-8B-SFT-GRPO-Instruct/706bbc09-f867-4327-bc4d-b5ede41ebd93.json
deleted file mode 100644
index 3032b3ce7..000000000
--- a/data/hfopenllm_v2/ymcki/Llama-3.1-8B-SFT-GRPO-Instruct/706bbc09-f867-4327-bc4d-b5ede41ebd93.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ymcki_Llama-3.1-8B-SFT-GRPO-Instruct/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama-3.1-8B-SFT-GRPO-Instruct",
-    "id": "ymcki/Llama-3.1-8B-SFT-GRPO-Instruct",
-    "developer": "ymcki",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3354
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3126
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2534
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3526
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1098
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18-merge/8962e9be-75bf-4f57-8ce2-b29523740851.json b/data/hfopenllm_v2/ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18-merge/8962e9be-75bf-4f57-8ce2-b29523740851.json
deleted file mode 100644
index 58ab0bf05..000000000
--- a/data/hfopenllm_v2/ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18-merge/8962e9be-75bf-4f57-8ce2-b29523740851.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ymcki_gemma-2-2b-ORPO-jpn-it-abliterated-18-merge/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-ORPO-jpn-it-abliterated-18-merge",
-    "id": "ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18-merge",
-    "developer": "ymcki",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5218
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4147
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0544
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2836
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3514
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2461
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18/014f4838-22ff-4802-a887-4d2de01a9256.json b/data/hfopenllm_v2/ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18/014f4838-22ff-4802-a887-4d2de01a9256.json
deleted file mode 100644
index ff7346956..000000000
--- a/data/hfopenllm_v2/ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18/014f4838-22ff-4802-a887-4d2de01a9256.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ymcki_gemma-2-2b-ORPO-jpn-it-abliterated-18/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-ORPO-jpn-it-abliterated-18",
-    "id": "ymcki/gemma-2-2b-ORPO-jpn-it-abliterated-18",
-    "developer": "ymcki",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4631
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4053
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0431
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2886
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3754
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2345
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-18-24/5c6eac9c-0ec6-4364-a86b-dcd894d69f0b.json b/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-18-24/5c6eac9c-0ec6-4364-a86b-dcd894d69f0b.json
deleted file mode 100644
index 758147784..000000000
--- a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-18-24/5c6eac9c-0ec6-4364-a86b-dcd894d69f0b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ymcki_gemma-2-2b-jpn-it-abliterated-17-18-24/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-jpn-it-abliterated-17-18-24",
-    "id": "ymcki/gemma-2-2b-jpn-it-abliterated-17-18-24",
-    "developer": "ymcki",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5055
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3812
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0257
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.281
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3502
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2282
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca/09b81cf2-3b79-448c-ab8e-87e378c804bb.json b/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca/09b81cf2-3b79-448c-ab8e-87e378c804bb.json
deleted file mode 100644
index d9039ff8d..000000000
--- a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca/09b81cf2-3b79-448c-ab8e-87e378c804bb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ymcki_gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca",
-    "id": "ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO-alpaca",
-    "developer": "ymcki",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3065
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4072
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0325
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2693
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3969
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2249
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO/28b9977a-db3d-4f38-b1f7-bd0cdcab5504.json b/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO/28b9977a-db3d-4f38-b1f7-bd0cdcab5504.json
deleted file mode 100644
index 89ebbecdf..000000000
--- a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO/28b9977a-db3d-4f38-b1f7-bd0cdcab5504.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ymcki_gemma-2-2b-jpn-it-abliterated-17-ORPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-jpn-it-abliterated-17-ORPO",
-    "id": "ymcki/gemma-2-2b-jpn-it-abliterated-17-ORPO",
-    "developer": "ymcki",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4748
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3898
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0619
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2743
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3768
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2191
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17/845ea162-cfa1-47f4-8914-d81d9bf1bb7d.json b/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17/845ea162-cfa1-47f4-8914-d81d9bf1bb7d.json
deleted file mode 100644
index b0dc22bac..000000000
--- a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-17/845ea162-cfa1-47f4-8914-d81d9bf1bb7d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ymcki_gemma-2-2b-jpn-it-abliterated-17/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-jpn-it-abliterated-17",
-    "id": "ymcki/gemma-2-2b-jpn-it-abliterated-17",
-    "developer": "ymcki",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5082
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4076
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0385
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2718
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3701
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2455
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-18-ORPO/706737c7-cd1a-4958-9ffc-2655f0b50178.json b/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-18-ORPO/706737c7-cd1a-4958-9ffc-2655f0b50178.json
deleted file mode 100644
index 1d4acd4a2..000000000
--- a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-18-ORPO/706737c7-cd1a-4958-9ffc-2655f0b50178.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ymcki_gemma-2-2b-jpn-it-abliterated-18-ORPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-jpn-it-abliterated-18-ORPO",
-    "id": "ymcki/gemma-2-2b-jpn-it-abliterated-18-ORPO",
-    "developer": "ymcki",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4742
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4039
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0468
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2617
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3953
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2185
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-18/5acd58cd-8dfb-4fb7-8832-6bc151e0b1a1.json b/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-18/5acd58cd-8dfb-4fb7-8832-6bc151e0b1a1.json
deleted file mode 100644
index 902673fd5..000000000
--- a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-18/5acd58cd-8dfb-4fb7-8832-6bc151e0b1a1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ymcki_gemma-2-2b-jpn-it-abliterated-18/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-jpn-it-abliterated-18",
-    "id": "ymcki/gemma-2-2b-jpn-it-abliterated-18",
-    "developer": "ymcki",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5175
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4132
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0446
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2735
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3742
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2505
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-24/d374a68d-b985-47c2-b087-500bffa93c80.json b/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-24/d374a68d-b985-47c2-b087-500bffa93c80.json
deleted file mode 100644
index 7ae15b60f..000000000
--- a/data/hfopenllm_v2/ymcki/gemma-2-2b-jpn-it-abliterated-24/d374a68d-b985-47c2-b087-500bffa93c80.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/ymcki_gemma-2-2b-jpn-it-abliterated-24/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-jpn-it-abliterated-24",
-    "id": "ymcki/gemma-2-2b-jpn-it-abliterated-24",
-    "developer": "ymcki",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4979
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.411
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0438
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2777
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3915
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2473
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yuchenxie/ArlowGPT-3B-Multilingual/23fbceb0-b646-4945-b17f-66dde24a0e43.json b/data/hfopenllm_v2/yuchenxie/ArlowGPT-3B-Multilingual/23fbceb0-b646-4945-b17f-66dde24a0e43.json
deleted file mode 100644
index 266d9bf50..000000000
--- a/data/hfopenllm_v2/yuchenxie/ArlowGPT-3B-Multilingual/23fbceb0-b646-4945-b17f-66dde24a0e43.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yuchenxie_ArlowGPT-3B-Multilingual/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ArlowGPT-3B-Multilingual",
-    "id": "yuchenxie/ArlowGPT-3B-Multilingual",
-    "developer": "yuchenxie",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 3.213
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6395
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4301
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1125
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2802
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3727
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2817
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yuchenxie/ArlowGPT-8B/73d9e204-e829-4159-b340-6d9581c6f0e1.json b/data/hfopenllm_v2/yuchenxie/ArlowGPT-8B/73d9e204-e829-4159-b340-6d9581c6f0e1.json
deleted file mode 100644
index 4cc6ee856..000000000
--- a/data/hfopenllm_v2/yuchenxie/ArlowGPT-8B/73d9e204-e829-4159-b340-6d9581c6f0e1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yuchenxie_ArlowGPT-8B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ArlowGPT-8B",
-    "id": "yuchenxie/ArlowGPT-8B",
-    "developer": "yuchenxie",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "float16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7847
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.508
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2039
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2936
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3882
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3787
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yuvraj17/Llama3-8B-SuperNova-Spectrum-Hermes-DPO/a6979dda-fba6-4104-b153-3b0a89de8585.json b/data/hfopenllm_v2/yuvraj17/Llama3-8B-SuperNova-Spectrum-Hermes-DPO/a6979dda-fba6-4104-b153-3b0a89de8585.json
deleted file mode 100644
index 9cefceabb..000000000
--- a/data/hfopenllm_v2/yuvraj17/Llama3-8B-SuperNova-Spectrum-Hermes-DPO/a6979dda-fba6-4104-b153-3b0a89de8585.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yuvraj17_Llama3-8B-SuperNova-Spectrum-Hermes-DPO/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3-8B-SuperNova-Spectrum-Hermes-DPO",
-    "id": "yuvraj17/Llama3-8B-SuperNova-Spectrum-Hermes-DPO",
-    "developer": "yuvraj17",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4691
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0566
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.302
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4012
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2635
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yuvraj17/Llama3-8B-SuperNova-Spectrum-dare_ties/62e04968-0c5c-4aad-a434-d9d24bccbdb8.json b/data/hfopenllm_v2/yuvraj17/Llama3-8B-SuperNova-Spectrum-dare_ties/62e04968-0c5c-4aad-a434-d9d24bccbdb8.json
deleted file mode 100644
index 76030c29d..000000000
--- a/data/hfopenllm_v2/yuvraj17/Llama3-8B-SuperNova-Spectrum-dare_ties/62e04968-0c5c-4aad-a434-d9d24bccbdb8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yuvraj17_Llama3-8B-SuperNova-Spectrum-dare_ties/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3-8B-SuperNova-Spectrum-dare_ties",
-    "id": "yuvraj17/Llama3-8B-SuperNova-Spectrum-dare_ties",
-    "developer": "yuvraj17",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4013
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4616
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0846
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2752
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4211
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3574
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/yuvraj17/Llama3-8B-abliterated-Spectrum-slerp/bae4064e-b10f-4082-876d-e4168ca1a8cc.json b/data/hfopenllm_v2/yuvraj17/Llama3-8B-abliterated-Spectrum-slerp/bae4064e-b10f-4082-876d-e4168ca1a8cc.json
deleted file mode 100644
index 95dd6c5ed..000000000
--- a/data/hfopenllm_v2/yuvraj17/Llama3-8B-abliterated-Spectrum-slerp/bae4064e-b10f-4082-876d-e4168ca1a8cc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/yuvraj17_Llama3-8B-abliterated-Spectrum-slerp/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Llama3-8B-abliterated-Spectrum-slerp",
-    "id": "yuvraj17/Llama3-8B-abliterated-Spectrum-slerp",
-    "developer": "yuvraj17",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "LlamaForCausalLM",
-      "params_billions": 8.03
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2885
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4978
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0604
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3012
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3998
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3257
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zake7749/gemma-2-2b-it-chinese-kyara-dpo/0040b48c-0f54-4c9b-97ee-1ca833c68e36.json b/data/hfopenllm_v2/zake7749/gemma-2-2b-it-chinese-kyara-dpo/0040b48c-0f54-4c9b-97ee-1ca833c68e36.json
deleted file mode 100644
index c07d9eb08..000000000
--- a/data/hfopenllm_v2/zake7749/gemma-2-2b-it-chinese-kyara-dpo/0040b48c-0f54-4c9b-97ee-1ca833c68e36.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zake7749_gemma-2-2b-it-chinese-kyara-dpo/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-2b-it-chinese-kyara-dpo",
-    "id": "zake7749/gemma-2-2b-it-chinese-kyara-dpo",
-    "developer": "zake7749",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 2.614
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5382
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4257
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0838
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2668
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4576
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2573
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zake7749/gemma-2-9b-it-chinese-kyara/6050e969-bcde-4594-8e53-05fa74c7287d.json b/data/hfopenllm_v2/zake7749/gemma-2-9b-it-chinese-kyara/6050e969-bcde-4594-8e53-05fa74c7287d.json
deleted file mode 100644
index 0bf944e0e..000000000
--- a/data/hfopenllm_v2/zake7749/gemma-2-9b-it-chinese-kyara/6050e969-bcde-4594-8e53-05fa74c7287d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zake7749_gemma-2-9b-it-chinese-kyara/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-9b-it-chinese-kyara",
-    "id": "zake7749/gemma-2-9b-it-chinese-kyara",
-    "developer": "zake7749",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 9.242
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1764
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5954
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.105
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4242
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4179
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/Gemma-2-TM-9B/3aaee358-bf3e-4d91-91bf-bd42e0a7c61e.json b/data/hfopenllm_v2/zelk12/Gemma-2-TM-9B/3aaee358-bf3e-4d91-91bf-bd42e0a7c61e.json
deleted file mode 100644
index c972fae87..000000000
--- a/data/hfopenllm_v2/zelk12/Gemma-2-TM-9B/3aaee358-bf3e-4d91-91bf-bd42e0a7c61e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_Gemma-2-TM-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Gemma-2-TM-9B",
-    "id": "zelk12/Gemma-2-TM-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8045
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5987
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2024
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3465
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4152
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4088
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT-Gen1-gemma-2-9B/ef5f4fb2-f409-49dc-b3f0-f3e19585cd8a.json b/data/hfopenllm_v2/zelk12/MT-Gen1-gemma-2-9B/ef5f4fb2-f409-49dc-b3f0-f3e19585cd8a.json
deleted file mode 100644
index 14766f899..000000000
--- a/data/hfopenllm_v2/zelk12/MT-Gen1-gemma-2-9B/ef5f4fb2-f409-49dc-b3f0-f3e19585cd8a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT-Gen1-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT-Gen1-gemma-2-9B",
-    "id": "zelk12/MT-Gen1-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7886
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.61
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2221
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3465
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4217
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4381
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT-Gen2-GI-gemma-2-9B/4048fa60-7427-4f7e-9939-e270aa5e8b51.json b/data/hfopenllm_v2/zelk12/MT-Gen2-GI-gemma-2-9B/4048fa60-7427-4f7e-9939-e270aa5e8b51.json
deleted file mode 100644
index aab0e6abb..000000000
--- a/data/hfopenllm_v2/zelk12/MT-Gen2-GI-gemma-2-9B/4048fa60-7427-4f7e-9939-e270aa5e8b51.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT-Gen2-GI-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT-Gen2-GI-gemma-2-9B",
-    "id": "zelk12/MT-Gen2-GI-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7914
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6096
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2205
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3507
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4283
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4356
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT-Gen2-gemma-2-9B/f5c9baea-f2cf-414a-937a-6a43f55a1c1d.json b/data/hfopenllm_v2/zelk12/MT-Gen2-gemma-2-9B/f5c9baea-f2cf-414a-937a-6a43f55a1c1d.json
deleted file mode 100644
index 1b3fd4238..000000000
--- a/data/hfopenllm_v2/zelk12/MT-Gen2-gemma-2-9B/f5c9baea-f2cf-414a-937a-6a43f55a1c1d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT-Gen2-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT-Gen2-gemma-2-9B",
-    "id": "zelk12/MT-Gen2-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7907
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.61
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3465
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4323
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4387
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT-Gen3-gemma-2-9B/1da70796-d40b-4f2a-8ce3-b304f414a6d5.json b/data/hfopenllm_v2/zelk12/MT-Gen3-gemma-2-9B/1da70796-d40b-4f2a-8ce3-b304f414a6d5.json
deleted file mode 100644
index 9a41654cb..000000000
--- a/data/hfopenllm_v2/zelk12/MT-Gen3-gemma-2-9B/1da70796-d40b-4f2a-8ce3-b304f414a6d5.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT-Gen3-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT-Gen3-gemma-2-9B",
-    "id": "zelk12/MT-Gen3-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.802
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6097
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2296
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.349
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4217
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4356
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT-Gen4-gemma-2-9B/de476f79-2539-4f9e-a1d2-901c6c4342d4.json b/data/hfopenllm_v2/zelk12/MT-Gen4-gemma-2-9B/de476f79-2539-4f9e-a1d2-901c6c4342d4.json
deleted file mode 100644
index 6c3bf327c..000000000
--- a/data/hfopenllm_v2/zelk12/MT-Gen4-gemma-2-9B/de476f79-2539-4f9e-a1d2-901c6c4342d4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT-Gen4-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT-Gen4-gemma-2-9B",
-    "id": "zelk12/MT-Gen4-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7883
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.611
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2236
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3549
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4228
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4387
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT-Gen5-gemma-2-9B/80aee542-c894-46b6-a6ed-9f3400aefa9e.json b/data/hfopenllm_v2/zelk12/MT-Gen5-gemma-2-9B/80aee542-c894-46b6-a6ed-9f3400aefa9e.json
deleted file mode 100644
index 758fecb7b..000000000
--- a/data/hfopenllm_v2/zelk12/MT-Gen5-gemma-2-9B/80aee542-c894-46b6-a6ed-9f3400aefa9e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT-Gen5-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT-Gen5-gemma-2-9B",
-    "id": "zelk12/MT-Gen5-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7923
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6133
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2153
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4202
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4402
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT-Gen6-gemma-2-9B/5c9d4eaf-0985-4f9e-8007-08b4081bb19d.json b/data/hfopenllm_v2/zelk12/MT-Gen6-gemma-2-9B/5c9d4eaf-0985-4f9e-8007-08b4081bb19d.json
deleted file mode 100644
index d42787489..000000000
--- a/data/hfopenllm_v2/zelk12/MT-Gen6-gemma-2-9B/5c9d4eaf-0985-4f9e-8007-08b4081bb19d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT-Gen6-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT-Gen6-gemma-2-9B",
-    "id": "zelk12/MT-Gen6-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1616
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5845
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0823
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3331
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4069
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4166
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT-Gen6fix-gemma-2-9B/4b019824-8454-4ce8-aa49-d122a2491f9c.json b/data/hfopenllm_v2/zelk12/MT-Gen6fix-gemma-2-9B/4b019824-8454-4ce8-aa49-d122a2491f9c.json
deleted file mode 100644
index ea018b3ba..000000000
--- a/data/hfopenllm_v2/zelk12/MT-Gen6fix-gemma-2-9B/4b019824-8454-4ce8-aa49-d122a2491f9c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT-Gen6fix-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT-Gen6fix-gemma-2-9B",
-    "id": "zelk12/MT-Gen6fix-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1576
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5917
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0816
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3372
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4084
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.412
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT-Gen7-gemma-2-9B/0dfcd13c-f057-4aec-82ad-b5cf2b266502.json b/data/hfopenllm_v2/zelk12/MT-Gen7-gemma-2-9B/0dfcd13c-f057-4aec-82ad-b5cf2b266502.json
deleted file mode 100644
index dd5c80820..000000000
--- a/data/hfopenllm_v2/zelk12/MT-Gen7-gemma-2-9B/0dfcd13c-f057-4aec-82ad-b5cf2b266502.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT-Gen7-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT-Gen7-gemma-2-9B",
-    "id": "zelk12/MT-Gen7-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1664
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5935
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0891
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3356
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4098
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4122
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT-Max-Merge_02012025163610-gemma-2-9B/927589bf-f6a0-4155-a24b-120231bbf029.json b/data/hfopenllm_v2/zelk12/MT-Max-Merge_02012025163610-gemma-2-9B/927589bf-f6a0-4155-a24b-120231bbf029.json
deleted file mode 100644
index 57eea20ea..000000000
--- a/data/hfopenllm_v2/zelk12/MT-Max-Merge_02012025163610-gemma-2-9B/927589bf-f6a0-4155-a24b-120231bbf029.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT-Max-Merge_02012025163610-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT-Max-Merge_02012025163610-gemma-2-9B",
-    "id": "zelk12/MT-Max-Merge_02012025163610-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7907
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6142
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2213
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4228
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4396
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT-Merge-gemma-2-9B/1a2740cb-c541-434e-89a1-7a9fd2c4cabd.json b/data/hfopenllm_v2/zelk12/MT-Merge-gemma-2-9B/1a2740cb-c541-434e-89a1-7a9fd2c4cabd.json
deleted file mode 100644
index 3dbd1e532..000000000
--- a/data/hfopenllm_v2/zelk12/MT-Merge-gemma-2-9B/1a2740cb-c541-434e-89a1-7a9fd2c4cabd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT-Merge-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT-Merge-gemma-2-9B",
-    "id": "zelk12/MT-Merge-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8035
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6118
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2205
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3482
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4256
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4362
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT-Merge1-gemma-2-9B/0110d1c9-755e-4f09-888b-0c9c1a263639.json b/data/hfopenllm_v2/zelk12/MT-Merge1-gemma-2-9B/0110d1c9-755e-4f09-888b-0c9c1a263639.json
deleted file mode 100644
index d496666d2..000000000
--- a/data/hfopenllm_v2/zelk12/MT-Merge1-gemma-2-9B/0110d1c9-755e-4f09-888b-0c9c1a263639.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT-Merge1-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT-Merge1-gemma-2-9B",
-    "id": "zelk12/MT-Merge1-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7901
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.61
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2289
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4244
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4374
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT-Merge2-MU-gemma-2-MTg2MT1g2-9B/cda65781-494c-45bd-8c32-7b1fe987f31c.json b/data/hfopenllm_v2/zelk12/MT-Merge2-MU-gemma-2-MTg2MT1g2-9B/cda65781-494c-45bd-8c32-7b1fe987f31c.json
deleted file mode 100644
index 66adad61e..000000000
--- a/data/hfopenllm_v2/zelk12/MT-Merge2-MU-gemma-2-MTg2MT1g2-9B/cda65781-494c-45bd-8c32-7b1fe987f31c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT-Merge2-MU-gemma-2-MTg2MT1g2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT-Merge2-MU-gemma-2-MTg2MT1g2-9B",
-    "id": "zelk12/MT-Merge2-MU-gemma-2-MTg2MT1g2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7956
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6084
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2183
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3507
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4322
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4373
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT-Merge2-gemma-2-9B/2fd7de02-f8d9-45c1-9bb5-db5134bd4862.json b/data/hfopenllm_v2/zelk12/MT-Merge2-gemma-2-9B/2fd7de02-f8d9-45c1-9bb5-db5134bd4862.json
deleted file mode 100644
index 6d090a24c..000000000
--- a/data/hfopenllm_v2/zelk12/MT-Merge2-gemma-2-9B/2fd7de02-f8d9-45c1-9bb5-db5134bd4862.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT-Merge2-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT-Merge2-gemma-2-9B",
-    "id": "zelk12/MT-Merge2-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7877
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6107
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2349
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3507
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4217
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4382
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT-Merge3-gemma-2-9B/acf07f51-5acd-4375-bafa-7a1a244db3c6.json b/data/hfopenllm_v2/zelk12/MT-Merge3-gemma-2-9B/acf07f51-5acd-4375-bafa-7a1a244db3c6.json
deleted file mode 100644
index 78c6f3258..000000000
--- a/data/hfopenllm_v2/zelk12/MT-Merge3-gemma-2-9B/acf07f51-5acd-4375-bafa-7a1a244db3c6.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT-Merge3-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT-Merge3-gemma-2-9B",
-    "id": "zelk12/MT-Merge3-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7859
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6102
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2205
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.349
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4258
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4373
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT-Merge4-gemma-2-9B/ff985193-ba26-45d3-97be-b7d3b17ab4d7.json b/data/hfopenllm_v2/zelk12/MT-Merge4-gemma-2-9B/ff985193-ba26-45d3-97be-b7d3b17ab4d7.json
deleted file mode 100644
index f73843da3..000000000
--- a/data/hfopenllm_v2/zelk12/MT-Merge4-gemma-2-9B/ff985193-ba26-45d3-97be-b7d3b17ab4d7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT-Merge4-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT-Merge4-gemma-2-9B",
-    "id": "zelk12/MT-Merge4-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7807
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6118
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2168
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3523
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4294
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.439
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT-Merge5-gemma-2-9B/21dbea2c-5cb1-431c-a496-af9b932b3440.json b/data/hfopenllm_v2/zelk12/MT-Merge5-gemma-2-9B/21dbea2c-5cb1-431c-a496-af9b932b3440.json
deleted file mode 100644
index 9e45d9643..000000000
--- a/data/hfopenllm_v2/zelk12/MT-Merge5-gemma-2-9B/21dbea2c-5cb1-431c-a496-af9b932b3440.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT-Merge5-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT-Merge5-gemma-2-9B",
-    "id": "zelk12/MT-Merge5-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7844
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6123
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2183
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3532
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4281
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4387
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT-Merge6-gemma-2-9B/1143955c-c32c-4b41-8484-2c77e72f4946.json b/data/hfopenllm_v2/zelk12/MT-Merge6-gemma-2-9B/1143955c-c32c-4b41-8484-2c77e72f4946.json
deleted file mode 100644
index 348e60c56..000000000
--- a/data/hfopenllm_v2/zelk12/MT-Merge6-gemma-2-9B/1143955c-c32c-4b41-8484-2c77e72f4946.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT-Merge6-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT-Merge6-gemma-2-9B",
-    "id": "zelk12/MT-Merge6-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1695
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5949
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0801
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4098
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4115
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT-gemma-2-9B/94824ceb-08c3-415c-8003-b70a0d9af09d.json b/data/hfopenllm_v2/zelk12/MT-gemma-2-9B/94824ceb-08c3-415c-8003-b70a0d9af09d.json
deleted file mode 100644
index 2c62b725d..000000000
--- a/data/hfopenllm_v2/zelk12/MT-gemma-2-9B/94824ceb-08c3-415c-8003-b70a0d9af09d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT-gemma-2-9B",
-    "id": "zelk12/MT-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7968
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6064
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2054
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3456
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4071
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4224
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT1-Gen1-gemma-2-9B/bf2903cb-b954-4870-98c3-116a96aa49fb.json b/data/hfopenllm_v2/zelk12/MT1-Gen1-gemma-2-9B/bf2903cb-b954-4870-98c3-116a96aa49fb.json
deleted file mode 100644
index 8e7b2f927..000000000
--- a/data/hfopenllm_v2/zelk12/MT1-Gen1-gemma-2-9B/bf2903cb-b954-4870-98c3-116a96aa49fb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT1-Gen1-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT1-Gen1-gemma-2-9B",
-    "id": "zelk12/MT1-Gen1-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7974
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6118
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2243
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.431
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4376
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT1-Gen2-gemma-2-9B/b089c439-a38c-438d-bdad-1c68a1265d95.json b/data/hfopenllm_v2/zelk12/MT1-Gen2-gemma-2-9B/b089c439-a38c-438d-bdad-1c68a1265d95.json
deleted file mode 100644
index 714f19ac3..000000000
--- a/data/hfopenllm_v2/zelk12/MT1-Gen2-gemma-2-9B/b089c439-a38c-438d-bdad-1c68a1265d95.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT1-Gen2-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT1-Gen2-gemma-2-9B",
-    "id": "zelk12/MT1-Gen2-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7984
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6096
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2251
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3523
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4284
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4355
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT1-Gen3-gemma-2-9B/c988815b-50e5-47e4-a418-bbbcdf1eb4a0.json b/data/hfopenllm_v2/zelk12/MT1-Gen3-gemma-2-9B/c988815b-50e5-47e4-a418-bbbcdf1eb4a0.json
deleted file mode 100644
index e7e79fa7b..000000000
--- a/data/hfopenllm_v2/zelk12/MT1-Gen3-gemma-2-9B/c988815b-50e5-47e4-a418-bbbcdf1eb4a0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT1-Gen3-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT1-Gen3-gemma-2-9B",
-    "id": "zelk12/MT1-Gen3-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.796
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6102
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2243
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.349
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4243
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4349
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT1-Gen4-gemma-2-9B/fa11d66c-7ebc-4b81-83b7-d35a4ff23d3f.json b/data/hfopenllm_v2/zelk12/MT1-Gen4-gemma-2-9B/fa11d66c-7ebc-4b81-83b7-d35a4ff23d3f.json
deleted file mode 100644
index b6d5e1c4b..000000000
--- a/data/hfopenllm_v2/zelk12/MT1-Gen4-gemma-2-9B/fa11d66c-7ebc-4b81-83b7-d35a4ff23d3f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT1-Gen4-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT1-Gen4-gemma-2-9B",
-    "id": "zelk12/MT1-Gen4-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7941
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6058
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.216
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4231
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4286
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT1-Gen5-IF-gemma-2-S2DMv1-9B/1c81787b-594e-4bb6-aee1-7f193a628b16.json b/data/hfopenllm_v2/zelk12/MT1-Gen5-IF-gemma-2-S2DMv1-9B/1c81787b-594e-4bb6-aee1-7f193a628b16.json
deleted file mode 100644
index c2e0e4354..000000000
--- a/data/hfopenllm_v2/zelk12/MT1-Gen5-IF-gemma-2-S2DMv1-9B/1c81787b-594e-4bb6-aee1-7f193a628b16.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT1-Gen5-IF-gemma-2-S2DMv1-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT1-Gen5-IF-gemma-2-S2DMv1-9B",
-    "id": "zelk12/MT1-Gen5-IF-gemma-2-S2DMv1-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7929
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2032
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4245
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4218
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT1-Gen5-gemma-2-9B/fd9ce37e-d43d-4ec2-94ec-0eb42e3cc685.json b/data/hfopenllm_v2/zelk12/MT1-Gen5-gemma-2-9B/fd9ce37e-d43d-4ec2-94ec-0eb42e3cc685.json
deleted file mode 100644
index c299e1f9b..000000000
--- a/data/hfopenllm_v2/zelk12/MT1-Gen5-gemma-2-9B/fd9ce37e-d43d-4ec2-94ec-0eb42e3cc685.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT1-Gen5-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT1-Gen5-gemma-2-9B",
-    "id": "zelk12/MT1-Gen5-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7795
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6017
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2077
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3465
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4191
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4222
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT1-Gen6-gemma-2-9B/0625f09a-3e02-410b-963b-49b83dfc5c8f.json b/data/hfopenllm_v2/zelk12/MT1-Gen6-gemma-2-9B/0625f09a-3e02-410b-963b-49b83dfc5c8f.json
deleted file mode 100644
index 550cda232..000000000
--- a/data/hfopenllm_v2/zelk12/MT1-Gen6-gemma-2-9B/0625f09a-3e02-410b-963b-49b83dfc5c8f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT1-Gen6-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT1-Gen6-gemma-2-9B",
-    "id": "zelk12/MT1-Gen6-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1634
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5944
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0808
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4044
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4133
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT1-Gen7-gemma-2-9B/50c1399e-b409-4dff-b4d6-9be01dbb02c7.json b/data/hfopenllm_v2/zelk12/MT1-Gen7-gemma-2-9B/50c1399e-b409-4dff-b4d6-9be01dbb02c7.json
deleted file mode 100644
index 2e79a59e4..000000000
--- a/data/hfopenllm_v2/zelk12/MT1-Gen7-gemma-2-9B/50c1399e-b409-4dff-b4d6-9be01dbb02c7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT1-Gen7-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT1-Gen7-gemma-2-9B",
-    "id": "zelk12/MT1-Gen7-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1634
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5938
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0831
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.328
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4111
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4145
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT1-Max-Merge_02012025163610-gemma-2-9B/402bdb4a-b258-40a4-ac9f-de74026c02f3.json b/data/hfopenllm_v2/zelk12/MT1-Max-Merge_02012025163610-gemma-2-9B/402bdb4a-b258-40a4-ac9f-de74026c02f3.json
deleted file mode 100644
index 09acf9468..000000000
--- a/data/hfopenllm_v2/zelk12/MT1-Max-Merge_02012025163610-gemma-2-9B/402bdb4a-b258-40a4-ac9f-de74026c02f3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT1-Max-Merge_02012025163610-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT1-Max-Merge_02012025163610-gemma-2-9B",
-    "id": "zelk12/MT1-Max-Merge_02012025163610-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7929
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6123
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2228
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3549
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4255
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4382
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT1-gemma-2-9B/65dcf458-db0f-45cd-a8a4-e16108e51161.json b/data/hfopenllm_v2/zelk12/MT1-gemma-2-9B/65dcf458-db0f-45cd-a8a4-e16108e51161.json
deleted file mode 100644
index f7d6bc79e..000000000
--- a/data/hfopenllm_v2/zelk12/MT1-gemma-2-9B/65dcf458-db0f-45cd-a8a4-e16108e51161.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT1-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT1-gemma-2-9B",
-    "id": "zelk12/MT1-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7947
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6109
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2236
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3456
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4322
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4358
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT2-Gen1-gemma-2-9B/f1346b1a-0e66-4d80-bfad-ccbe0a8e2abf.json b/data/hfopenllm_v2/zelk12/MT2-Gen1-gemma-2-9B/f1346b1a-0e66-4d80-bfad-ccbe0a8e2abf.json
deleted file mode 100644
index 44374c9d5..000000000
--- a/data/hfopenllm_v2/zelk12/MT2-Gen1-gemma-2-9B/f1346b1a-0e66-4d80-bfad-ccbe0a8e2abf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT2-Gen1-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT2-Gen1-gemma-2-9B",
-    "id": "zelk12/MT2-Gen1-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7856
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6101
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2213
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3431
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4243
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4377
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT2-Gen2-gemma-2-9B/11e7b55a-d872-474a-98a6-fc82ce5a863e.json b/data/hfopenllm_v2/zelk12/MT2-Gen2-gemma-2-9B/11e7b55a-d872-474a-98a6-fc82ce5a863e.json
deleted file mode 100644
index 3faf801d3..000000000
--- a/data/hfopenllm_v2/zelk12/MT2-Gen2-gemma-2-9B/11e7b55a-d872-474a-98a6-fc82ce5a863e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT2-Gen2-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT2-Gen2-gemma-2-9B",
-    "id": "zelk12/MT2-Gen2-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7889
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6093
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2183
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3465
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.427
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4388
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT2-Gen3-gemma-2-9B/19688633-fa6c-412a-8dbc-c16fc49b3276.json b/data/hfopenllm_v2/zelk12/MT2-Gen3-gemma-2-9B/19688633-fa6c-412a-8dbc-c16fc49b3276.json
deleted file mode 100644
index b70f067a8..000000000
--- a/data/hfopenllm_v2/zelk12/MT2-Gen3-gemma-2-9B/19688633-fa6c-412a-8dbc-c16fc49b3276.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT2-Gen3-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT2-Gen3-gemma-2-9B",
-    "id": "zelk12/MT2-Gen3-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.781
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6105
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2107
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3465
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4231
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4374
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT2-Gen4-gemma-2-9B/7d67eb9c-a4d8-4b86-8c24-928ebbe58de7.json b/data/hfopenllm_v2/zelk12/MT2-Gen4-gemma-2-9B/7d67eb9c-a4d8-4b86-8c24-928ebbe58de7.json
deleted file mode 100644
index 70dd1549f..000000000
--- a/data/hfopenllm_v2/zelk12/MT2-Gen4-gemma-2-9B/7d67eb9c-a4d8-4b86-8c24-928ebbe58de7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT2-Gen4-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT2-Gen4-gemma-2-9B",
-    "id": "zelk12/MT2-Gen4-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7896
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6097
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2236
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3456
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4125
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4321
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT2-Gen5-gemma-2-9B/447f880c-643f-4041-8cdb-87697d798085.json b/data/hfopenllm_v2/zelk12/MT2-Gen5-gemma-2-9B/447f880c-643f-4041-8cdb-87697d798085.json
deleted file mode 100644
index 2ef223d58..000000000
--- a/data/hfopenllm_v2/zelk12/MT2-Gen5-gemma-2-9B/447f880c-643f-4041-8cdb-87697d798085.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT2-Gen5-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT2-Gen5-gemma-2-9B",
-    "id": "zelk12/MT2-Gen5-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7749
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6064
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2107
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4244
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4302
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT2-Gen6-gemma-2-9B/653d459e-f8b7-48bc-a9db-779e515532cf.json b/data/hfopenllm_v2/zelk12/MT2-Gen6-gemma-2-9B/653d459e-f8b7-48bc-a9db-779e515532cf.json
deleted file mode 100644
index 828f27f8f..000000000
--- a/data/hfopenllm_v2/zelk12/MT2-Gen6-gemma-2-9B/653d459e-f8b7-48bc-a9db-779e515532cf.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT2-Gen6-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT2-Gen6-gemma-2-9B",
-    "id": "zelk12/MT2-Gen6-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1664
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.596
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0846
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4137
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.421
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT2-Gen7-gemma-2-9B/4e56faf6-dbde-4059-b502-32c76bdbed2d.json b/data/hfopenllm_v2/zelk12/MT2-Gen7-gemma-2-9B/4e56faf6-dbde-4059-b502-32c76bdbed2d.json
deleted file mode 100644
index 64fc4a08f..000000000
--- a/data/hfopenllm_v2/zelk12/MT2-Gen7-gemma-2-9B/4e56faf6-dbde-4059-b502-32c76bdbed2d.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT2-Gen7-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT2-Gen7-gemma-2-9B",
-    "id": "zelk12/MT2-Gen7-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1762
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6079
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.102
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3549
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4203
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4311
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT2-Max-Merge_02012025163610-gemma-2-9B/f161df97-3cc6-48d3-bfc5-d3f01108ecbb.json b/data/hfopenllm_v2/zelk12/MT2-Max-Merge_02012025163610-gemma-2-9B/f161df97-3cc6-48d3-bfc5-d3f01108ecbb.json
deleted file mode 100644
index 565a40eb8..000000000
--- a/data/hfopenllm_v2/zelk12/MT2-Max-Merge_02012025163610-gemma-2-9B/f161df97-3cc6-48d3-bfc5-d3f01108ecbb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT2-Max-Merge_02012025163610-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT2-Max-Merge_02012025163610-gemma-2-9B",
-    "id": "zelk12/MT2-Max-Merge_02012025163610-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7901
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6108
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2243
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4228
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4391
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT2-gemma-2-9B/7d08412d-e987-497f-a6ec-ce0affe0f80f.json b/data/hfopenllm_v2/zelk12/MT2-gemma-2-9B/7d08412d-e987-497f-a6ec-ce0affe0f80f.json
deleted file mode 100644
index adfbad27f..000000000
--- a/data/hfopenllm_v2/zelk12/MT2-gemma-2-9B/7d08412d-e987-497f-a6ec-ce0affe0f80f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT2-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT2-gemma-2-9B",
-    "id": "zelk12/MT2-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7886
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6115
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2213
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4217
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4368
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT3-Gen1-gemma-2-9B/f042f897-cfe8-4d8c-b75b-bbfca44505ea.json b/data/hfopenllm_v2/zelk12/MT3-Gen1-gemma-2-9B/f042f897-cfe8-4d8c-b75b-bbfca44505ea.json
deleted file mode 100644
index 4eb9a7a77..000000000
--- a/data/hfopenllm_v2/zelk12/MT3-Gen1-gemma-2-9B/f042f897-cfe8-4d8c-b75b-bbfca44505ea.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT3-Gen1-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT3-Gen1-gemma-2-9B",
-    "id": "zelk12/MT3-Gen1-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7838
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6107
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3465
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4151
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4327
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT3-Gen2-gemma-2-9B/f24ab334-c022-4e34-a930-3fed6ee18793.json b/data/hfopenllm_v2/zelk12/MT3-Gen2-gemma-2-9B/f24ab334-c022-4e34-a930-3fed6ee18793.json
deleted file mode 100644
index 84a45fd26..000000000
--- a/data/hfopenllm_v2/zelk12/MT3-Gen2-gemma-2-9B/f24ab334-c022-4e34-a930-3fed6ee18793.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT3-Gen2-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT3-Gen2-gemma-2-9B",
-    "id": "zelk12/MT3-Gen2-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7843
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6091
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2236
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3574
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4111
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4333
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT3-Gen3-gemma-2-9B/2bd3c620-780f-452d-92d7-d01a04539939.json b/data/hfopenllm_v2/zelk12/MT3-Gen3-gemma-2-9B/2bd3c620-780f-452d-92d7-d01a04539939.json
deleted file mode 100644
index 03053bdd1..000000000
--- a/data/hfopenllm_v2/zelk12/MT3-Gen3-gemma-2-9B/2bd3c620-780f-452d-92d7-d01a04539939.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT3-Gen3-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT3-Gen3-gemma-2-9B",
-    "id": "zelk12/MT3-Gen3-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7856
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6089
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2153
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4258
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4303
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT3-Gen4-gemma-2-9B/234042bd-237f-4cc5-8c5d-1eacd2e8bfaa.json b/data/hfopenllm_v2/zelk12/MT3-Gen4-gemma-2-9B/234042bd-237f-4cc5-8c5d-1eacd2e8bfaa.json
deleted file mode 100644
index 283c8ee00..000000000
--- a/data/hfopenllm_v2/zelk12/MT3-Gen4-gemma-2-9B/234042bd-237f-4cc5-8c5d-1eacd2e8bfaa.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT3-Gen4-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT3-Gen4-gemma-2-9B",
-    "id": "zelk12/MT3-Gen4-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7737
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6101
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2062
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4476
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4387
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT3-Gen5-gemma-2-9B/d8e0a32e-f307-4056-b450-47a12a0a7b15.json b/data/hfopenllm_v2/zelk12/MT3-Gen5-gemma-2-9B/d8e0a32e-f307-4056-b450-47a12a0a7b15.json
deleted file mode 100644
index e636eb2e9..000000000
--- a/data/hfopenllm_v2/zelk12/MT3-Gen5-gemma-2-9B/d8e0a32e-f307-4056-b450-47a12a0a7b15.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT3-Gen5-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT3-Gen5-gemma-2-9B",
-    "id": "zelk12/MT3-Gen5-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.799
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6099
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2266
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3532
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4191
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4317
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT3-Gen5-gemma-2-9B_v1/9dc3c4f5-8974-4496-8a6e-daa4fe3e3c2a.json b/data/hfopenllm_v2/zelk12/MT3-Gen5-gemma-2-9B_v1/9dc3c4f5-8974-4496-8a6e-daa4fe3e3c2a.json
deleted file mode 100644
index 90b69beb0..000000000
--- a/data/hfopenllm_v2/zelk12/MT3-Gen5-gemma-2-9B_v1/9dc3c4f5-8974-4496-8a6e-daa4fe3e3c2a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT3-Gen5-gemma-2-9B_v1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT3-Gen5-gemma-2-9B_v1",
-    "id": "zelk12/MT3-Gen5-gemma-2-9B_v1",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7996
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6113
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2228
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.349
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4204
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4359
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT3-Gen6-gemma-2-9B/037787fb-9c61-4c56-a7fc-704c04b519f7.json b/data/hfopenllm_v2/zelk12/MT3-Gen6-gemma-2-9B/037787fb-9c61-4c56-a7fc-704c04b519f7.json
deleted file mode 100644
index bff72f6ad..000000000
--- a/data/hfopenllm_v2/zelk12/MT3-Gen6-gemma-2-9B/037787fb-9c61-4c56-a7fc-704c04b519f7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT3-Gen6-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT3-Gen6-gemma-2-9B",
-    "id": "zelk12/MT3-Gen6-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1762
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.602
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0884
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3431
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4126
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4102
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT3-Max-Merge_02012025163610-gemma-2-9B/5df3dd8f-4921-4916-8163-8651b796e478.json b/data/hfopenllm_v2/zelk12/MT3-Max-Merge_02012025163610-gemma-2-9B/5df3dd8f-4921-4916-8163-8651b796e478.json
deleted file mode 100644
index 71138b3be..000000000
--- a/data/hfopenllm_v2/zelk12/MT3-Max-Merge_02012025163610-gemma-2-9B/5df3dd8f-4921-4916-8163-8651b796e478.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT3-Max-Merge_02012025163610-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT3-Max-Merge_02012025163610-gemma-2-9B",
-    "id": "zelk12/MT3-Max-Merge_02012025163610-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1762
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6123
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1012
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3507
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4255
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4389
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT3-gemma-2-9B/50463593-3a53-4b3f-9621-d05670309b7e.json b/data/hfopenllm_v2/zelk12/MT3-gemma-2-9B/50463593-3a53-4b3f-9621-d05670309b7e.json
deleted file mode 100644
index 16582b121..000000000
--- a/data/hfopenllm_v2/zelk12/MT3-gemma-2-9B/50463593-3a53-4b3f-9621-d05670309b7e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT3-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT3-gemma-2-9B",
-    "id": "zelk12/MT3-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7786
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6131
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2168
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3448
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4243
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4327
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT4-Gen1-gemma-2-9B/d7fef356-36c7-488f-8f49-997682a2c01a.json b/data/hfopenllm_v2/zelk12/MT4-Gen1-gemma-2-9B/d7fef356-36c7-488f-8f49-997682a2c01a.json
deleted file mode 100644
index 7bdf9dc8f..000000000
--- a/data/hfopenllm_v2/zelk12/MT4-Gen1-gemma-2-9B/d7fef356-36c7-488f-8f49-997682a2c01a.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT4-Gen1-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT4-Gen1-gemma-2-9B",
-    "id": "zelk12/MT4-Gen1-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7895
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6094
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2198
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4322
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4389
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT4-Gen2-gemma-2-9B/42e7abc6-eaa2-4971-90ee-e4d9dbb97ddb.json b/data/hfopenllm_v2/zelk12/MT4-Gen2-gemma-2-9B/42e7abc6-eaa2-4971-90ee-e4d9dbb97ddb.json
deleted file mode 100644
index 3b4ba860e..000000000
--- a/data/hfopenllm_v2/zelk12/MT4-Gen2-gemma-2-9B/42e7abc6-eaa2-4971-90ee-e4d9dbb97ddb.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT4-Gen2-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT4-Gen2-gemma-2-9B",
-    "id": "zelk12/MT4-Gen2-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8051
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6108
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2326
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3456
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4257
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4368
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT4-Gen3-gemma-2-9B/b1cf06a6-d270-41ae-bb9b-443bdc5446f3.json b/data/hfopenllm_v2/zelk12/MT4-Gen3-gemma-2-9B/b1cf06a6-d270-41ae-bb9b-443bdc5446f3.json
deleted file mode 100644
index 5a8bcf436..000000000
--- a/data/hfopenllm_v2/zelk12/MT4-Gen3-gemma-2-9B/b1cf06a6-d270-41ae-bb9b-443bdc5446f3.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT4-Gen3-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT4-Gen3-gemma-2-9B",
-    "id": "zelk12/MT4-Gen3-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7841
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6087
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.219
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4243
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4381
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT4-Gen4-gemma-2-9B/e40ea476-bcc5-4d3b-bf8e-e5048d9cbe42.json b/data/hfopenllm_v2/zelk12/MT4-Gen4-gemma-2-9B/e40ea476-bcc5-4d3b-bf8e-e5048d9cbe42.json
deleted file mode 100644
index fd69e6724..000000000
--- a/data/hfopenllm_v2/zelk12/MT4-Gen4-gemma-2-9B/e40ea476-bcc5-4d3b-bf8e-e5048d9cbe42.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT4-Gen4-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT4-Gen4-gemma-2-9B",
-    "id": "zelk12/MT4-Gen4-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7874
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6076
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3523
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4244
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4323
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT4-Gen5-gemma-2-9B/731a5f85-a59e-40af-870c-00e519ca0e7e.json b/data/hfopenllm_v2/zelk12/MT4-Gen5-gemma-2-9B/731a5f85-a59e-40af-870c-00e519ca0e7e.json
deleted file mode 100644
index bc85cdb46..000000000
--- a/data/hfopenllm_v2/zelk12/MT4-Gen5-gemma-2-9B/731a5f85-a59e-40af-870c-00e519ca0e7e.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT4-Gen5-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT4-Gen5-gemma-2-9B",
-    "id": "zelk12/MT4-Gen5-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7789
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6107
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2266
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3565
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4268
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4384
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT4-Max-Merge_02012025163610-gemma-2-9B/38d93ae8-90ec-473c-8570-33d52c46770b.json b/data/hfopenllm_v2/zelk12/MT4-Max-Merge_02012025163610-gemma-2-9B/38d93ae8-90ec-473c-8570-33d52c46770b.json
deleted file mode 100644
index 197b1a326..000000000
--- a/data/hfopenllm_v2/zelk12/MT4-Max-Merge_02012025163610-gemma-2-9B/38d93ae8-90ec-473c-8570-33d52c46770b.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT4-Max-Merge_02012025163610-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT4-Max-Merge_02012025163610-gemma-2-9B",
-    "id": "zelk12/MT4-Max-Merge_02012025163610-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1771
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.612
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0952
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4228
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4391
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT4-gemma-2-9B/9072fd28-040b-44df-bd58-6e3f59398189.json b/data/hfopenllm_v2/zelk12/MT4-gemma-2-9B/9072fd28-040b-44df-bd58-6e3f59398189.json
deleted file mode 100644
index 7380725c6..000000000
--- a/data/hfopenllm_v2/zelk12/MT4-gemma-2-9B/9072fd28-040b-44df-bd58-6e3f59398189.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT4-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT4-gemma-2-9B",
-    "id": "zelk12/MT4-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7762
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6073
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2085
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3381
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4309
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4366
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT5-Gen1-gemma-2-9B/14827e00-09c5-4ebd-93cb-8e026ac73d20.json b/data/hfopenllm_v2/zelk12/MT5-Gen1-gemma-2-9B/14827e00-09c5-4ebd-93cb-8e026ac73d20.json
deleted file mode 100644
index 045a5d93e..000000000
--- a/data/hfopenllm_v2/zelk12/MT5-Gen1-gemma-2-9B/14827e00-09c5-4ebd-93cb-8e026ac73d20.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT5-Gen1-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT5-Gen1-gemma-2-9B",
-    "id": "zelk12/MT5-Gen1-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7831
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.611
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2213
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3473
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4204
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4368
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT5-Gen2-gemma-2-9B/11e76d74-b8e0-408f-b429-566faa5d60a2.json b/data/hfopenllm_v2/zelk12/MT5-Gen2-gemma-2-9B/11e76d74-b8e0-408f-b429-566faa5d60a2.json
deleted file mode 100644
index 0398ea5dd..000000000
--- a/data/hfopenllm_v2/zelk12/MT5-Gen2-gemma-2-9B/11e76d74-b8e0-408f-b429-566faa5d60a2.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT5-Gen2-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT5-Gen2-gemma-2-9B",
-    "id": "zelk12/MT5-Gen2-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7962
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6105
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2205
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4163
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4379
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT5-Gen3-gemma-2-9B/944c84d8-231d-47ef-85f4-23c0286a4a02.json b/data/hfopenllm_v2/zelk12/MT5-Gen3-gemma-2-9B/944c84d8-231d-47ef-85f4-23c0286a4a02.json
deleted file mode 100644
index 1735bd1eb..000000000
--- a/data/hfopenllm_v2/zelk12/MT5-Gen3-gemma-2-9B/944c84d8-231d-47ef-85f4-23c0286a4a02.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT5-Gen3-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT5-Gen3-gemma-2-9B",
-    "id": "zelk12/MT5-Gen3-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7825
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.609
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2168
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4231
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4375
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT5-Gen4-gemma-2-9B/47c8da1d-8ce3-4d19-b8b8-6b5e68e2e8ab.json b/data/hfopenllm_v2/zelk12/MT5-Gen4-gemma-2-9B/47c8da1d-8ce3-4d19-b8b8-6b5e68e2e8ab.json
deleted file mode 100644
index 4db3a4fe8..000000000
--- a/data/hfopenllm_v2/zelk12/MT5-Gen4-gemma-2-9B/47c8da1d-8ce3-4d19-b8b8-6b5e68e2e8ab.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT5-Gen4-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT5-Gen4-gemma-2-9B",
-    "id": "zelk12/MT5-Gen4-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7835
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6131
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2243
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3532
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4228
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4397
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT5-Gen5-gemma-2-9B/ca54a8d4-153b-4169-b6ee-133461a9bedd.json b/data/hfopenllm_v2/zelk12/MT5-Gen5-gemma-2-9B/ca54a8d4-153b-4169-b6ee-133461a9bedd.json
deleted file mode 100644
index 08cfd2a70..000000000
--- a/data/hfopenllm_v2/zelk12/MT5-Gen5-gemma-2-9B/ca54a8d4-153b-4169-b6ee-133461a9bedd.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT5-Gen5-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT5-Gen5-gemma-2-9B",
-    "id": "zelk12/MT5-Gen5-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7947
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6112
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2258
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3482
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4191
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4329
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT5-Max-Merge_02012025163610-gemma-2-9B/652359ec-14f2-4f94-a694-b7dc98819bfc.json b/data/hfopenllm_v2/zelk12/MT5-Max-Merge_02012025163610-gemma-2-9B/652359ec-14f2-4f94-a694-b7dc98819bfc.json
deleted file mode 100644
index 5dfae6c49..000000000
--- a/data/hfopenllm_v2/zelk12/MT5-Max-Merge_02012025163610-gemma-2-9B/652359ec-14f2-4f94-a694-b7dc98819bfc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT5-Max-Merge_02012025163610-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT5-Max-Merge_02012025163610-gemma-2-9B",
-    "id": "zelk12/MT5-Max-Merge_02012025163610-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1762
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6127
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0982
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4228
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.439
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MT5-gemma-2-9B/b34f3335-c7a3-431f-b2c8-6f0731a81378.json b/data/hfopenllm_v2/zelk12/MT5-gemma-2-9B/b34f3335-c7a3-431f-b2c8-6f0731a81378.json
deleted file mode 100644
index f6eb0dcef..000000000
--- a/data/hfopenllm_v2/zelk12/MT5-gemma-2-9B/b34f3335-c7a3-431f-b2c8-6f0731a81378.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MT5-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MT5-gemma-2-9B",
-    "id": "zelk12/MT5-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8048
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6112
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2258
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3431
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4204
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4367
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MTM-Merge-gemma-2-9B/077306f9-5d40-40dc-9df4-b5ca559af5c7.json b/data/hfopenllm_v2/zelk12/MTM-Merge-gemma-2-9B/077306f9-5d40-40dc-9df4-b5ca559af5c7.json
deleted file mode 100644
index 5b5954b79..000000000
--- a/data/hfopenllm_v2/zelk12/MTM-Merge-gemma-2-9B/077306f9-5d40-40dc-9df4-b5ca559af5c7.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MTM-Merge-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MTM-Merge-gemma-2-9B",
-    "id": "zelk12/MTM-Merge-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7798
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6133
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2175
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3549
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4268
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4388
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/MTMaMe-Merge_02012025163610-gemma-2-9B/e0f0fe87-8ed3-4398-8683-65aa042d01d9.json b/data/hfopenllm_v2/zelk12/MTMaMe-Merge_02012025163610-gemma-2-9B/e0f0fe87-8ed3-4398-8683-65aa042d01d9.json
deleted file mode 100644
index 15b286068..000000000
--- a/data/hfopenllm_v2/zelk12/MTMaMe-Merge_02012025163610-gemma-2-9B/e0f0fe87-8ed3-4398-8683-65aa042d01d9.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_MTMaMe-Merge_02012025163610-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MTMaMe-Merge_02012025163610-gemma-2-9B",
-    "id": "zelk12/MTMaMe-Merge_02012025163610-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1786
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6117
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0959
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3523
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4241
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4382
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/Rv0.4DMv1t0.25-gemma-2-9B/2d968d3e-a3df-4bdf-86a4-034087c0d7fc.json b/data/hfopenllm_v2/zelk12/Rv0.4DMv1t0.25-gemma-2-9B/2d968d3e-a3df-4bdf-86a4-034087c0d7fc.json
deleted file mode 100644
index eeccbef87..000000000
--- a/data/hfopenllm_v2/zelk12/Rv0.4DMv1t0.25-gemma-2-9B/2d968d3e-a3df-4bdf-86a4-034087c0d7fc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_Rv0.4DMv1t0.25-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rv0.4DMv1t0.25-gemma-2-9B",
-    "id": "zelk12/Rv0.4DMv1t0.25-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7497
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.607
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2258
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3456
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4309
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4401
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/Rv0.4DMv1t0.25Tt0.25-gemma-2-9B/db476911-87fb-433f-b164-4435718dab46.json b/data/hfopenllm_v2/zelk12/Rv0.4DMv1t0.25Tt0.25-gemma-2-9B/db476911-87fb-433f-b164-4435718dab46.json
deleted file mode 100644
index 48c1ee534..000000000
--- a/data/hfopenllm_v2/zelk12/Rv0.4DMv1t0.25Tt0.25-gemma-2-9B/db476911-87fb-433f-b164-4435718dab46.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_Rv0.4DMv1t0.25Tt0.25-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rv0.4DMv1t0.25Tt0.25-gemma-2-9B",
-    "id": "zelk12/Rv0.4DMv1t0.25Tt0.25-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7646
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6098
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2069
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3423
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4283
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4347
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/Rv0.4MT4g2-gemma-2-9B/75a967f6-a8ab-435f-999b-4889e8217dce.json b/data/hfopenllm_v2/zelk12/Rv0.4MT4g2-gemma-2-9B/75a967f6-a8ab-435f-999b-4889e8217dce.json
deleted file mode 100644
index eb1701501..000000000
--- a/data/hfopenllm_v2/zelk12/Rv0.4MT4g2-gemma-2-9B/75a967f6-a8ab-435f-999b-4889e8217dce.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_Rv0.4MT4g2-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Rv0.4MT4g2-gemma-2-9B",
-    "id": "zelk12/Rv0.4MT4g2-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.732
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6041
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1949
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3532
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4231
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4417
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/T31122024203920-gemma-2-9B/e072997b-2f79-4d25-b8dc-ebf15ac311e1.json b/data/hfopenllm_v2/zelk12/T31122024203920-gemma-2-9B/e072997b-2f79-4d25-b8dc-ebf15ac311e1.json
deleted file mode 100644
index 2fd27ce58..000000000
--- a/data/hfopenllm_v2/zelk12/T31122024203920-gemma-2-9B/e072997b-2f79-4d25-b8dc-ebf15ac311e1.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_T31122024203920-gemma-2-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "T31122024203920-gemma-2-9B",
-    "id": "zelk12/T31122024203920-gemma-2-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7676
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6096
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2054
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3507
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4322
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4373
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/Test01012025155054/6d681a29-0d1a-4054-8250-5246993509f8.json b/data/hfopenllm_v2/zelk12/Test01012025155054/6d681a29-0d1a-4054-8250-5246993509f8.json
deleted file mode 100644
index 5d2d4db3f..000000000
--- a/data/hfopenllm_v2/zelk12/Test01012025155054/6d681a29-0d1a-4054-8250-5246993509f8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_Test01012025155054/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Test01012025155054",
-    "id": "zelk12/Test01012025155054",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 3.817
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1555
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.283
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2416
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.367
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.109
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/Test01012025155054t0.5_gemma-2/2a6af4ce-e45c-4721-a23c-03071a5e774f.json b/data/hfopenllm_v2/zelk12/Test01012025155054t0.5_gemma-2/2a6af4ce-e45c-4721-a23c-03071a5e774f.json
deleted file mode 100644
index e5cfc1c87..000000000
--- a/data/hfopenllm_v2/zelk12/Test01012025155054t0.5_gemma-2/2a6af4ce-e45c-4721-a23c-03071a5e774f.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_Test01012025155054t0.5_gemma-2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Test01012025155054t0.5_gemma-2",
-    "id": "zelk12/Test01012025155054t0.5_gemma-2",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 3.817
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1555
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.283
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2416
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.367
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.109
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/gemma-2-S2MTM-9B/5ae5ddff-714d-4a20-b1d3-3eeb95fd858c.json b/data/hfopenllm_v2/zelk12/gemma-2-S2MTM-9B/5ae5ddff-714d-4a20-b1d3-3eeb95fd858c.json
deleted file mode 100644
index 18e9a3529..000000000
--- a/data/hfopenllm_v2/zelk12/gemma-2-S2MTM-9B/5ae5ddff-714d-4a20-b1d3-3eeb95fd858c.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_gemma-2-S2MTM-9B/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemma-2-S2MTM-9B",
-    "id": "zelk12/gemma-2-S2MTM-9B",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7823
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6061
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2047
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3456
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4218
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4297
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25/60052d34-f6a7-4204-baea-532f5ba29880.json b/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25/60052d34-f6a7-4204-baea-532f5ba29880.json
deleted file mode 100644
index 2e6a12b87..000000000
--- a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25/60052d34-f6a7-4204-baea-532f5ba29880.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25",
-    "id": "zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.25",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7707
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6075
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2145
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3431
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4323
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.44
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75/e1ddd882-f8a1-48d0-bb2a-878f43095895.json b/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75/e1ddd882-f8a1-48d0-bb2a-878f43095895.json
deleted file mode 100644
index c498d42d2..000000000
--- a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75/e1ddd882-f8a1-48d0-bb2a-878f43095895.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75",
-    "id": "zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1-t0.75",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7208
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5995
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2017
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3498
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3951
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4141
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1/d2c3edec-38d8-48e3-9f6d-e26a63442af8.json b/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1/d2c3edec-38d8-48e3-9f6d-e26a63442af8.json
deleted file mode 100644
index eeaa69af5..000000000
--- a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1/d2c3edec-38d8-48e3-9f6d-e26a63442af8.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "recoilme-gemma-2-Ataraxy-9B-v0.1",
-    "id": "zelk12/recoilme-gemma-2-Ataraxy-9B-v0.1",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7649
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6075
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2281
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3498
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4136
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4321
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.2/dcfafe94-dacb-4e7a-9365-8bb39ecb79ec.json b/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.2/dcfafe94-dacb-4e7a-9365-8bb39ecb79ec.json
deleted file mode 100644
index d481e65cf..000000000
--- a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ataraxy-9B-v0.2/dcfafe94-dacb-4e7a-9365-8bb39ecb79ec.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_recoilme-gemma-2-Ataraxy-9B-v0.2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "recoilme-gemma-2-Ataraxy-9B-v0.2",
-    "id": "zelk12/recoilme-gemma-2-Ataraxy-9B-v0.2",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6066
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2228
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3482
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.411
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4323
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1/8ca0e602-bf6b-4d15-95c2-a0d47e78ded0.json b/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1/8ca0e602-bf6b-4d15-95c2-a0d47e78ded0.json
deleted file mode 100644
index 93f6e8d52..000000000
--- a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1/8ca0e602-bf6b-4d15-95c2-a0d47e78ded0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1",
-    "id": "zelk12/recoilme-gemma-2-Gutenberg-Doppel-9B-v0.1",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7615
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6099
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.21
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3414
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.431
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4315
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ifable-9B-v0.1/fc262523-dcde-4b45-80ba-2922e66d42c4.json b/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ifable-9B-v0.1/fc262523-dcde-4b45-80ba-2922e66d42c4.json
deleted file mode 100644
index 5aa27b1eb..000000000
--- a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-Ifable-9B-v0.1/fc262523-dcde-4b45-80ba-2922e66d42c4.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_recoilme-gemma-2-Ifable-9B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "recoilme-gemma-2-Ifable-9B-v0.1",
-    "id": "zelk12/recoilme-gemma-2-Ifable-9B-v0.1",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7944
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6064
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2205
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3515
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4202
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4323
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-psy10k-mental_healt-9B-v0.1/f8d745da-9867-4348-bace-d8052c3b4025.json b/data/hfopenllm_v2/zelk12/recoilme-gemma-2-psy10k-mental_healt-9B-v0.1/f8d745da-9867-4348-bace-d8052c3b4025.json
deleted file mode 100644
index 283796808..000000000
--- a/data/hfopenllm_v2/zelk12/recoilme-gemma-2-psy10k-mental_healt-9B-v0.1/f8d745da-9867-4348-bace-d8052c3b4025.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zelk12_recoilme-gemma-2-psy10k-mental_healt-9B-v0.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "recoilme-gemma-2-psy10k-mental_healt-9B-v0.1",
-    "id": "zelk12/recoilme-gemma-2-psy10k-mental_healt-9B-v0.1",
-    "developer": "zelk12",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Gemma2ForCausalLM",
-      "params_billions": 10.159
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7445
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5978
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1888
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.344
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4295
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4181
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zetasepic/Qwen2.5-32B-Instruct-abliterated-v2/3d410f0f-6b24-4e86-a353-6142c51b1ecc.json b/data/hfopenllm_v2/zetasepic/Qwen2.5-32B-Instruct-abliterated-v2/3d410f0f-6b24-4e86-a353-6142c51b1ecc.json
deleted file mode 100644
index 165cc965d..000000000
--- a/data/hfopenllm_v2/zetasepic/Qwen2.5-32B-Instruct-abliterated-v2/3d410f0f-6b24-4e86-a353-6142c51b1ecc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zetasepic_Qwen2.5-32B-Instruct-abliterated-v2/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-32B-Instruct-abliterated-v2",
-    "id": "zetasepic/Qwen2.5-32B-Instruct-abliterated-v2",
-    "developer": "zetasepic",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 32.764
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8334
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6934
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5952
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3674
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4354
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5622
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zetasepic/Qwen2.5-72B-Instruct-abliterated/46329fc3-974f-4d04-be9e-ba85b3816efc.json b/data/hfopenllm_v2/zetasepic/Qwen2.5-72B-Instruct-abliterated/46329fc3-974f-4d04-be9e-ba85b3816efc.json
deleted file mode 100644
index 5b3165605..000000000
--- a/data/hfopenllm_v2/zetasepic/Qwen2.5-72B-Instruct-abliterated/46329fc3-974f-4d04-be9e-ba85b3816efc.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zetasepic_Qwen2.5-72B-Instruct-abliterated/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen2.5-72B-Instruct-abliterated",
-    "id": "zetasepic/Qwen2.5-72B-Instruct-abliterated",
-    "developer": "zetasepic",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "Qwen2ForCausalLM",
-      "params_billions": 72.706
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7153
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7152
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5242
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4069
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4719
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5872
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/hfopenllm_v2/zhengr/MixTAO-7Bx2-MoE-v8.1/b964d0a4-7c44-4ea2-894e-3e1ca30321e0.json b/data/hfopenllm_v2/zhengr/MixTAO-7Bx2-MoE-v8.1/b964d0a4-7c44-4ea2-894e-3e1ca30321e0.json
deleted file mode 100644
index b0791265c..000000000
--- a/data/hfopenllm_v2/zhengr/MixTAO-7Bx2-MoE-v8.1/b964d0a4-7c44-4ea2-894e-3e1ca30321e0.json
+++ /dev/null
@@ -1,132 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "hfopenllm_v2/zhengr_MixTAO-7Bx2-MoE-v8.1/1770682486.623709",
-  "retrieved_timestamp": "1770682486.623709",
-  "source_metadata": {
-    "source_name": "HF Open LLM v2",
-    "source_type": "documentation",
-    "source_organization_name": "Hugging Face",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "MixTAO-7Bx2-MoE-v8.1",
-    "id": "zhengr/MixTAO-7Bx2-MoE-v8.1",
-    "developer": "zhengr",
-    "inference_platform": "unknown",
-    "additional_details": {
-      "precision": "bfloat16",
-      "architecture": "MixtralForCausalLM",
-      "params_billions": 12.879
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "IFEval",
-      "source_data": {
-        "dataset_name": "IFEval",
-        "source_type": "hf_dataset",
-        "hf_repo": "google/IFEval"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on IFEval",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4188
-      }
-    },
-    {
-      "evaluation_name": "BBH",
-      "source_data": {
-        "dataset_name": "BBH",
-        "source_type": "hf_dataset",
-        "hf_repo": "SaylorTwift/bbh"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on BBH",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4202
-      }
-    },
-    {
-      "evaluation_name": "MATH Level 5",
-      "source_data": {
-        "dataset_name": "MATH Level 5",
-        "source_type": "hf_dataset",
-        "hf_repo": "DigitalLearningGmbH/MATH-lighteval"
-      },
-      "metric_config": {
-        "evaluation_description": "Exact Match on MATH Level 5",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0604
-      }
-    },
-    {
-      "evaluation_name": "GPQA",
-      "source_data": {
-        "dataset_name": "GPQA",
-        "source_type": "hf_dataset",
-        "hf_repo": "Idavidrein/gpqa"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on GPQA",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2987
-      }
-    },
-    {
-      "evaluation_name": "MUSR",
-      "source_data": {
-        "dataset_name": "MUSR",
-        "source_type": "hf_dataset",
-        "hf_repo": "TAUR-Lab/MuSR"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MUSR",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3976
-      }
-    },
-    {
-      "evaluation_name": "MMLU-PRO",
-      "source_data": {
-        "dataset_name": "MMLU-PRO",
-        "source_type": "hf_dataset",
-        "hf_repo": "TIGER-Lab/MMLU-Pro"
-      },
-      "metric_config": {
-        "evaluation_description": "Accuracy on MMLU-PRO",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2847
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/livecodebenchpro/alibaba/qwen3-235b-a22b-thinking-2507/126326f3-6521-45d1-aa14-5c51335c1929.json b/data/livecodebenchpro/alibaba/qwen3-235b-a22b-thinking-2507/126326f3-6521-45d1-aa14-5c51335c1929.json
deleted file mode 100644
index f3d8ed859..000000000
--- a/data/livecodebenchpro/alibaba/qwen3-235b-a22b-thinking-2507/126326f3-6521-45d1-aa14-5c51335c1929.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/qwen3-235b-a22b-thinking-2507/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "qwen3-235b-a22b-thinking-2507",
-    "developer": "Alibaba",
-    "inference_platform": "aliyun",
-    "id": "alibaba/qwen3-235b-a22b-thinking-2507"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.1267605633802817
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.7605633802816901
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/alibaba/qwen3-30b-a3b/b3f5937a-1489-417b-8162-6c62dea0703d.json b/data/livecodebenchpro/alibaba/qwen3-30b-a3b/b3f5937a-1489-417b-8162-6c62dea0703d.json
deleted file mode 100644
index 86221fbb2..000000000
--- a/data/livecodebenchpro/alibaba/qwen3-30b-a3b/b3f5937a-1489-417b-8162-6c62dea0703d.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/qwen3-30b-a3b/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "qwen3-30b-a3b",
-    "developer": "Alibaba",
-    "inference_platform": "aliyun",
-    "id": "alibaba/qwen3-30b-a3b"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.028169014084507043
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.5774647887323944
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/alibaba/qwen3-max/f06d6c4c-b2c4-4c48-9702-f0bf08af62c4.json b/data/livecodebenchpro/alibaba/qwen3-max/f06d6c4c-b2c4-4c48-9702-f0bf08af62c4.json
deleted file mode 100644
index f517719a6..000000000
--- a/data/livecodebenchpro/alibaba/qwen3-max/f06d6c4c-b2c4-4c48-9702-f0bf08af62c4.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/alibaba/qwen3-max/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "alibaba/qwen3-max",
-    "developer": "Alibaba",
-    "inference_platform": "openrouter",
-    "id": "alibaba/qwen3-max"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.04225352112676056
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.36619718309859156
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/alibaba/qwen3-next-80b-a3b-thinking/809a1503-a161-4532-afd3-fdbd6551eb63.json b/data/livecodebenchpro/alibaba/qwen3-next-80b-a3b-thinking/809a1503-a161-4532-afd3-fdbd6551eb63.json
deleted file mode 100644
index 3255f5bca..000000000
--- a/data/livecodebenchpro/alibaba/qwen3-next-80b-a3b-thinking/809a1503-a161-4532-afd3-fdbd6551eb63.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/qwen3-next-80b-a3b-thinking/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "qwen3-next-80b-a3b-thinking",
-    "developer": "Alibaba",
-    "inference_platform": "aliyun",
-    "id": "alibaba/qwen3-next-80b-a3b-thinking"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.14084507042253522
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.7464788732394366
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/aliyun/qwen3-next-80b-a3b-thinking/808ca8e4-9b14-48ba-bb39-e3b6a5672c80.json b/data/livecodebenchpro/aliyun/qwen3-next-80b-a3b-thinking/808ca8e4-9b14-48ba-bb39-e3b6a5672c80.json
deleted file mode 100644
index 78b03c308..000000000
--- a/data/livecodebenchpro/aliyun/qwen3-next-80b-a3b-thinking/808ca8e4-9b14-48ba-bb39-e3b6a5672c80.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/qwen3-next-80b-a3b-thinking/1770683238.099205",
-  "retrieved_timestamp": "1770683238.099205",
-  "source_metadata": {
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation",
-    "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "qwen3-next-80b-a3b-thinking",
-    "id": "aliyun/qwen3-next-80b-a3b-thinking",
-    "developer": "aliyun",
-    "inference_platform": "aliyun"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0704
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6901
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/livecodebenchpro/anthropic/claude-3-7-sonnet-20250219/be076445-eb88-49b0-a855-2e0cb1551bab.json b/data/livecodebenchpro/anthropic/claude-3-7-sonnet-20250219/be076445-eb88-49b0-a855-2e0cb1551bab.json
deleted file mode 100644
index 6816347c8..000000000
--- a/data/livecodebenchpro/anthropic/claude-3-7-sonnet-20250219/be076445-eb88-49b0-a855-2e0cb1551bab.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/claude-3-7-sonnet-20250219/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "claude-3-7-sonnet-20250219",
-    "developer": "Anthropic",
-    "inference_platform": "anthropic",
-    "id": "anthropic/claude-3-7-sonnet-20250219"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.28169014084507044
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/anthropic/claude-3.7-sonnet/69210faf-04a8-46d4-b92b-94f2ca521c09.json b/data/livecodebenchpro/anthropic/claude-3.7-sonnet/69210faf-04a8-46d4-b92b-94f2ca521c09.json
deleted file mode 100644
index 586366674..000000000
--- a/data/livecodebenchpro/anthropic/claude-3.7-sonnet/69210faf-04a8-46d4-b92b-94f2ca521c09.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/anthropic/claude-3.7-sonnet/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "anthropic/claude-3.7-sonnet",
-    "developer": "Anthropic",
-    "inference_platform": "openrouter",
-    "id": "anthropic/claude-3.7-sonnet"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.014084507042253521
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.15492957746478872
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/anthropic/claude-sonnet-4-5-20250929/ed293aa1-f64e-429d-bddf-91a35a4203d1.json b/data/livecodebenchpro/anthropic/claude-sonnet-4-5-20250929/ed293aa1-f64e-429d-bddf-91a35a4203d1.json
deleted file mode 100644
index 304dcadde..000000000
--- a/data/livecodebenchpro/anthropic/claude-sonnet-4-5-20250929/ed293aa1-f64e-429d-bddf-91a35a4203d1.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/claude-sonnet-4-5-20250929/1770683238.099205",
-  "retrieved_timestamp": "1770683238.099205",
-  "source_metadata": {
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation",
-    "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "claude-sonnet-4-5-20250929",
-    "id": "anthropic/claude-sonnet-4-5-20250929",
-    "developer": "anthropic",
-    "inference_platform": "anthropic"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5352
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/livecodebenchpro/ark/ep-20250603132404-cgpjm/2bddd388-5e9a-423e-8767-37d6f9f69032.json b/data/livecodebenchpro/ark/ep-20250603132404-cgpjm/2bddd388-5e9a-423e-8767-37d6f9f69032.json
deleted file mode 100644
index 5c18e44af..000000000
--- a/data/livecodebenchpro/ark/ep-20250603132404-cgpjm/2bddd388-5e9a-423e-8767-37d6f9f69032.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/ep-20250603132404-cgpjm/1770683238.099205",
-  "retrieved_timestamp": "1770683238.099205",
-  "source_metadata": {
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation",
-    "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ep-20250603132404-cgpjm",
-    "id": "ark/ep-20250603132404-cgpjm",
-    "developer": "ark",
-    "inference_platform": "ark"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0141
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.507
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/livecodebenchpro/bytedance/doubao-seed-1-6-thinking-250615/bfd991ca-13e9-4716-b389-11e0d2afe286.json b/data/livecodebenchpro/bytedance/doubao-seed-1-6-thinking-250615/bfd991ca-13e9-4716-b389-11e0d2afe286.json
deleted file mode 100644
index 14a043adb..000000000
--- a/data/livecodebenchpro/bytedance/doubao-seed-1-6-thinking-250615/bfd991ca-13e9-4716-b389-11e0d2afe286.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/doubao-seed-1-6-thinking-250615/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "doubao-seed-1-6-thinking-250615",
-    "developer": "ByteDance",
-    "inference_platform": "ark",
-    "id": "bytedance/doubao-seed-1-6-thinking-250615"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.07042253521126761
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.5774647887323944
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/deepseek/chat-v3-0324/b29b7c8e-759e-45fe-a9d3-1054f19af617.json b/data/livecodebenchpro/deepseek/chat-v3-0324/b29b7c8e-759e-45fe-a9d3-1054f19af617.json
deleted file mode 100644
index 88ca0a5d0..000000000
--- a/data/livecodebenchpro/deepseek/chat-v3-0324/b29b7c8e-759e-45fe-a9d3-1054f19af617.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/deepseek/chat-v3-0324/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "deepseek/chat-v3-0324",
-    "developer": "DeepSeek",
-    "inference_platform": "openrouter",
-    "id": "deepseek/chat-v3-0324"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.19718309859154928
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/deepseek/ep-20250214004308-p7n89/801d2dc6-17e7-47f1-a54f-87b94a59b508.json b/data/livecodebenchpro/deepseek/ep-20250214004308-p7n89/801d2dc6-17e7-47f1-a54f-87b94a59b508.json
deleted file mode 100644
index 078e0a459..000000000
--- a/data/livecodebenchpro/deepseek/ep-20250214004308-p7n89/801d2dc6-17e7-47f1-a54f-87b94a59b508.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/ep-20250214004308-p7n89/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "ep-20250214004308-p7n89",
-    "developer": "DeepSeek",
-    "inference_platform": "ark",
-    "id": "deepseek/ep-20250214004308-p7n89"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.014084507042253521
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.4225352112676056
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/deepseek/ep-20250228232227-z44x5/def0b2e3-cf5f-4dfd-8f1c-827f98d1626a.json b/data/livecodebenchpro/deepseek/ep-20250228232227-z44x5/def0b2e3-cf5f-4dfd-8f1c-827f98d1626a.json
deleted file mode 100644
index 4ea07e2df..000000000
--- a/data/livecodebenchpro/deepseek/ep-20250228232227-z44x5/def0b2e3-cf5f-4dfd-8f1c-827f98d1626a.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/ep-20250228232227-z44x5/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "ep-20250228232227-z44x5",
-    "developer": "DeepSeek",
-    "inference_platform": "ark",
-    "id": "deepseek/ep-20250228232227-z44x5"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.1267605633802817
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/deepseek/ep-20250603132404-cgpjm/157dd68b-fcc2-416f-a2c0-c9781020e6af.json b/data/livecodebenchpro/deepseek/ep-20250603132404-cgpjm/157dd68b-fcc2-416f-a2c0-c9781020e6af.json
deleted file mode 100644
index 114e45638..000000000
--- a/data/livecodebenchpro/deepseek/ep-20250603132404-cgpjm/157dd68b-fcc2-416f-a2c0-c9781020e6af.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/ep-20250603132404-cgpjm/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "ep-20250603132404-cgpjm",
-    "developer": "DeepSeek",
-    "inference_platform": "ark",
-    "id": "deepseek/ep-20250603132404-cgpjm"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.08450704225352113
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.5774647887323944
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/google/gemini-2.5-flash/174f0e23-84f1-43d0-bcdf-11b83c37025a.json b/data/livecodebenchpro/google/gemini-2.5-flash/174f0e23-84f1-43d0-bcdf-11b83c37025a.json
deleted file mode 100644
index 57f7f41bd..000000000
--- a/data/livecodebenchpro/google/gemini-2.5-flash/174f0e23-84f1-43d0-bcdf-11b83c37025a.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/google/gemini-2.5-flash/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "google/gemini-2.5-flash",
-    "developer": "Google",
-    "inference_platform": "openrouter",
-    "id": "google/gemini-2.5-flash"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.028169014084507043
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.38028169014084506
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/google/gemini-2.5-pro/bef7254b-549f-4e6b-b5c8-31b84dc6acda.json b/data/livecodebenchpro/google/gemini-2.5-pro/bef7254b-549f-4e6b-b5c8-31b84dc6acda.json
deleted file mode 100644
index a5be78bce..000000000
--- a/data/livecodebenchpro/google/gemini-2.5-pro/bef7254b-549f-4e6b-b5c8-31b84dc6acda.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/gemini-2.5-pro/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "gemini-2.5-pro",
-    "developer": "Google",
-    "inference_platform": "google",
-    "id": "google/gemini-2.5-pro"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.014084507042253521
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.2112676056338028
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.7183098591549296
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/kuaishou/kwaipilot-40b-0604/aa236b03-b81f-431b-b049-7101cea165f2.json b/data/livecodebenchpro/kuaishou/kwaipilot-40b-0604/aa236b03-b81f-431b-b049-7101cea165f2.json
deleted file mode 100644
index 2cbd5d730..000000000
--- a/data/livecodebenchpro/kuaishou/kwaipilot-40b-0604/aa236b03-b81f-431b-b049-7101cea165f2.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/kwaipilot-40b-0604/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "kwaipilot-40b-0604",
-    "developer": "Kuaishou",
-    "inference_platform": "kuaishou",
-    "id": "kuaishou/kwaipilot-40b-0604"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.07042253521126761
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.056338028169014086
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/meta/llama-4-maverick/abc37028-a362-4e02-8499-1bb7497e0293.json b/data/livecodebenchpro/meta/llama-4-maverick/abc37028-a362-4e02-8499-1bb7497e0293.json
deleted file mode 100644
index 949352df3..000000000
--- a/data/livecodebenchpro/meta/llama-4-maverick/abc37028-a362-4e02-8499-1bb7497e0293.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/meta/llama-4-maverick/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "meta/llama-4-maverick",
-    "developer": "Meta",
-    "inference_platform": "openrouter",
-    "id": "meta/llama-4-maverick"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.09859154929577464
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/openai/gpt-4.1/ba46ef91-d157-4984-b3df-ce33d8d97f8e.json b/data/livecodebenchpro/openai/gpt-4.1/ba46ef91-d157-4984-b3df-ce33d8d97f8e.json
deleted file mode 100644
index 28d6a0f6c..000000000
--- a/data/livecodebenchpro/openai/gpt-4.1/ba46ef91-d157-4984-b3df-ce33d8d97f8e.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/openai/gpt-4.1/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "openai/gpt-4.1",
-    "developer": "OpenAI",
-    "inference_platform": "openrouter",
-    "id": "openai/gpt-4.1"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.19718309859154928
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/openai/gpt-4o-2024-11-20/e70acf51-30ef-4c20-b7cc-51704d114d70.json b/data/livecodebenchpro/openai/gpt-4o-2024-11-20/e70acf51-30ef-4c20-b7cc-51704d114d70.json
deleted file mode 100644
index e67250be3..000000000
--- a/data/livecodebenchpro/openai/gpt-4o-2024-11-20/e70acf51-30ef-4c20-b7cc-51704d114d70.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/openai/gpt-4o-2024-11-20/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "openai/gpt-4o-2024-11-20",
-    "developer": "OpenAI",
-    "inference_platform": "openrouter",
-    "id": "openai/gpt-4o-2024-11-20"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.07042253521126761
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/openai/gpt-5-2025-08-07/0e57aa1f-48c6-42b7-9aee-43a29d21b83f.json b/data/livecodebenchpro/openai/gpt-5-2025-08-07/0e57aa1f-48c6-42b7-9aee-43a29d21b83f.json
deleted file mode 100644
index cf3bb7a63..000000000
--- a/data/livecodebenchpro/openai/gpt-5-2025-08-07/0e57aa1f-48c6-42b7-9aee-43a29d21b83f.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/gpt-5-2025-08-07/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "gpt-5-2025-08-07",
-    "developer": "OpenAI",
-    "inference_platform": "openai",
-    "id": "openai/gpt-5-2025-08-07"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.04225352112676056
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.4084507042253521
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.8873239436619719
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/openai/gpt-5-2025-08-07/de66cc70-b456-4165-a827-5193dd77e84d.json b/data/livecodebenchpro/openai/gpt-5-2025-08-07/de66cc70-b456-4165-a827-5193dd77e84d.json
deleted file mode 100644
index 348bade22..000000000
--- a/data/livecodebenchpro/openai/gpt-5-2025-08-07/de66cc70-b456-4165-a827-5193dd77e84d.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/gpt-5-2025-08-07/1770683238.099205",
-  "retrieved_timestamp": "1770683238.099205",
-  "source_metadata": {
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation",
-    "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt-5-2025-08-07",
-    "id": "openai/gpt-5-2025-08-07",
-    "developer": "openai",
-    "inference_platform": "openai"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0423
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4085
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9014
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/livecodebenchpro/openai/gpt-5.2-2025-12-11/e9139c52-ada0-4d1c-ae82-7852aacdb6ea.json b/data/livecodebenchpro/openai/gpt-5.2-2025-12-11/e9139c52-ada0-4d1c-ae82-7852aacdb6ea.json
deleted file mode 100644
index 8996fcf9e..000000000
--- a/data/livecodebenchpro/openai/gpt-5.2-2025-12-11/e9139c52-ada0-4d1c-ae82-7852aacdb6ea.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/gpt-5.2-2025-12-11/1770683238.099205",
-  "retrieved_timestamp": "1770683238.099205",
-  "source_metadata": {
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation",
-    "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gpt-5.2-2025-12-11",
-    "id": "openai/gpt-5.2-2025-12-11",
-    "developer": "openai",
-    "inference_platform": "openai"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1594
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5211
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9014
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/livecodebenchpro/openai/gpt-oss-120b/1dd8c827-72af-4c8f-9ead-989de7105590.json b/data/livecodebenchpro/openai/gpt-oss-120b/1dd8c827-72af-4c8f-9ead-989de7105590.json
deleted file mode 100644
index d9a8cbc70..000000000
--- a/data/livecodebenchpro/openai/gpt-oss-120b/1dd8c827-72af-4c8f-9ead-989de7105590.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/openai/gpt-oss-120b/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "openai/gpt-oss-120b",
-    "developer": "OpenAI",
-    "inference_platform": "openrouter",
-    "id": "openai/gpt-oss-120b"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.11267605633802817
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.6619718309859155
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/openai/gpt-oss-20b/ead39f61-b408-42b2-808f-8421a3200c89.json b/data/livecodebenchpro/openai/gpt-oss-20b/ead39f61-b408-42b2-808f-8421a3200c89.json
deleted file mode 100644
index fd7123119..000000000
--- a/data/livecodebenchpro/openai/gpt-oss-20b/ead39f61-b408-42b2-808f-8421a3200c89.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/openai/gpt-oss-20b/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "openai/gpt-oss-20b",
-    "developer": "OpenAI",
-    "inference_platform": "openrouter",
-    "id": "openai/gpt-oss-20b"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.056338028169014086
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.5070422535211268
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/openai/o3-2025-04-16/f96bdb35-4d61-4fde-8d91-edf55f13dc03.json b/data/livecodebenchpro/openai/o3-2025-04-16/f96bdb35-4d61-4fde-8d91-edf55f13dc03.json
deleted file mode 100644
index 5fc307953..000000000
--- a/data/livecodebenchpro/openai/o3-2025-04-16/f96bdb35-4d61-4fde-8d91-edf55f13dc03.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/o3-2025-04-16/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "o3-2025-04-16",
-    "developer": "OpenAI",
-    "inference_platform": "openai",
-    "id": "openai/o3-2025-04-16"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.22535211267605634
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.7183098591549296
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/openai/o4-mini-2025-04-16/5516f77c-932a-4eaa-ac31-dda9260ce82d.json b/data/livecodebenchpro/openai/o4-mini-2025-04-16/5516f77c-932a-4eaa-ac31-dda9260ce82d.json
deleted file mode 100644
index 21df96195..000000000
--- a/data/livecodebenchpro/openai/o4-mini-2025-04-16/5516f77c-932a-4eaa-ac31-dda9260ce82d.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/o4-mini-2025-04-16/1770683238.099205",
-  "retrieved_timestamp": "1770683238.099205",
-  "source_metadata": {
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation",
-    "source_organization_name": "New York University, Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "o4-mini-2025-04-16",
-    "id": "openai/o4-mini-2025-04-16",
-    "developer": "openai",
-    "inference_platform": "openai"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0143
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2923
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      },
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8571
-      }
-    }
-  ]
-}
\ No newline at end of file
diff --git a/data/livecodebenchpro/openai/o4-mini-2025-04-16/8992cef5-df7e-40a1-b099-331532c3deb0.json b/data/livecodebenchpro/openai/o4-mini-2025-04-16/8992cef5-df7e-40a1-b099-331532c3deb0.json
deleted file mode 100644
index 824e5dc57..000000000
--- a/data/livecodebenchpro/openai/o4-mini-2025-04-16/8992cef5-df7e-40a1-b099-331532c3deb0.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/o4-mini-2025-04-16/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "o4-mini-2025-04-16",
-    "developer": "OpenAI",
-    "inference_platform": "openai",
-    "id": "openai/o4-mini-2025-04-16"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.014084507042253521
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.30985915492957744
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.8873239436619719
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/livecodebenchpro/z-ai/glm-4.5/a77c08d6-a782-440c-b545-c60b6169712d.json b/data/livecodebenchpro/z-ai/glm-4.5/a77c08d6-a782-440c-b545-c60b6169712d.json
deleted file mode 100644
index 013991ae1..000000000
--- a/data/livecodebenchpro/z-ai/glm-4.5/a77c08d6-a782-440c-b545-c60b6169712d.json
+++ /dev/null
@@ -1,79 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "livecodebenchpro/z-ai/glm-4.5/1760492095.8105888",
-  "retrieved_timestamp": "1760492095.8105888",
-  "source_metadata": {
-    "source_organization_name": "New York University,  Princeton University, University of California San Diego, University of Washington and Canyon Crest Academy",
-    "evaluator_relationship": "third_party",
-    "source_name": "Live Code Bench Pro",
-    "source_type": "documentation"
-  },
-  "model_info": {
-    "name": "z-ai/glm-4.5",
-    "developer": "Z.AI",
-    "inference_platform": "openrouter",
-    "id": "z-ai/glm-4.5"
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Hard Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Hard Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "Hard Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=hard&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Medium Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Medium Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.028169014084507043
-      },
-      "source_data": {
-        "dataset_name": "Medium Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=medium&benchmark_mode=live"
-        ]
-      }
-    },
-    {
-      "evaluation_name": "Easy Problems",
-      "metric_config": {
-        "evaluation_description": "Pass@1 on Easy Problems",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0,
-        "max_score": 1
-      },
-      "score_details": {
-        "score": 0.1267605633802817
-      },
-      "source_data": {
-        "dataset_name": "Easy Problems",
-        "source_type": "url",
-        "url": [
-          "https://webhook.cp-bench.orzzh.com/leaderboard/llm/difficulty?difficulty=easy&benchmark_mode=live"
-        ]
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/0-hero/Matter-0.1-7B-DPO-preview/623bae1f-19e9-47f9-bc7b-80a859218d07.json b/data/reward-bench/0-hero/Matter-0.1-7B-DPO-preview/623bae1f-19e9-47f9-bc7b-80a859218d07.json
deleted file mode 100644
index cef912137..000000000
--- a/data/reward-bench/0-hero/Matter-0.1-7B-DPO-preview/623bae1f-19e9-47f9-bc7b-80a859218d07.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/0-hero_Matter-0.1-7B-DPO-preview/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "0-hero/Matter-0.1-7B-DPO-preview",
-    "id": "0-hero/Matter-0.1-7B-DPO-preview",
-    "developer": "0-hero",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7247
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8939
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5768
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6378
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8854
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5348
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/0-hero/Matter-0.1-7B-boost-DPO-preview/fbba98c5-5d56-4837-9044-d4e5ac610c2c.json b/data/reward-bench/0-hero/Matter-0.1-7B-boost-DPO-preview/fbba98c5-5d56-4837-9044-d4e5ac610c2c.json
deleted file mode 100644
index 2e9c3f43d..000000000
--- a/data/reward-bench/0-hero/Matter-0.1-7B-boost-DPO-preview/fbba98c5-5d56-4837-9044-d4e5ac610c2c.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/0-hero_Matter-0.1-7B-boost-DPO-preview/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "0-hero/Matter-0.1-7B-boost-DPO-preview",
-    "id": "0-hero/Matter-0.1-7B-boost-DPO-preview",
-    "developer": "0-hero",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7448
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9106
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6096
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7135
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8395
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5566
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Ahjeong/MMPO_Gemma_7b/dc6e1164-c9d7-4dd5-b8dc-fbc4e3f45011.json b/data/reward-bench/Ahjeong/MMPO_Gemma_7b/dc6e1164-c9d7-4dd5-b8dc-fbc4e3f45011.json
deleted file mode 100644
index 89456cf7f..000000000
--- a/data/reward-bench/Ahjeong/MMPO_Gemma_7b/dc6e1164-c9d7-4dd5-b8dc-fbc4e3f45011.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Ahjeong_MMPO_Gemma_7b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ahjeong/MMPO_Gemma_7b",
-    "id": "Ahjeong/MMPO_Gemma_7b",
-    "developer": "Ahjeong",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7587
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9693
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.614
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7135
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7756
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6831
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3/c62a913b-3101-4ce3-a5c5-a1ac844e55f8.json b/data/reward-bench/Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3/c62a913b-3101-4ce3-a5c5-a1ac844e55f8.json
deleted file mode 100644
index f147a68de..000000000
--- a/data/reward-bench/Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3/c62a913b-3101-4ce3-a5c5-a1ac844e55f8.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Ahjeong_MMPO_Gemma_7b_gamma1.1_epoch3/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3",
-    "id": "Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3",
-    "developer": "Ahjeong",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7652
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9721
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6338
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7635
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6913
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/3101726d-fd51-436d-8adf-cbdf0d534834.json b/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/3101726d-fd51-436d-8adf-cbdf0d534834.json
deleted file mode 100644
index 9aaa4ec32..000000000
--- a/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/3101726d-fd51-436d-8adf-cbdf0d534834.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/anthropic_claude-3-5-sonnet-20240620/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "anthropic/claude-3-5-sonnet-20240620",
-    "id": "anthropic/claude-3-5-sonnet-20240620",
-    "developer": "anthropic",
-    "additional_details": {
-      "model_type": "Generative RM"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6466
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3875
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5683
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8519
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8697
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.674
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/f878a52a-fa80-4113-ae7d-0cb11e3ef9fd.json b/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/f878a52a-fa80-4113-ae7d-0cb11e3ef9fd.json
deleted file mode 100644
index 4e001bb6c..000000000
--- a/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/f878a52a-fa80-4113-ae7d-0cb11e3ef9fd.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Anthropic_claude-3-5-sonnet-20240620/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Anthropic/claude-3-5-sonnet-20240620",
-    "id": "Anthropic/claude-3-5-sonnet-20240620",
-    "developer": "Anthropic",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8417
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9637
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7401
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8162
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8469
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Anthropic/claude-3-7-sonnet-20250219/904c6359-bd7b-4448-9f16-bc115d0629c4.json b/data/reward-bench/Anthropic/claude-3-7-sonnet-20250219/904c6359-bd7b-4448-9f16-bc115d0629c4.json
deleted file mode 100644
index 47b1297ca..000000000
--- a/data/reward-bench/Anthropic/claude-3-7-sonnet-20250219/904c6359-bd7b-4448-9f16-bc115d0629c4.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/anthropic_claude-3-7-sonnet-20250219/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "anthropic/claude-3-7-sonnet-20250219",
-    "id": "anthropic/claude-3-7-sonnet-20250219",
-    "developer": "anthropic",
-    "additional_details": {
-      "model_type": "Generative RM"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7539
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7326
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5437
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.75
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9033
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9212
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6723
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Anthropic/claude-3-haiku-20240307/49511052-6881-4151-9b46-686c75f73c22.json b/data/reward-bench/Anthropic/claude-3-haiku-20240307/49511052-6881-4151-9b46-686c75f73c22.json
deleted file mode 100644
index 6e0f3e1b3..000000000
--- a/data/reward-bench/Anthropic/claude-3-haiku-20240307/49511052-6881-4151-9b46-686c75f73c22.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/anthropic_claude-3-haiku-20240307/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "anthropic/claude-3-haiku-20240307",
-    "id": "anthropic/claude-3-haiku-20240307",
-    "developer": "anthropic",
-    "additional_details": {
-      "model_type": "Generative RM"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3711
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4042
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2812
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3552
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.595
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.501
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0899
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Anthropic/claude-3-haiku-20240307/b289e2e6-d57b-4a2b-aa61-e2974d193909.json b/data/reward-bench/Anthropic/claude-3-haiku-20240307/b289e2e6-d57b-4a2b-aa61-e2974d193909.json
deleted file mode 100644
index 16656cf8a..000000000
--- a/data/reward-bench/Anthropic/claude-3-haiku-20240307/b289e2e6-d57b-4a2b-aa61-e2974d193909.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Anthropic_claude-3-haiku-20240307/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Anthropic/claude-3-haiku-20240307",
-    "id": "Anthropic/claude-3-haiku-20240307",
-    "developer": "Anthropic",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7289
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9274
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5197
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7953
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.706
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6635
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Anthropic/claude-3-opus-20240229/aeeca919-71a1-42a0-a6d0-6779d77750e6.json b/data/reward-bench/Anthropic/claude-3-opus-20240229/aeeca919-71a1-42a0-a6d0-6779d77750e6.json
deleted file mode 100644
index dd51285f1..000000000
--- a/data/reward-bench/Anthropic/claude-3-opus-20240229/aeeca919-71a1-42a0-a6d0-6779d77750e6.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Anthropic_claude-3-opus-20240229/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Anthropic/claude-3-opus-20240229",
-    "id": "Anthropic/claude-3-opus-20240229",
-    "developer": "Anthropic",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8008
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9469
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6031
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8662
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7868
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Anthropic/claude-3-opus-20240229/db29538d-f40e-42d0-b3c0-e622f92112d2.json b/data/reward-bench/Anthropic/claude-3-opus-20240229/db29538d-f40e-42d0-b3c0-e622f92112d2.json
deleted file mode 100644
index 1c912c7c4..000000000
--- a/data/reward-bench/Anthropic/claude-3-opus-20240229/db29538d-f40e-42d0-b3c0-e622f92112d2.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/anthropic_claude-3-opus-20240229/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "anthropic/claude-3-opus-20240229",
-    "id": "anthropic/claude-3-opus-20240229",
-    "developer": "anthropic",
-    "additional_details": {
-      "model_type": "Generative RM"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5744
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5389
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3312
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5137
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8378
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6646
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5601
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Anthropic/claude-3-sonnet-20240229/ab0cdc4f-47dd-4dcc-b506-982ce3924105.json b/data/reward-bench/Anthropic/claude-3-sonnet-20240229/ab0cdc4f-47dd-4dcc-b506-982ce3924105.json
deleted file mode 100644
index 3721b0f48..000000000
--- a/data/reward-bench/Anthropic/claude-3-sonnet-20240229/ab0cdc4f-47dd-4dcc-b506-982ce3924105.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Anthropic_claude-3-sonnet-20240229/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Anthropic/claude-3-sonnet-20240229",
-    "id": "Anthropic/claude-3-sonnet-20240229",
-    "developer": "Anthropic",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7458
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9344
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5658
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8169
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6907
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6963
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Anthropic/claude-opus-4-20250514/44da63b6-d934-4330-bc20-33464bae61dd.json b/data/reward-bench/Anthropic/claude-opus-4-20250514/44da63b6-d934-4330-bc20-33464bae61dd.json
deleted file mode 100644
index 8dffb9139..000000000
--- a/data/reward-bench/Anthropic/claude-opus-4-20250514/44da63b6-d934-4330-bc20-33464bae61dd.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/anthropic_claude-opus-4-20250514/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "anthropic/claude-opus-4-20250514",
-    "id": "anthropic/claude-opus-4-20250514",
-    "developer": "anthropic",
-    "additional_details": {
-      "model_type": "Generative RM"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7648
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8267
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4188
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7491
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8954
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8616
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Anthropic/claude-sonnet-4-20250514/c930cbe0-f429-4b61-9abe-86dcb7266cf7.json b/data/reward-bench/Anthropic/claude-sonnet-4-20250514/c930cbe0-f429-4b61-9abe-86dcb7266cf7.json
deleted file mode 100644
index 45d756c32..000000000
--- a/data/reward-bench/Anthropic/claude-sonnet-4-20250514/c930cbe0-f429-4b61-9abe-86dcb7266cf7.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/anthropic_claude-sonnet-4-20250514/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "anthropic/claude-sonnet-4-20250514",
-    "id": "anthropic/claude-sonnet-4-20250514",
-    "developer": "anthropic",
-    "additional_details": {
-      "model_type": "Generative RM"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7117
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7612
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3594
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7049
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8909
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7596
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7939
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/AtlaAI/Selene-1-Mini-Llama-3.1-8B/c84b27b2-2dd9-48ee-9a53-ec27ae62ae7a.json b/data/reward-bench/AtlaAI/Selene-1-Mini-Llama-3.1-8B/c84b27b2-2dd9-48ee-9a53-ec27ae62ae7a.json
deleted file mode 100644
index e1c13041b..000000000
--- a/data/reward-bench/AtlaAI/Selene-1-Mini-Llama-3.1-8B/c84b27b2-2dd9-48ee-9a53-ec27ae62ae7a.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/AtlaAI_Selene-1-Mini-Llama-3.1-8B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AtlaAI/Selene-1-Mini-Llama-3.1-8B",
-    "id": "AtlaAI/Selene-1-Mini-Llama-3.1-8B",
-    "developer": "AtlaAI",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8913
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9358
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7939
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8926
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9429
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/AtlaAI/Selene-1/73ee9408-e669-4b8a-9419-76bd6051ce8d.json b/data/reward-bench/AtlaAI/Selene-1/73ee9408-e669-4b8a-9419-76bd6051ce8d.json
deleted file mode 100644
index e90407d26..000000000
--- a/data/reward-bench/AtlaAI/Selene-1/73ee9408-e669-4b8a-9419-76bd6051ce8d.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/AtlaAI_Selene-1/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "AtlaAI/Selene-1",
-    "id": "AtlaAI/Selene-1",
-    "developer": "AtlaAI",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9241
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9777
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8399
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9216
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9572
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/0deed2f4-770e-4033-a65d-e1da19e00611.json b/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/0deed2f4-770e-4033-a65d-e1da19e00611.json
deleted file mode 100644
index dd6dc0bf7..000000000
--- a/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/0deed2f4-770e-4033-a65d-e1da19e00611.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/CIR-AMS_BTRM_Qwen2_7b_0613/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CIR-AMS/BTRM_Qwen2_7b_0613",
-    "id": "CIR-AMS/BTRM_Qwen2_7b_0613",
-    "developer": "CIR-AMS",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5736
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5347
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3563
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6066
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7178
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5737
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6527
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/e727cb77-f229-4aaa-909f-99c7aa06676b.json b/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/e727cb77-f229-4aaa-909f-99c7aa06676b.json
deleted file mode 100644
index 7179fab0e..000000000
--- a/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/e727cb77-f229-4aaa-909f-99c7aa06676b.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/CIR-AMS_BTRM_Qwen2_7b_0613/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CIR-AMS/BTRM_Qwen2_7b_0613",
-    "id": "CIR-AMS/BTRM_Qwen2_7b_0613",
-    "developer": "CIR-AMS",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8172
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9749
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5724
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9014
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8775
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7029
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/CohereForAI/c4ai-command-r-plus/da9264cd-2fa3-4121-81de-eef994e15993.json b/data/reward-bench/CohereForAI/c4ai-command-r-plus/da9264cd-2fa3-4121-81de-eef994e15993.json
deleted file mode 100644
index 10fb8ed6c..000000000
--- a/data/reward-bench/CohereForAI/c4ai-command-r-plus/da9264cd-2fa3-4121-81de-eef994e15993.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/CohereForAI_c4ai-command-r-plus/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "CohereForAI/c4ai-command-r-plus",
-    "id": "CohereForAI/c4ai-command-r-plus",
-    "developer": "CohereForAI",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7057
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9511
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5757
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5986
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.704
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6924
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ContextualAI/LMUnit-llama3.1-70b/79cc5cd4-bfed-466d-9fbe-2f27e8aab175.json b/data/reward-bench/ContextualAI/LMUnit-llama3.1-70b/79cc5cd4-bfed-466d-9fbe-2f27e8aab175.json
deleted file mode 100644
index ff7ea07a0..000000000
--- a/data/reward-bench/ContextualAI/LMUnit-llama3.1-70b/79cc5cd4-bfed-466d-9fbe-2f27e8aab175.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/ContextualAI_LMUnit-llama3.1-70b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ContextualAI/LMUnit-llama3.1-70b",
-    "id": "ContextualAI/LMUnit-llama3.1-70b",
-    "developer": "ContextualAI",
-    "additional_details": {
-      "model_type": "Generative RM"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8054
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8463
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4875
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7158
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9067
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9697
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9063
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ContextualAI/LMUnit-qwen2.5-72b/28c35831-679d-489a-b2c4-fd2c7f333fbc.json b/data/reward-bench/ContextualAI/LMUnit-qwen2.5-72b/28c35831-679d-489a-b2c4-fd2c7f333fbc.json
deleted file mode 100644
index 8597afb51..000000000
--- a/data/reward-bench/ContextualAI/LMUnit-qwen2.5-72b/28c35831-679d-489a-b2c4-fd2c7f333fbc.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/ContextualAI_LMUnit-qwen2.5-72b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ContextualAI/LMUnit-qwen2.5-72b",
-    "id": "ContextualAI/LMUnit-qwen2.5-72b",
-    "developer": "ContextualAI",
-    "additional_details": {
-      "model_type": "Generative RM"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8208
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8716
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5437
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7268
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9133
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9677
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9014
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_llama13b/9db7907d-7b22-480c-86a5-f88ec2b302e7.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_llama13b/9db7907d-7b22-480c-86a5-f88ec2b302e7.json
deleted file mode 100644
index 4aa411ea6..000000000
--- a/data/reward-bench/ContextualAI/archangel_sft-dpo_llama13b/9db7907d-7b22-480c-86a5-f88ec2b302e7.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_llama13b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ContextualAI/archangel_sft-dpo_llama13b",
-    "id": "ContextualAI/archangel_sft-dpo_llama13b",
-    "developer": "ContextualAI",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.54
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7123
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4298
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5649
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4401
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5656
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_llama30b/2faddf79-41e6-47e9-9c26-17bc987bc870.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_llama30b/2faddf79-41e6-47e9-9c26-17bc987bc870.json
deleted file mode 100644
index fefd98c33..000000000
--- a/data/reward-bench/ContextualAI/archangel_sft-dpo_llama30b/2faddf79-41e6-47e9-9c26-17bc987bc870.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_llama30b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ContextualAI/archangel_sft-dpo_llama30b",
-    "id": "ContextualAI/archangel_sft-dpo_llama30b",
-    "developer": "ContextualAI",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5618
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6927
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4474
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4745
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5705
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_llama7b/20989a47-6556-4e3b-8909-d0a419cb159b.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_llama7b/20989a47-6556-4e3b-8909-d0a419cb159b.json
deleted file mode 100644
index 69dc72884..000000000
--- a/data/reward-bench/ContextualAI/archangel_sft-dpo_llama7b/20989a47-6556-4e3b-8909-d0a419cb159b.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_llama7b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ContextualAI/archangel_sft-dpo_llama7b",
-    "id": "ContextualAI/archangel_sft-dpo_llama7b",
-    "developer": "ContextualAI",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5304
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5782
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4452
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5203
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5658
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5544
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia1-4b/f3d0010f-efed-4f87-9582-b9c87b4de99a.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia1-4b/f3d0010f-efed-4f87-9582-b9c87b4de99a.json
deleted file mode 100644
index c640a7456..000000000
--- a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia1-4b/f3d0010f-efed-4f87-9582-b9c87b4de99a.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_pythia1-4b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ContextualAI/archangel_sft-dpo_pythia1-4b",
-    "id": "ContextualAI/archangel_sft-dpo_pythia1-4b",
-    "developer": "ContextualAI",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5233
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6397
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3728
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5041
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5672
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5427
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia12-0b/a0ce3ed6-2a2c-46ad-be86-6f6701533e36.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia12-0b/a0ce3ed6-2a2c-46ad-be86-6f6701533e36.json
deleted file mode 100644
index ed72e23b9..000000000
--- a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia12-0b/a0ce3ed6-2a2c-46ad-be86-6f6701533e36.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_pythia12-0b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ContextualAI/archangel_sft-dpo_pythia12-0b",
-    "id": "ContextualAI/archangel_sft-dpo_pythia12-0b",
-    "developer": "ContextualAI",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5009
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6676
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.364
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5432
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4139
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5303
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia2-8b/d54c4830-23c8-4c12-aea1-4f5b5245464f.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia2-8b/d54c4830-23c8-4c12-aea1-4f5b5245464f.json
deleted file mode 100644
index 10908a053..000000000
--- a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia2-8b/d54c4830-23c8-4c12-aea1-4f5b5245464f.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_pythia2-8b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ContextualAI/archangel_sft-dpo_pythia2-8b",
-    "id": "ContextualAI/archangel_sft-dpo_pythia2-8b",
-    "developer": "ContextualAI",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5286
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8073
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3355
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4473
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5135
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5501
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia6-9b/b5853278-edd9-4bc8-bbeb-d6dab515b562.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia6-9b/b5853278-edd9-4bc8-bbeb-d6dab515b562.json
deleted file mode 100644
index 40f3a091a..000000000
--- a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia6-9b/b5853278-edd9-4bc8-bbeb-d6dab515b562.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_pythia6-9b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ContextualAI/archangel_sft-dpo_pythia6-9b",
-    "id": "ContextualAI/archangel_sft-dpo_pythia6-9b",
-    "developer": "ContextualAI",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5263
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7486
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3421
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5176
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4847
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.551
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_llama13b/74188e30-1e49-47d8-af01-b80e430dafa0.json b/data/reward-bench/ContextualAI/archangel_sft-kto_llama13b/74188e30-1e49-47d8-af01-b80e430dafa0.json
deleted file mode 100644
index 22b4b63bd..000000000
--- a/data/reward-bench/ContextualAI/archangel_sft-kto_llama13b/74188e30-1e49-47d8-af01-b80e430dafa0.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_llama13b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ContextualAI/archangel_sft-kto_llama13b",
-    "id": "ContextualAI/archangel_sft-kto_llama13b",
-    "developer": "ContextualAI",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5952
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8408
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3772
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4649
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7077
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.576
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_llama30b/93974286-0497-46a2-a2e8-404c1e89dba0.json b/data/reward-bench/ContextualAI/archangel_sft-kto_llama30b/93974286-0497-46a2-a2e8-404c1e89dba0.json
deleted file mode 100644
index ca6ff0f55..000000000
--- a/data/reward-bench/ContextualAI/archangel_sft-kto_llama30b/93974286-0497-46a2-a2e8-404c1e89dba0.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_llama30b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ContextualAI/archangel_sft-kto_llama30b",
-    "id": "ContextualAI/archangel_sft-kto_llama30b",
-    "developer": "ContextualAI",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5901
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8436
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4057
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6054
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5075
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5862
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_llama7b/02c0020c-7d69-4701-a606-4bc79ad87afd.json b/data/reward-bench/ContextualAI/archangel_sft-kto_llama7b/02c0020c-7d69-4701-a606-4bc79ad87afd.json
deleted file mode 100644
index ac5acffb2..000000000
--- a/data/reward-bench/ContextualAI/archangel_sft-kto_llama7b/02c0020c-7d69-4701-a606-4bc79ad87afd.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_llama7b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ContextualAI/archangel_sft-kto_llama7b",
-    "id": "ContextualAI/archangel_sft-kto_llama7b",
-    "developer": "ContextualAI",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5388
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5587
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4364
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4568
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6941
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5575
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia1-4b/5dcb7c54-64e7-4f76-8903-8f57b35cdb0c.json b/data/reward-bench/ContextualAI/archangel_sft-kto_pythia1-4b/5dcb7c54-64e7-4f76-8903-8f57b35cdb0c.json
deleted file mode 100644
index 36044c710..000000000
--- a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia1-4b/5dcb7c54-64e7-4f76-8903-8f57b35cdb0c.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_pythia1-4b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ContextualAI/archangel_sft-kto_pythia1-4b",
-    "id": "ContextualAI/archangel_sft-kto_pythia1-4b",
-    "developer": "ContextualAI",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5581
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6844
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3794
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5257
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6447
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5546
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia12-0b/4887256e-0545-40dd-9756-ff850e003a29.json b/data/reward-bench/ContextualAI/archangel_sft-kto_pythia12-0b/4887256e-0545-40dd-9756-ff850e003a29.json
deleted file mode 100644
index 16ed21233..000000000
--- a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia12-0b/4887256e-0545-40dd-9756-ff850e003a29.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_pythia12-0b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ContextualAI/archangel_sft-kto_pythia12-0b",
-    "id": "ContextualAI/archangel_sft-kto_pythia12-0b",
-    "developer": "ContextualAI",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5053
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7486
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3618
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4757
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4127
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.55
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia2-8b/d2b70870-9cbc-4666-bbd4-097fcebe716e.json b/data/reward-bench/ContextualAI/archangel_sft-kto_pythia2-8b/d2b70870-9cbc-4666-bbd4-097fcebe716e.json
deleted file mode 100644
index 4c1047aa1..000000000
--- a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia2-8b/d2b70870-9cbc-4666-bbd4-097fcebe716e.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_pythia2-8b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ContextualAI/archangel_sft-kto_pythia2-8b",
-    "id": "ContextualAI/archangel_sft-kto_pythia2-8b",
-    "developer": "ContextualAI",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5497
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.757
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3421
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4743
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6216
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.557
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia6-9b/f420f432-2291-40a9-8ebd-b91241970113.json b/data/reward-bench/ContextualAI/archangel_sft-kto_pythia6-9b/f420f432-2291-40a9-8ebd-b91241970113.json
deleted file mode 100644
index 521c30c11..000000000
--- a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia6-9b/f420f432-2291-40a9-8ebd-b91241970113.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_pythia6-9b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ContextualAI/archangel_sft-kto_pythia6-9b",
-    "id": "ContextualAI/archangel_sft-kto_pythia6-9b",
-    "developer": "ContextualAI",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5561
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7765
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3618
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5365
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5415
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5723
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Databricks-Mosaic-Research/PGRM/02e68d1b-86f3-4344-ad8d-45df878b744c.json b/data/reward-bench/Databricks-Mosaic-Research/PGRM/02e68d1b-86f3-4344-ad8d-45df878b744c.json
deleted file mode 100644
index 5d22f5c2e..000000000
--- a/data/reward-bench/Databricks-Mosaic-Research/PGRM/02e68d1b-86f3-4344-ad8d-45df878b744c.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Databricks-Mosaic-Research_PGRM/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Databricks-Mosaic-Research/PGRM",
-    "id": "Databricks-Mosaic-Research/PGRM",
-    "developer": "Databricks-Mosaic-Research",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8002
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7937
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5062
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7404
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9289
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9424
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8893
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/HFXM/RAMO-Llama3.1-8B/f712ab4a-1127-44ba-b6b9-7a40290f3322.json b/data/reward-bench/HFXM/RAMO-Llama3.1-8B/f712ab4a-1127-44ba-b6b9-7a40290f3322.json
deleted file mode 100644
index 7ccfc23f1..000000000
--- a/data/reward-bench/HFXM/RAMO-Llama3.1-8B/f712ab4a-1127-44ba-b6b9-7a40290f3322.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/HFXM_RAMO-Llama3.1-8B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HFXM/RAMO-Llama3.1-8B",
-    "id": "HFXM/RAMO-Llama3.1-8B",
-    "developer": "HFXM",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6917
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6547
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5628
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9756
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9071
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6752
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/HuggingFaceH4/starchat2-15b-v0.1/b4175f0f-f9f4-4418-b4aa-a31e7f1f93f4.json b/data/reward-bench/HuggingFaceH4/starchat2-15b-v0.1/b4175f0f-f9f4-4418-b4aa-a31e7f1f93f4.json
deleted file mode 100644
index a6b1abca8..000000000
--- a/data/reward-bench/HuggingFaceH4/starchat2-15b-v0.1/b4175f0f-f9f4-4418-b4aa-a31e7f1f93f4.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/HuggingFaceH4_starchat2-15b-v0.1/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HuggingFaceH4/starchat2-15b-v0.1",
-    "id": "HuggingFaceH4/starchat2-15b-v0.1",
-    "developer": "HuggingFaceH4",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7322
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9385
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5548
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7095
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8159
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5525
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/HuggingFaceH4/zephyr-7b-alpha/9879e9a7-ddbc-4338-abc7-e3bc394869e9.json b/data/reward-bench/HuggingFaceH4/zephyr-7b-alpha/9879e9a7-ddbc-4338-abc7-e3bc394869e9.json
deleted file mode 100644
index b313fb87e..000000000
--- a/data/reward-bench/HuggingFaceH4/zephyr-7b-alpha/9879e9a7-ddbc-4338-abc7-e3bc394869e9.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/HuggingFaceH4_zephyr-7b-alpha/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HuggingFaceH4/zephyr-7b-alpha",
-    "id": "HuggingFaceH4/zephyr-7b-alpha",
-    "developer": "HuggingFaceH4",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7392
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9162
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.625
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7662
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7514
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5353
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/HuggingFaceH4/zephyr-7b-beta/d7d8a5cb-e295-4ced-b528-d99d814ff008.json b/data/reward-bench/HuggingFaceH4/zephyr-7b-beta/d7d8a5cb-e295-4ced-b528-d99d814ff008.json
deleted file mode 100644
index 7d0709109..000000000
--- a/data/reward-bench/HuggingFaceH4/zephyr-7b-beta/d7d8a5cb-e295-4ced-b528-d99d814ff008.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/HuggingFaceH4_zephyr-7b-beta/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HuggingFaceH4/zephyr-7b-beta",
-    "id": "HuggingFaceH4/zephyr-7b-beta",
-    "developer": "HuggingFaceH4",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7281
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9525
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6272
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6568
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7789
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5216
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/HuggingFaceH4/zephyr-7b-gemma-v0.1/bff86a1f-71c3-4f27-aeae-bba6d03635ef.json b/data/reward-bench/HuggingFaceH4/zephyr-7b-gemma-v0.1/bff86a1f-71c3-4f27-aeae-bba6d03635ef.json
deleted file mode 100644
index 89a96432c..000000000
--- a/data/reward-bench/HuggingFaceH4/zephyr-7b-gemma-v0.1/bff86a1f-71c3-4f27-aeae-bba6d03635ef.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/HuggingFaceH4_zephyr-7b-gemma-v0.1/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "HuggingFaceH4/zephyr-7b-gemma-v0.1",
-    "id": "HuggingFaceH4/zephyr-7b-gemma-v0.1",
-    "developer": "HuggingFaceH4",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6758
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9581
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4956
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5824
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7463
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5171
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/IDEA-CCNL/Ziya-LLaMA-7B-Reward/723281f8-54b7-4db6-8253-5a6dcf4f3d4a.json b/data/reward-bench/IDEA-CCNL/Ziya-LLaMA-7B-Reward/723281f8-54b7-4db6-8253-5a6dcf4f3d4a.json
deleted file mode 100644
index 73199f618..000000000
--- a/data/reward-bench/IDEA-CCNL/Ziya-LLaMA-7B-Reward/723281f8-54b7-4db6-8253-5a6dcf4f3d4a.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/IDEA-CCNL_Ziya-LLaMA-7B-Reward/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "IDEA-CCNL/Ziya-LLaMA-7B-Reward",
-    "id": "IDEA-CCNL/Ziya-LLaMA-7B-Reward",
-    "developer": "IDEA-CCNL",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6378
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8687
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4605
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6405
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5775
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6461
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/LxzGordon/URM-LLaMa-3-8B/0ce7dc54-f608-4985-9904-75cee09b6288.json b/data/reward-bench/LxzGordon/URM-LLaMa-3-8B/0ce7dc54-f608-4985-9904-75cee09b6288.json
deleted file mode 100644
index 2f1093e30..000000000
--- a/data/reward-bench/LxzGordon/URM-LLaMa-3-8B/0ce7dc54-f608-4985-9904-75cee09b6288.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/LxzGordon_URM-LLaMa-3-8B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LxzGordon/URM-LLaMa-3-8B",
-    "id": "LxzGordon/URM-LLaMa-3-8B",
-    "developer": "LxzGordon",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8991
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9693
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7873
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8824
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9574
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/5bb0aaa4-2cc5-4622-8235-993bc4178f12.json b/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/5bb0aaa4-2cc5-4622-8235-993bc4178f12.json
deleted file mode 100644
index d56005468..000000000
--- a/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/5bb0aaa4-2cc5-4622-8235-993bc4178f12.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/LxzGordon_URM-LLaMa-3.1-8B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LxzGordon/URM-LLaMa-3.1-8B",
-    "id": "LxzGordon/URM-LLaMa-3.1-8B",
-    "developer": "LxzGordon",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9294
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9553
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8816
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9108
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9698
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/85ab22b8-0587-4e2b-857f-3d6d84d571a4.json b/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/85ab22b8-0587-4e2b-857f-3d6d84d571a4.json
deleted file mode 100644
index a57e5caa3..000000000
--- a/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/85ab22b8-0587-4e2b-857f-3d6d84d571a4.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/LxzGordon_URM-LLaMa-3.1-8B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "LxzGordon/URM-LLaMa-3.1-8B",
-    "id": "LxzGordon/URM-LLaMa-3.1-8B",
-    "developer": "LxzGordon",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7394
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6884
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.45
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6393
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9178
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9758
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7653
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/NCSOFT/Llama-3-OffsetBias-8B/37aa6702-b2fa-43bf-b5a9-36740f627217.json b/data/reward-bench/NCSOFT/Llama-3-OffsetBias-8B/37aa6702-b2fa-43bf-b5a9-36740f627217.json
deleted file mode 100644
index a9c1eb53c..000000000
--- a/data/reward-bench/NCSOFT/Llama-3-OffsetBias-8B/37aa6702-b2fa-43bf-b5a9-36740f627217.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/NCSOFT_Llama-3-OffsetBias-8B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NCSOFT/Llama-3-OffsetBias-8B",
-    "id": "NCSOFT/Llama-3-OffsetBias-8B",
-    "developer": "NCSOFT",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8397
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9246
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8026
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8676
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7639
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/57f48d0c-e424-410d-b9ee-4707e2add036.json b/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/57f48d0c-e424-410d-b9ee-4707e2add036.json
deleted file mode 100644
index f38bf29f7..000000000
--- a/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/57f48d0c-e424-410d-b9ee-4707e2add036.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/NCSOFT_Llama-3-OffsetBias-RM-8B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NCSOFT/Llama-3-OffsetBias-RM-8B",
-    "id": "NCSOFT/Llama-3-OffsetBias-RM-8B",
-    "developer": "NCSOFT",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8942
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9721
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.818
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8676
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9192
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/8643b4dd-e18c-442c-adb5-84ef756534f8.json b/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/8643b4dd-e18c-442c-adb5-84ef756534f8.json
deleted file mode 100644
index ec0f3756c..000000000
--- a/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/8643b4dd-e18c-442c-adb5-84ef756534f8.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/NCSOFT_Llama-3-OffsetBias-RM-8B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NCSOFT/Llama-3-OffsetBias-RM-8B",
-    "id": "NCSOFT/Llama-3-OffsetBias-RM-8B",
-    "developer": "NCSOFT",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.648
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6084
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5191
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7222
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9596
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6786
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Nexusflow/Starling-RM-34B/2f3d2e46-1f9e-4b1c-9729-ab0a93cc245c.json b/data/reward-bench/Nexusflow/Starling-RM-34B/2f3d2e46-1f9e-4b1c-9729-ab0a93cc245c.json
deleted file mode 100644
index 3b7921590..000000000
--- a/data/reward-bench/Nexusflow/Starling-RM-34B/2f3d2e46-1f9e-4b1c-9729-ab0a93cc245c.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Nexusflow_Starling-RM-34B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nexusflow/Starling-RM-34B",
-    "id": "Nexusflow/Starling-RM-34B",
-    "developer": "Nexusflow",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8133
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9693
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5724
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.877
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8845
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7137
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Nexusflow/Starling-RM-34B/4aec78d3-a38c-48e0-b9e2-b6dc063bd37e.json b/data/reward-bench/Nexusflow/Starling-RM-34B/4aec78d3-a38c-48e0-b9e2-b6dc063bd37e.json
deleted file mode 100644
index a2c665080..000000000
--- a/data/reward-bench/Nexusflow/Starling-RM-34B/4aec78d3-a38c-48e0-b9e2-b6dc063bd37e.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Nexusflow_Starling-RM-34B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Nexusflow/Starling-RM-34B",
-    "id": "Nexusflow/Starling-RM-34B",
-    "developer": "Nexusflow",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4553
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4589
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3187
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6175
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7556
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4808
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1004
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/NousResearch/Hermes-3-Llama-3.1-70B/f9b60945-8b14-4564-9d44-3eb6db675ab9.json b/data/reward-bench/NousResearch/Hermes-3-Llama-3.1-70B/f9b60945-8b14-4564-9d44-3eb6db675ab9.json
deleted file mode 100644
index 734675894..000000000
--- a/data/reward-bench/NousResearch/Hermes-3-Llama-3.1-70B/f9b60945-8b14-4564-9d44-3eb6db675ab9.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/NousResearch_Hermes-3-Llama-3.1-70B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NousResearch/Hermes-3-Llama-3.1-70B",
-    "id": "NousResearch/Hermes-3-Llama-3.1-70B",
-    "developer": "NousResearch",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7847
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9623
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5669
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.823
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7867
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/56703c11-eccb-4f66-af13-60f972a5068f.json b/data/reward-bench/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/56703c11-eccb-4f66-af13-60f972a5068f.json
deleted file mode 100644
index 0e770043c..000000000
--- a/data/reward-bench/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/56703c11-eccb-4f66-af13-60f972a5068f.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/NousResearch_Nous-Hermes-2-Mistral-7B-DPO/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NousResearch/Nous-Hermes-2-Mistral-7B-DPO",
-    "id": "NousResearch/Nous-Hermes-2-Mistral-7B-DPO",
-    "developer": "NousResearch",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7481
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9218
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6053
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8243
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.555
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/fbd8be7e-5670-4729-a77d-83472510b734.json b/data/reward-bench/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/fbd8be7e-5670-4729-a77d-83472510b734.json
deleted file mode 100644
index 99623d8ff..000000000
--- a/data/reward-bench/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/fbd8be7e-5670-4729-a77d-83472510b734.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
-    "id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
-    "developer": "NousResearch",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7138
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9162
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6053
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8149
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6126
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5266
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/2e18ee77-9c46-4cf9-9521-303ad15e5be4.json b/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/2e18ee77-9c46-4cf9-9521-303ad15e5be4.json
deleted file mode 100644
index 31e05eb58..000000000
--- a/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/2e18ee77-9c46-4cf9-9521-303ad15e5be4.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/OpenAssistant_oasst-rm-2-pythia-6.9b-epoch-1/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1",
-    "id": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1",
-    "developer": "OpenAssistant",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.615
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9246
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3728
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5446
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5855
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6801
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/ec5b296e-03e8-4371-a8c1-eca0b0b9759d.json b/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/ec5b296e-03e8-4371-a8c1-eca0b0b9759d.json
deleted file mode 100644
index dfed08cd8..000000000
--- a/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/ec5b296e-03e8-4371-a8c1-eca0b0b9759d.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/OpenAssistant_oasst-rm-2-pythia-6.9b-epoch-1/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1",
-    "id": "OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1",
-    "developer": "OpenAssistant",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2653
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3979
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2875
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.377
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3289
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1535
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.047
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/07b61a55-a8e3-4a6f-9806-a4100f8d5297.json b/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/07b61a55-a8e3-4a6f-9806-a4100f8d5297.json
deleted file mode 100644
index 85e007109..000000000
--- a/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/07b61a55-a8e3-4a6f-9806-a4100f8d5297.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/OpenAssistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5",
-    "id": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5",
-    "developer": "OpenAssistant",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6901
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8855
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4868
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6311
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7752
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6533
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/3d534c25-5016-44de-9c47-24b7d7399b0f.json b/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/3d534c25-5016-44de-9c47-24b7d7399b0f.json
deleted file mode 100644
index 38eca68e4..000000000
--- a/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/3d534c25-5016-44de-9c47-24b7d7399b0f.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/OpenAssistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5",
-    "id": "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5",
-    "developer": "OpenAssistant",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2648
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3179
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2625
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3934
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3244
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2707
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0198
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/4de91433-05b3-4f88-9d0f-66691c671f62.json b/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/4de91433-05b3-4f88-9d0f-66691c671f62.json
deleted file mode 100644
index 365667594..000000000
--- a/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/4de91433-05b3-4f88-9d0f-66691c671f62.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/OpenAssistant_reward-model-deberta-v3-large-v2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenAssistant/reward-model-deberta-v3-large-v2",
-    "id": "OpenAssistant/reward-model-deberta-v3-large-v2",
-    "developer": "OpenAssistant",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.32
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3853
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2687
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5027
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3667
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2768
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.12
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/dc71f1ba-f4b8-4231-ac72-0acf9a22d73e.json b/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/dc71f1ba-f4b8-4231-ac72-0acf9a22d73e.json
deleted file mode 100644
index 4712b7f92..000000000
--- a/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/dc71f1ba-f4b8-4231-ac72-0acf9a22d73e.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/OpenAssistant_reward-model-deberta-v3-large-v2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "OpenAssistant/reward-model-deberta-v3-large-v2",
-    "id": "OpenAssistant/reward-model-deberta-v3-large-v2",
-    "developer": "OpenAssistant",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6126
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8939
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4518
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7338
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3855
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5836
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/36c4adc9-c2fb-4bc3-81ba-88478d30332e.json b/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/36c4adc9-c2fb-4bc3-81ba-88478d30332e.json
deleted file mode 100644
index ae927c59e..000000000
--- a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/36c4adc9-c2fb-4bc3-81ba-88478d30332e.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v1.0-cost/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PKU-Alignment/beaver-7b-v1.0-cost",
-    "id": "PKU-Alignment/beaver-7b-v1.0-cost",
-    "developer": "PKU-Alignment",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5798
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6173
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4232
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7351
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5482
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.57
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/f0827b15-20d0-4986-b5a0-bb4bc9be768e.json b/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/f0827b15-20d0-4986-b5a0-bb4bc9be768e.json
deleted file mode 100644
index 9e4c95ee2..000000000
--- a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/f0827b15-20d0-4986-b5a0-bb4bc9be768e.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v1.0-cost/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PKU-Alignment/beaver-7b-v1.0-cost",
-    "id": "PKU-Alignment/beaver-7b-v1.0-cost",
-    "developer": "PKU-Alignment",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3332
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3263
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2313
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3989
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7589
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2939
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -0.01
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/aeaa8b33-e327-4c65-9641-5dfc63feee3b.json b/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/aeaa8b33-e327-4c65-9641-5dfc63feee3b.json
deleted file mode 100644
index fc41926be..000000000
--- a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/aeaa8b33-e327-4c65-9641-5dfc63feee3b.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v1.0-reward/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PKU-Alignment/beaver-7b-v1.0-reward",
-    "id": "PKU-Alignment/beaver-7b-v1.0-reward",
-    "developer": "PKU-Alignment",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1606
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2105
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2938
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2623
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1422
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0646
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -0.01
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/c97c79f3-fd92-49db-9131-5e45834a7eaf.json b/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/c97c79f3-fd92-49db-9131-5e45834a7eaf.json
deleted file mode 100644
index d06d08a3c..000000000
--- a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/c97c79f3-fd92-49db-9131-5e45834a7eaf.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v1.0-reward/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PKU-Alignment/beaver-7b-v1.0-reward",
-    "id": "PKU-Alignment/beaver-7b-v1.0-reward",
-    "developer": "PKU-Alignment",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4727
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8184
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2873
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3757
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.346
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5993
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/687099cb-c1bf-49ec-a902-329c2b818369.json b/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/687099cb-c1bf-49ec-a902-329c2b818369.json
deleted file mode 100644
index 3868ab64e..000000000
--- a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/687099cb-c1bf-49ec-a902-329c2b818369.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v2.0-cost/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PKU-Alignment/beaver-7b-v2.0-cost",
-    "id": "PKU-Alignment/beaver-7b-v2.0-cost",
-    "developer": "PKU-Alignment",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3326
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3789
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.275
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3333
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7356
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2828
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -0.01
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/8da4f5eb-6264-4503-b9bc-fcf843b638be.json b/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/8da4f5eb-6264-4503-b9bc-fcf843b638be.json
deleted file mode 100644
index 4af7bc7a5..000000000
--- a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/8da4f5eb-6264-4503-b9bc-fcf843b638be.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v2.0-cost/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PKU-Alignment/beaver-7b-v2.0-cost",
-    "id": "PKU-Alignment/beaver-7b-v2.0-cost",
-    "developer": "PKU-Alignment",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5957
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5726
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4561
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7608
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6211
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5397
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/28a68b87-5412-4374-9e61-896b0fff7669.json b/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/28a68b87-5412-4374-9e61-896b0fff7669.json
deleted file mode 100644
index 376b686c0..000000000
--- a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/28a68b87-5412-4374-9e61-896b0fff7669.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v2.0-reward/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PKU-Alignment/beaver-7b-v2.0-reward",
-    "id": "PKU-Alignment/beaver-7b-v2.0-reward",
-    "developer": "PKU-Alignment",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2544
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2168
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2562
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3825
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3156
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2606
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0944
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/3209c869-03c5-4801-8e4b-4c8bcde3d58f.json b/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/3209c869-03c5-4801-8e4b-4c8bcde3d58f.json
deleted file mode 100644
index 49fb24c89..000000000
--- a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/3209c869-03c5-4801-8e4b-4c8bcde3d58f.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v2.0-reward/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PKU-Alignment/beaver-7b-v2.0-reward",
-    "id": "PKU-Alignment/beaver-7b-v2.0-reward",
-    "developer": "PKU-Alignment",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6366
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8994
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.364
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6041
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6887
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6171
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022.../9d1e124c-e133-41d3-8ac7-5c8c5027aa02.json b/data/reward-bench/PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022.../9d1e124c-e133-41d3-8ac7-5c8c5027aa02.json
deleted file mode 100644
index f9da09026..000000000
--- a/data/reward-bench/PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022.../9d1e124c-e133-41d3-8ac7-5c8c5027aa02.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/PoLL_gpt-3.5-turbo-0125_claude-3-sonnet-2024022.../1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022...",
-    "id": "PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022...",
-    "developer": "PoLL",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7578
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9525
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5406
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8034
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7346
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Qwen/Qwen1.5-0.5B-Chat/633d499b-58bd-4fca-9b56-0f005a5a21b8.json b/data/reward-bench/Qwen/Qwen1.5-0.5B-Chat/633d499b-58bd-4fca-9b56-0f005a5a21b8.json
deleted file mode 100644
index 8753b5ea8..000000000
--- a/data/reward-bench/Qwen/Qwen1.5-0.5B-Chat/633d499b-58bd-4fca-9b56-0f005a5a21b8.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Qwen_Qwen1.5-0.5B-Chat/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen/Qwen1.5-0.5B-Chat",
-    "id": "Qwen/Qwen1.5-0.5B-Chat",
-    "developer": "Qwen",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5298
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3547
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6294
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5703
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5984
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4629
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Qwen/Qwen1.5-1.8B-Chat/5c4f3caf-6af3-48c6-83e2-4710d31e6acf.json b/data/reward-bench/Qwen/Qwen1.5-1.8B-Chat/5c4f3caf-6af3-48c6-83e2-4710d31e6acf.json
deleted file mode 100644
index 48dfa65fd..000000000
--- a/data/reward-bench/Qwen/Qwen1.5-1.8B-Chat/5c4f3caf-6af3-48c6-83e2-4710d31e6acf.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Qwen_Qwen1.5-1.8B-Chat/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen/Qwen1.5-1.8B-Chat",
-    "id": "Qwen/Qwen1.5-1.8B-Chat",
-    "developer": "Qwen",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.589
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5615
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6031
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4838
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7793
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4453
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Qwen/Qwen1.5-14B-Chat/77d1edc1-fb54-4371-bf7c-baebbb351163.json b/data/reward-bench/Qwen/Qwen1.5-14B-Chat/77d1edc1-fb54-4371-bf7c-baebbb351163.json
deleted file mode 100644
index f34eee3d4..000000000
--- a/data/reward-bench/Qwen/Qwen1.5-14B-Chat/77d1edc1-fb54-4371-bf7c-baebbb351163.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Qwen_Qwen1.5-14B-Chat/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen/Qwen1.5-14B-Chat",
-    "id": "Qwen/Qwen1.5-14B-Chat",
-    "developer": "Qwen",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6864
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5726
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7018
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7122
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8961
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4123
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Qwen/Qwen1.5-4B-Chat/e7eecdb0-bc17-4d9f-b3e8-9ee777d2f595.json b/data/reward-bench/Qwen/Qwen1.5-4B-Chat/e7eecdb0-bc17-4d9f-b3e8-9ee777d2f595.json
deleted file mode 100644
index 85d507824..000000000
--- a/data/reward-bench/Qwen/Qwen1.5-4B-Chat/e7eecdb0-bc17-4d9f-b3e8-9ee777d2f595.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Qwen_Qwen1.5-4B-Chat/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen/Qwen1.5-4B-Chat",
-    "id": "Qwen/Qwen1.5-4B-Chat",
-    "developer": "Qwen",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5477
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3883
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6272
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5568
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6689
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.447
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Qwen/Qwen1.5-72B-Chat/3f3915b3-0d6e-451c-9185-fa4372b93f2b.json b/data/reward-bench/Qwen/Qwen1.5-72B-Chat/3f3915b3-0d6e-451c-9185-fa4372b93f2b.json
deleted file mode 100644
index f3cc894c3..000000000
--- a/data/reward-bench/Qwen/Qwen1.5-72B-Chat/3f3915b3-0d6e-451c-9185-fa4372b93f2b.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Qwen_Qwen1.5-72B-Chat/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen/Qwen1.5-72B-Chat",
-    "id": "Qwen/Qwen1.5-72B-Chat",
-    "developer": "Qwen",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6723
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6229
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6601
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6757
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8554
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4226
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Qwen/Qwen1.5-7B-Chat/e534d37b-3009-4a7d-82d8-d7c85b95649e.json b/data/reward-bench/Qwen/Qwen1.5-7B-Chat/e534d37b-3009-4a7d-82d8-d7c85b95649e.json
deleted file mode 100644
index 2373972cd..000000000
--- a/data/reward-bench/Qwen/Qwen1.5-7B-Chat/e534d37b-3009-4a7d-82d8-d7c85b95649e.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Qwen_Qwen1.5-7B-Chat/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen/Qwen1.5-7B-Chat",
-    "id": "Qwen/Qwen1.5-7B-Chat",
-    "developer": "Qwen",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.675
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5363
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6908
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6919
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9041
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4288
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Qwen/Qwen1.5-MoE-A2.7B-Chat/bd8f0ed1-75fc-48c1-996e-655d205c027c.json b/data/reward-bench/Qwen/Qwen1.5-MoE-A2.7B-Chat/bd8f0ed1-75fc-48c1-996e-655d205c027c.json
deleted file mode 100644
index 7daa3735e..000000000
--- a/data/reward-bench/Qwen/Qwen1.5-MoE-A2.7B-Chat/bd8f0ed1-75fc-48c1-996e-655d205c027c.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Qwen_Qwen1.5-MoE-A2.7B-Chat/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen/Qwen1.5-MoE-A2.7B-Chat",
-    "id": "Qwen/Qwen1.5-MoE-A2.7B-Chat",
-    "developer": "Qwen",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6644
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7291
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6316
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.774
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4536
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Qwen/WorldPM-72B/e9effaf6-e48b-4b35-b035-430be81b316b.json b/data/reward-bench/Qwen/WorldPM-72B/e9effaf6-e48b-4b35-b035-430be81b316b.json
deleted file mode 100644
index 6ee54b6e7..000000000
--- a/data/reward-bench/Qwen/WorldPM-72B/e9effaf6-e48b-4b35-b035-430be81b316b.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Qwen_WorldPM-72B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Qwen/WorldPM-72B",
-    "id": "Qwen/WorldPM-72B",
-    "developer": "Qwen",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6333
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7074
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3125
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6557
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8533
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9172
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3535
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-32B/d2132eea-eb88-41e5-b8e6-2e8e8a623ed1.json b/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-32B/d2132eea-eb88-41e5-b8e6-2e8e8a623ed1.json
deleted file mode 100644
index d48e6bfec..000000000
--- a/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-32B/d2132eea-eb88-41e5-b8e6-2e8e8a623ed1.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/R-I-S-E_RISE-Judge-Qwen2.5-32B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "R-I-S-E/RISE-Judge-Qwen2.5-32B",
-    "id": "R-I-S-E/RISE-Judge-Qwen2.5-32B",
-    "developer": "R-I-S-E",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9266
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9665
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8333
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9189
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9877
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-7B/ffd05bc7-3724-40ba-85b9-c25ebe71fba2.json b/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-7B/ffd05bc7-3724-40ba-85b9-c25ebe71fba2.json
deleted file mode 100644
index 2418db79b..000000000
--- a/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-7B/ffd05bc7-3724-40ba-85b9-c25ebe71fba2.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/R-I-S-E_RISE-Judge-Qwen2.5-7B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "R-I-S-E/RISE-Judge-Qwen2.5-7B",
-    "id": "R-I-S-E/RISE-Judge-Qwen2.5-7B",
-    "developer": "R-I-S-E",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8819
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9218
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7654
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8797
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9608
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/43f0e93d-f0b8-46af-a549-e1ac315d96ea.json b/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/43f0e93d-f0b8-46af-a549-e1ac315d96ea.json
deleted file mode 100644
index 7370b5a14..000000000
--- a/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/43f0e93d-f0b8-46af-a549-e1ac315d96ea.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/RLHFlow_ArmoRM-Llama3-8B-v0.1/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RLHFlow/ArmoRM-Llama3-8B-v0.1",
-    "id": "RLHFlow/ArmoRM-Llama3-8B-v0.1",
-    "developer": "RLHFlow",
-    "additional_details": {
-      "model_type": "Custom Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6646
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6568
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4188
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6612
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8222
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7657
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6629
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/9ccab7bd-d2ed-4ab3-ad81-656650c29a3b.json b/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/9ccab7bd-d2ed-4ab3-ad81-656650c29a3b.json
deleted file mode 100644
index 41532e2cf..000000000
--- a/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/9ccab7bd-d2ed-4ab3-ad81-656650c29a3b.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/RLHFlow_ArmoRM-Llama3-8B-v0.1/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RLHFlow/ArmoRM-Llama3-8B-v0.1",
-    "id": "RLHFlow/ArmoRM-Llama3-8B-v0.1",
-    "developer": "RLHFlow",
-    "additional_details": {
-      "model_type": "Custom Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.886
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9693
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7675
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9054
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9735
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7429
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/RLHFlow/LLaMA3-iterative-DPO-final/c10d4213-f1fa-41e6-92d9-0d5337c1362b.json b/data/reward-bench/RLHFlow/LLaMA3-iterative-DPO-final/c10d4213-f1fa-41e6-92d9-0d5337c1362b.json
deleted file mode 100644
index e8c6cfc5b..000000000
--- a/data/reward-bench/RLHFlow/LLaMA3-iterative-DPO-final/c10d4213-f1fa-41e6-92d9-0d5337c1362b.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/RLHFlow_LLaMA3-iterative-DPO-final/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RLHFlow/LLaMA3-iterative-DPO-final",
-    "id": "RLHFlow/LLaMA3-iterative-DPO-final",
-    "developer": "RLHFlow",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6783
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.838
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5921
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7865
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6161
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4392
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/RLHFlow/RewardModel-Mistral-7B-for-DPA-v1/63b08ba0-eeb9-48ae-a5d1-d7d3792aa1c0.json b/data/reward-bench/RLHFlow/RewardModel-Mistral-7B-for-DPA-v1/63b08ba0-eeb9-48ae-a5d1-d7d3792aa1c0.json
deleted file mode 100644
index 35d7a9ebf..000000000
--- a/data/reward-bench/RLHFlow/RewardModel-Mistral-7B-for-DPA-v1/63b08ba0-eeb9-48ae-a5d1-d7d3792aa1c0.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/RLHFlow_RewardModel-Mistral-7B-for-DPA-v1/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RLHFlow/RewardModel-Mistral-7B-for-DPA-v1",
-    "id": "RLHFlow/RewardModel-Mistral-7B-for-DPA-v1",
-    "developer": "RLHFlow",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6633
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8799
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4978
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7068
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5971
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6068
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/RLHFlow/pair-preference-model-LLaMA3-8B/d724076d-509f-4ad4-894c-976b0472de85.json b/data/reward-bench/RLHFlow/pair-preference-model-LLaMA3-8B/d724076d-509f-4ad4-894c-976b0472de85.json
deleted file mode 100644
index ee0769a51..000000000
--- a/data/reward-bench/RLHFlow/pair-preference-model-LLaMA3-8B/d724076d-509f-4ad4-894c-976b0472de85.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/RLHFlow_pair-preference-model-LLaMA3-8B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "RLHFlow/pair-preference-model-LLaMA3-8B",
-    "id": "RLHFlow/pair-preference-model-LLaMA3-8B",
-    "developer": "RLHFlow",
-    "additional_details": {
-      "model_type": "Custom Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8575
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9832
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6579
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8973
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9473
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7458
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Ray2333/GRM-Gemma-2B-rewardmodel-ft/54d34f25-1cd9-4995-8e56-c36981842fc8.json b/data/reward-bench/Ray2333/GRM-Gemma-2B-rewardmodel-ft/54d34f25-1cd9-4995-8e56-c36981842fc8.json
deleted file mode 100644
index 05c7b0ff2..000000000
--- a/data/reward-bench/Ray2333/GRM-Gemma-2B-rewardmodel-ft/54d34f25-1cd9-4995-8e56-c36981842fc8.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Ray2333_GRM-Gemma-2B-rewardmodel-ft/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ray2333/GRM-Gemma-2B-rewardmodel-ft",
-    "id": "Ray2333/GRM-Gemma-2B-rewardmodel-ft",
-    "developer": "Ray2333",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8447
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8939
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7522
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8446
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8881
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Ray2333/GRM-Gemma-2B-sftreg/63ae1c75-fd4d-4f40-afd0-b9f91d700014.json b/data/reward-bench/Ray2333/GRM-Gemma-2B-sftreg/63ae1c75-fd4d-4f40-afd0-b9f91d700014.json
deleted file mode 100644
index 8a5814b51..000000000
--- a/data/reward-bench/Ray2333/GRM-Gemma-2B-sftreg/63ae1c75-fd4d-4f40-afd0-b9f91d700014.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Ray2333_GRM-Gemma-2B-sftreg/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ray2333/GRM-Gemma-2B-sftreg",
-    "id": "Ray2333/GRM-Gemma-2B-sftreg",
-    "developer": "Ray2333",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7451
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9553
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4868
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7932
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7684
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6983
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/1d5ebbce-8cfe-446b-82c0-a227d4e9247f.json b/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/1d5ebbce-8cfe-446b-82c0-a227d4e9247f.json
deleted file mode 100644
index ae24803ba..000000000
--- a/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/1d5ebbce-8cfe-446b-82c0-a227d4e9247f.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Ray2333_GRM-Llama3-8B-rewardmodel-ft/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ray2333/GRM-Llama3-8B-rewardmodel-ft",
-    "id": "Ray2333/GRM-Llama3-8B-rewardmodel-ft",
-    "developer": "Ray2333",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9154
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9553
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8618
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9081
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9362
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/3f9c81ac-5c76-43b4-a27d-7eaa055139c4.json b/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/3f9c81ac-5c76-43b4-a27d-7eaa055139c4.json
deleted file mode 100644
index 2f035232b..000000000
--- a/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/3f9c81ac-5c76-43b4-a27d-7eaa055139c4.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Ray2333_GRM-Llama3-8B-rewardmodel-ft/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ray2333/GRM-Llama3-8B-rewardmodel-ft",
-    "id": "Ray2333/GRM-Llama3-8B-rewardmodel-ft",
-    "developer": "Ray2333",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6766
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6274
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5847
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9222
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8929
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6824
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/680098fb-76cf-47b6-a0ea-a1a06ca46dca.json b/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/680098fb-76cf-47b6-a0ea-a1a06ca46dca.json
deleted file mode 100644
index fa7e8dccb..000000000
--- a/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/680098fb-76cf-47b6-a0ea-a1a06ca46dca.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Ray2333_GRM-gemma2-2B-rewardmodel-ft/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ray2333/GRM-gemma2-2B-rewardmodel-ft",
-    "id": "Ray2333/GRM-gemma2-2B-rewardmodel-ft",
-    "developer": "Ray2333",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5966
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5305
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3125
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5902
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9222
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7455
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4788
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/6ec21338-9908-4ce4-a1f2-dac14c5e27ab.json b/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/6ec21338-9908-4ce4-a1f2-dac14c5e27ab.json
deleted file mode 100644
index 79c0a560c..000000000
--- a/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/6ec21338-9908-4ce4-a1f2-dac14c5e27ab.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Ray2333_GRM-gemma2-2B-rewardmodel-ft/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ray2333/GRM-gemma2-2B-rewardmodel-ft",
-    "id": "Ray2333/GRM-gemma2-2B-rewardmodel-ft",
-    "developer": "Ray2333",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8839
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9302
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7719
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9216
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.912
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Ray2333/GRM-llama3-8B-distill/592ad1e3-8a48-4c39-8013-81d7c731780f.json b/data/reward-bench/Ray2333/GRM-llama3-8B-distill/592ad1e3-8a48-4c39-8013-81d7c731780f.json
deleted file mode 100644
index 3c94abda8..000000000
--- a/data/reward-bench/Ray2333/GRM-llama3-8B-distill/592ad1e3-8a48-4c39-8013-81d7c731780f.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Ray2333_GRM-llama3-8B-distill/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ray2333/GRM-llama3-8B-distill",
-    "id": "Ray2333/GRM-llama3-8B-distill",
-    "developer": "Ray2333",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8464
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9832
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6842
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8676
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9133
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7209
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Ray2333/GRM-llama3-8B-distill/5b36f0af-7ff6-4564-9714-08fbf41d261f.json b/data/reward-bench/Ray2333/GRM-llama3-8B-distill/5b36f0af-7ff6-4564-9714-08fbf41d261f.json
deleted file mode 100644
index 7a518a591..000000000
--- a/data/reward-bench/Ray2333/GRM-llama3-8B-distill/5b36f0af-7ff6-4564-9714-08fbf41d261f.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Ray2333_GRM-llama3-8B-distill/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ray2333/GRM-llama3-8B-distill",
-    "id": "Ray2333/GRM-llama3-8B-distill",
-    "developer": "Ray2333",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.589
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5874
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3875
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5902
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7222
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6727
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5743
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/04f120c6-b648-4c83-81d8-05118efb0904.json b/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/04f120c6-b648-4c83-81d8-05118efb0904.json
deleted file mode 100644
index fd63fed4b..000000000
--- a/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/04f120c6-b648-4c83-81d8-05118efb0904.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Ray2333_GRM-llama3-8B-sftreg/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ray2333/GRM-llama3-8B-sftreg",
-    "id": "Ray2333/GRM-llama3-8B-sftreg",
-    "developer": "Ray2333",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8542
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.986
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6776
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8919
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9229
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7309
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/c907e494-ab2e-4a28-a28d-aeb68eb818ed.json b/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/c907e494-ab2e-4a28-a28d-aeb68eb818ed.json
deleted file mode 100644
index c42486675..000000000
--- a/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/c907e494-ab2e-4a28-a28d-aeb68eb818ed.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Ray2333_GRM-llama3-8B-sftreg/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ray2333/GRM-llama3-8B-sftreg",
-    "id": "Ray2333/GRM-llama3-8B-sftreg",
-    "developer": "Ray2333",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6089
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6189
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3875
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5792
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7867
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6828
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5981
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Ray2333/GRM-llama3.2-3B-rewardmodel-ft/d9eed240-ebbe-482f-8dae-c5251ed6d067.json b/data/reward-bench/Ray2333/GRM-llama3.2-3B-rewardmodel-ft/d9eed240-ebbe-482f-8dae-c5251ed6d067.json
deleted file mode 100644
index ef4f104a0..000000000
--- a/data/reward-bench/Ray2333/GRM-llama3.2-3B-rewardmodel-ft/d9eed240-ebbe-482f-8dae-c5251ed6d067.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Ray2333_GRM-llama3.2-3B-rewardmodel-ft/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ray2333/GRM-llama3.2-3B-rewardmodel-ft",
-    "id": "Ray2333/GRM-llama3.2-3B-rewardmodel-ft",
-    "developer": "Ray2333",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9092
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9162
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8487
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.927
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.945
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-baseline/670865e1-f219-465b-9fbe-6da6f73ac9e6.json b/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-baseline/670865e1-f219-465b-9fbe-6da6f73ac9e6.json
deleted file mode 100644
index 429660d5b..000000000
--- a/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-baseline/670865e1-f219-465b-9fbe-6da6f73ac9e6.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Ray2333_Gemma-2B-rewardmodel-baseline/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ray2333/Gemma-2B-rewardmodel-baseline",
-    "id": "Ray2333/Gemma-2B-rewardmodel-baseline",
-    "developer": "Ray2333",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.729
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9413
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4693
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7865
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7384
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6897
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-ft/88953298-b63e-499f-a31e-f0f586c4772d.json b/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-ft/88953298-b63e-499f-a31e-f0f586c4772d.json
deleted file mode 100644
index f7eece540..000000000
--- a/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-ft/88953298-b63e-499f-a31e-f0f586c4772d.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Ray2333_Gemma-2B-rewardmodel-ft/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ray2333/Gemma-2B-rewardmodel-ft",
-    "id": "Ray2333/Gemma-2B-rewardmodel-ft",
-    "developer": "Ray2333",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8048
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7793
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7478
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8527
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8393
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Ray2333/reward-model-Mistral-7B-instruct-Unifie.../3acb690c-ffc0-4e67-8ae1-e79bcee4f824.json b/data/reward-bench/Ray2333/reward-model-Mistral-7B-instruct-Unifie.../3acb690c-ffc0-4e67-8ae1-e79bcee4f824.json
deleted file mode 100644
index 9a4e578d4..000000000
--- a/data/reward-bench/Ray2333/reward-model-Mistral-7B-instruct-Unifie.../3acb690c-ffc0-4e67-8ae1-e79bcee4f824.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Ray2333_reward-model-Mistral-7B-instruct-Unifie.../1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Ray2333/reward-model-Mistral-7B-instruct-Unifie...",
-    "id": "Ray2333/reward-model-Mistral-7B-instruct-Unifie...",
-    "developer": "Ray2333",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7661
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9777
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5066
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8527
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7389
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7434
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/SF-Foundation/TextEval-Llama3.1-70B/6ad2cb6a-f9a3-424e-aed2-9493899872e3.json b/data/reward-bench/SF-Foundation/TextEval-Llama3.1-70B/6ad2cb6a-f9a3-424e-aed2-9493899872e3.json
deleted file mode 100644
index c50e15fdc..000000000
--- a/data/reward-bench/SF-Foundation/TextEval-Llama3.1-70B/6ad2cb6a-f9a3-424e-aed2-9493899872e3.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/SF-Foundation_TextEval-Llama3.1-70B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SF-Foundation/TextEval-Llama3.1-70B",
-    "id": "SF-Foundation/TextEval-Llama3.1-70B",
-    "developer": "SF-Foundation",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9348
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9413
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9013
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9324
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9641
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/SF-Foundation/TextEval-OffsetBias-12B/1892bf75-916b-4d4f-96ab-fda36872ae5d.json b/data/reward-bench/SF-Foundation/TextEval-OffsetBias-12B/1892bf75-916b-4d4f-96ab-fda36872ae5d.json
deleted file mode 100644
index b71080064..000000000
--- a/data/reward-bench/SF-Foundation/TextEval-OffsetBias-12B/1892bf75-916b-4d4f-96ab-fda36872ae5d.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/SF-Foundation_TextEval-OffsetBias-12B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SF-Foundation/TextEval-OffsetBias-12B",
-    "id": "SF-Foundation/TextEval-OffsetBias-12B",
-    "developer": "SF-Foundation",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9105
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.919
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8662
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9203
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9365
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Salesforce/SFR-LLaMa-3.1-70B-Judge-r/e06e1863-c28f-4c96-a672-b1073c80aa71.json b/data/reward-bench/Salesforce/SFR-LLaMa-3.1-70B-Judge-r/e06e1863-c28f-4c96-a672-b1073c80aa71.json
deleted file mode 100644
index 49c043587..000000000
--- a/data/reward-bench/Salesforce/SFR-LLaMa-3.1-70B-Judge-r/e06e1863-c28f-4c96-a672-b1073c80aa71.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Salesforce_SFR-LLaMa-3.1-70B-Judge-r/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Salesforce/SFR-LLaMa-3.1-70B-Judge-r",
-    "id": "Salesforce/SFR-LLaMa-3.1-70B-Judge-r",
-    "developer": "Salesforce",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9272
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9693
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8476
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9162
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9757
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Salesforce/SFR-LLaMa-3.1-8B-Judge-r/d923f7aa-a9d4-406a-b5d7-bdab508f04f7.json b/data/reward-bench/Salesforce/SFR-LLaMa-3.1-8B-Judge-r/d923f7aa-a9d4-406a-b5d7-bdab508f04f7.json
deleted file mode 100644
index deced96e1..000000000
--- a/data/reward-bench/Salesforce/SFR-LLaMa-3.1-8B-Judge-r/d923f7aa-a9d4-406a-b5d7-bdab508f04f7.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Salesforce_SFR-LLaMa-3.1-8B-Judge-r/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Salesforce/SFR-LLaMa-3.1-8B-Judge-r",
-    "id": "Salesforce/SFR-LLaMa-3.1-8B-Judge-r",
-    "developer": "Salesforce",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8865
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9553
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7774
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8622
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9513
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Salesforce/SFR-nemo-12B-Judge-r/5c5e40b1-e86a-4d30-b93c-f8f9e73cdca8.json b/data/reward-bench/Salesforce/SFR-nemo-12B-Judge-r/5c5e40b1-e86a-4d30-b93c-f8f9e73cdca8.json
deleted file mode 100644
index 616e9bc30..000000000
--- a/data/reward-bench/Salesforce/SFR-nemo-12B-Judge-r/5c5e40b1-e86a-4d30-b93c-f8f9e73cdca8.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Salesforce_SFR-nemo-12B-Judge-r/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Salesforce/SFR-nemo-12B-Judge-r",
-    "id": "Salesforce/SFR-nemo-12B-Judge-r",
-    "developer": "Salesforce",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9027
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9721
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8224
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8649
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9513
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Schrieffer/Llama-SARM-4B/59299d8c-e468-490f-8a52-eef49b0aaeea.json b/data/reward-bench/Schrieffer/Llama-SARM-4B/59299d8c-e468-490f-8a52-eef49b0aaeea.json
deleted file mode 100644
index 4492a4262..000000000
--- a/data/reward-bench/Schrieffer/Llama-SARM-4B/59299d8c-e468-490f-8a52-eef49b0aaeea.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Schrieffer_Llama-SARM-4B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Schrieffer/Llama-SARM-4B",
-    "id": "Schrieffer/Llama-SARM-4B",
-    "developer": "Schrieffer",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7379
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6874
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4281
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6448
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9178
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9556
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7939
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/3ce9612f-9b57-476e-9fa4-6e63f14568a7.json b/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/3ce9612f-9b57-476e-9fa4-6e63f14568a7.json
deleted file mode 100644
index 6723992e6..000000000
--- a/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/3ce9612f-9b57-476e-9fa4-6e63f14568a7.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ShikaiChen_LDL-Reward-Gemma-2-27B-v0.1/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1",
-    "id": "ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1",
-    "developer": "ShikaiChen",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9499
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9637
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9079
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9378
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9903
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/9c605bf1-2533-43db-a610-e71c0aaecdb5.json b/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/9c605bf1-2533-43db-a610-e71c0aaecdb5.json
deleted file mode 100644
index e51beb588..000000000
--- a/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/9c605bf1-2533-43db-a610-e71c0aaecdb5.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/ShikaiChen_LDL-Reward-Gemma-2-27B-v0.1/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1",
-    "id": "ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1",
-    "developer": "ShikaiChen",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7249
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7558
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6448
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9222
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9131
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7633
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-70B/c289f778-92b8-44df-a079-3bced33c8ab5.json b/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-70B/c289f778-92b8-44df-a079-3bced33c8ab5.json
deleted file mode 100644
index 7f469a316..000000000
--- a/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-70B/c289f778-92b8-44df-a079-3bced33c8ab5.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Skywork_Skywork-Critic-Llama-3.1-70B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork/Skywork-Critic-Llama-3.1-70B",
-    "id": "Skywork/Skywork-Critic-Llama-3.1-70B",
-    "developer": "Skywork",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9331
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9665
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8794
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9311
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9554
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-8B/329d4101-e740-490c-9fbc-1708f76a2f61.json b/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-8B/329d4101-e740-490c-9fbc-1708f76a2f61.json
deleted file mode 100644
index cf3327493..000000000
--- a/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-8B/329d4101-e740-490c-9fbc-1708f76a2f61.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Skywork_Skywork-Critic-Llama-3.1-8B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork/Skywork-Critic-Llama-3.1-8B",
-    "id": "Skywork/Skywork-Critic-Llama-3.1-8B",
-    "developer": "Skywork",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8896
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9358
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8136
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9108
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.898
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/3e87f52e-b136-4cb3-8cbb-d8d8a8571051.json b/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/3e87f52e-b136-4cb3-8cbb-d8d8a8571051.json
deleted file mode 100644
index 4ac0f4414..000000000
--- a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/3e87f52e-b136-4cb3-8cbb-d8d8a8571051.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2",
-    "id": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2",
-    "developer": "Skywork",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7531
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7674
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6721
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9689
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9172
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8182
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/62b9adca-db38-46c0-a68a-ed7a8e735035.json b/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/62b9adca-db38-46c0-a68a-ed7a8e735035.json
deleted file mode 100644
index 5c04a1152..000000000
--- a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/62b9adca-db38-46c0-a68a-ed7a8e735035.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2",
-    "id": "Skywork/Skywork-Reward-Gemma-2-27B-v0.2",
-    "developer": "Skywork",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9426
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9609
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8991
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9297
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9807
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/4d2f43eb-e6f3-4686-a9d9-6b6c6b68b86c.json b/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/4d2f43eb-e6f3-4686-a9d9-6b6c6b68b86c.json
deleted file mode 100644
index 08b4c8323..000000000
--- a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/4d2f43eb-e6f3-4686-a9d9-6b6c6b68b86c.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Gemma-2-27B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork/Skywork-Reward-Gemma-2-27B",
-    "id": "Skywork/Skywork-Reward-Gemma-2-27B",
-    "developer": "Skywork",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7576
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7368
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4031
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7049
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9422
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9323
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8261
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/830df3fd-d479-4af8-a92b-93d82e804fec.json b/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/830df3fd-d479-4af8-a92b-93d82e804fec.json
deleted file mode 100644
index 22de7e431..000000000
--- a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/830df3fd-d479-4af8-a92b-93d82e804fec.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Gemma-2-27B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork/Skywork-Reward-Gemma-2-27B",
-    "id": "Skywork/Skywork-Reward-Gemma-2-27B",
-    "developer": "Skywork",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.938
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9581
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9145
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9189
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9606
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/0e6d85b8-aa37-448c-adb2-0da2bd13e322.json b/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/0e6d85b8-aa37-448c-adb2-0da2bd13e322.json
deleted file mode 100644
index e5a811527..000000000
--- a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/0e6d85b8-aa37-448c-adb2-0da2bd13e322.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork/Skywork-Reward-Llama-3.1-8B-v0.2",
-    "id": "Skywork/Skywork-Reward-Llama-3.1-8B-v0.2",
-    "developer": "Skywork",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9313
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9469
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8838
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.927
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9675
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/45f0bd9c-e939-4b83-a623-1db61f431500.json b/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/45f0bd9c-e939-4b83-a623-1db61f431500.json
deleted file mode 100644
index 1941ebc04..000000000
--- a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/45f0bd9c-e939-4b83-a623-1db61f431500.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork/Skywork-Reward-Llama-3.1-8B-v0.2",
-    "id": "Skywork/Skywork-Reward-Llama-3.1-8B-v0.2",
-    "developer": "Skywork",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7175
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6968
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4062
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6011
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9422
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9414
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7169
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/0f710903-7dd8-44ea-914d-d43bbfe894f1.json b/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/0f710903-7dd8-44ea-914d-d43bbfe894f1.json
deleted file mode 100644
index ba30f4f9f..000000000
--- a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/0f710903-7dd8-44ea-914d-d43bbfe894f1.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Llama-3.1-8B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork/Skywork-Reward-Llama-3.1-8B",
-    "id": "Skywork/Skywork-Reward-Llama-3.1-8B",
-    "developer": "Skywork",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7314
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6989
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.425
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9333
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9616
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.741
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/b9ddd960-f6f7-4962-8297-88ec7fbbbd1f.json b/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/b9ddd960-f6f7-4962-8297-88ec7fbbbd1f.json
deleted file mode 100644
index 03903b4ec..000000000
--- a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/b9ddd960-f6f7-4962-8297-88ec7fbbbd1f.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Llama-3.1-8B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork/Skywork-Reward-Llama-3.1-8B",
-    "id": "Skywork/Skywork-Reward-Llama-3.1-8B",
-    "developer": "Skywork",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9252
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9581
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8728
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9081
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.962
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.1-8B/25a4520b-c780-45fc-a00f-36db1776c6a8.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.1-8B/25a4520b-c780-45fc-a00f-36db1776c6a8.json
deleted file mode 100644
index b19a61534..000000000
--- a/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.1-8B/25a4520b-c780-45fc-a00f-36db1776c6a8.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Llama-3.1-8B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork/Skywork-Reward-V2-Llama-3.1-8B",
-    "id": "Skywork/Skywork-Reward-V2-Llama-3.1-8B",
-    "developer": "Skywork",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8413
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8463
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6625
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.776
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9667
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9838
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8124
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-1B/96d7e5c1-2f43-4f09-9702-0af090afa141.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-1B/96d7e5c1-2f43-4f09-9702-0af090afa141.json
deleted file mode 100644
index 2ff90cff2..000000000
--- a/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-1B/96d7e5c1-2f43-4f09-9702-0af090afa141.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Llama-3.2-1B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork/Skywork-Reward-V2-Llama-3.2-1B",
-    "id": "Skywork/Skywork-Reward-V2-Llama-3.2-1B",
-    "developer": "Skywork",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6438
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6084
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4562
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6011
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8733
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8929
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4306
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-3B/5a47f8bd-401a-4b6b-91b0-9593b36e5996.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-3B/5a47f8bd-401a-4b6b-91b0-9593b36e5996.json
deleted file mode 100644
index 9f8069f50..000000000
--- a/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-3B/5a47f8bd-401a-4b6b-91b0-9593b36e5996.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Llama-3.2-3B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork/Skywork-Reward-V2-Llama-3.2-3B",
-    "id": "Skywork/Skywork-Reward-V2-Llama-3.2-3B",
-    "developer": "Skywork",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7466
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7621
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4562
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.694
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9311
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9596
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6768
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-0.6B/c27e98d4-f5ea-48f9-babc-3ccda2d21d2a.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-0.6B/c27e98d4-f5ea-48f9-babc-3ccda2d21d2a.json
deleted file mode 100644
index 44ea9887d..000000000
--- a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-0.6B/c27e98d4-f5ea-48f9-babc-3ccda2d21d2a.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Qwen3-0.6B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork/Skywork-Reward-V2-Qwen3-0.6B",
-    "id": "Skywork/Skywork-Reward-V2-Qwen3-0.6B",
-    "developer": "Skywork",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6125
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.58
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7158
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8444
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7949
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3397
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-1.7B/060bf847-e7b5-4e30-934f-5306d01c499a.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-1.7B/060bf847-e7b5-4e30-934f-5306d01c499a.json
deleted file mode 100644
index f670ad051..000000000
--- a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-1.7B/060bf847-e7b5-4e30-934f-5306d01c499a.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Qwen3-1.7B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork/Skywork-Reward-V2-Qwen3-1.7B",
-    "id": "Skywork/Skywork-Reward-V2-Qwen3-1.7B",
-    "developer": "Skywork",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6818
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6568
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4437
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7268
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8911
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8848
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4872
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-4B/e648e6c2-18bb-49d7-b08f-47ce41a67d4f.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-4B/e648e6c2-18bb-49d7-b08f-47ce41a67d4f.json
deleted file mode 100644
index 6f6900c68..000000000
--- a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-4B/e648e6c2-18bb-49d7-b08f-47ce41a67d4f.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Qwen3-4B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork/Skywork-Reward-V2-Qwen3-4B",
-    "id": "Skywork/Skywork-Reward-V2-Qwen3-4B",
-    "developer": "Skywork",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7551
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7737
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4625
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7322
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9222
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9657
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6743
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-8B/537e92cb-25db-47f5-916a-6f666e14639a.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-8B/537e92cb-25db-47f5-916a-6f666e14639a.json
deleted file mode 100644
index 1c01babb8..000000000
--- a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-8B/537e92cb-25db-47f5-916a-6f666e14639a.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Qwen3-8B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork/Skywork-Reward-V2-Qwen3-8B",
-    "id": "Skywork/Skywork-Reward-V2-Qwen3-8B",
-    "developer": "Skywork",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7837
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7989
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7705
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.94
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9636
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7294
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Skywork/Skywork-VL-Reward-7B/e59ca33f-c6ce-44d4-9cb4-2fd65608313b.json b/data/reward-bench/Skywork/Skywork-VL-Reward-7B/e59ca33f-c6ce-44d4-9cb4-2fd65608313b.json
deleted file mode 100644
index 47757e3b6..000000000
--- a/data/reward-bench/Skywork/Skywork-VL-Reward-7B/e59ca33f-c6ce-44d4-9cb4-2fd65608313b.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Skywork_Skywork-VL-Reward-7B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork/Skywork-VL-Reward-7B",
-    "id": "Skywork/Skywork-VL-Reward-7B",
-    "developer": "Skywork",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9007
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8994
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.875
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9108
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9176
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/Skywork/Skywork-VL-Reward-7B/fc99848b-82c7-459e-8327-1867a332ff28.json b/data/reward-bench/Skywork/Skywork-VL-Reward-7B/fc99848b-82c7-459e-8327-1867a332ff28.json
deleted file mode 100644
index adb50e622..000000000
--- a/data/reward-bench/Skywork/Skywork-VL-Reward-7B/fc99848b-82c7-459e-8327-1867a332ff28.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/Skywork_Skywork-VL-Reward-7B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Skywork/Skywork-VL-Reward-7B",
-    "id": "Skywork/Skywork-VL-Reward-7B",
-    "developer": "Skywork",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6885
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6063
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6339
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8911
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8909
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7586
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/SultanR/SmolTulu-1.7b-RM/357f4f03-9542-495f-b575-4274111bbe1f.json b/data/reward-bench/SultanR/SmolTulu-1.7b-RM/357f4f03-9542-495f-b575-4274111bbe1f.json
deleted file mode 100644
index 16bfc7b82..000000000
--- a/data/reward-bench/SultanR/SmolTulu-1.7b-RM/357f4f03-9542-495f-b575-4274111bbe1f.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/SultanR_SmolTulu-1.7b-RM/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "SultanR/SmolTulu-1.7b-RM",
-    "id": "SultanR/SmolTulu-1.7b-RM",
-    "developer": "SultanR",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5094
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.743
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4408
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5716
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2821
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ZiyiYe/Con-J-Qwen2-7B/d78c42d6-fc0d-4719-bbb6-7a53dbb0d017.json b/data/reward-bench/ZiyiYe/Con-J-Qwen2-7B/d78c42d6-fc0d-4719-bbb6-7a53dbb0d017.json
deleted file mode 100644
index 5ae0638d6..000000000
--- a/data/reward-bench/ZiyiYe/Con-J-Qwen2-7B/d78c42d6-fc0d-4719-bbb6-7a53dbb0d017.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ZiyiYe_Con-J-Qwen2-7B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ZiyiYe/Con-J-Qwen2-7B",
-    "id": "ZiyiYe/Con-J-Qwen2-7B",
-    "developer": "ZiyiYe",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8712
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.919
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8026
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8824
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8808
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ai2/llama-2-chat-7b-nectar-3.8m.json/c94ddbe5-2bc0-4a33-b06b-10671fb22b70.json b/data/reward-bench/ai2/llama-2-chat-7b-nectar-3.8m.json/c94ddbe5-2bc0-4a33-b06b-10671fb22b70.json
deleted file mode 100644
index 400642b9b..000000000
--- a/data/reward-bench/ai2/llama-2-chat-7b-nectar-3.8m.json/c94ddbe5-2bc0-4a33-b06b-10671fb22b70.json
+++ /dev/null
@@ -1,94 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ai2_llama-2-chat-7b-nectar-3.8m.json/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ai2/llama-2-chat-7b-nectar-3.8m.json",
-    "id": "ai2/llama-2-chat-7b-nectar-3.8m.json",
-    "developer": "ai2",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5843
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8631
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2654
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6243
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ai2/llama-2-chat-nectar-180k.json/cc2ac405-1710-46fa-aeba-dd86797c666c.json b/data/reward-bench/ai2/llama-2-chat-nectar-180k.json/cc2ac405-1710-46fa-aeba-dd86797c666c.json
deleted file mode 100644
index 26ba58fae..000000000
--- a/data/reward-bench/ai2/llama-2-chat-nectar-180k.json/cc2ac405-1710-46fa-aeba-dd86797c666c.json
+++ /dev/null
@@ -1,94 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ai2_llama-2-chat-nectar-180k.json/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ai2/llama-2-chat-nectar-180k.json",
-    "id": "ai2/llama-2-chat-nectar-180k.json",
-    "developer": "ai2",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5235
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8827
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2851
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4027
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ai2/llama-2-chat-ultrafeedback-60k.jsonl/49fcb3e2-2883-4c3d-b519-d511c6b10162.json b/data/reward-bench/ai2/llama-2-chat-ultrafeedback-60k.jsonl/49fcb3e2-2883-4c3d-b519-d511c6b10162.json
deleted file mode 100644
index 4b539edf5..000000000
--- a/data/reward-bench/ai2/llama-2-chat-ultrafeedback-60k.jsonl/49fcb3e2-2883-4c3d-b519-d511c6b10162.json
+++ /dev/null
@@ -1,94 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ai2_llama-2-chat-ultrafeedback-60k.jsonl/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ai2/llama-2-chat-ultrafeedback-60k.jsonl",
-    "id": "ai2/llama-2-chat-ultrafeedback-60k.jsonl",
-    "developer": "ai2",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.644
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9441
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4539
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5338
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../0ba5ce6c-f311-4b02-a67a-d49539119a8e.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../0ba5ce6c-f311-4b02-a67a-d49539119a8e.json
deleted file mode 100644
index 277116c3d..000000000
--- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../0ba5ce6c-f311-4b02-a67a-d49539119a8e.json
+++ /dev/null
@@ -1,94 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
-    "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
-    "developer": "ai2",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7058
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9525
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3947
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7703
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../49029c9e-a831-4219-8e26-df20862ad3e1.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../49029c9e-a831-4219-8e26-df20862ad3e1.json
deleted file mode 100644
index dfc9008b5..000000000
--- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../49029c9e-a831-4219-8e26-df20862ad3e1.json
+++ /dev/null
@@ -1,94 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
-    "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
-    "developer": "ai2",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7004
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9413
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3882
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7716
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../6dedd117-eab0-4c31-b50b-4890099d9904.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../6dedd117-eab0-4c31-b50b-4890099d9904.json
deleted file mode 100644
index 974cd9980..000000000
--- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../6dedd117-eab0-4c31-b50b-4890099d9904.json
+++ /dev/null
@@ -1,94 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
-    "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
-    "developer": "ai2",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6905
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9441
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3596
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7676
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../71c20c06-efb8-428e-9e9d-e4fedf11041a.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../71c20c06-efb8-428e-9e9d-e4fedf11041a.json
deleted file mode 100644
index fb2652a50..000000000
--- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../71c20c06-efb8-428e-9e9d-e4fedf11041a.json
+++ /dev/null
@@ -1,94 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
-    "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
-    "developer": "ai2",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6945
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9385
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3706
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7743
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../862f3d57-8f5f-4372-b6fb-876fb35efba4.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../862f3d57-8f5f-4372-b6fb-876fb35efba4.json
deleted file mode 100644
index a8d6993af..000000000
--- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../862f3d57-8f5f-4372-b6fb-876fb35efba4.json
+++ /dev/null
@@ -1,94 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
-    "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
-    "developer": "ai2",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6808
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9302
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3596
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7527
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../93ea2bfa-e058-42d5-afac-0d3fc50fce91.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../93ea2bfa-e058-42d5-afac-0d3fc50fce91.json
deleted file mode 100644
index 4d645ea3b..000000000
--- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../93ea2bfa-e058-42d5-afac-0d3fc50fce91.json
+++ /dev/null
@@ -1,94 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
-    "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
-    "developer": "ai2",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6895
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9385
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3706
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7595
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c1331fa1-7793-4526-b24b-02261bb4437f.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c1331fa1-7793-4526-b24b-02261bb4437f.json
deleted file mode 100644
index f0096c309..000000000
--- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c1331fa1-7793-4526-b24b-02261bb4437f.json
+++ /dev/null
@@ -1,94 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
-    "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
-    "developer": "ai2",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7019
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9497
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7811
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c3cab72a-47b3-47ec-bb2d-986903ab8c26.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c3cab72a-47b3-47ec-bb2d-986903ab8c26.json
deleted file mode 100644
index 7ccfca2e6..000000000
--- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c3cab72a-47b3-47ec-bb2d-986903ab8c26.json
+++ /dev/null
@@ -1,94 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
-    "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
-    "developer": "ai2",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7008
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9385
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3882
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7757
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../cd0452a7-0370-4024-a51f-b3deff290db9.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../cd0452a7-0370-4024-a51f-b3deff290db9.json
deleted file mode 100644
index dddd173cb..000000000
--- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../cd0452a7-0370-4024-a51f-b3deff290db9.json
+++ /dev/null
@@ -1,94 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
-    "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...",
-    "developer": "ai2",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6924
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9441
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3575
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7757
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json/6fd85045-d600-451f-8d27-da637add4081.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json/6fd85045-d600-451f-8d27-da637add4081.json
deleted file mode 100644
index fbf11359b..000000000
--- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json/6fd85045-d600-451f-8d27-da637add4081.json
+++ /dev/null
@@ -1,94 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-700k.json/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json",
-    "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json",
-    "developer": "ai2",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7127
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9358
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4079
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7946
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized.json/a15ca8c3-fd90-4ef9-80c5-40eeac60d785.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized.json/a15ca8c3-fd90-4ef9-80c5-40eeac60d785.json
deleted file mode 100644
index 3770ce48e..000000000
--- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized.json/a15ca8c3-fd90-4ef9-80c5-40eeac60d785.json
+++ /dev/null
@@ -1,94 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized.json/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ai2/tulu-2-7b-rm-v0-nectar-binarized.json",
-    "id": "ai2/tulu-2-7b-rm-v0-nectar-binarized.json",
-    "developer": "ai2",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6756
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9134
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3904
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.723
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0.json/5f43832f-14fa-49e1-a851-949163aec826.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0.json/5f43832f-14fa-49e1-a851-949163aec826.json
deleted file mode 100644
index ffe63e3e4..000000000
--- a/data/reward-bench/ai2/tulu-2-7b-rm-v0.json/5f43832f-14fa-49e1-a851-949163aec826.json
+++ /dev/null
@@ -1,94 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0.json/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "ai2/tulu-2-7b-rm-v0.json",
-    "id": "ai2/tulu-2-7b-rm-v0.json",
-    "developer": "ai2",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6655
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.933
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4539
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6095
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/1f8869e7-e434-469e-906d-d34621582cba.json b/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/1f8869e7-e434-469e-906d-d34621582cba.json
deleted file mode 100644
index fef53f2e4..000000000
--- a/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/1f8869e7-e434-469e-906d-d34621582cba.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_Llama-3.1-70B-Instruct-RM-RB2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/Llama-3.1-70B-Instruct-RM-RB2",
-    "id": "allenai/Llama-3.1-70B-Instruct-RM-RB2",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7606
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8126
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4188
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6995
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8844
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8646
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8835
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/8f9d05db-9bb0-4998-bc75-96dbfa695548.json b/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/8f9d05db-9bb0-4998-bc75-96dbfa695548.json
deleted file mode 100644
index 419aa0a24..000000000
--- a/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/8f9d05db-9bb0-4998-bc75-96dbfa695548.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/allenai_Llama-3.1-70B-Instruct-RM-RB2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/Llama-3.1-70B-Instruct-RM-RB2",
-    "id": "allenai/Llama-3.1-70B-Instruct-RM-RB2",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9021
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9665
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8355
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9095
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8969
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/2681e475-da0a-48a9-ab68-e0bf59240f90.json b/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/2681e475-da0a-48a9-ab68-e0bf59240f90.json
deleted file mode 100644
index 77a854ced..000000000
--- a/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/2681e475-da0a-48a9-ab68-e0bf59240f90.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_Llama-3.1-8B-Base-RM-RB2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/Llama-3.1-8B-Base-RM-RB2",
-    "id": "allenai/Llama-3.1-8B-Base-RM-RB2",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.649
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.72
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3625
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.612
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8267
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8323
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5406
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/e2986d78-100d-417a-9f38-9a570a335d95.json b/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/e2986d78-100d-417a-9f38-9a570a335d95.json
deleted file mode 100644
index a01839a2f..000000000
--- a/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/e2986d78-100d-417a-9f38-9a570a335d95.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/allenai_Llama-3.1-8B-Base-RM-RB2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/Llama-3.1-8B-Base-RM-RB2",
-    "id": "allenai/Llama-3.1-8B-Base-RM-RB2",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8463
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.933
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7785
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8851
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7886
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1bc5cd51-5a3a-46ea-bc78-56f9b3081f69.json b/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1bc5cd51-5a3a-46ea-bc78-56f9b3081f69.json
deleted file mode 100644
index 7175068fd..000000000
--- a/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1bc5cd51-5a3a-46ea-bc78-56f9b3081f69.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/allenai_Llama-3.1-8B-Instruct-RM-RB2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/Llama-3.1-8B-Instruct-RM-RB2",
-    "id": "allenai/Llama-3.1-8B-Instruct-RM-RB2",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8885
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9581
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8158
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8932
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.887
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1d1127ee-7a0e-4915-b8bf-0b22f8ba338b.json b/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1d1127ee-7a0e-4915-b8bf-0b22f8ba338b.json
deleted file mode 100644
index 095adf95a..000000000
--- a/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1d1127ee-7a0e-4915-b8bf-0b22f8ba338b.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_Llama-3.1-8B-Instruct-RM-RB2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/Llama-3.1-8B-Instruct-RM-RB2",
-    "id": "allenai/Llama-3.1-8B-Instruct-RM-RB2",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7285
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7432
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4437
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6175
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8956
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9071
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7638
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/4bb55ff5-5adf-407f-a9d6-910c6c9d2770.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/4bb55ff5-5adf-407f-a9d6-910c6c9d2770.json
deleted file mode 100644
index f3bf51149..000000000
--- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/4bb55ff5-5adf-407f-a9d6-910c6c9d2770.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-70B-SFT-RM-RB2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2",
-    "id": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.722
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8084
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3688
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6776
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8689
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7778
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8308
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/daebee0b-3856-4270-94c6-c14bd84f5cf5.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/daebee0b-3856-4270-94c6-c14bd84f5cf5.json
deleted file mode 100644
index d0492cb5e..000000000
--- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/daebee0b-3856-4270-94c6-c14bd84f5cf5.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-70B-SFT-RM-RB2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2",
-    "id": "allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8892
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9693
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8268
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9027
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8583
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1be99417-352e-4a94-8108-b43123553667.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1be99417-352e-4a94-8108-b43123553667.json
deleted file mode 100644
index 042aac2cb..000000000
--- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1be99417-352e-4a94-8108-b43123553667.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2",
-    "id": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.687
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7516
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3875
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.86
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8545
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6397
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/8d3fbc68-2ee7-4989-a40c-f4a45e579b5c.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/8d3fbc68-2ee7-4989-a40c-f4a45e579b5c.json
deleted file mode 100644
index 8adafbc18..000000000
--- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/8d3fbc68-2ee7-4989-a40c-f4a45e579b5c.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2",
-    "id": "allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8431
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9553
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.761
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8662
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7898
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/9533891f-c2f7-4e82-9f39-131768dbc28a.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/9533891f-c2f7-4e82-9f39-131768dbc28a.json
deleted file mode 100644
index 98a6ce817..000000000
--- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/9533891f-c2f7-4e82-9f39-131768dbc28a.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2",
-    "id": "allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8369
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9469
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7588
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8703
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7715
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/b8a47660-f0a5-4136-a743-979863c53e3a.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/b8a47660-f0a5-4136-a743-979863c53e3a.json
deleted file mode 100644
index d3b513f5d..000000000
--- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/b8a47660-f0a5-4136-a743-979863c53e3a.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2",
-    "id": "allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6871
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7642
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6175
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8644
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8485
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6281
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RM/2673bea2-42eb-42a5-9dc2-13d43341c9b2.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RM/2673bea2-42eb-42a5-9dc2-13d43341c9b2.json
deleted file mode 100644
index e3e043728..000000000
--- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RM/2673bea2-42eb-42a5-9dc2-13d43341c9b2.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-RM/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/Llama-3.1-Tulu-3-8B-RM",
-    "id": "allenai/Llama-3.1-Tulu-3-8B-RM",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.59
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7453
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3469
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6448
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7422
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5364
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5243
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/6f5555c2-588a-48d1-811c-be53634bbdef.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/6f5555c2-588a-48d1-811c-be53634bbdef.json
deleted file mode 100644
index 44e1a6e59..000000000
--- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/6f5555c2-588a-48d1-811c-be53634bbdef.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2",
-    "id": "allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8551
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9497
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7917
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8784
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8005
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/9c96fa7b-52e8-4aed-9fdd-f389091d5e6f.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/9c96fa7b-52e8-4aed-9fdd-f389091d5e6f.json
deleted file mode 100644
index 674d59e88..000000000
--- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/9c96fa7b-52e8-4aed-9fdd-f389091d5e6f.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2",
-    "id": "allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6821
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7326
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3875
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5792
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8978
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8889
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6063
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/OLMo-7B-Instruct/0519d9fb-f220-40ab-8257-f20ed98a8b47.json b/data/reward-bench/allenai/OLMo-7B-Instruct/0519d9fb-f220-40ab-8257-f20ed98a8b47.json
deleted file mode 100644
index f8ff8a104..000000000
--- a/data/reward-bench/allenai/OLMo-7B-Instruct/0519d9fb-f220-40ab-8257-f20ed98a8b47.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/allenai_OLMo-7B-Instruct/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/OLMo-7B-Instruct",
-    "id": "allenai/OLMo-7B-Instruct",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6727
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8966
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5066
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6486
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7168
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5173
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/llama-3-tulu-2-70b-uf-mean-rm/ece70375-447f-41e8-aa03-8f4b26abea73.json b/data/reward-bench/allenai/llama-3-tulu-2-70b-uf-mean-rm/ece70375-447f-41e8-aa03-8f4b26abea73.json
deleted file mode 100644
index 5110dfc2e..000000000
--- a/data/reward-bench/allenai/llama-3-tulu-2-70b-uf-mean-rm/ece70375-447f-41e8-aa03-8f4b26abea73.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/allenai_llama-3-tulu-2-70b-uf-mean-rm/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/llama-3-tulu-2-70b-uf-mean-rm",
-    "id": "allenai/llama-3-tulu-2-70b-uf-mean-rm",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7019
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8631
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5614
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6095
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8268
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5957
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/llama-3-tulu-2-8b-uf-mean-rm/7bbaffdd-f822-48cf-a0f2-e66b16db678d.json b/data/reward-bench/allenai/llama-3-tulu-2-8b-uf-mean-rm/7bbaffdd-f822-48cf-a0f2-e66b16db678d.json
deleted file mode 100644
index e1917bdfd..000000000
--- a/data/reward-bench/allenai/llama-3-tulu-2-8b-uf-mean-rm/7bbaffdd-f822-48cf-a0f2-e66b16db678d.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/allenai_llama-3-tulu-2-8b-uf-mean-rm/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/llama-3-tulu-2-8b-uf-mean-rm",
-    "id": "allenai/llama-3-tulu-2-8b-uf-mean-rm",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7342
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9525
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5921
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6162
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8212
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6434
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/llama-3-tulu-2-dpo-70b/27c5c441-64ce-41dd-8384-f84c8f6ccc14.json b/data/reward-bench/allenai/llama-3-tulu-2-dpo-70b/27c5c441-64ce-41dd-8384-f84c8f6ccc14.json
deleted file mode 100644
index bef93ec9e..000000000
--- a/data/reward-bench/allenai/llama-3-tulu-2-dpo-70b/27c5c441-64ce-41dd-8384-f84c8f6ccc14.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/allenai_llama-3-tulu-2-dpo-70b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/llama-3-tulu-2-dpo-70b",
-    "id": "allenai/llama-3-tulu-2-dpo-70b",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7496
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9637
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5746
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7486
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.802
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5687
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/llama-3-tulu-2-dpo-8b/38a14e6a-2094-4e0b-be22-45181ede2a63.json b/data/reward-bench/allenai/llama-3-tulu-2-dpo-8b/38a14e6a-2094-4e0b-be22-45181ede2a63.json
deleted file mode 100644
index a54ed9cc6..000000000
--- a/data/reward-bench/allenai/llama-3-tulu-2-dpo-8b/38a14e6a-2094-4e0b-be22-45181ede2a63.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/allenai_llama-3-tulu-2-dpo-8b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/llama-3-tulu-2-dpo-8b",
-    "id": "allenai/llama-3-tulu-2-dpo-8b",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7275
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9525
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5351
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6649
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8663
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5097
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739590997/cee37c2c-2766-47b7-9192-a141e5d22f2d.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739590997/cee37c2c-2766-47b7-9192-a141e5d22f2d.json
deleted file mode 100644
index 264f422e1..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739590997/cee37c2c-2766-47b7-9192-a141e5d22f2d.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739590997/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-reward_modeling__1__1739590997",
-    "id": "allenai/open_instruct_dev-reward_modeling__1__1739590997",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6004
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7032
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.623
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7867
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.598
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5165
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739871066/d1d69392-8717-462d-9ce0-c7ddf5faf97d.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739871066/d1d69392-8717-462d-9ce0-c7ddf5faf97d.json
deleted file mode 100644
index 1d7e43d9e..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739871066/d1d69392-8717-462d-9ce0-c7ddf5faf97d.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739871066/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-reward_modeling__1__1739871066",
-    "id": "allenai/open_instruct_dev-reward_modeling__1__1739871066",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6012
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6989
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.425
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7978
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.604
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4527
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739925892/72071bb1-57c0-4727-8100-ba24d8da10f5.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739925892/72071bb1-57c0-4727-8100-ba24d8da10f5.json
deleted file mode 100644
index ccb6f9252..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739925892/72071bb1-57c0-4727-8100-ba24d8da10f5.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739925892/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-reward_modeling__1__1739925892",
-    "id": "allenai/open_instruct_dev-reward_modeling__1__1739925892",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6345
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7432
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3563
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.623
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8111
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7131
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5606
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943850/7626c158-edaf-48f3-9ac3-1188be0c6032.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943850/7626c158-edaf-48f3-9ac3-1188be0c6032.json
deleted file mode 100644
index 8e0cbdf9e..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943850/7626c158-edaf-48f3-9ac3-1188be0c6032.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739943850/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-reward_modeling__1__1739943850",
-    "id": "allenai/open_instruct_dev-reward_modeling__1__1739943850",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4978
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5726
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3125
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5191
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6489
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6222
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3114
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943881/c37be7a8-dc10-4fea-962b-202986a4581e.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943881/c37be7a8-dc10-4fea-962b-202986a4581e.json
deleted file mode 100644
index b0fd4a9be..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943881/c37be7a8-dc10-4fea-962b-202986a4581e.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739943881/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-reward_modeling__1__1739943881",
-    "id": "allenai/open_instruct_dev-reward_modeling__1__1739943881",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5998
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7032
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3187
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5792
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8222
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6727
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5025
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943972/223dc616-b20f-4065-91a7-3c35bfd11c94.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943972/223dc616-b20f-4065-91a7-3c35bfd11c94.json
deleted file mode 100644
index c0ad13c6c..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943972/223dc616-b20f-4065-91a7-3c35bfd11c94.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739943972/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-reward_modeling__1__1739943972",
-    "id": "allenai/open_instruct_dev-reward_modeling__1__1739943972",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5289
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6168
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5738
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6844
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5657
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3577
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739957701/4236b0a9-9d1e-41f6-8364-a7e8ebf51635.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739957701/4236b0a9-9d1e-41f6-8364-a7e8ebf51635.json
deleted file mode 100644
index 6eb775d20..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739957701/4236b0a9-9d1e-41f6-8364-a7e8ebf51635.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739957701/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-reward_modeling__1__1739957701",
-    "id": "allenai/open_instruct_dev-reward_modeling__1__1739957701",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6194
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6779
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3563
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6011
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8022
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.697
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5822
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971507/c8030a87-0cdf-4918-b0d5-d1fb0e284656.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971507/c8030a87-0cdf-4918-b0d5-d1fb0e284656.json
deleted file mode 100644
index c409d89eb..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971507/c8030a87-0cdf-4918-b0d5-d1fb0e284656.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739971507/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-reward_modeling__1__1739971507",
-    "id": "allenai/open_instruct_dev-reward_modeling__1__1739971507",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5717
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.68
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6066
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7667
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5475
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4545
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971529/e6ecc1eb-7ff1-46aa-bf03-37bad1b391b7.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971529/e6ecc1eb-7ff1-46aa-bf03-37bad1b391b7.json
deleted file mode 100644
index c8518a2e2..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971529/e6ecc1eb-7ff1-46aa-bf03-37bad1b391b7.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739971529/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-reward_modeling__1__1739971529",
-    "id": "allenai/open_instruct_dev-reward_modeling__1__1739971529",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5564
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6568
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3563
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5956
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7533
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5737
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4027
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739998765/64872b1a-1eae-4171-95ec-a80c782b69f0.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739998765/64872b1a-1eae-4171-95ec-a80c782b69f0.json
deleted file mode 100644
index b41f9844e..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739998765/64872b1a-1eae-4171-95ec-a80c782b69f0.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739998765/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-reward_modeling__1__1739998765",
-    "id": "allenai/open_instruct_dev-reward_modeling__1__1739998765",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6008
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7095
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4125
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6066
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8022
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5859
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4883
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740005072/37484401-c7fe-469d-889a-e70f7cadbf82.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740005072/37484401-c7fe-469d-889a-e70f7cadbf82.json
deleted file mode 100644
index 04e0eceb6..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740005072/37484401-c7fe-469d-889a-e70f7cadbf82.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1740005072/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-reward_modeling__1__1740005072",
-    "id": "allenai/open_instruct_dev-reward_modeling__1__1740005072",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6097
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7137
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3937
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6339
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7778
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6343
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5047
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740129284/8cf36288-3add-4fcd-a012-0df9eae2a059.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740129284/8cf36288-3add-4fcd-a012-0df9eae2a059.json
deleted file mode 100644
index a39194f60..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740129284/8cf36288-3add-4fcd-a012-0df9eae2a059.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1740129284/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-reward_modeling__1__1740129284",
-    "id": "allenai/open_instruct_dev-reward_modeling__1__1740129284",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6129
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7116
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4437
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6448
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8022
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6101
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4652
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741286813/f2c8f979-c331-4b9b-b0a7-5efa82c17d3b.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741286813/f2c8f979-c331-4b9b-b0a7-5efa82c17d3b.json
deleted file mode 100644
index 6657f9ea6..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741286813/f2c8f979-c331-4b9b-b0a7-5efa82c17d3b.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1741286813/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-reward_modeling__1__1741286813",
-    "id": "allenai/open_instruct_dev-reward_modeling__1__1741286813",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6557
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6295
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4188
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.612
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9111
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8263
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5365
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741287363/de409ce8-fb68-4113-8879-23712769cbde.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741287363/de409ce8-fb68-4113-8879-23712769cbde.json
deleted file mode 100644
index 56ed0daae..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741287363/de409ce8-fb68-4113-8879-23712769cbde.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1741287363/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-reward_modeling__1__1741287363",
-    "id": "allenai/open_instruct_dev-reward_modeling__1__1741287363",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6672
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6295
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6066
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9374
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5748
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741292911/264f20d7-1574-448c-8917-eb3f20810819.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741292911/264f20d7-1574-448c-8917-eb3f20810819.json
deleted file mode 100644
index d217f0d3d..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741292911/264f20d7-1574-448c-8917-eb3f20810819.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1741292911/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-reward_modeling__1__1741292911",
-    "id": "allenai/open_instruct_dev-reward_modeling__1__1741292911",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6607
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6589
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6066
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9089
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8869
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5028
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742338142/0ebaec42-9190-4326-95dd-5ecb48bf1a72.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742338142/0ebaec42-9190-4326-95dd-5ecb48bf1a72.json
deleted file mode 100644
index 4897a8825..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742338142/0ebaec42-9190-4326-95dd-5ecb48bf1a72.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1742338142/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-reward_modeling__1__1742338142",
-    "id": "allenai/open_instruct_dev-reward_modeling__1__1742338142",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6344
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7326
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3812
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7049
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.88
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6323
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.475
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519610/29515933-c60b-4686-b475-70ef53d75457.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519610/29515933-c60b-4686-b475-70ef53d75457.json
deleted file mode 100644
index e2f564fd7..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519610/29515933-c60b-4686-b475-70ef53d75457.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1742519610/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-reward_modeling__1__1742519610",
-    "id": "allenai/open_instruct_dev-reward_modeling__1__1742519610",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6361
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7074
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3812
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6721
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6444
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5915
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519628/414174a9-7e44-4f7b-94ce-0757639f5af7.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519628/414174a9-7e44-4f7b-94ce-0757639f5af7.json
deleted file mode 100644
index e92fdfc16..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519628/414174a9-7e44-4f7b-94ce-0757639f5af7.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1742519628/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-reward_modeling__1__1742519628",
-    "id": "allenai/open_instruct_dev-reward_modeling__1__1742519628",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5609
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5179
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3563
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.623
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8356
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5071
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5254
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455/48513083-f854-455e-8455-ddbd2698ec03.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455/48513083-f854-455e-8455-ddbd2698ec03.json
deleted file mode 100644
index c75b5a4dd..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455/48513083-f854-455e-8455-ddbd2698ec03.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455",
-    "id": "allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0576
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1313
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0546
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0489
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0808
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -0.01
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511/0b373560-854f-4482-81d0-6c984e130144.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511/0b373560-854f-4482-81d0-6c984e130144.json
deleted file mode 100644
index d30d94d18..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511/0b373560-854f-4482-81d0-6c984e130144.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511",
-    "id": "allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5499
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6821
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3937
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5956
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7356
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5212
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3711
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406/1a021cab-d569-4077-af5e-1643f45de03d.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406/1a021cab-d569-4077-af5e-1643f45de03d.json
deleted file mode 100644
index d5744d0a2..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406/1a021cab-d569-4077-af5e-1643f45de03d.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406",
-    "id": "allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5054
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6358
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3688
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6066
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6867
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4424
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2922
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136/e26e230d-59b3-4243-a6c4-3845ab74b89b.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136/e26e230d-59b3-4243-a6c4-3845ab74b89b.json
deleted file mode 100644
index 1263be4bc..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136/e26e230d-59b3-4243-a6c4-3845ab74b89b.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136",
-    "id": "allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.478
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6442
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3563
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.612
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6356
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2707
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3496
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398/aa0991d0-9c5e-4f94-bc12-3342ca389e99.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398/aa0991d0-9c5e-4f94-bc12-3342ca389e99.json
deleted file mode 100644
index 8bd48e77c..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398/aa0991d0-9c5e-4f94-bc12-3342ca389e99.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398",
-    "id": "allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.219
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2484
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2812
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2623
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3422
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1717
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.008
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535/397abe47-d5e9-487d-b883-ec49db16c584.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535/397abe47-d5e9-487d-b883-ec49db16c584.json
deleted file mode 100644
index a81128f70..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535/397abe47-d5e9-487d-b883-ec49db16c584.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535",
-    "id": "allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5625
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6821
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4062
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6011
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7511
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5313
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.403
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054/82f52a35-41b5-4b9c-bb3e-4bf18eed0b92.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054/82f52a35-41b5-4b9c-bb3e-4bf18eed0b92.json
deleted file mode 100644
index 53333e181..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054/82f52a35-41b5-4b9c-bb3e-4bf18eed0b92.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_dpo__1__1743550054/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054",
-    "id": "allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5759
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7074
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.623
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7578
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5333
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.459
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271/670382ab-a8a1-43f3-a572-b9a5aeae23ef.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271/670382ab-a8a1-43f3-a572-b9a5aeae23ef.json
deleted file mode 100644
index ba5b508f6..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271/670382ab-a8a1-43f3-a572-b9a5aeae23ef.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271",
-    "id": "allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6057
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5053
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5902
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8422
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7798
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5419
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181/a4b3c031-7c01-4f7a-8cfe-52b3260d6ecc.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181/a4b3c031-7c01-4f7a-8cfe-52b3260d6ecc.json
deleted file mode 100644
index 55a7568d2..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181/a4b3c031-7c01-4f7a-8cfe-52b3260d6ecc.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181",
-    "id": "allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6535
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7137
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3812
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6175
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8244
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7737
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6101
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221/7fcd3fce-2296-4b5c-8362-24b1c70ccb8f.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221/7fcd3fce-2296-4b5c-8362-24b1c70ccb8f.json
deleted file mode 100644
index b781deca8..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221/7fcd3fce-2296-4b5c-8362-24b1c70ccb8f.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_rl__1__1743551221/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221",
-    "id": "allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5799
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7116
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3812
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5374
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.461
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262/4f164e8b-55a1-498f-b586-cf78da7d0b57.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262/4f164e8b-55a1-498f-b586-cf78da7d0b57.json
deleted file mode 100644
index 9f032475e..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262/4f164e8b-55a1-498f-b586-cf78da7d0b57.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262",
-    "id": "allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5903
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4863
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3625
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5738
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8489
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7778
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4926
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523/a84d3d61-6e05-4d4d-bc89-7f663e9667fb.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523/a84d3d61-6e05-4d4d-bc89-7f663e9667fb.json
deleted file mode 100644
index 9d860b297..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523/a84d3d61-6e05-4d4d-bc89-7f663e9667fb.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523",
-    "id": "allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6483
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7074
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3625
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6175
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8222
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7758
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6044
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750/7aa98f71-8262-4c1f-a71c-1ef36f2ef04c.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750/7aa98f71-8262-4c1f-a71c-1ef36f2ef04c.json
deleted file mode 100644
index 7961d2bec..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750/7aa98f71-8262-4c1f-a71c-1ef36f2ef04c.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750",
-    "id": "allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5157
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6084
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3688
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6066
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7089
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4222
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3791
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427/93398c1f-3129-4be4-83b5-62a4a45c6b84.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427/93398c1f-3129-4be4-83b5-62a4a45c6b84.json
deleted file mode 100644
index 9a7aa8751..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427/93398c1f-3129-4be4-83b5-62a4a45c6b84.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427",
-    "id": "allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6009
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7263
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5902
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7933
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7273
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3931
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446/62493784-f899-4736-bdce-2107ec99a752.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446/62493784-f899-4736-bdce-2107ec99a752.json
deleted file mode 100644
index 0629539f9..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446/62493784-f899-4736-bdce-2107ec99a752.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446",
-    "id": "allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5716
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6779
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3937
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5464
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7533
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7051
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3534
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094/9b68ecaa-cf9d-414e-9cf1-c662c765bb5c.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094/9b68ecaa-cf9d-414e-9cf1-c662c765bb5c.json
deleted file mode 100644
index bca56ed2b..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094/9b68ecaa-cf9d-414e-9cf1-c662c765bb5c.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094",
-    "id": "allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5151
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6484
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3312
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5574
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7289
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4889
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3357
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636/76f3d0bd-2b71-4406-a0d4-b01b6c91c4ff.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636/76f3d0bd-2b71-4406-a0d4-b01b6c91c4ff.json
deleted file mode 100644
index 54fc2bfbe..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636/76f3d0bd-2b71-4406-a0d4-b01b6c91c4ff.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636",
-    "id": "allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6119
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.72
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4062
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8067
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6889
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.421
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325/2dc5ab6f-2427-42ae-9582-a0e6139f451a.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325/2dc5ab6f-2427-42ae-9582-a0e6139f451a.json
deleted file mode 100644
index 55ddbdfe8..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325/2dc5ab6f-2427-42ae-9582-a0e6139f451a.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_dpo__1__1743549325/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325",
-    "id": "allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6008
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7179
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5956
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6707
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4707
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238/0db97be6-6562-47d8-bd1a-5b469250e54b.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238/0db97be6-6562-47d8-bd1a-5b469250e54b.json
deleted file mode 100644
index 4b4ef4368..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238/0db97be6-6562-47d8-bd1a-5b469250e54b.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_rl__1__1743551238/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238",
-    "id": "allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5965
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7095
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3438
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.612
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8044
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6566
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.453
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906/228e4dc4-e517-4023-b690-7f0c321286b2.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906/228e4dc4-e517-4023-b690-7f0c321286b2.json
deleted file mode 100644
index 2cf126b25..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906/228e4dc4-e517-4023-b690-7f0c321286b2.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906",
-    "id": "allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5574
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6526
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3937
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6011
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7711
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5051
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4208
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529/9442b27c-c94d-41c0-a752-3bd82385272d.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529/9442b27c-c94d-41c0-a752-3bd82385272d.json
deleted file mode 100644
index 8dc1a9073..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529/9442b27c-c94d-41c0-a752-3bd82385272d.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529",
-    "id": "allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0719
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0421
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2062
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0601
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0378
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0949
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -0.01
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305/561039ac-b156-40eb-bf53-21a275b858ca.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305/561039ac-b156-40eb-bf53-21a275b858ca.json
deleted file mode 100644
index 5f62f67ab..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305/561039ac-b156-40eb-bf53-21a275b858ca.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305",
-    "id": "allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.553
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6674
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3563
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6733
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5697
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4227
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778/d801d700-7b4d-4a62-883b-3d85b05385ea.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778/d801d700-7b4d-4a62-883b-3d85b05385ea.json
deleted file mode 100644
index 431df7a47..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778/d801d700-7b4d-4a62-883b-3d85b05385ea.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778",
-    "id": "allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4955
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6189
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.325
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5792
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6378
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5657
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2466
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459/b8f24058-4441-4d19-898e-80470cc7b685.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459/b8f24058-4441-4d19-898e-80470cc7b685.json
deleted file mode 100644
index 6552eeea3..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459/b8f24058-4441-4d19-898e-80470cc7b685.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459",
-    "id": "allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4198
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5747
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5464
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4933
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3596
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2073
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747/1f372e00-e7a8-43ef-8e14-ef1b08e5e957.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747/1f372e00-e7a8-43ef-8e14-ef1b08e5e957.json
deleted file mode 100644
index 6c4f2b7cc..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747/1f372e00-e7a8-43ef-8e14-ef1b08e5e957.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747",
-    "id": "allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5465
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6821
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.612
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7333
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5051
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3713
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935/0200a1b3-71f1-4633-96a5-4ca9883a67a7.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935/0200a1b3-71f1-4633-96a5-4ca9883a67a7.json
deleted file mode 100644
index aabf71c3b..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935/0200a1b3-71f1-4633-96a5-4ca9883a67a7.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935",
-    "id": "allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5197
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6126
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5847
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7333
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4646
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3855
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360/55479901-aec7-4875-b792-ba73b54aa37a.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360/55479901-aec7-4875-b792-ba73b54aa37a.json
deleted file mode 100644
index a1a7dd25e..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360/55479901-aec7-4875-b792-ba73b54aa37a.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360",
-    "id": "allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4555
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5495
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3063
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4262
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5711
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6101
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2696
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366/872597b2-4392-4f23-b5b2-41d418b6cf89.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366/872597b2-4392-4f23-b5b2-41d418b6cf89.json
deleted file mode 100644
index f5aa5f436..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366/872597b2-4392-4f23-b5b2-41d418b6cf89.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366",
-    "id": "allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4422
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5053
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4044
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5422
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6646
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1991
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352/5cb437b5-5993-418d-bd9f-81dea71d9edf.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352/5cb437b5-5993-418d-bd9f-81dea71d9edf.json
deleted file mode 100644
index a0af647f5..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352/5cb437b5-5993-418d-bd9f-81dea71d9edf.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352",
-    "id": "allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.341
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4674
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2875
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3333
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3711
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3919
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.195
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634/c471cdf7-73f9-48c9-a970-baa66b609093.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634/c471cdf7-73f9-48c9-a970-baa66b609093.json
deleted file mode 100644
index c1a864b6a..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634/c471cdf7-73f9-48c9-a970-baa66b609093.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634",
-    "id": "allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4698
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5853
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2562
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5027
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6489
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5697
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2562
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988/794a71b4-8a43-4c69-a663-369eea6a84a3.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988/794a71b4-8a43-4c69-a663-369eea6a84a3.json
deleted file mode 100644
index 4f624253c..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988/794a71b4-8a43-4c69-a663-369eea6a84a3.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988",
-    "id": "allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4791
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6421
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3125
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.541
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6911
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4182
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.27
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103/2ad22375-4ed8-4be6-a012-a6f6799581e2.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103/2ad22375-4ed8-4be6-a012-a6f6799581e2.json
deleted file mode 100644
index da7381bc1..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103/2ad22375-4ed8-4be6-a012-a6f6799581e2.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0607
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0274
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1625
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0656
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.04
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0788
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": -0.01
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835/a8df0dc2-d16c-4e1a-b0b5-abe2a4a1d803.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835/a8df0dc2-d16c-4e1a-b0b5-abe2a4a1d803.json
deleted file mode 100644
index a04867707..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835/a8df0dc2-d16c-4e1a-b0b5-abe2a4a1d803.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6089
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.612
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7622
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6444
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4686
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221/ca0a010a-fe3a-4b87-8c80-4a8d3e2597fb.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221/ca0a010a-fe3a-4b87-8c80-4a8d3e2597fb.json
deleted file mode 100644
index de5939755..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221/ca0a010a-fe3a-4b87-8c80-4a8d3e2597fb.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6032
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7158
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4062
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7778
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5859
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5051
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826/5d1c166c-6a22-4afb-b1b1-f7db9ec38bd8.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826/5d1c166c-6a22-4afb-b1b1-f7db9ec38bd8.json
deleted file mode 100644
index 7f0d231a3..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826/5d1c166c-6a22-4afb-b1b1-f7db9ec38bd8.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5831
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6947
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4188
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.623
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.74
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5758
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4465
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363/10a432fa-dfef-4c9c-bdf7-ce0f81fd1895.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363/10a432fa-dfef-4c9c-bdf7-ce0f81fd1895.json
deleted file mode 100644
index 25ae9470b..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363/10a432fa-dfef-4c9c-bdf7-ce0f81fd1895.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5268
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.68
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3688
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5792
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7178
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4343
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3809
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498/a550663c-2a04-4dfb-8663-b177a7181f3d.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498/a550663c-2a04-4dfb-8663-b177a7181f3d.json
deleted file mode 100644
index 0676aafc3..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498/a550663c-2a04-4dfb-8663-b177a7181f3d.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6093
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7326
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4313
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6339
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7578
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5859
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5143
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__2__1743897475/72b6196e-0a2b-4ec9-80a3-a7eb14f7be09.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__2__1743897475/72b6196e-0a2b-4ec9-80a3-a7eb14f7be09.json
deleted file mode 100644
index 42e9ce0e8..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__2__1743897475/72b6196e-0a2b-4ec9-80a3-a7eb14f7be09.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1__2__1743897475/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1__2__1743897475",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1__2__1743897475",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6122
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7368
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.623
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8044
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.602
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5071
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__3__1744311421/5e41f068-f009-4e32-bac1-9de5220a2ce2.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__3__1744311421/5e41f068-f009-4e32-bac1-9de5220a2ce2.json
deleted file mode 100644
index c9eb27faa..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__3__1744311421/5e41f068-f009-4e32-bac1-9de5220a2ce2.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1__3__1744311421/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1__3__1744311421",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1__3__1744311421",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5995
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7179
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6066
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6323
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.503
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903/eca1331f-6503-481a-b77b-3d96791f54e8.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903/eca1331f-6503-481a-b77b-3d96791f54e8.json
deleted file mode 100644
index 86624d603..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903/eca1331f-6503-481a-b77b-3d96791f54e8.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_dpo__1__1743549903/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6154
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7326
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6339
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7778
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6061
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5043
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368/69def7de-a916-4d23-984b-e676e91e1d8c.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368/69def7de-a916-4d23-984b-e676e91e1d8c.json
deleted file mode 100644
index ade9d6695..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368/69def7de-a916-4d23-984b-e676e91e1d8c.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6604
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6316
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3937
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5792
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9044
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8929
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5604
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182/679c6e0b-9e0b-4224-b1e3-59df149739a0.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182/679c6e0b-9e0b-4224-b1e3-59df149739a0.json
deleted file mode 100644
index c5d2892d2..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182/679c6e0b-9e0b-4224-b1e3-59df149739a0.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6783
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7705
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6066
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.84
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8101
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6427
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012/2335433d-37c6-47f0-ad3b-5e0a42e9488f.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012/2335433d-37c6-47f0-ad3b-5e0a42e9488f.json
deleted file mode 100644
index e2e75967a..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012/2335433d-37c6-47f0-ad3b-5e0a42e9488f.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_no_if__2__1744316012/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5911
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7347
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.74
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.604
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4392
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765/fe84f8a3-5fe9-4385-b6d4-0436fb7e5197.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765/fe84f8a3-5fe9-4385-b6d4-0436fb7e5197.json
deleted file mode 100644
index 2d3cbe13a..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765/fe84f8a3-5fe9-4385-b6d4-0436fb7e5197.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_no_if__3__1744315765/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5926
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7263
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3563
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.623
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7889
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5879
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4733
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527/70d2697e-0df5-40ae-9268-b906c9cabd9d.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527/70d2697e-0df5-40ae-9268-b906c9cabd9d.json
deleted file mode 100644
index d305acb78..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527/70d2697e-0df5-40ae-9268-b906c9cabd9d.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_rl__1__1743551527/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6126
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7411
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.425
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.623
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7822
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5939
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5104
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236/0a30fd70-2381-4a4b-89aa-dbd169c856f0.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236/0a30fd70-2381-4a4b-89aa-dbd169c856f0.json
deleted file mode 100644
index bf0b750c2..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236/0a30fd70-2381-4a4b-89aa-dbd169c856f0.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6525
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6021
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3875
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5792
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8933
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8626
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.59
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530/b9c787f9-3bcd-4215-a157-7fcfa2df82cc.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530/b9c787f9-3bcd-4215-a157-7fcfa2df82cc.json
deleted file mode 100644
index bf056a3a5..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530/b9c787f9-3bcd-4215-a157-7fcfa2df82cc.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6849
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7453
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3812
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.612
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8422
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8404
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6885
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417/bdd98f27-fbfd-4de7-bd4e-3b8c3e4e7cc0.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417/bdd98f27-fbfd-4de7-bd4e-3b8c3e4e7cc0.json
deleted file mode 100644
index b8d8f17e6..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417/bdd98f27-fbfd-4de7-bd4e-3b8c3e4e7cc0.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.586
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6632
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.425
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6557
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7778
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5172
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.477
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486/44b20109-d534-4aa9-867d-fa59935ef6d0.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486/44b20109-d534-4aa9-867d-fa59935ef6d0.json
deleted file mode 100644
index 5bc8800b1..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486/44b20109-d534-4aa9-867d-fa59935ef6d0.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6773
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7432
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.612
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8422
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.804
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6626
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745/d1196312-4153-4a38-aa46-2940d63d7924.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745/d1196312-4153-4a38-aa46-2940d63d7924.json
deleted file mode 100644
index 620c1403f..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745/d1196312-4153-4a38-aa46-2940d63d7924.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6793
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7558
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4062
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8311
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8061
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6485
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661/4b1e3070-04ef-47e7-b720-739320194e7b.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661/4b1e3070-04ef-47e7-b720-739320194e7b.json
deleted file mode 100644
index 93ad54ca5..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661/4b1e3070-04ef-47e7-b720-739320194e7b.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6611
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.72
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3563
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6393
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8444
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7636
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6428
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472/247f400e-dca8-4dab-bebf-092f778f02c9.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472/247f400e-dca8-4dab-bebf-092f778f02c9.json
deleted file mode 100644
index 12de82f8f..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472/247f400e-dca8-4dab-bebf-092f778f02c9.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472",
-    "id": "allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5778
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6674
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3875
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6011
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7933
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5172
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5003
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267/d043ad21-102b-49f0-9e8e-6daef7cc3a2e.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267/d043ad21-102b-49f0-9e8e-6daef7cc3a2e.json
deleted file mode 100644
index 7f64660cc..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267/d043ad21-102b-49f0-9e8e-6daef7cc3a2e.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267",
-    "id": "allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5746
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6505
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5082
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7844
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7414
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4128
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759/d45ec8b8-1ee6-49bb-9237-a7271ba9d13c.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759/d45ec8b8-1ee6-49bb-9237-a7271ba9d13c.json
deleted file mode 100644
index cd842e3b9..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759/d45ec8b8-1ee6-49bb-9237-a7271ba9d13c.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759",
-    "id": "allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6065
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7116
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5792
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8178
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7152
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.465
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905/05a4c6aa-9af2-44f0-8c55-8aeed2e75eaf.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905/05a4c6aa-9af2-44f0-8c55-8aeed2e75eaf.json
deleted file mode 100644
index eb5fdd21c..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905/05a4c6aa-9af2-44f0-8c55-8aeed2e75eaf.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905",
-    "id": "allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5305
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5832
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3312
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.459
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7178
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7071
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3849
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363/a6ef712e-014e-470e-8d5b-f3b51f677aee.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363/a6ef712e-014e-470e-8d5b-f3b51f677aee.json
deleted file mode 100644
index f42f0f831..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363/a6ef712e-014e-470e-8d5b-f3b51f677aee.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363",
-    "id": "allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4436
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5411
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3312
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3115
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6267
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5414
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.31
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505/35a039ba-06be-4ec2-9bde-a6a6db2eefec.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505/35a039ba-06be-4ec2-9bde-a6a6db2eefec.json
deleted file mode 100644
index f6b4dff52..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505/35a039ba-06be-4ec2-9bde-a6a6db2eefec.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505",
-    "id": "allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5925
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.68
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3688
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5519
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.78
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7434
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.431
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180/97cb96f8-ce4c-403f-bfbc-386d3c611c81.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180/97cb96f8-ce4c-403f-bfbc-386d3c611c81.json
deleted file mode 100644
index d291b202f..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180/97cb96f8-ce4c-403f-bfbc-386d3c611c81.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_dpo__1__1743550180/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180",
-    "id": "allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6198
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7263
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3312
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6339
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8133
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7232
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4908
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187/3a1621e9-75ee-4b34-9c0d-ae15399b1dab.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187/3a1621e9-75ee-4b34-9c0d-ae15399b1dab.json
deleted file mode 100644
index 0d4d4902f..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187/3a1621e9-75ee-4b34-9c0d-ae15399b1dab.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187",
-    "id": "allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6763
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7411
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.612
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8844
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8545
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5908
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509/237218ac-4c74-4647-82b1-700360ddfdbd.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509/237218ac-4c74-4647-82b1-700360ddfdbd.json
deleted file mode 100644
index 6717d04eb..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509/237218ac-4c74-4647-82b1-700360ddfdbd.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_rl__1__1743551509/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509",
-    "id": "allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6245
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7242
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6175
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8178
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7253
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5124
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498/2858d126-d2ef-4512-8fc8-c39faf24b908.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498/2858d126-d2ef-4512-8fc8-c39faf24b908.json
deleted file mode 100644
index 7e1595151..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498/2858d126-d2ef-4512-8fc8-c39faf24b908.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498",
-    "id": "allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6673
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7326
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3438
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6175
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8622
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8566
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5911
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926/d118ddb1-aafc-4ddf-b5c7-f3ff921bbe0c.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926/d118ddb1-aafc-4ddf-b5c7-f3ff921bbe0c.json
deleted file mode 100644
index 599698f0e..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926/d118ddb1-aafc-4ddf-b5c7-f3ff921bbe0c.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926",
-    "id": "allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5863
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6674
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3937
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5515
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4768
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661/379ec82f-a6a7-4976-a4a6-ab80cb9da293.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661/379ec82f-a6a7-4976-a4a6-ab80cb9da293.json
deleted file mode 100644
index aa704ecb8..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661/379ec82f-a6a7-4976-a4a6-ab80cb9da293.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661",
-    "id": "allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.589
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6842
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3688
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6393
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7867
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6081
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.447
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598/c4df42d1-a838-4717-a814-40559fcd7342.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598/c4df42d1-a838-4717-a814-40559fcd7342.json
deleted file mode 100644
index 1c2c98ef1..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598/c4df42d1-a838-4717-a814-40559fcd7342.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598",
-    "id": "allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7306
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7474
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.694
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8622
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8061
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8992
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923/f022d826-3252-4def-b37b-3ce44d78f4ce.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923/f022d826-3252-4def-b37b-3ce44d78f4ce.json
deleted file mode 100644
index a60528170..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923/f022d826-3252-4def-b37b-3ce44d78f4ce.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923",
-    "id": "allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7573
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8168
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4125
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7049
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8733
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8545
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8814
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628/cecc321b-efbd-434e-8a31-a97bbb8bbb3b.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628/cecc321b-efbd-434e-8a31-a97bbb8bbb3b.json
deleted file mode 100644
index 7523cf126..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628/cecc321b-efbd-434e-8a31-a97bbb8bbb3b.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_1e-6_1__1__1743896628/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628",
-    "id": "allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6637
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6947
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4062
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8422
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7273
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6834
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999/278c2132-3415-48f4-a839-ed09d71e9240.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999/278c2132-3415-48f4-a839-ed09d71e9240.json
deleted file mode 100644
index 884f2c1d7..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999/278c2132-3415-48f4-a839-ed09d71e9240.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999",
-    "id": "allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6665
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5979
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3688
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6339
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8956
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8606
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6422
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777/92bbda1a-ecb1-493d-aa39-a29522c1a11e.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777/92bbda1a-ecb1-493d-aa39-a29522c1a11e.json
deleted file mode 100644
index 58d203adf..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777/92bbda1a-ecb1-493d-aa39-a29522c1a11e.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777",
-    "id": "allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7038
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6947
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3937
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6557
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8867
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8586
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7331
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638/f43b2dff-9e73-4779-86e0-b2cc30ae8b40.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638/f43b2dff-9e73-4779-86e0-b2cc30ae8b40.json
deleted file mode 100644
index eb2f9451c..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638/f43b2dff-9e73-4779-86e0-b2cc30ae8b40.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_1e-6_2__1__1743896638/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638",
-    "id": "allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6754
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6716
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6339
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8756
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7737
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6976
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938/59a98f5d-d017-4b1a-a563-5abd113337e9.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938/59a98f5d-d017-4b1a-a563-5abd113337e9.json
deleted file mode 100644
index b741da56e..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938/59a98f5d-d017-4b1a-a563-5abd113337e9.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938",
-    "id": "allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7241
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7305
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6667
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9422
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9414
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6635
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885/a41597ed-fbab-41af-9625-c277ca988546.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885/a41597ed-fbab-41af-9625-c277ca988546.json
deleted file mode 100644
index 4c6ec4ab3..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885/a41597ed-fbab-41af-9625-c277ca988546.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885",
-    "id": "allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6716
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6632
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3688
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.82
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8303
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.719
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773/e311eb59-f217-4bc2-b69b-dcea434797a8.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773/e311eb59-f217-4bc2-b69b-dcea434797a8.json
deleted file mode 100644
index 92fb6052e..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773/e311eb59-f217-4bc2-b69b-dcea434797a8.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773",
-    "id": "allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6207
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6358
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.375
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5902
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8267
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.802
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4948
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867/69b037c3-bae2-4889-b10d-e732c45851e9.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867/69b037c3-bae2-4889-b10d-e732c45851e9.json
deleted file mode 100644
index 5c7a0911d..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867/69b037c3-bae2-4889-b10d-e732c45851e9.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867",
-    "id": "allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.719
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7263
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3875
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6393
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8956
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9273
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.738
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424/adeee000-0b62-4a0c-afaa-5e8c5f29ff6d.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424/adeee000-0b62-4a0c-afaa-5e8c5f29ff6d.json
deleted file mode 100644
index 8b895898b..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424/adeee000-0b62-4a0c-afaa-5e8c5f29ff6d.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_1__1__1743929424/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424",
-    "id": "allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6572
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7305
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3688
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8289
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.703
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6837
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395/4464d588-62b2-440b-8188-2450bd7a94c5.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395/4464d588-62b2-440b-8188-2450bd7a94c5.json
deleted file mode 100644
index cba9daba8..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395/4464d588-62b2-440b-8188-2450bd7a94c5.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_1__2__1744311395/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395",
-    "id": "allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6938
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7537
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.45
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6393
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8667
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7616
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6913
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491/bf358648-a41d-43ee-8c14-f8b8eef41871.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491/bf358648-a41d-43ee-8c14-f8b8eef41871.json
deleted file mode 100644
index acca1c710..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491/bf358648-a41d-43ee-8c14-f8b8eef41871.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_1__3__1744311491/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491",
-    "id": "allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6754
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7242
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4062
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8422
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7535
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6976
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787/afd99f12-f739-40d3-aa11-ef3a45316931.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787/afd99f12-f739-40d3-aa11-ef3a45316931.json
deleted file mode 100644
index d8abd1886..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787/afd99f12-f739-40d3-aa11-ef3a45316931.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787",
-    "id": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7045
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6253
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3812
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6667
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9232
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7109
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461/49b4a24b-ddf1-47f0-ba39-9366892a1213.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461/49b4a24b-ddf1-47f0-ba39-9366892a1213.json
deleted file mode 100644
index 68019510c..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461/49b4a24b-ddf1-47f0-ba39-9366892a1213.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461",
-    "id": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7189
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7305
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3937
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6066
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8978
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9374
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7475
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780/ea14a487-39c3-488b-b52b-998e57135487.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780/ea14a487-39c3-488b-b52b-998e57135487.json
deleted file mode 100644
index cb7ea678a..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780/ea14a487-39c3-488b-b52b-998e57135487.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780",
-    "id": "allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7172
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7242
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4313
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6175
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8778
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.897
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7555
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489/02f74b6a-7f63-484e-a7c1-0c53bd801b87.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489/02f74b6a-7f63-484e-a7c1-0c53bd801b87.json
deleted file mode 100644
index 462dccb75..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489/02f74b6a-7f63-484e-a7c1-0c53bd801b87.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_2__1__1743896489/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489",
-    "id": "allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6813
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7137
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4437
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8644
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7596
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6781
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713/e492c59d-4b03-4dce-983e-a8724de35a60.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713/e492c59d-4b03-4dce-983e-a8724de35a60.json
deleted file mode 100644
index 1f6674a88..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713/e492c59d-4b03-4dce-983e-a8724de35a60.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713",
-    "id": "allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7209
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7116
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3875
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6612
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9067
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9172
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7414
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911/53de0394-8516-4882-b2bc-c7e62e3d8ef0.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911/53de0394-8516-4882-b2bc-c7e62e3d8ef0.json
deleted file mode 100644
index f774c416d..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911/53de0394-8516-4882-b2bc-c7e62e3d8ef0.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911",
-    "id": "allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7266
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7347
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4313
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6339
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8933
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.897
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7697
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412/56d4c1c5-5238-45dc-8331-64a14b830779.json b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412/56d4c1c5-5238-45dc-8331-64a14b830779.json
deleted file mode 100644
index 41b8b3c61..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412/56d4c1c5-5238-45dc-8331-64a14b830779.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412",
-    "id": "allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5342
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6042
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.275
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7222
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5818
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3935
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922/7003c9d4-c758-4373-a7a3-04822978bf35.json b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922/7003c9d4-c758-4373-a7a3-04822978bf35.json
deleted file mode 100644
index 19e1f2b82..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922/7003c9d4-c758-4373-a7a3-04822978bf35.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922",
-    "id": "allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6111
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6884
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3063
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.623
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8289
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7576
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4628
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495/75a7dcb6-789c-49de-b209-4cf7d27465e4.json b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495/75a7dcb6-789c-49de-b209-4cf7d27465e4.json
deleted file mode 100644
index 504f0108c..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495/75a7dcb6-789c-49de-b209-4cf7d27465e4.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495",
-    "id": "allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5825
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6379
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.325
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5355
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8222
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7051
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4691
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507/e91d3910-4f20-4e82-b1fb-8605f5d2b8ac.json b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507/e91d3910-4f20-4e82-b1fb-8605f5d2b8ac.json
deleted file mode 100644
index b50170dc8..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507/e91d3910-4f20-4e82-b1fb-8605f5d2b8ac.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507",
-    "id": "allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5598
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5495
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3563
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5902
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7273
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3754
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507/f18bfd44-3097-4eb8-a09c-2372c3ecd738.json b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507/f18bfd44-3097-4eb8-a09c-2372c3ecd738.json
deleted file mode 100644
index 2ce4a6752..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507/f18bfd44-3097-4eb8-a09c-2372c3ecd738.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507",
-    "id": "allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6101
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6632
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6175
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7778
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7111
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5408
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917/9ca974b9-c5fb-4fc4-ab3e-1246e31ecdb2.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917/9ca974b9-c5fb-4fc4-ab3e-1246e31ecdb2.json
deleted file mode 100644
index aef503803..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917/9ca974b9-c5fb-4fc4-ab3e-1246e31ecdb2.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917",
-    "id": "allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7185
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7305
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4125
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7158
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7933
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8545
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.804
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961/fb1ab5e0-18db-4e5f-add3-2352d9a1f260.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961/fb1ab5e0-18db-4e5f-add3-2352d9a1f260.json
deleted file mode 100644
index 3c5aab68c..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961/fb1ab5e0-18db-4e5f-add3-2352d9a1f260.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961",
-    "id": "allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7325
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7474
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4437
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7158
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7978
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8141
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8763
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830/60ba1f0d-7e85-49e4-8c73-330d74de6707.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830/60ba1f0d-7e85-49e4-8c73-330d74de6707.json
deleted file mode 100644
index 4f30313c2..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830/60ba1f0d-7e85-49e4-8c73-330d74de6707.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830",
-    "id": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6022
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.325
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.694
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7556
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7616
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5486
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024/29d1c194-8b87-466c-8701-e0fcf267665c.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024/29d1c194-8b87-466c-8701-e0fcf267665c.json
deleted file mode 100644
index e82661177..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024/29d1c194-8b87-466c-8701-e0fcf267665c.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024",
-    "id": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5948
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5579
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2875
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6776
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.72
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7394
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5863
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914/31e8f616-7b64-4d1a-b395-20bf8bb4629c.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914/31e8f616-7b64-4d1a-b395-20bf8bb4629c.json
deleted file mode 100644
index 0ba320e56..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914/31e8f616-7b64-4d1a-b395-20bf8bb4629c.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914",
-    "id": "allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6492
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6084
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.35
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6776
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.76
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.699
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091/cc3f315d-3cea-47e4-83b4-b5045e778c5e.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091/cc3f315d-3cea-47e4-83b4-b5045e778c5e.json
deleted file mode 100644
index 4d27acb60..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091/cc3f315d-3cea-47e4-83b4-b5045e778c5e.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091",
-    "id": "allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6764
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7074
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6885
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8622
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.802
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6984
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829/5d20dbf8-bb14-46af-adcd-b7ba05f8352c.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829/5d20dbf8-bb14-46af-adcd-b7ba05f8352c.json
deleted file mode 100644
index 68ba40d15..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829/5d20dbf8-bb14-46af-adcd-b7ba05f8352c.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829",
-    "id": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6408
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6337
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3063
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6831
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8467
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8222
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5529
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050/06f2cb33-3937-4fde-84e2-6b5467f051c6.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050/06f2cb33-3937-4fde-84e2-6b5467f051c6.json
deleted file mode 100644
index 69c6a7c36..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050/06f2cb33-3937-4fde-84e2-6b5467f051c6.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050",
-    "id": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6452
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6063
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3187
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7158
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8356
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8343
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5603
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916/f35c4efa-3767-4a0e-8769-06230cda2512.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916/f35c4efa-3767-4a0e-8769-06230cda2512.json
deleted file mode 100644
index 3817a0a83..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916/f35c4efa-3767-4a0e-8769-06230cda2512.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916",
-    "id": "allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7013
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7263
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3438
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6995
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8222
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8444
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7714
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576/6cb65d6a-6c46-4991-8154-f28b101954f6.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576/6cb65d6a-6c46-4991-8154-f28b101954f6.json
deleted file mode 100644
index 05ff3f527..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576/6cb65d6a-6c46-4991-8154-f28b101954f6.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_3e-6_2__1__1743023576/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576",
-    "id": "allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6369
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6905
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3187
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6448
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7844
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7596
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6236
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619/6e15a49b-7dc4-4d69-965e-cb962c084e4a.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619/6e15a49b-7dc4-4d69-965e-cb962c084e4a.json
deleted file mode 100644
index 2295c0011..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619/6e15a49b-7dc4-4d69-965e-cb962c084e4a.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_3e-6_3__1__1743023619/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619",
-    "id": "allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6221
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6674
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.325
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.612
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7978
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7455
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5852
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583/9f5591f4-751d-48d3-a348-4bb59f6bb1a3.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583/9f5591f4-751d-48d3-a348-4bb59f6bb1a3.json
deleted file mode 100644
index afd3a2b0e..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583/9f5591f4-751d-48d3-a348-4bb59f6bb1a3.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583",
-    "id": "allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5735
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5895
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2625
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6448
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6889
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6727
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5823
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604/b609c002-fa0a-46a8-b5a1-9213ee89606c.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604/b609c002-fa0a-46a8-b5a1-9213ee89606c.json
deleted file mode 100644
index ce70a17a6..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604/b609c002-fa0a-46a8-b5a1-9213ee89606c.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604",
-    "id": "allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6336
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6337
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3063
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6885
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7244
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.802
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6465
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738/b147fc7f-0e31-49ca-abfd-ba990a925097.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738/b147fc7f-0e31-49ca-abfd-ba990a925097.json
deleted file mode 100644
index 23d89ab11..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738/b147fc7f-0e31-49ca-abfd-ba990a925097.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738",
-    "id": "allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6824
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6989
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3625
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6831
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8311
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8081
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7107
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191/e4fbfe23-2b70-459e-821b-db0116d43d8c.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191/e4fbfe23-2b70-459e-821b-db0116d43d8c.json
deleted file mode 100644
index b6f1f989c..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191/e4fbfe23-2b70-459e-821b-db0116d43d8c.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191",
-    "id": "allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6392
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6589
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3312
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6995
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7933
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7717
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5804
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737/2ab7dc14-af3e-4fb2-8c0c-fe0e14100321.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737/2ab7dc14-af3e-4fb2-8c0c-fe0e14100321.json
deleted file mode 100644
index 9f16f2dae..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737/2ab7dc14-af3e-4fb2-8c0c-fe0e14100321.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737",
-    "id": "allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.664
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6821
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3312
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6448
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8133
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8061
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7066
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138/aca2c665-79f2-4226-b806-307be277ed08.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138/aca2c665-79f2-4226-b806-307be277ed08.json
deleted file mode 100644
index d9c996fb0..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138/aca2c665-79f2-4226-b806-307be277ed08.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138",
-    "id": "allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6678
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6505
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3312
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6831
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7978
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8808
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6632
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455/d37a63df-6d38-4083-bf87-11064162efde.json b/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455/d37a63df-6d38-4083-bf87-11064162efde.json
deleted file mode 100644
index 61f3caac6..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455/d37a63df-6d38-4083-bf87-11064162efde.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_tulu3_70b_1__8__1742924455/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455",
-    "id": "allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6618
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7958
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.325
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6557
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8311
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6323
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7311
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964/16e550cc-e59d-4aaa-b221-8cf71e1b26d2.json b/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964/16e550cc-e59d-4aaa-b221-8cf71e1b26d2.json
deleted file mode 100644
index 2b5278d3d..000000000
--- a/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964/16e550cc-e59d-4aaa-b221-8cf71e1b26d2.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_tulu3_70b_2__8__1742982964/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964",
-    "id": "allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6605
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7789
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3688
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6448
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8844
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6667
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6195
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/tulu-2-dpo-13b/47058e2a-dc41-45f8-8c32-bc496a8d3bc5.json b/data/reward-bench/allenai/tulu-2-dpo-13b/47058e2a-dc41-45f8-8c32-bc496a8d3bc5.json
deleted file mode 100644
index 16dc6eef6..000000000
--- a/data/reward-bench/allenai/tulu-2-dpo-13b/47058e2a-dc41-45f8-8c32-bc496a8d3bc5.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/allenai_tulu-2-dpo-13b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/tulu-2-dpo-13b",
-    "id": "allenai/tulu-2-dpo-13b",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7368
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9581
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5833
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7946
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7323
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4947
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/tulu-2-dpo-70b/7199c8b3-8346-4200-b07e-4362ad13a7db.json b/data/reward-bench/allenai/tulu-2-dpo-70b/7199c8b3-8346-4200-b07e-4362ad13a7db.json
deleted file mode 100644
index d68ff3ff8..000000000
--- a/data/reward-bench/allenai/tulu-2-dpo-70b/7199c8b3-8346-4200-b07e-4362ad13a7db.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/allenai_tulu-2-dpo-70b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/tulu-2-dpo-70b",
-    "id": "allenai/tulu-2-dpo-70b",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7621
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9749
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6053
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8446
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7407
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5278
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/tulu-2-dpo-7b/de7e59d5-e2ce-4479-bbd9-ab9deb3beed3.json b/data/reward-bench/allenai/tulu-2-dpo-7b/de7e59d5-e2ce-4479-bbd9-ab9deb3beed3.json
deleted file mode 100644
index bef43cd19..000000000
--- a/data/reward-bench/allenai/tulu-2-dpo-7b/de7e59d5-e2ce-4479-bbd9-ab9deb3beed3.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/allenai_tulu-2-dpo-7b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/tulu-2-dpo-7b",
-    "id": "allenai/tulu-2-dpo-7b",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7212
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9749
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5614
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7527
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7176
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4774
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/tulu-v2.5-13b-preference-mix-rm/17e011c3-1a53-40ae-b7b4-cb24c23df3de.json b/data/reward-bench/allenai/tulu-v2.5-13b-preference-mix-rm/17e011c3-1a53-40ae-b7b4-cb24c23df3de.json
deleted file mode 100644
index 15c29fb58..000000000
--- a/data/reward-bench/allenai/tulu-v2.5-13b-preference-mix-rm/17e011c3-1a53-40ae-b7b4-cb24c23df3de.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/allenai_tulu-v2.5-13b-preference-mix-rm/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/tulu-v2.5-13b-preference-mix-rm",
-    "id": "allenai/tulu-v2.5-13b-preference-mix-rm",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8027
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9358
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.682
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.773
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.885
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6724
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/tulu-v2.5-13b-uf-rm/1125dd05-2f0d-48ca-825c-f5efa18564aa.json b/data/reward-bench/allenai/tulu-v2.5-13b-uf-rm/1125dd05-2f0d-48ca-825c-f5efa18564aa.json
deleted file mode 100644
index 817d26686..000000000
--- a/data/reward-bench/allenai/tulu-v2.5-13b-uf-rm/1125dd05-2f0d-48ca-825c-f5efa18564aa.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/allenai_tulu-v2.5-13b-uf-rm/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/tulu-v2.5-13b-uf-rm",
-    "id": "allenai/tulu-v2.5-13b-uf-rm",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4806
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3939
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4232
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5554
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4737
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6326
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/tulu-v2.5-70b-preference-mix-rm/88014e0d-e89b-4fed-9eb6-5276bd7658df.json b/data/reward-bench/allenai/tulu-v2.5-70b-preference-mix-rm/88014e0d-e89b-4fed-9eb6-5276bd7658df.json
deleted file mode 100644
index 9fc720998..000000000
--- a/data/reward-bench/allenai/tulu-v2.5-70b-preference-mix-rm/88014e0d-e89b-4fed-9eb6-5276bd7658df.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/allenai_tulu-v2.5-70b-preference-mix-rm/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/tulu-v2.5-70b-preference-mix-rm",
-    "id": "allenai/tulu-v2.5-70b-preference-mix-rm",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6516
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7737
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5921
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8486
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4138
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6079
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/allenai/tulu-v2.5-70b-uf-rm/7cc9bfc2-570d-456c-918f-68fd4b711f05.json b/data/reward-bench/allenai/tulu-v2.5-70b-uf-rm/7cc9bfc2-570d-456c-918f-68fd4b711f05.json
deleted file mode 100644
index b30d36361..000000000
--- a/data/reward-bench/allenai/tulu-v2.5-70b-uf-rm/7cc9bfc2-570d-456c-918f-68fd4b711f05.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/allenai_tulu-v2.5-70b-uf-rm/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "allenai/tulu-v2.5-70b-uf-rm",
-    "id": "allenai/tulu-v2.5-70b-uf-rm",
-    "developer": "allenai",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7398
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8659
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7171
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7014
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.757
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5757
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/berkeley-nest/Starling-RM-7B-alpha/77b0957f-8779-4dbe-a6ea-cff50c4ee73b.json b/data/reward-bench/berkeley-nest/Starling-RM-7B-alpha/77b0957f-8779-4dbe-a6ea-cff50c4ee73b.json
deleted file mode 100644
index 66d40e95d..000000000
--- a/data/reward-bench/berkeley-nest/Starling-RM-7B-alpha/77b0957f-8779-4dbe-a6ea-cff50c4ee73b.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/berkeley-nest_Starling-RM-7B-alpha/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "berkeley-nest/Starling-RM-7B-alpha",
-    "id": "berkeley-nest/Starling-RM-7B-alpha",
-    "developer": "berkeley-nest",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7113
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9804
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4561
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8446
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.58
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6794
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/facebook/Self-taught-Llama-3-70B/ba0ce7ce-a755-4337-bfec-0391680d3625.json b/data/reward-bench/facebook/Self-taught-Llama-3-70B/ba0ce7ce-a755-4337-bfec-0391680d3625.json
deleted file mode 100644
index 96835a8ce..000000000
--- a/data/reward-bench/facebook/Self-taught-Llama-3-70B/ba0ce7ce-a755-4337-bfec-0391680d3625.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/facebook_Self-taught-Llama-3-70B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "facebook/Self-taught-Llama-3-70B",
-    "id": "facebook/Self-taught-Llama-3-70B",
-    "developer": "facebook",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8863
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9693
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8399
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9108
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8251
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/facebook/Self-taught-evaluator-llama3.1-70B/4eb460eb-b3ad-4e0d-b131-5b59ef54015c.json b/data/reward-bench/facebook/Self-taught-evaluator-llama3.1-70B/4eb460eb-b3ad-4e0d-b131-5b59ef54015c.json
deleted file mode 100644
index a0b337292..000000000
--- a/data/reward-bench/facebook/Self-taught-evaluator-llama3.1-70B/4eb460eb-b3ad-4e0d-b131-5b59ef54015c.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/facebook_Self-taught-evaluator-llama3.1-70B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "facebook/Self-taught-evaluator-llama3.1-70B",
-    "id": "facebook/Self-taught-evaluator-llama3.1-70B",
-    "developer": "facebook",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9001
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9693
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8509
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8959
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8844
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/general-preference/GPM-Gemma-2B/6868a1e5-ee86-4f89-8452-5e939ac19169.json b/data/reward-bench/general-preference/GPM-Gemma-2B/6868a1e5-ee86-4f89-8452-5e939ac19169.json
deleted file mode 100644
index 30c26da89..000000000
--- a/data/reward-bench/general-preference/GPM-Gemma-2B/6868a1e5-ee86-4f89-8452-5e939ac19169.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/general-preference_GPM-Gemma-2B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "general-preference/GPM-Gemma-2B",
-    "id": "general-preference/GPM-Gemma-2B",
-    "developer": "general-preference",
-    "additional_details": {
-      "model_type": "Custom Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7449
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7151
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6974
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8122
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.755
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/general-preference/GPM-Llama-3.1-8B/4a151d43-5fac-4afe-9c23-ba0e86a60849.json b/data/reward-bench/general-preference/GPM-Llama-3.1-8B/4a151d43-5fac-4afe-9c23-ba0e86a60849.json
deleted file mode 100644
index d66a7ae70..000000000
--- a/data/reward-bench/general-preference/GPM-Llama-3.1-8B/4a151d43-5fac-4afe-9c23-ba0e86a60849.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/general-preference_GPM-Llama-3.1-8B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "general-preference/GPM-Llama-3.1-8B",
-    "id": "general-preference/GPM-Llama-3.1-8B",
-    "developer": "general-preference",
-    "additional_details": {
-      "model_type": "Custom Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9224
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.933
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.886
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9108
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9597
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/google/flame-1.0-24B-july-2024/5f16d574-adef-4016-abcf-9e7936771ba7.json b/data/reward-bench/google/flame-1.0-24B-july-2024/5f16d574-adef-4016-abcf-9e7936771ba7.json
deleted file mode 100644
index 4f1439052..000000000
--- a/data/reward-bench/google/flame-1.0-24B-july-2024/5f16d574-adef-4016-abcf-9e7936771ba7.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/google_flame-1.0-24B-july-2024/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "google/flame-1.0-24B-july-2024",
-    "id": "google/flame-1.0-24B-july-2024",
-    "developer": "google",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8781
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9218
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7566
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8959
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.938
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/google/gemini-1.5-flash-001/f3e0300f-39ed-4cfd-bd03-218904836037.json b/data/reward-bench/google/gemini-1.5-flash-001/f3e0300f-39ed-4cfd-bd03-218904836037.json
deleted file mode 100644
index 0c59d6494..000000000
--- a/data/reward-bench/google/gemini-1.5-flash-001/f3e0300f-39ed-4cfd-bd03-218904836037.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/google_gemini-1.5-flash-001/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "google/gemini-1.5-flash-001",
-    "id": "google/gemini-1.5-flash-001",
-    "developer": "google",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8054
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9218
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6349
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8696
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8512
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6937
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/google/gemini-1.5-flash-8b/42c82c00-b74e-4152-a222-15d481a13e0c.json b/data/reward-bench/google/gemini-1.5-flash-8b/42c82c00-b74e-4152-a222-15d481a13e0c.json
deleted file mode 100644
index 11fe9a9c7..000000000
--- a/data/reward-bench/google/gemini-1.5-flash-8b/42c82c00-b74e-4152-a222-15d481a13e0c.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/google_gemini-1.5-flash-8b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "google/gemini-1.5-flash-8b",
-    "id": "google/gemini-1.5-flash-8b",
-    "developer": "google",
-    "additional_details": {
-      "model_type": "Generative RM"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4851
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4611
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3625
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5082
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6622
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6747
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2421
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/google/gemini-1.5-pro-0514/68096be8-c49f-4a23-824e-1275248369f7.json b/data/reward-bench/google/gemini-1.5-pro-0514/68096be8-c49f-4a23-824e-1275248369f7.json
deleted file mode 100644
index 1faa0442f..000000000
--- a/data/reward-bench/google/gemini-1.5-pro-0514/68096be8-c49f-4a23-824e-1275248369f7.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/google_gemini-1.5-pro-0514/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "google/gemini-1.5-pro-0514",
-    "id": "google/gemini-1.5-pro-0514",
-    "developer": "google",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.882
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9232
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8059
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8791
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9199
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/google/gemini-1.5-pro-0924/c91270bd-3731-452a-b429-6cd4943d1194.json b/data/reward-bench/google/gemini-1.5-pro-0924/c91270bd-3731-452a-b429-6cd4943d1194.json
deleted file mode 100644
index 2eb44d882..000000000
--- a/data/reward-bench/google/gemini-1.5-pro-0924/c91270bd-3731-452a-b429-6cd4943d1194.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/google_gemini-1.5-pro-0924/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "google/gemini-1.5-pro-0924",
-    "id": "google/gemini-1.5-pro-0924",
-    "developer": "google",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8678
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9413
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7697
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8581
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9022
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/google/gemini-2.5-flash-preview-04-17/337c7a43-46a7-4acb-b7f1-936e1f2cf46f.json b/data/reward-bench/google/gemini-2.5-flash-preview-04-17/337c7a43-46a7-4acb-b7f1-936e1f2cf46f.json
deleted file mode 100644
index 8abf7e861..000000000
--- a/data/reward-bench/google/gemini-2.5-flash-preview-04-17/337c7a43-46a7-4acb-b7f1-936e1f2cf46f.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/google_gemini-2.5-flash-preview-04-17/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "google/gemini-2.5-flash-preview-04-17",
-    "id": "google/gemini-2.5-flash-preview-04-17",
-    "developer": "google",
-    "additional_details": {
-      "model_type": "Generative RM"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7721
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6574
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5531
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8115
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9094
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8672
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8341
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/google/gemini-2.5-flash/3b00f881-8f73-4608-8cbb-846fe7d1cfea.json b/data/reward-bench/google/gemini-2.5-flash/3b00f881-8f73-4608-8cbb-846fe7d1cfea.json
deleted file mode 100644
index 5a4a0577c..000000000
--- a/data/reward-bench/google/gemini-2.5-flash/3b00f881-8f73-4608-8cbb-846fe7d1cfea.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/google_gemini-2.5-flash/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "google/gemini-2.5-flash",
-    "id": "google/gemini-2.5-flash",
-    "developer": "google",
-    "additional_details": {
-      "model_type": "Generative RM"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7767
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.674
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.575
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.852
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.909
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.841
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.809
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/google/gemini-2.5-pro-preview-05-06/2821dfdc-291b-405e-bd81-cf536c802885.json b/data/reward-bench/google/gemini-2.5-pro-preview-05-06/2821dfdc-291b-405e-bd81-cf536c802885.json
deleted file mode 100644
index a4d4ee1dd..000000000
--- a/data/reward-bench/google/gemini-2.5-pro-preview-05-06/2821dfdc-291b-405e-bd81-cf536c802885.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/google_gemini-2.5-pro-preview-05-06/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "google/gemini-2.5-pro-preview-05-06",
-    "id": "google/gemini-2.5-pro-preview-05-06",
-    "developer": "google",
-    "additional_details": {
-      "model_type": "Generative RM"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6775
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6532
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4688
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5342
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8806
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8308
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6973
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/google/gemini-2.5-pro/7d441240-7e85-4776-b51c-3c1bc84456ba.json b/data/reward-bench/google/gemini-2.5-pro/7d441240-7e85-4776-b51c-3c1bc84456ba.json
deleted file mode 100644
index f67d63bbb..000000000
--- a/data/reward-bench/google/gemini-2.5-pro/7d441240-7e85-4776-b51c-3c1bc84456ba.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/google_gemini-2.5-pro/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "google/gemini-2.5-pro",
-    "id": "google/gemini-2.5-pro",
-    "developer": "google",
-    "additional_details": {
-      "model_type": "Generative RM"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7948
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.755
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.619
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.898
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.881
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.805
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.811
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/google/gemma-2-27b-it/840d35d9-441e-4ba3-bbc3-1f4ff2627517.json b/data/reward-bench/google/gemma-2-27b-it/840d35d9-441e-4ba3-bbc3-1f4ff2627517.json
deleted file mode 100644
index 5bc50f14c..000000000
--- a/data/reward-bench/google/gemma-2-27b-it/840d35d9-441e-4ba3-bbc3-1f4ff2627517.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/google_gemma-2-27b-it/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "google/gemma-2-27b-it",
-    "id": "google/gemma-2-27b-it",
-    "developer": "google",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.809
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9483
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.591
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8635
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.833
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/0127f3c5-9657-4eb6-a77a-5a6476a8fc79.json b/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/0127f3c5-9657-4eb6-a77a-5a6476a8fc79.json
deleted file mode 100644
index b2d697ddd..000000000
--- a/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/0127f3c5-9657-4eb6-a77a-5a6476a8fc79.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/hendrydong_Mistral-RM-for-RAFT-GSHF-v0/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "hendrydong/Mistral-RM-for-RAFT-GSHF-v0",
-    "id": "hendrydong/Mistral-RM-for-RAFT-GSHF-v0",
-    "developer": "hendrydong",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7847
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9832
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5789
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.85
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7434
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7508
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/b72e2988-75e4-4d26-9a47-daae4786b02f.json b/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/b72e2988-75e4-4d26-9a47-daae4786b02f.json
deleted file mode 100644
index 53579af56..000000000
--- a/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/b72e2988-75e4-4d26-9a47-daae4786b02f.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/hendrydong_Mistral-RM-for-RAFT-GSHF-v0/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "hendrydong/Mistral-RM-for-RAFT-GSHF-v0",
-    "id": "hendrydong/Mistral-RM-for-RAFT-GSHF-v0",
-    "developer": "hendrydong",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5851
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5779
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3625
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6011
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6956
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6747
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5988
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/infly/INF-ORM-Llama3.1-70B/643cf5a3-8992-4126-87c9-814887314266.json b/data/reward-bench/infly/INF-ORM-Llama3.1-70B/643cf5a3-8992-4126-87c9-814887314266.json
deleted file mode 100644
index 4bce79497..000000000
--- a/data/reward-bench/infly/INF-ORM-Llama3.1-70B/643cf5a3-8992-4126-87c9-814887314266.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/infly_INF-ORM-Llama3.1-70B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "infly/INF-ORM-Llama3.1-70B",
-    "id": "infly/INF-ORM-Llama3.1-70B",
-    "developer": "infly",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7648
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7411
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4188
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6995
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9644
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.903
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8622
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/infly/INF-ORM-Llama3.1-70B/f81f1f67-6506-481f-87ce-a17a6a7578f3.json b/data/reward-bench/infly/INF-ORM-Llama3.1-70B/f81f1f67-6506-481f-87ce-a17a6a7578f3.json
deleted file mode 100644
index d0e17fefb..000000000
--- a/data/reward-bench/infly/INF-ORM-Llama3.1-70B/f81f1f67-6506-481f-87ce-a17a6a7578f3.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/infly_INF-ORM-Llama3.1-70B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "infly/INF-ORM-Llama3.1-70B",
-    "id": "infly/INF-ORM-Llama3.1-70B",
-    "developer": "infly",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9511
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9665
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9101
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9365
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9912
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/internlm/internlm2-1_8b-reward/32b35218-a099-410e-8a65-a0d6e2f380a6.json b/data/reward-bench/internlm/internlm2-1_8b-reward/32b35218-a099-410e-8a65-a0d6e2f380a6.json
deleted file mode 100644
index 009a9841f..000000000
--- a/data/reward-bench/internlm/internlm2-1_8b-reward/32b35218-a099-410e-8a65-a0d6e2f380a6.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/internlm_internlm2-1_8b-reward/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "internlm/internlm2-1_8b-reward",
-    "id": "internlm/internlm2-1_8b-reward",
-    "developer": "internlm",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3902
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2758
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3625
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4426
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4711
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.596
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1934
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/internlm/internlm2-1_8b-reward/deec1e7c-0cb8-4e6f-b3ac-d37790b709f3.json b/data/reward-bench/internlm/internlm2-1_8b-reward/deec1e7c-0cb8-4e6f-b3ac-d37790b709f3.json
deleted file mode 100644
index c3fbd28d8..000000000
--- a/data/reward-bench/internlm/internlm2-1_8b-reward/deec1e7c-0cb8-4e6f-b3ac-d37790b709f3.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/internlm_internlm2-1_8b-reward/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "internlm/internlm2-1_8b-reward",
-    "id": "internlm/internlm2-1_8b-reward",
-    "developer": "internlm",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8217
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9358
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6623
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8162
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8724
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/internlm/internlm2-20b-reward/e42a9986-4dcc-4017-be97-8135646c7424.json b/data/reward-bench/internlm/internlm2-20b-reward/e42a9986-4dcc-4017-be97-8135646c7424.json
deleted file mode 100644
index 332851441..000000000
--- a/data/reward-bench/internlm/internlm2-20b-reward/e42a9986-4dcc-4017-be97-8135646c7424.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/internlm_internlm2-20b-reward/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "internlm/internlm2-20b-reward",
-    "id": "internlm/internlm2-20b-reward",
-    "developer": "internlm",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9016
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9888
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7654
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8946
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9576
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/internlm/internlm2-20b-reward/ffc92063-606a-4f31-bfdd-5683aa748ccc.json b/data/reward-bench/internlm/internlm2-20b-reward/ffc92063-606a-4f31-bfdd-5683aa748ccc.json
deleted file mode 100644
index ceaeec27a..000000000
--- a/data/reward-bench/internlm/internlm2-20b-reward/ffc92063-606a-4f31-bfdd-5683aa748ccc.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/internlm_internlm2-20b-reward/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "internlm/internlm2-20b-reward",
-    "id": "internlm/internlm2-20b-reward",
-    "developer": "internlm",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5628
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5558
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3625
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5738
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6111
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7253
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5483
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/internlm/internlm2-7b-reward/23a5398c-0911-4a66-930d-abada12bf985.json b/data/reward-bench/internlm/internlm2-7b-reward/23a5398c-0911-4a66-930d-abada12bf985.json
deleted file mode 100644
index 3c136cd53..000000000
--- a/data/reward-bench/internlm/internlm2-7b-reward/23a5398c-0911-4a66-930d-abada12bf985.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/internlm_internlm2-7b-reward/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "internlm/internlm2-7b-reward",
-    "id": "internlm/internlm2-7b-reward",
-    "developer": "internlm",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5335
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4211
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5628
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5956
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7051
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5164
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/internlm/internlm2-7b-reward/80b0bbcb-a57a-453c-8fff-502646520b1d.json b/data/reward-bench/internlm/internlm2-7b-reward/80b0bbcb-a57a-453c-8fff-502646520b1d.json
deleted file mode 100644
index 2273cb29a..000000000
--- a/data/reward-bench/internlm/internlm2-7b-reward/80b0bbcb-a57a-453c-8fff-502646520b1d.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/internlm_internlm2-7b-reward/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "internlm/internlm2-7b-reward",
-    "id": "internlm/internlm2-7b-reward",
-    "developer": "internlm",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8759
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9916
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6952
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8716
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9453
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/jondurbin/bagel-dpo-34b-v0.5/e383c939-b952-4fdd-94e3-eb3716691860.json b/data/reward-bench/jondurbin/bagel-dpo-34b-v0.5/e383c939-b952-4fdd-94e3-eb3716691860.json
deleted file mode 100644
index 090fb16d8..000000000
--- a/data/reward-bench/jondurbin/bagel-dpo-34b-v0.5/e383c939-b952-4fdd-94e3-eb3716691860.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/jondurbin_bagel-dpo-34b-v0.5/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "jondurbin/bagel-dpo-34b-v0.5",
-    "id": "jondurbin/bagel-dpo-34b-v0.5",
-    "developer": "jondurbin",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7215
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9385
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5504
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6446
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8889
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4487
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/llm-blender/PairRM-hf/daf873f9-ab03-49df-96cb-a0f5a8613048.json b/data/reward-bench/llm-blender/PairRM-hf/daf873f9-ab03-49df-96cb-a0f5a8613048.json
deleted file mode 100644
index ed400926f..000000000
--- a/data/reward-bench/llm-blender/PairRM-hf/daf873f9-ab03-49df-96cb-a0f5a8613048.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/llm-blender_PairRM-hf/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "llm-blender/PairRM-hf",
-    "id": "llm-blender/PairRM-hf",
-    "developer": "llm-blender",
-    "additional_details": {
-      "model_type": "Custom Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6087
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9022
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5219
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.477
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4898
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6961
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/mattshumer/Reflection-70B/f4cff132-3b2f-4e03-bb49-098b16d87cef.json b/data/reward-bench/mattshumer/Reflection-70B/f4cff132-3b2f-4e03-bb49-098b16d87cef.json
deleted file mode 100644
index 48058a174..000000000
--- a/data/reward-bench/mattshumer/Reflection-70B/f4cff132-3b2f-4e03-bb49-098b16d87cef.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/mattshumer_Reflection-70B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mattshumer/Reflection-70B",
-    "id": "mattshumer/Reflection-70B",
-    "developer": "mattshumer",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8422
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9749
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7061
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8318
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8562
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/meta-llama/Meta-Llama-3-70B-Instruct/f80685de-058c-4ab8-aa35-dc7321d1cea6.json b/data/reward-bench/meta-llama/Meta-Llama-3-70B-Instruct/f80685de-058c-4ab8-aa35-dc7321d1cea6.json
deleted file mode 100644
index d29586253..000000000
--- a/data/reward-bench/meta-llama/Meta-Llama-3-70B-Instruct/f80685de-058c-4ab8-aa35-dc7321d1cea6.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/meta-llama_Meta-Llama-3-70B-Instruct/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "meta-llama/Meta-Llama-3-70B-Instruct",
-    "id": "meta-llama/Meta-Llama-3-70B-Instruct",
-    "developer": "meta-llama",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7627
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9763
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5888
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7297
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7854
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7035
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/meta-llama/Meta-Llama-3-8B-Instruct/c8e4349d-a084-4eb5-990f-403ba930a9ad.json b/data/reward-bench/meta-llama/Meta-Llama-3-8B-Instruct/c8e4349d-a084-4eb5-990f-403ba930a9ad.json
deleted file mode 100644
index 21b28918d..000000000
--- a/data/reward-bench/meta-llama/Meta-Llama-3-8B-Instruct/c8e4349d-a084-4eb5-990f-403ba930a9ad.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/meta-llama_Meta-Llama-3-8B-Instruct/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "meta-llama/Meta-Llama-3-8B-Instruct",
-    "id": "meta-llama/Meta-Llama-3-8B-Instruct",
-    "developer": "meta-llama",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.645
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8547
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4156
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6797
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6482
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6082
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo/729ca9c0-0680-49f1-97b9-5581be17a352.json b/data/reward-bench/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo/729ca9c0-0680-49f1-97b9-5581be17a352.json
deleted file mode 100644
index 2f6cdcbbc..000000000
--- a/data/reward-bench/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo/729ca9c0-0680-49f1-97b9-5581be17a352.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/meta-llama_Meta-Llama-3.1-405B-Instruct-Turbo/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
-    "id": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
-    "developer": "meta-llama",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8412
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9721
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7456
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7757
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8715
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/fdd4add5-b44d-46f9-8c98-da3120df4161.json b/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/fdd4add5-b44d-46f9-8c98-da3120df4161.json
deleted file mode 100644
index 1181fd3d1..000000000
--- a/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/fdd4add5-b44d-46f9-8c98-da3120df4161.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/meta-llama_Meta-Llama-3.1-70B-Instruct-Turbo/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
-    "id": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
-    "developer": "meta-llama",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7808
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8757
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6689
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7507
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.828
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct/6b5ef643-30dd-4381-b66f-e9ecd6b0d06e.json b/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct/6b5ef643-30dd-4381-b66f-e9ecd6b0d06e.json
deleted file mode 100644
index 2f41e3adc..000000000
--- a/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct/6b5ef643-30dd-4381-b66f-e9ecd6b0d06e.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/meta-llama_Meta-Llama-3.1-70B-Instruct/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-    "id": "meta-llama/Meta-Llama-3.1-70B-Instruct",
-    "developer": "meta-llama",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8405
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9721
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7018
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8599
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo/95271b8c-4135-48bf-bbad-ae94baa37640.json b/data/reward-bench/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo/95271b8c-4135-48bf-bbad-ae94baa37640.json
deleted file mode 100644
index 47e395707..000000000
--- a/data/reward-bench/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo/95271b8c-4135-48bf-bbad-ae94baa37640.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/meta-llama_Meta-Llama-3.1-8B-Instruct-Turbo/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
-    "id": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
-    "developer": "meta-llama",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6565
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8073
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4978
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6399
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6811
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/meta-metrics/MetaMetrics-RM-v1.0/f437e790-efe1-4dc5-8ccc-5b0bfd800069.json b/data/reward-bench/meta-metrics/MetaMetrics-RM-v1.0/f437e790-efe1-4dc5-8ccc-5b0bfd800069.json
deleted file mode 100644
index 5a185daab..000000000
--- a/data/reward-bench/meta-metrics/MetaMetrics-RM-v1.0/f437e790-efe1-4dc5-8ccc-5b0bfd800069.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/meta-metrics_MetaMetrics-RM-v1.0/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "meta-metrics/MetaMetrics-RM-v1.0",
-    "id": "meta-metrics/MetaMetrics-RM-v1.0",
-    "developer": "meta-metrics",
-    "additional_details": {
-      "model_type": "Custom Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9342
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9832
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.864
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9081
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9816
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/mightbe/Better-PairRM/7d0f761a-2650-4029-b1e9-13af2f0cc69d.json b/data/reward-bench/mightbe/Better-PairRM/7d0f761a-2650-4029-b1e9-13af2f0cc69d.json
deleted file mode 100644
index 46baee169..000000000
--- a/data/reward-bench/mightbe/Better-PairRM/7d0f761a-2650-4029-b1e9-13af2f0cc69d.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/mightbe_Better-PairRM/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mightbe/Better-PairRM",
-    "id": "mightbe/Better-PairRM",
-    "developer": "mightbe",
-    "additional_details": {
-      "model_type": "Custom Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.673
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9553
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3925
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8203
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4983
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.724
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/mistralai/Mixtral-8x7B-Instruct-v0.1/49fc601e-4ac6-4672-a53d-0e89f19959c1.json b/data/reward-bench/mistralai/Mixtral-8x7B-Instruct-v0.1/49fc601e-4ac6-4672-a53d-0e89f19959c1.json
deleted file mode 100644
index 83daa1359..000000000
--- a/data/reward-bench/mistralai/Mixtral-8x7B-Instruct-v0.1/49fc601e-4ac6-4672-a53d-0e89f19959c1.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/mistralai_Mixtral-8x7B-Instruct-v0.1/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-    "id": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-    "developer": "mistralai",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7455
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9497
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6404
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7257
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7872
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5033
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/my_model/6195e81a-d5a5-40af-96f6-259252009ad7.json b/data/reward-bench/my_model/6195e81a-d5a5-40af-96f6-259252009ad7.json
deleted file mode 100644
index 366e80763..000000000
--- a/data/reward-bench/my_model/6195e81a-d5a5-40af-96f6-259252009ad7.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/my_model_/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "my_model/",
-    "id": "my_model/",
-    "developer": "my_model",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5267
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4553
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5592
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4392
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6532
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/nicolinho/QRM-Gemma-2-27B/2dec0f50-d374-4af3-9d27-80fcf50dac2c.json b/data/reward-bench/nicolinho/QRM-Gemma-2-27B/2dec0f50-d374-4af3-9d27-80fcf50dac2c.json
deleted file mode 100644
index e2fe3a9e8..000000000
--- a/data/reward-bench/nicolinho/QRM-Gemma-2-27B/2dec0f50-d374-4af3-9d27-80fcf50dac2c.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/nicolinho_QRM-Gemma-2-27B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "nicolinho/QRM-Gemma-2-27B",
-    "id": "nicolinho/QRM-Gemma-2-27B",
-    "developer": "nicolinho",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7667
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7853
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3719
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6995
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9578
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9535
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8321
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/nicolinho/QRM-Gemma-2-27B/96722888-0cc9-4dfd-b38d-91f4118c0be2.json b/data/reward-bench/nicolinho/QRM-Gemma-2-27B/96722888-0cc9-4dfd-b38d-91f4118c0be2.json
deleted file mode 100644
index d61d3e0af..000000000
--- a/data/reward-bench/nicolinho/QRM-Gemma-2-27B/96722888-0cc9-4dfd-b38d-91f4118c0be2.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/nicolinho_QRM-Gemma-2-27B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "nicolinho/QRM-Gemma-2-27B",
-    "id": "nicolinho/QRM-Gemma-2-27B",
-    "developer": "nicolinho",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9444
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9665
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9013
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.927
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9826
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/nicolinho/QRM-Llama3-8B/683abc2a-fce0-4d3d-bdcc-5cac2c76a46a.json b/data/reward-bench/nicolinho/QRM-Llama3-8B/683abc2a-fce0-4d3d-bdcc-5cac2c76a46a.json
deleted file mode 100644
index 52e654851..000000000
--- a/data/reward-bench/nicolinho/QRM-Llama3-8B/683abc2a-fce0-4d3d-bdcc-5cac2c76a46a.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/nicolinho_QRM-Llama3-8B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "nicolinho/QRM-Llama3-8B",
-    "id": "nicolinho/QRM-Llama3-8B",
-    "developer": "nicolinho",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.911
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9581
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8114
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8986
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9758
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/121344ec-61ef-49c5-a74b-b86f605d513e.json b/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/121344ec-61ef-49c5-a74b-b86f605d513e.json
deleted file mode 100644
index aef21f0ec..000000000
--- a/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/121344ec-61ef-49c5-a74b-b86f605d513e.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/nicolinho_QRM-Llama3.1-8B-v2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "nicolinho/QRM-Llama3.1-8B-v2",
-    "id": "nicolinho/QRM-Llama3.1-8B-v2",
-    "developer": "nicolinho",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7074
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6653
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4062
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.612
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9467
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8909
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7234
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/8594f86b-a7f2-4046-a3a7-830d7ac20690.json b/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/8594f86b-a7f2-4046-a3a7-830d7ac20690.json
deleted file mode 100644
index 45f32ccdc..000000000
--- a/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/8594f86b-a7f2-4046-a3a7-830d7ac20690.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/nicolinho_QRM-Llama3.1-8B-v2/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "nicolinho/QRM-Llama3.1-8B-v2",
-    "id": "nicolinho/QRM-Llama3.1-8B-v2",
-    "developer": "nicolinho",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9314
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9637
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8684
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9257
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9677
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/nicolinho/QRM-Llama3.1-8B/c0c5e5e1-801c-48fd-a994-a4a69c0b1213.json b/data/reward-bench/nicolinho/QRM-Llama3.1-8B/c0c5e5e1-801c-48fd-a994-a4a69c0b1213.json
deleted file mode 100644
index d0517b2db..000000000
--- a/data/reward-bench/nicolinho/QRM-Llama3.1-8B/c0c5e5e1-801c-48fd-a994-a4a69c0b1213.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/nicolinho_QRM-Llama3.1-8B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "nicolinho/QRM-Llama3.1-8B",
-    "id": "nicolinho/QRM-Llama3.1-8B",
-    "developer": "nicolinho",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9306
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9441
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8969
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.923
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9583
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/nvidia/Llama-3.1-Nemotron-70B-Reward/0411ac30-1536-4639-8350-fc11d53298e3.json b/data/reward-bench/nvidia/Llama-3.1-Nemotron-70B-Reward/0411ac30-1536-4639-8350-fc11d53298e3.json
deleted file mode 100644
index 639ea033b..000000000
--- a/data/reward-bench/nvidia/Llama-3.1-Nemotron-70B-Reward/0411ac30-1536-4639-8350-fc11d53298e3.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/nvidia_Llama-3.1-Nemotron-70B-Reward/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "nvidia/Llama-3.1-Nemotron-70B-Reward",
-    "id": "nvidia/Llama-3.1-Nemotron-70B-Reward",
-    "developer": "nvidia",
-    "additional_details": {
-      "model_type": "Custom Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9411
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9749
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8575
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9514
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9807
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/nvidia/Llama3-70B-SteerLM-RM/92281e58-4160-4d76-9119-b38fb47ffd8f.json b/data/reward-bench/nvidia/Llama3-70B-SteerLM-RM/92281e58-4160-4d76-9119-b38fb47ffd8f.json
deleted file mode 100644
index 34cb0b116..000000000
--- a/data/reward-bench/nvidia/Llama3-70B-SteerLM-RM/92281e58-4160-4d76-9119-b38fb47ffd8f.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/nvidia_Llama3-70B-SteerLM-RM/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "nvidia/Llama3-70B-SteerLM-RM",
-    "id": "nvidia/Llama3-70B-SteerLM-RM",
-    "developer": "nvidia",
-    "additional_details": {
-      "model_type": "Custom Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8877
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9134
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8026
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9064
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/nvidia/Nemotron-4-340B-Reward/43687871-2e19-4d2b-9754-1cb6527496c1.json b/data/reward-bench/nvidia/Nemotron-4-340B-Reward/43687871-2e19-4d2b-9754-1cb6527496c1.json
deleted file mode 100644
index 81a8ec028..000000000
--- a/data/reward-bench/nvidia/Nemotron-4-340B-Reward/43687871-2e19-4d2b-9754-1cb6527496c1.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/nvidia_Nemotron-4-340B-Reward/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "nvidia/Nemotron-4-340B-Reward",
-    "id": "nvidia/Nemotron-4-340B-Reward",
-    "developer": "nvidia",
-    "additional_details": {
-      "model_type": "Custom Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.92
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9581
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8706
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9149
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9363
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/openai/gpt-3.5-turbo-0125/1debe1de-b394-4856-a946-9d14bd867bf6.json b/data/reward-bench/openai/gpt-3.5-turbo-0125/1debe1de-b394-4856-a946-9d14bd867bf6.json
deleted file mode 100644
index 53333618b..000000000
--- a/data/reward-bench/openai/gpt-3.5-turbo-0125/1debe1de-b394-4856-a946-9d14bd867bf6.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/openai_gpt-3.5-turbo-0125/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openai/gpt-3.5-turbo-0125",
-    "id": "openai/gpt-3.5-turbo-0125",
-    "developer": "openai",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6534
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9218
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4452
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6547
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5912
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6548
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/openai/gpt-4-0125-preview/80c589d2-c1eb-4dcf-8be8-042f4f66b7eb.json b/data/reward-bench/openai/gpt-4-0125-preview/80c589d2-c1eb-4dcf-8be8-042f4f66b7eb.json
deleted file mode 100644
index 24b1269d5..000000000
--- a/data/reward-bench/openai/gpt-4-0125-preview/80c589d2-c1eb-4dcf-8be8-042f4f66b7eb.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/openai_gpt-4-0125-preview/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openai/gpt-4-0125-preview",
-    "id": "openai/gpt-4-0125-preview",
-    "developer": "openai",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8434
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9525
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7434
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8757
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8692
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7085
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/openai/gpt-4-turbo-2024-04-09/62478772-bb85-4d3f-a916-c3d17db3ee61.json b/data/reward-bench/openai/gpt-4-turbo-2024-04-09/62478772-bb85-4d3f-a916-c3d17db3ee61.json
deleted file mode 100644
index 8fbd65118..000000000
--- a/data/reward-bench/openai/gpt-4-turbo-2024-04-09/62478772-bb85-4d3f-a916-c3d17db3ee61.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/openai_gpt-4-turbo-2024-04-09/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openai/gpt-4-turbo-2024-04-09",
-    "id": "openai/gpt-4-turbo-2024-04-09",
-    "developer": "openai",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8395
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9525
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7544
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8757
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.827
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7363
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/openai/gpt-4.1-2025-04-14/a070bae2-c927-418b-91cc-161781c4f5b7.json b/data/reward-bench/openai/gpt-4.1-2025-04-14/a070bae2-c927-418b-91cc-161781c4f5b7.json
deleted file mode 100644
index 7ad659dc4..000000000
--- a/data/reward-bench/openai/gpt-4.1-2025-04-14/a070bae2-c927-418b-91cc-161781c4f5b7.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/openai_gpt-4.1-2025-04-14/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openai/gpt-4.1-2025-04-14",
-    "id": "openai/gpt-4.1-2025-04-14",
-    "developer": "openai",
-    "additional_details": {
-      "model_type": "Generative RM"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7232
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8289
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3974
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6521
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8726
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7338
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8542
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/openai/gpt-4.1-mini-2025-04-14/b884c919-a272-4f67-9a09-3d232f56d083.json b/data/reward-bench/openai/gpt-4.1-mini-2025-04-14/b884c919-a272-4f67-9a09-3d232f56d083.json
deleted file mode 100644
index fe081bf47..000000000
--- a/data/reward-bench/openai/gpt-4.1-mini-2025-04-14/b884c919-a272-4f67-9a09-3d232f56d083.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/openai_gpt-4.1-mini-2025-04-14/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openai/gpt-4.1-mini-2025-04-14",
-    "id": "openai/gpt-4.1-mini-2025-04-14",
-    "developer": "openai",
-    "additional_details": {
-      "model_type": "Generative RM"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6573
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6084
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4125
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7213
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7265
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7354
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.74
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/openai/gpt-4.1-nano-2025-04-14/deac33dd-187b-4406-a76a-b33caf417380.json b/data/reward-bench/openai/gpt-4.1-nano-2025-04-14/deac33dd-187b-4406-a76a-b33caf417380.json
deleted file mode 100644
index 9236ca4a4..000000000
--- a/data/reward-bench/openai/gpt-4.1-nano-2025-04-14/deac33dd-187b-4406-a76a-b33caf417380.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/openai_gpt-4.1-nano-2025-04-14/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openai/gpt-4.1-nano-2025-04-14",
-    "id": "openai/gpt-4.1-nano-2025-04-14",
-    "developer": "openai",
-    "additional_details": {
-      "model_type": "Generative RM"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4849
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4646
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2578
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5041
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7156
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.466
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5015
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/openai/gpt-4o-2024-05-13/185bd742-d7d4-4600-86bd-bcda75ed2ebc.json b/data/reward-bench/openai/gpt-4o-2024-05-13/185bd742-d7d4-4600-86bd-bcda75ed2ebc.json
deleted file mode 100644
index e598746ee..000000000
--- a/data/reward-bench/openai/gpt-4o-2024-05-13/185bd742-d7d4-4600-86bd-bcda75ed2ebc.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/openai_gpt-4o-2024-05-13/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openai/gpt-4o-2024-05-13",
-    "id": "openai/gpt-4o-2024-05-13",
-    "developer": "openai",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8327
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9665
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7039
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8649
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8487
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7262
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/openai/gpt-4o-2024-08-06/901e4de6-3ef6-4c2a-873c-cdcc47201974.json b/data/reward-bench/openai/gpt-4o-2024-08-06/901e4de6-3ef6-4c2a-873c-cdcc47201974.json
deleted file mode 100644
index 92a4b0914..000000000
--- a/data/reward-bench/openai/gpt-4o-2024-08-06/901e4de6-3ef6-4c2a-873c-cdcc47201974.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/openai_gpt-4o-2024-08-06/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openai/gpt-4o-2024-08-06",
-    "id": "openai/gpt-4o-2024-08-06",
-    "developer": "openai",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8673
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9609
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.761
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8811
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8661
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/openai/gpt-4o-2024-08-06/a051d5d6-18e6-483d-a000-4a52a06de676.json b/data/reward-bench/openai/gpt-4o-2024-08-06/a051d5d6-18e6-483d-a000-4a52a06de676.json
deleted file mode 100644
index 44c5bcc27..000000000
--- a/data/reward-bench/openai/gpt-4o-2024-08-06/a051d5d6-18e6-483d-a000-4a52a06de676.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/openai_gpt-4o-2024-08-06/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openai/gpt-4o-2024-08-06",
-    "id": "openai/gpt-4o-2024-08-06",
-    "developer": "openai",
-    "additional_details": {
-      "model_type": "Generative RM"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6493
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5684
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3312
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.623
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8619
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7293
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7819
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/openai/gpt-4o-mini-2024-07-18/94d77182-8952-4a63-b02b-3d8bd8a8dead.json b/data/reward-bench/openai/gpt-4o-mini-2024-07-18/94d77182-8952-4a63-b02b-3d8bd8a8dead.json
deleted file mode 100644
index 653bf1c3b..000000000
--- a/data/reward-bench/openai/gpt-4o-mini-2024-07-18/94d77182-8952-4a63-b02b-3d8bd8a8dead.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/openai_gpt-4o-mini-2024-07-18/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openai/gpt-4o-mini-2024-07-18",
-    "id": "openai/gpt-4o-mini-2024-07-18",
-    "developer": "openai",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8007
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9497
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6075
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8081
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8374
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/openai/gpt-4o-mini-2024-07-18/9a48d808-0280-4175-a28a-7e9ba8ac6deb.json b/data/reward-bench/openai/gpt-4o-mini-2024-07-18/9a48d808-0280-4175-a28a-7e9ba8ac6deb.json
deleted file mode 100644
index 4e0668e59..000000000
--- a/data/reward-bench/openai/gpt-4o-mini-2024-07-18/9a48d808-0280-4175-a28a-7e9ba8ac6deb.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/openai_gpt-4o-mini-2024-07-18/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openai/gpt-4o-mini-2024-07-18",
-    "id": "openai/gpt-4o-mini-2024-07-18",
-    "developer": "openai",
-    "additional_details": {
-      "model_type": "Generative RM"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5796
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4105
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3438
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5191
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7667
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7414
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6962
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/openbmb/Eurus-7b-kto/f0d9f57d-d552-44ea-a91c-751854133316.json b/data/reward-bench/openbmb/Eurus-7b-kto/f0d9f57d-d552-44ea-a91c-751854133316.json
deleted file mode 100644
index f4c92b674..000000000
--- a/data/reward-bench/openbmb/Eurus-7b-kto/f0d9f57d-d552-44ea-a91c-751854133316.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/openbmb_Eurus-7b-kto/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbmb/Eurus-7b-kto",
-    "id": "openbmb/Eurus-7b-kto",
-    "developer": "openbmb",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.69
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9525
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5373
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6054
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7467
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5261
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/openbmb/Eurus-RM-7b/561cfba1-856d-4809-b5c7-41481735e1d6.json b/data/reward-bench/openbmb/Eurus-RM-7b/561cfba1-856d-4809-b5c7-41481735e1d6.json
deleted file mode 100644
index 2ceca85c7..000000000
--- a/data/reward-bench/openbmb/Eurus-RM-7b/561cfba1-856d-4809-b5c7-41481735e1d6.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/openbmb_Eurus-RM-7b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbmb/Eurus-RM-7b",
-    "id": "openbmb/Eurus-RM-7b",
-    "developer": "openbmb",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5806
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3438
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5683
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6267
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7475
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5972
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/openbmb/Eurus-RM-7b/995d1caf-b735-44dd-adff-875e3203aa46.json b/data/reward-bench/openbmb/Eurus-RM-7b/995d1caf-b735-44dd-adff-875e3203aa46.json
deleted file mode 100644
index 83d393244..000000000
--- a/data/reward-bench/openbmb/Eurus-RM-7b/995d1caf-b735-44dd-adff-875e3203aa46.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/openbmb_Eurus-RM-7b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbmb/Eurus-RM-7b",
-    "id": "openbmb/Eurus-RM-7b",
-    "developer": "openbmb",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8159
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9804
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6557
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8135
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8633
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7172
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/openbmb/MiniCPM-2B-dpo-fp32/81767043-23c2-4229-b3b5-1c24e470d52a.json b/data/reward-bench/openbmb/MiniCPM-2B-dpo-fp32/81767043-23c2-4229-b3b5-1c24e470d52a.json
deleted file mode 100644
index 94f6c97ac..000000000
--- a/data/reward-bench/openbmb/MiniCPM-2B-dpo-fp32/81767043-23c2-4229-b3b5-1c24e470d52a.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/openbmb_MiniCPM-2B-dpo-fp32/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbmb/MiniCPM-2B-dpo-fp32",
-    "id": "openbmb/MiniCPM-2B-dpo-fp32",
-    "developer": "openbmb",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.673
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8911
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4934
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.573
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8233
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4958
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/openbmb/UltraRM-13b/4f6344bc-af30-46f9-b6f8-41ff925d064e.json b/data/reward-bench/openbmb/UltraRM-13b/4f6344bc-af30-46f9-b6f8-41ff925d064e.json
deleted file mode 100644
index 2c68b9ca5..000000000
--- a/data/reward-bench/openbmb/UltraRM-13b/4f6344bc-af30-46f9-b6f8-41ff925d064e.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/openbmb_UltraRM-13b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbmb/UltraRM-13b",
-    "id": "openbmb/UltraRM-13b",
-    "developer": "openbmb",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6903
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9637
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5548
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5986
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6244
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7294
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/openbmb/UltraRM-13b/abac8640-40be-4eb5-9035-2bf6fd436a7a.json b/data/reward-bench/openbmb/UltraRM-13b/abac8640-40be-4eb5-9035-2bf6fd436a7a.json
deleted file mode 100644
index 0b4c10b89..000000000
--- a/data/reward-bench/openbmb/UltraRM-13b/abac8640-40be-4eb5-9035-2bf6fd436a7a.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/openbmb_UltraRM-13b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "openbmb/UltraRM-13b",
-    "id": "openbmb/UltraRM-13b",
-    "developer": "openbmb",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4683
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5063
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3312
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5519
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5089
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6081
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3036
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/opencompass/CompassJudger-1-1.5B-Instruct/6fd972ab-c45f-4ccd-a5cf-4aac5e703342.json b/data/reward-bench/opencompass/CompassJudger-1-1.5B-Instruct/6fd972ab-c45f-4ccd-a5cf-4aac5e703342.json
deleted file mode 100644
index 4c9d57828..000000000
--- a/data/reward-bench/opencompass/CompassJudger-1-1.5B-Instruct/6fd972ab-c45f-4ccd-a5cf-4aac5e703342.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/opencompass_CompassJudger-1-1.5B-Instruct/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "opencompass/CompassJudger-1-1.5B-Instruct",
-    "id": "opencompass/CompassJudger-1-1.5B-Instruct",
-    "developer": "opencompass",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7344
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9637
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4923
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7818
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6999
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/opencompass/CompassJudger-1-14B-Instruct/8eb1bcf2-a6bd-467c-bc37-090fdb7a9460.json b/data/reward-bench/opencompass/CompassJudger-1-14B-Instruct/8eb1bcf2-a6bd-467c-bc37-090fdb7a9460.json
deleted file mode 100644
index 4299d154c..000000000
--- a/data/reward-bench/opencompass/CompassJudger-1-14B-Instruct/8eb1bcf2-a6bd-467c-bc37-090fdb7a9460.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/opencompass_CompassJudger-1-14B-Instruct/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "opencompass/CompassJudger-1-14B-Instruct",
-    "id": "opencompass/CompassJudger-1-14B-Instruct",
-    "developer": "opencompass",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8409
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9749
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6228
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8392
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9268
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/opencompass/CompassJudger-1-32B-Instruct/5ad53725-ed5a-41f3-8ff6-7404f3f981db.json b/data/reward-bench/opencompass/CompassJudger-1-32B-Instruct/5ad53725-ed5a-41f3-8ff6-7404f3f981db.json
deleted file mode 100644
index 49134a927..000000000
--- a/data/reward-bench/opencompass/CompassJudger-1-32B-Instruct/5ad53725-ed5a-41f3-8ff6-7404f3f981db.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/opencompass_CompassJudger-1-32B-Instruct/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "opencompass/CompassJudger-1-32B-Instruct",
-    "id": "opencompass/CompassJudger-1-32B-Instruct",
-    "developer": "opencompass",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8522
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9804
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6513
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8527
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9244
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/opencompass/CompassJudger-1-7B-Instruct/ae2d05b4-5e80-4b00-af67-b94609b073eb.json b/data/reward-bench/opencompass/CompassJudger-1-7B-Instruct/ae2d05b4-5e80-4b00-af67-b94609b073eb.json
deleted file mode 100644
index 145d5b3e9..000000000
--- a/data/reward-bench/opencompass/CompassJudger-1-7B-Instruct/ae2d05b4-5e80-4b00-af67-b94609b073eb.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/opencompass_CompassJudger-1-7B-Instruct/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "opencompass/CompassJudger-1-7B-Instruct",
-    "id": "opencompass/CompassJudger-1-7B-Instruct",
-    "developer": "opencompass",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8317
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9777
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6096
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8446
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8948
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/prometheus-eval/prometheus-7b-v2.0/592f2811-c197-423e-89d4-e25ee5a324fb.json b/data/reward-bench/prometheus-eval/prometheus-7b-v2.0/592f2811-c197-423e-89d4-e25ee5a324fb.json
deleted file mode 100644
index e934ef88d..000000000
--- a/data/reward-bench/prometheus-eval/prometheus-7b-v2.0/592f2811-c197-423e-89d4-e25ee5a324fb.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/prometheus-eval_prometheus-7b-v2.0/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "prometheus-eval/prometheus-7b-v2.0",
-    "id": "prometheus-eval/prometheus-7b-v2.0",
-    "developer": "prometheus-eval",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7204
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8547
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4912
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7709
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7648
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/prometheus-eval/prometheus-8x7b-v2.0/17795e7b-e912-440f-a80e-63233d3b6d8c.json b/data/reward-bench/prometheus-eval/prometheus-8x7b-v2.0/17795e7b-e912-440f-a80e-63233d3b6d8c.json
deleted file mode 100644
index d2deb1a71..000000000
--- a/data/reward-bench/prometheus-eval/prometheus-8x7b-v2.0/17795e7b-e912-440f-a80e-63233d3b6d8c.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/prometheus-eval_prometheus-8x7b-v2.0/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "prometheus-eval/prometheus-8x7b-v2.0",
-    "id": "prometheus-eval/prometheus-8x7b-v2.0",
-    "developer": "prometheus-eval",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7451
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9302
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4715
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8047
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.774
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/375cf55f-64f6-42f6-a947-1487feffb196.json b/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/375cf55f-64f6-42f6-a947-1487feffb196.json
deleted file mode 100644
index 494a96669..000000000
--- a/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/375cf55f-64f6-42f6-a947-1487feffb196.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/sfairXC_FsfairX-LLaMA3-RM-v0.1/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "sfairXC/FsfairX-LLaMA3-RM-v0.1",
-    "id": "sfairXC/FsfairX-LLaMA3-RM-v0.1",
-    "developer": "sfairXC",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8338
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9944
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6513
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8676
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8644
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7492
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/94d2eddd-f7db-4360-ac58-0af39ce66935.json b/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/94d2eddd-f7db-4360-ac58-0af39ce66935.json
deleted file mode 100644
index 8dad45261..000000000
--- a/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/94d2eddd-f7db-4360-ac58-0af39ce66935.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/sfairXC_FsfairX-LLaMA3-RM-v0.1/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "sfairXC/FsfairX-LLaMA3-RM-v0.1",
-    "id": "sfairXC/FsfairX-LLaMA3-RM-v0.1",
-    "developer": "sfairXC",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6292
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5916
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4188
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6284
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7667
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7051
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6647
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/stabilityai/stable-code-instruct-3b/996ca604-e01c-4a95-9286-60b6dc04f67d.json b/data/reward-bench/stabilityai/stable-code-instruct-3b/996ca604-e01c-4a95-9286-60b6dc04f67d.json
deleted file mode 100644
index 74acfeed3..000000000
--- a/data/reward-bench/stabilityai/stable-code-instruct-3b/996ca604-e01c-4a95-9286-60b6dc04f67d.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/stabilityai_stable-code-instruct-3b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "stabilityai/stable-code-instruct-3b",
-    "id": "stabilityai/stable-code-instruct-3b",
-    "developer": "stabilityai",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6216
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5782
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5855
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6554
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7528
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4506
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/stabilityai/stablelm-2-12b-chat/b6f0089f-d04b-4bcd-be84-ce3bc0d6c2b9.json b/data/reward-bench/stabilityai/stablelm-2-12b-chat/b6f0089f-d04b-4bcd-be84-ce3bc0d6c2b9.json
deleted file mode 100644
index 491861cbd..000000000
--- a/data/reward-bench/stabilityai/stablelm-2-12b-chat/b6f0089f-d04b-4bcd-be84-ce3bc0d6c2b9.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/stabilityai_stablelm-2-12b-chat/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "stabilityai/stablelm-2-12b-chat",
-    "id": "stabilityai/stablelm-2-12b-chat",
-    "developer": "stabilityai",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7642
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9665
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5548
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7811
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8945
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4839
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/stabilityai/stablelm-2-zephyr-1_6b/83e15cba-4fec-48f2-9be4-78decbd96f66.json b/data/reward-bench/stabilityai/stablelm-2-zephyr-1_6b/83e15cba-4fec-48f2-9be4-78decbd96f66.json
deleted file mode 100644
index 3406eee45..000000000
--- a/data/reward-bench/stabilityai/stablelm-2-zephyr-1_6b/83e15cba-4fec-48f2-9be4-78decbd96f66.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/stabilityai_stablelm-2-zephyr-1_6b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "stabilityai/stablelm-2-zephyr-1_6b",
-    "id": "stabilityai/stablelm-2-zephyr-1_6b",
-    "developer": "stabilityai",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6574
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9665
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4671
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6027
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6784
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4868
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/stabilityai/stablelm-zephyr-3b/493617c0-37eb-4c83-b175-2507a3647b5e.json b/data/reward-bench/stabilityai/stablelm-zephyr-3b/493617c0-37eb-4c83-b175-2507a3647b5e.json
deleted file mode 100644
index 47f135b1d..000000000
--- a/data/reward-bench/stabilityai/stablelm-zephyr-3b/493617c0-37eb-4c83-b175-2507a3647b5e.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/stabilityai_stablelm-zephyr-3b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "stabilityai/stablelm-zephyr-3b",
-    "id": "stabilityai/stablelm-zephyr-3b",
-    "developer": "stabilityai",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7146
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8631
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6009
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7405
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7573
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5075
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-large/97f494ce-3c9c-4a19-a237-d458be611a0a.json b/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-large/97f494ce-3c9c-4a19-a237-d458be611a0a.json
deleted file mode 100644
index 47e44c557..000000000
--- a/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-large/97f494ce-3c9c-4a19-a237-d458be611a0a.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/stanfordnlp_SteamSHP-flan-t5-large/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "stanfordnlp/SteamSHP-flan-t5-large",
-    "id": "stanfordnlp/SteamSHP-flan-t5-large",
-    "developer": "stanfordnlp",
-    "additional_details": {
-      "model_type": "Custom Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4962
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8575
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3311
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3743
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3563
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6273
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-xl/f8bf1e92-3cc3-4c7e-9770-485a3074e85f.json b/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-xl/f8bf1e92-3cc3-4c7e-9770-485a3074e85f.json
deleted file mode 100644
index 6a0de9161..000000000
--- a/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-xl/f8bf1e92-3cc3-4c7e-9770-485a3074e85f.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/stanfordnlp_SteamSHP-flan-t5-xl/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "stanfordnlp/SteamSHP-flan-t5-xl",
-    "id": "stanfordnlp/SteamSHP-flan-t5-xl",
-    "developer": "stanfordnlp",
-    "additional_details": {
-      "model_type": "Custom Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5135
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8547
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3684
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3784
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3841
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6498
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/unknown/Cohere March 2024/5bf73fba-520f-4a2f-9296-8240847eb8ec.json b/data/reward-bench/unknown/Cohere March 2024/5bf73fba-520f-4a2f-9296-8240847eb8ec.json
deleted file mode 100644
index b2e8c1248..000000000
--- a/data/reward-bench/unknown/Cohere March 2024/5bf73fba-520f-4a2f-9296-8240847eb8ec.json	
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Cohere March 2024/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cohere March 2024",
-    "id": "Cohere March 2024",
-    "developer": "unknown",
-    "additional_details": {
-      "model_type": "Custom Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8511
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9469
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6513
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.877
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9817
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7458
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/unknown/Cohere May 2024/3dd2c89f-64f5-4bbc-a621-791a9f0538b2.json b/data/reward-bench/unknown/Cohere May 2024/3dd2c89f-64f5-4bbc-a621-791a9f0538b2.json
deleted file mode 100644
index a50e32313..000000000
--- a/data/reward-bench/unknown/Cohere May 2024/3dd2c89f-64f5-4bbc-a621-791a9f0538b2.json	
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/Cohere May 2024/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "Cohere May 2024",
-    "id": "Cohere May 2024",
-    "developer": "unknown",
-    "additional_details": {
-      "model_type": "Custom Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8816
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9637
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7127
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.923
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9768
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.782
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/unknown/gemini-1.5-flash-8b/ef987556-7277-48d8-ac07-532586773a3a.json b/data/reward-bench/unknown/gemini-1.5-flash-8b/ef987556-7277-48d8-ac07-532586773a3a.json
deleted file mode 100644
index dd83d6018..000000000
--- a/data/reward-bench/unknown/gemini-1.5-flash-8b/ef987556-7277-48d8-ac07-532586773a3a.json
+++ /dev/null
@@ -1,112 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/gemini-1.5-flash-8b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "gemini-1.5-flash-8b",
-    "id": "gemini-1.5-flash-8b",
-    "developer": "unknown",
-    "additional_details": {
-      "model_type": "Generative"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7601
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9441
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5987
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7399
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7575
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/upstage/SOLAR-10.7B-Instruct-v1.0/add7eddb-7a8b-4c78-9864-c4316a97ce5e.json b/data/reward-bench/upstage/SOLAR-10.7B-Instruct-v1.0/add7eddb-7a8b-4c78-9864-c4316a97ce5e.json
deleted file mode 100644
index 5df4ce7b8..000000000
--- a/data/reward-bench/upstage/SOLAR-10.7B-Instruct-v1.0/add7eddb-7a8b-4c78-9864-c4316a97ce5e.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/upstage_SOLAR-10.7B-Instruct-v1.0/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "upstage/SOLAR-10.7B-Instruct-v1.0",
-    "id": "upstage/SOLAR-10.7B-Instruct-v1.0",
-    "developer": "upstage",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7391
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8156
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6864
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8514
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7252
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4949
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/wenbopan/Faro-Yi-9B-DPO/caf02954-1eed-44eb-b5f4-df47c90828d7.json b/data/reward-bench/wenbopan/Faro-Yi-9B-DPO/caf02954-1eed-44eb-b5f4-df47c90828d7.json
deleted file mode 100644
index 78244f3ff..000000000
--- a/data/reward-bench/wenbopan/Faro-Yi-9B-DPO/caf02954-1eed-44eb-b5f4-df47c90828d7.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/wenbopan_Faro-Yi-9B-DPO/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "wenbopan/Faro-Yi-9B-DPO",
-    "id": "wenbopan/Faro-Yi-9B-DPO",
-    "developer": "wenbopan",
-    "additional_details": {
-      "model_type": "DPO"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6461
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9218
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5307
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5514
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5839
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6395
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/weqweasdas/RM-Gemma-2B/00798930-daa2-4e79-82c6-2cccf1c3a0cb.json b/data/reward-bench/weqweasdas/RM-Gemma-2B/00798930-daa2-4e79-82c6-2cccf1c3a0cb.json
deleted file mode 100644
index 8856c5e7a..000000000
--- a/data/reward-bench/weqweasdas/RM-Gemma-2B/00798930-daa2-4e79-82c6-2cccf1c3a0cb.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/weqweasdas_RM-Gemma-2B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "weqweasdas/RM-Gemma-2B",
-    "id": "weqweasdas/RM-Gemma-2B",
-    "developer": "weqweasdas",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6549
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9441
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4079
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4986
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7637
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6652
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/weqweasdas/RM-Gemma-2B/71658cf8-0189-49dc-847f-b9a9b5faee4a.json b/data/reward-bench/weqweasdas/RM-Gemma-2B/71658cf8-0189-49dc-847f-b9a9b5faee4a.json
deleted file mode 100644
index c3e2d4a3c..000000000
--- a/data/reward-bench/weqweasdas/RM-Gemma-2B/71658cf8-0189-49dc-847f-b9a9b5faee4a.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/weqweasdas_RM-Gemma-2B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "weqweasdas/RM-Gemma-2B",
-    "id": "weqweasdas/RM-Gemma-2B",
-    "developer": "weqweasdas",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3057
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3705
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2812
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4317
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3311
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2343
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.1851
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/weqweasdas/RM-Gemma-7B-4096/3d506b91-5b0d-47e3-a3a0-bc09808bf5b5.json b/data/reward-bench/weqweasdas/RM-Gemma-7B-4096/3d506b91-5b0d-47e3-a3a0-bc09808bf5b5.json
deleted file mode 100644
index 81934bf6a..000000000
--- a/data/reward-bench/weqweasdas/RM-Gemma-7B-4096/3d506b91-5b0d-47e3-a3a0-bc09808bf5b5.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/weqweasdas_RM-Gemma-7B-4096/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "weqweasdas/RM-Gemma-7B-4096",
-    "id": "weqweasdas/RM-Gemma-7B-4096",
-    "developer": "weqweasdas",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6922
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9497
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5022
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5608
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7511
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7024
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/weqweasdas/RM-Gemma-7B/04c71231-2025-4e1a-b7ed-56b245868089.json b/data/reward-bench/weqweasdas/RM-Gemma-7B/04c71231-2025-4e1a-b7ed-56b245868089.json
deleted file mode 100644
index 02ba525c2..000000000
--- a/data/reward-bench/weqweasdas/RM-Gemma-7B/04c71231-2025-4e1a-b7ed-56b245868089.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/weqweasdas_RM-Gemma-7B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "weqweasdas/RM-Gemma-7B",
-    "id": "weqweasdas/RM-Gemma-7B",
-    "developer": "weqweasdas",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6967
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9693
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4978
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5784
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7362
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7069
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/weqweasdas/RM-Gemma-7B/08b2edd0-f8e9-47cd-b19d-53fdc7209917.json b/data/reward-bench/weqweasdas/RM-Gemma-7B/08b2edd0-f8e9-47cd-b19d-53fdc7209917.json
deleted file mode 100644
index ed03af39d..000000000
--- a/data/reward-bench/weqweasdas/RM-Gemma-7B/08b2edd0-f8e9-47cd-b19d-53fdc7209917.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/weqweasdas_RM-Gemma-7B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "weqweasdas/RM-Gemma-7B",
-    "id": "weqweasdas/RM-Gemma-7B",
-    "developer": "weqweasdas",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4826
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4926
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3937
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6066
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4822
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.497
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4232
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/weqweasdas/RM-Mistral-7B/79a43841-4032-4a20-8b5a-83b4b446d107.json b/data/reward-bench/weqweasdas/RM-Mistral-7B/79a43841-4032-4a20-8b5a-83b4b446d107.json
deleted file mode 100644
index 7abde633d..000000000
--- a/data/reward-bench/weqweasdas/RM-Mistral-7B/79a43841-4032-4a20-8b5a-83b4b446d107.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/weqweasdas_RM-Mistral-7B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "weqweasdas/RM-Mistral-7B",
-    "id": "weqweasdas/RM-Mistral-7B",
-    "developer": "weqweasdas",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7982
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.9665
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6053
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8703
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7736
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.753
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/weqweasdas/RM-Mistral-7B/a2c16ab8-1098-490a-8d0a-392d835427e0.json b/data/reward-bench/weqweasdas/RM-Mistral-7B/a2c16ab8-1098-490a-8d0a-392d835427e0.json
deleted file mode 100644
index d53f1986e..000000000
--- a/data/reward-bench/weqweasdas/RM-Mistral-7B/a2c16ab8-1098-490a-8d0a-392d835427e0.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/weqweasdas_RM-Mistral-7B/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "weqweasdas/RM-Mistral-7B",
-    "id": "weqweasdas/RM-Mistral-7B",
-    "developer": "weqweasdas",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.596
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5937
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3438
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5956
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6911
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.7293
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6226
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/0aa12860-7ebe-49c2-a5af-1926d23e34f8.json b/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/0aa12860-7ebe-49c2-a5af-1926d23e34f8.json
deleted file mode 100644
index dbe32c629..000000000
--- a/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/0aa12860-7ebe-49c2-a5af-1926d23e34f8.json
+++ /dev/null
@@ -1,130 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench/weqweasdas_hh_rlhf_rm_open_llama_3b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "weqweasdas/hh_rlhf_rm_open_llama_3b",
-    "id": "weqweasdas/hh_rlhf_rm_open_llama_3b",
-    "developer": "weqweasdas",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench Score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.5027
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat",
-      "metric_config": {
-        "evaluation_description": "Chat accuracy - includes easy chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.8184
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Chat Hard",
-      "metric_config": {
-        "evaluation_description": "Chat Hard accuracy - includes hard chat subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3728
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety accuracy - includes safety subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.4149
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Reasoning",
-      "metric_config": {
-        "evaluation_description": "Reasoning accuracy - includes code and math subsets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3281
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    },
-    {
-      "evaluation_name": "Prior Sets (0.5 weight)",
-      "metric_config": {
-        "evaluation_description": "Prior Sets score (weighted 0.5) - includes test sets",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.6564
-      },
-      "source_data": {
-        "dataset_name": "RewardBench",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench"
-      }
-    }
-  ]
-}
diff --git a/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/796d3ec1-9c26-4ead-87cb-4eb866209120.json b/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/796d3ec1-9c26-4ead-87cb-4eb866209120.json
deleted file mode 100644
index 7e050faee..000000000
--- a/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/796d3ec1-9c26-4ead-87cb-4eb866209120.json
+++ /dev/null
@@ -1,148 +0,0 @@
-{
-  "schema_version": "0.2.0",
-  "evaluation_id": "reward-bench-2/weqweasdas_hh_rlhf_rm_open_llama_3b/1766412838.146816",
-  "retrieved_timestamp": "1766412838.146816",
-  "source_metadata": {
-    "source_name": "RewardBench 2",
-    "source_type": "documentation",
-    "source_organization_name": "Allen Institute for AI",
-    "source_organization_url": "https://allenai.org",
-    "evaluator_relationship": "third_party"
-  },
-  "model_info": {
-    "name": "weqweasdas/hh_rlhf_rm_open_llama_3b",
-    "id": "weqweasdas/hh_rlhf_rm_open_llama_3b",
-    "developer": "weqweasdas",
-    "additional_details": {
-      "model_type": "Seq. Classifier"
-    }
-  },
-  "evaluation_results": [
-    {
-      "evaluation_name": "Score",
-      "metric_config": {
-        "evaluation_description": "Overall RewardBench 2 Score (mean of all metrics)",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2498
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Factuality",
-      "metric_config": {
-        "evaluation_description": "Factuality score - measures factual accuracy",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3642
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Precise IF",
-      "metric_config": {
-        "evaluation_description": "Precise Instruction Following score",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.275
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Math",
-      "metric_config": {
-        "evaluation_description": "Math score - measures mathematical reasoning",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.3497
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Safety",
-      "metric_config": {
-        "evaluation_description": "Safety score - measures safety awareness",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.24
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Focus",
-      "metric_config": {
-        "evaluation_description": "Focus score - measures response focus",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.2384
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    },
-    {
-      "evaluation_name": "Ties",
-      "metric_config": {
-        "evaluation_description": "Ties score - ability to identify tie cases",
-        "lower_is_better": false,
-        "score_type": "continuous",
-        "min_score": 0.0,
-        "max_score": 1.0
-      },
-      "score_details": {
-        "score": 0.0315
-      },
-      "source_data": {
-        "dataset_name": "RewardBench 2",
-        "source_type": "hf_dataset",
-        "hf_repo": "allenai/reward-bench-2-results"
-      }
-    }
-  ]
-}